Example #1
0
 def tokenz(self):
     env = Environment()
     df_tokenz = pd.DataFrame()
     file_tokenz = env.filename_tokenz_csv()
     try:
         df_tokenz = pd.read_csv(file_tokenz,
                                 index_col='idcorpus',
                                 encoding='utf-8')
     except:
         env.debug(1, ['Failed to read tokenz file:', file_tokenz])
     else:
         env.debug(1, ['Read tokenz OK:', file_tokenz])
     return df_tokenz
Example #2
0
    def tokenize(self, dftokenz=pd.DataFrame(), persistent=True, n_frac=1):
        env = Environment()
        enc = Word_Encoder()
        t_start = timer()
        if dftokenz.empty:
            dftokenz = self.tokenz()
        if n_frac < 1:
            dftokenz = dftokenz.sample(frac=n_frac)
        env.debug(
            1, ['Transforming to tokenz: START %s words' % dftokenz.shape[0]])

        gmask = dftokenz.groupby(['gram'])
        df_posstat = gmask.count()
        df_posstat.to_csv(env.filename_stat_pos_tokenz_csv(), encoding='utf-8')
        print('POSTagger', 'train dataset stat:\n', gmask.count())

        fields = [
            's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token',
            'n_len', 'n_tokens2', 'n_tokens3', 'n_tokenp2', 'n_tokenp3'
        ]

        for field in fields:
            val = 0.0
            if field[0] == 's':
                val = ''
            dftokenz[field] = val

        n_letters = 0
        s_letters = env.list_rus_letters()
        di_letters = env.di_bgm_byletters
        #bgm_columns_i = env.bgm_columns_list(mode=0)
        bgm_columns = env.bgm_columns_list(mode=1)

        #print('bgm_columns', bgm_columns)
        for column_name in bgm_columns:
            dftokenz[column_name] = None

        t_end = timer()
        env.debug(1, [
            'POStagger', 'Letters bigram columns added',
            env.job_time(t_start, t_end)
        ])

        #Form tokenz
        t_start = timer()
        for index, serie in dftokenz.iterrows():
            # print (serie.values)
            a_word = enc.s2token(index, serie)
            i = 2
            # print(a_word)
            for field in fields:
                dftokenz.at[index, field] = a_word[i]
                # print(field, a_word[i])
                i = i + 1
            # print(dftokenz.loc[index])
            #Letters bigram binaries
            for n_l in range(0, len(a_word[0]) - 1):
                n_l2 = n_l + 1
                di_n = di_letters.get('%s%s' %
                                      (a_word[0][n_l], a_word[0][n_l2]))
                if di_n is not None:
                    #print(di_n)
                    #print(bgm_columns[di_n])
                    dftokenz.at[index, bgm_columns[di_n]] = 1
        t_end = timer()
        env.debug(
            1,
            ['Transforming to tokenz: COMPLETE',
             env.job_time(t_start, t_end)])
        if persistent:
            dftokenz.to_csv(env.filename_tokenz_csv(), encoding='utf-8')
            env.debug(1, ['Tokenz written to CSV:', env.filename_tokenz_csv()])
        return dftokenz