def tokenz(self): env = Environment() df_tokenz = pd.DataFrame() file_tokenz = env.filename_tokenz_csv() try: df_tokenz = pd.read_csv(file_tokenz, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read tokenz file:', file_tokenz]) else: env.debug(1, ['Read tokenz OK:', file_tokenz]) return df_tokenz
def tokenize(self, dftokenz=pd.DataFrame(), persistent=True, n_frac=1): env = Environment() enc = Word_Encoder() t_start = timer() if dftokenz.empty: dftokenz = self.tokenz() if n_frac < 1: dftokenz = dftokenz.sample(frac=n_frac) env.debug( 1, ['Transforming to tokenz: START %s words' % dftokenz.shape[0]]) gmask = dftokenz.groupby(['gram']) df_posstat = gmask.count() df_posstat.to_csv(env.filename_stat_pos_tokenz_csv(), encoding='utf-8') print('POSTagger', 'train dataset stat:\n', gmask.count()) fields = [ 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token', 'n_len', 'n_tokens2', 'n_tokens3', 'n_tokenp2', 'n_tokenp3' ] for field in fields: val = 0.0 if field[0] == 's': val = '' dftokenz[field] = val n_letters = 0 s_letters = env.list_rus_letters() di_letters = env.di_bgm_byletters #bgm_columns_i = env.bgm_columns_list(mode=0) bgm_columns = env.bgm_columns_list(mode=1) #print('bgm_columns', bgm_columns) for column_name in bgm_columns: dftokenz[column_name] = None t_end = timer() env.debug(1, [ 'POStagger', 'Letters bigram columns added', env.job_time(t_start, t_end) ]) #Form tokenz t_start = timer() for index, serie in dftokenz.iterrows(): # print (serie.values) a_word = enc.s2token(index, serie) i = 2 # print(a_word) for field in fields: dftokenz.at[index, field] = a_word[i] # print(field, a_word[i]) i = i + 1 # print(dftokenz.loc[index]) #Letters bigram binaries for n_l in range(0, len(a_word[0]) - 1): n_l2 = n_l + 1 di_n = di_letters.get('%s%s' % (a_word[0][n_l], a_word[0][n_l2])) if di_n is not None: #print(di_n) #print(bgm_columns[di_n]) dftokenz.at[index, bgm_columns[di_n]] = 1 t_end = timer() env.debug( 1, ['Transforming to tokenz: COMPLETE', env.job_time(t_start, t_end)]) if persistent: dftokenz.to_csv(env.filename_tokenz_csv(), encoding='utf-8') env.debug(1, ['Tokenz written to CSV:', env.filename_tokenz_csv()]) return dftokenz