def grammemes_xml2csv(self, persistent=True): env = Environment() filename_gram = env.filename_grammemes_xml() dfcols = ['name', 'alias', 'description'] df_xml = pd.DataFrame(columns=dfcols) try: tree = ET.ElementTree(file=filename_gram) except: env.debug(1, ['Failed to load grammemes from XML:', filename_gram]) else: env.debug(1, ['Read grammemes:', filename_gram]) for elem in tree.iter('grammeme'): #print(elem.tag, elem.attrib) sattr = elem.attrib.get('include') if sattr == 'on': sname = sali = sdesc = '' for child in elem: if child.tag.lower() == 'name': sname = child.text.upper() elif child.tag.lower() == 'alias': sali = child.text.upper() elif child.tag.lower() == 'description': sdesc = child.text.lower() s = pd.Series(data=[sname, sali, sdesc], index=dfcols) df_xml = df_xml.append(s, ignore_index=True) df_xml.index.name = 'idgram' if persistent: filename_csv = env.filename_grammemes_csv() env.debug(1, ['Write grammemes to CSV:', filename_csv]) df_xml.to_csv(filename_csv, encoding='utf-8') return df_xml
def vocabulary_from_corpus(self, n_min=1, n_max=10, persistent=True): env = Environment() df_voc = pd.DataFrame() #dfgram = self.grammemes() for i in range(n_min, n_max + 1): file_csv = env.filename_corpus_csv(i) try: dffile = pd.read_csv(file_csv, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read corpus file:', file_csv]) else: env.debug(1, ['Read OK:', file_csv]) if not dffile.empty: df_voc = df_voc.append(dffile) df_voc = df_voc.drop_duplicates() df_voc.columns = ['word', 'gram', 'idgram'] df_voc = df_voc.reset_index(drop=True) df_voc.index.name = 'idcorpus' if persistent: file_voc = env.filename_vocabulary_csv() env.debug(1, ['Write vocabulary to CSV:', file_voc]) df_voc.to_csv(file_voc, encoding='utf-8') return df_voc
def get_texts_stat(self, mode='train'): # Готовим данные env = Environment() if mode == 'train': file_res = env.filename_results_csv() if mode == 'test': file_res = env.filename_stat_test_csv() authors = pd.read_csv(env.filename_authors_csv(), index_col='idauthor', encoding='utf-8') data = pd.read_csv(file_res, index_col='idstat', encoding='utf-8') data.drop(columns=['file', 'idchunk'], inplace=True) columns = data.columns group = data.groupby(['idtext', 'idauthor', 'author', 'name']) group = group.agg({ 'sentences_text': ['mean'], 'words_text': ['mean'], 'sentence_mean': ['mean'], 'sentences_chunk': ['mean'], 'words_chunk': ['mean'], 'words_uniq_chunk': ['mean'], 'uniq_per_sent_chunk': ['mean'], 'uniq_per_words_chunk': ['mean'], 'NOUN': ['mean'], 'ADJF': ['mean'], 'ADJS': ['mean'], 'COMP': ['mean'], 'VERB': ['mean'], 'INFN': ['mean'], 'PRTF': ['mean'], 'PRTS': ['mean'], 'GRND': ['mean'], 'NUMR': ['mean'], 'ADVB': ['mean'], 'NPRO': ['mean'], 'PRED': ['mean'], 'PREP': ['mean'], 'CONJ': ['mean'], 'PRCL': ['mean'], 'INTJ': ['mean'], 'predict': ['sum'] }) group.columns = columns[4:] group.reset_index(inplace=True) data = pd.merge(group, authors, on='idauthor', how='left', suffixes=('', '_author')) if mode == 'test': data['predict'] = data['predict'].astype(int) data = pd.merge(data, authors, left_on='predict', right_on='idauthor', how='left', suffixes=('', '_predict')) return data
def word2token(self, s): t_start = timer() env = Environment() bgm_columns = env.bgm_columns_list(mode=1) n_shift = 5 a_result=np.zeros(len(bgm_columns)+n_shift) a_result[0] = len(s) a_result[1] = self.s_encode(s[-2:]) # ts2 a_result[2] = self.s_encode(s[-3:]) # ts3 a_result[3] = self.s_encode(s[2:]) # tp2 a_result[4] = self.s_encode(s[3:]) # tp3 t_end = timer() #env.debug(1, ['WordEncoder', 'word2token', '%s without bgm takes %s sec.' % (s, env.job_time(t_start, t_end))]) #t_start = timer() di_letters = env.di_bgm_byletters #print(di_letters) di_word = {} for n_l in range(0, len(s) - 1): n_l2 = n_l + 1 di_n = di_letters.get('%s%s' % (s[n_l], s[n_l2])) #print('%s%s' % (s[n_l], s[n_l2]),di_n) if di_n is not None: #print(di_n) a_result[di_n + n_shift] = 1 t_end = timer() #env.debug(1, ['WordEncoder', 'word2token', '%s takes %s sec.' % (s, env.job_time(t_start, t_end))]) return a_result
def model_train(self): env = Environment() data = self.stat() t_start = timer() y, X = self.model_prepare_data(data) seed = 241 scoring = 'accuracy' n_splits = 4 frac_test_size = 0.25 #Cross-validation kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed) #clf = DecisionTreeClassifier(criterion='gini', random_state=seed) #clf = GradientBoostingClassifier(n_estimators=50) model = xgb.XGBClassifier(n_estimators=400, max_depth=24, colsample=1, subsample=1, seed=seed) cv_scores = cross_val_score(model, X, y, cv=kf) #eval X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=frac_test_size, random_state=seed) eval_set = [(X_train, y_train), (X_test, y_test)] #print(eval_set) f_eval = 'merror' # f_eval = 'mlogloss' model.fit(X_train, y_train, eval_metric=f_eval, eval_set=eval_set, verbose=False, early_stopping_rounds=10) ev_scores = model.evals_result() cv_mean = np.array(cv_scores.mean()) #ev_mean = np.array(ev_scores['validation_0']['mlogloss']).mean() ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean() #Посмотрим важность признаков в модели #print(model.feature_importances_) xgb.plot_importance(model) #plt.bar(range(len(model.feature_importances_)), model.feature_importances_) plt.show() #Обучаем модель на всех данных model.fit(X, y, verbose=False) #Сохраняем модель на диск pickle.dump(model, open(env.filename_model_texts(), 'wb')) #print('CV', cv_scores, 'EV', ev_scores) print('Cross-validation: mean', cv_mean, 'eval_set mean', ev_mean) return model
def visit_CombinatorNode(self, node): env = Environment() for parameter in node.parameters(): env.add(str(parameter)) d = env.index self.visit('E', node.body(), env = env) self.code.Update(d) self.code.Pop(d) self.code.Unwind() self.symtab[node.name()].code = self.code.clone() self.code.clear()
def visit_CombinatorNode(self, node): env = Environment() for parameter in node.parameters(): env.add(str(parameter)) d = env.index self.visit('E', node.body(), env=env) self.code.Update(d) self.code.Pop(d) self.code.Unwind() self.symtab[node.name()].code = self.code.clone() self.code.clear()
def model_predict(self, df, b_retrain=False): env = Environment() y, X = self.model_prepare_data(df, mode='test') if b_retrain: model = self.model_train( ) #Если хотим для кажжого теста вновь тренировать модель else: #Загружаем ранее тренированную модель с диска model = pickle.load(open(env.filename_model_texts(), 'rb')) #Предсказываем y = model.predict(X) return y
def pos_word_by_ml(self, awords): env = Environment() enc = Word_Encoder() file_model = env.filename_model_tree() clf = pickle.load(open(file_model, 'rb')) a_predict = np.array([enc.word2token('')]) for word in awords: a_padd = [enc.word2token(word)] #print(word, a_padd) a_predict = np.append(a_predict, a_padd, axis=0) a_predict = a_predict[1:] #print(a_predict[0, 100]) predictions = clf.predict(a_predict[:, 0:]) return (predictions[0:])
def model_prepare_data(self, df, mode='train'): env = Environment() data = df.copy() data.drop(columns=['file', 'idchunk', 'predict'], inplace=True) columns = data.columns #idstat,idtext,idchunk,idauthor,author,name,file,words_all,words_chunk,sentences_all,sentence_mean,words_uniq,uniq_per_words,NOUN,ADJF,ADJS,COMP,VERB,INFN,PRTF,PRTS,GRND,NUMR,ADVB,NPRO,PRED,PREP,CONJ,PRCL,INTJ columns2drop = [ 'idtext', 'idauthor', 'author', 'name', 'sentences_text', 'words_text', 'sentences_chunk', 'words_chunk', 'words_uniq_chunk' ] #New features #Создадим новые статистические поля для помощи нашей модели #data['words_uniq_per_sentense'] = data['words_uniq'] / data['sentences_all'] #кол-во уникальных слов/ кол-во предложений #data['words_uniq_3k'] = data['words_uniq'] / 3000 # кол-во уникальных слов на 3 тыс. слов #data['words_uniq_10k'] = data['words_uniq'] / 10000 #кол-во уникальных слов на 10 тыс. слов y = None if mode == 'train': y = data['idauthor'] X = data.drop(columns=columns2drop) #Add PCA features n_components = 4 pca_cols2drop = [ 'sentence_mean', 'uniq_per_sent_chunk', 'uniq_per_words_chunk' ] if mode == 'train': #формируем матрицу признаков pca_pos = PCA(n_components=n_components) X_new = pca_pos.fit_transform(X.drop(columns=pca_cols2drop), y) print( 'PCA ratio %s components quality: %s' % (n_components, round(np.sum(pca_pos.explained_variance_ratio_), 4)), pca_pos.explained_variance_ratio_) pickle.dump(pca_pos, open(env.filename_model_texts_pca(), 'wb')) if mode == 'test': #Переводим признаки в пространство признаков на основе ранее созданной матрицы pca_pos = pickle.load(open(env.filename_model_texts_pca(), 'rb')) X_new = pca_pos.transform(X.drop(columns=pca_cols2drop)) for i in range(0, n_components): X['pca_%s' % i] = X_new[:, i] return y, X
def main(): parser = argparse.ArgumentParser(description="RL exercise.") ADD = parser.add_argument ADD('-e', '--environment', default='CartPole-v1', help="Name of the OpenAI gym environment to train on.") ADD('-m', '--numEpisodesPerEval', type=int, default=100, help="Number of episodes per policy update iteration.") ADD('-n', '--numIterations', type=int, default=1000, help="Number of policy updates.") ADD('-r', '--renderEvery', type=int, default=0, help="Render every nth episode. 0 to disable.") ADD('-l', '--learningRate', type=float, default=0.1, help="Learning rate of policy update.") ADD('-z', '--sigma', type=float, default=0.2, help= "Standard deviation of policy for continuous actions. Exploration noise." ) ADD('-p', '--populationSize', type=int, default=64, help="Population size.") args = parser.parse_args() # Create the environment, the policy and the training algorithm. env = Environment(args.environment, args.numEpisodesPerEval, args.renderEvery) policy = DiscretePolicy(env) algo = EvolutionStrategies(policy, populationSize=args.populationSize, sigma=args.sigma, learnRate=args.learningRate) # Train the policy. algo.trainPolicy(policy, env, args.numIterations)
def stat(self): env = Environment() data = pd.DataFrame() file_stat = env.filename_results_csv() try: data = pd.read_csv(file_stat, index_col='idstat', encoding='utf-8') except: env.debug(1, ['Failed to read stat file:', file_stat]) else: env.debug(1, ['Read stat file OK:', file_stat]) #print(data) return data
def tokenz(self): env = Environment() df_tokenz = pd.DataFrame() file_tokenz = env.filename_tokenz_csv() try: df_tokenz = pd.read_csv(file_tokenz, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read tokenz file:', file_tokenz]) else: env.debug(1, ['Read tokenz OK:', file_tokenz]) return df_tokenz
def authors(self, mode=0): env = Environment() df = pd.DataFrame() filename = env.filename_authors_csv() try: df = pd.read_csv(filename, index_col='idauthor', encoding='utf-8') except: env.debug(1, ['Failed to load authors CSV file', filename]) else: env.debug(1, ['Load authors CSV file', filename]) if mode == 1: return df.to_dict().get('name') else: return df
def corpus_xml2txt(self, num=1, persistent=True): result = True env = Environment() file_xml = env.filename_corpus_xml(num) try: tree = ET.ElementTree(file=file_xml) except: env.debug(1, ['Failed to load XML:', file_xml]) result = False else: file_txt = env.filename_corpus_txt(num) file = open(file_txt, mode='w') for elem in tree.iter('source'): # print(elem.text, elem.tag, elem.attrib) file.write(elem.text) file.write(' ') file.close() env.debug(1, ['Write corpus file to TXT:', file_txt]) return result
def grammemes(self, mode=0): env = Environment() dfgram = pd.DataFrame() filename_gram = env.filename_grammemes_csv() try: dfgram = pd.read_csv(filename_gram, index_col='idgram', encoding='utf-8') except: env.debug(1, ['Failed to load grammemes CSV file', filename_gram]) else: env.debug(1, ['Load grammemes CSV file', filename_gram]) if mode == 1: return dfgram.to_dict().get('name') else: return dfgram
def main(): parser = argparse.ArgumentParser(description="RL exercise.") ADD = parser.add_argument ADD('-e', '--environment', default='CartPole-v1', help="Name of the OpenAI gym environment to train on.") ADD('-m', '--numEpisodesPerEval', type=int, default=100, help="Number of episodes per policy update iteration.") ADD('-n', '--numIterations', type=int, default=1000, help="Number of policy updates.") ADD('-l', '--learningRate', type=float, default=0.1, help="Learning rate of policy update.") ADD('-g', '--gamma', type=float, default=0.99, help="Rewards discount factor.") ADD('-r', '--renderEvery', type=int, default=0, help="Render every nth episode. 0 to disable.") args = parser.parse_args() # Create the environment, the policy and the training algorithm. env = Environment(args.environment, args.numEpisodesPerEval, args.renderEvery) policy = DiscretePolicy(env) algo = ReinforceAlgorithm(policy, gamma=args.gamma, learnRate=args.learningRate) # Train the policy. algo.trainPolicy(policy, env, args.numIterations)
def main(): pd.set_option("display.max_columns", 100) pd.set_option('display.width', 1000) #Служебные классы env = Environment() c = OpenCorpus() t = POSTagger() a = mlAnalyzer() enc = Word_Encoder() g = pd.DataFrame() g = c.grammemes() dg = c.grammemes( mode=1 ) #Справочник Части речи mode = 1 возвращает в виде словаря python da = c.authors(mode=1) #Справочник - авторы #Пример обработки текстов из texts_train и добавления статистической информации в results #a_texts_train = [1, 16] #a_texts_train = [48] #for i in a_texts_train: # a.process_from_texts_file([i]) #Пример визуализации статистической информации из results в 2-мерном пространстве #a.vizualize2d() #Пример визуализации статистичесокй информации о частях речи #t.vizualize2d(n_frac = 0.001) #Предсказание автора текста из text_test #[0, 1, 2, 3, 4]) #предсказать все тексты - долго text2predict = [3] y = a.predict(text2predict) #предсказать - указать номер текста j = 0 for i in y: print('idtext=%s' % text2predict[j], da.get(i)) j = j + 1
def tokenz_create_stat(self, dftokenz=pd.DataFrame(), n_frac=1): env = Environment() enc = Word_Encoder() di_letters = Environment.di_bgm_byletters bgm_columns = env.bgm_columns_list(mode=1) t_start = timer() if dftokenz.empty: dftokenz = self.tokenz() if n_frac < 1: dftokenz = dftokenz.sample(frac=n_frac) env.debug(1, [ 'POStagger', 'create_stat', 'Collecting statistic START %s words' % dftokenz.shape[0] ]) di_tokenz_stat = (dftokenz.count()).to_dict() di_tokenz_res = {} #print('di_letters', di_letters) print('di_tokenz_stat', di_tokenz_stat) bgm_astat = [['init', 0]] bgm_index = [] for key in di_letters: di_n = di_letters.get(key) column_stat = di_tokenz_stat.get(bgm_columns[di_n]) #di_tokenz_res[key] = column_stat bgm_astat.append([key, column_stat]) bgm_index.append(di_n) bgm_astat = bgm_astat[1:] print('column stat', bgm_astat) df_bgm_stat = pd.DataFrame(data=bgm_astat, columns=['bigram', 'counts'], index=bgm_index) df_bgm_stat.index.name = 'idbigram' df_bgm_stat = df_bgm_stat.sort_values(by=['counts'], ascending=False) print('bgm_stat\n', df_bgm_stat) df_bgm_stat.to_csv(env.filename_stat_bigram_letters_csv(), encoding='utf-8')
def vizualize2d(self, n_frac=0.01, b_annotations=False): n_components = 2 env = Environment() c = OpenCorpus() di_g = c.grammemes(mode=1) data = self.tokenz().sample(frac=n_frac) data = data.fillna(0) #print(data['idgram'].shape) #print(data.index.shape) tdf = pd.DataFrame(index=data.index) tdf['idgram'] = data['idgram'] tdf['gram'] = data['gram'] tdf['word'] = data['word'] #print(tdf) drop_columns = [ 'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token' ] # , 'bgm_l_None' # drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns]) env.debug( 1, ['POStagger', 'visualize2D', 'Drop colums: %s' % (drop_columns)]) data = data.drop(columns=drop_columns, axis=1) values = data.values X = values[:, 1:] y = values[:, 0] #print(data.head,X, y) #return 0 #Scalers sc = StandardScaler() min_max_scaler = preprocessing.MinMaxScaler() max_abs_scaler = preprocessing.MaxAbsScaler() #X = sc.fit_transform(X) #PCA b_pca = False b_sne = True if b_pca: model = PCA(n_components=n_components) if b_sne: model = MDS(n_components=n_components) #TSNE X_new = model.fit_transform(X, y) if b_pca: print('PCA ratio', n_components, 'components', model.explained_variance_ratio_) #X_new = sc.fit_transform(X_new) #X_new = preprocessing.scale(X_new) if b_pca: X_new = max_abs_scaler.fit_transform(X_new) #return 0 #tdf = pd.DataFrame(data=X_new, columns=['PC1', 'PC2'], index=data.index) tdf['PC1'] = X_new[:, 0] tdf['PC2'] = X_new[:, 1] #finalDf = pd.concat([tdf, data[['idgram']]], axis=1) df_groups = tdf.groupby('idgram').count() #print(df_groups) #return 0 tdf['counts'] = 0 for index, serie in tdf.iterrows(): n_idgram = tdf.at[index, 'idgram'] tdf.at[index, 'counts'] = df_groups[df_groups.index == n_idgram]['gram'] tdf = tdf.sort_values(by=['counts'], ascending=False) #print(tdf) #Draw i = 0 N = df_groups.shape[0] s_title = '' if b_pca: s_title = '2 component PCA. Точность %s' % (round( sum(float(i) for i in model.explained_variance_ratio_), 2)) if b_sne: s_title = 't-SNE' #Plotly if False: #Plotly py.sign_in('shashmaxus', 'AdfwTulrOoV3cSlbZT3B') c = [ 'hsl(' + str(h) + ',50%' + ',50%)' for h in np.linspace(0, 360, N) ] data_trace = [] for index, row in df_groups.iterrows(): #print(index) df_trace = tdf[tdf['idgram'] == index] #print(df_trace) g_trace = go.Scatter( x=df_trace['PC1'].values, y=df_trace['PC2'].values, name=df_trace['gram'].values[0], mode='markers', #'markers+text' marker=dict( size=8, color=i, #c[i] opacity=0.8, colorscale='Viridis'), text=df_trace['word'], textfont=dict(family='sans serif', size=12)) data_trace.append(g_trace) i += 1 layout = go.Layout( title=s_title_pca, xaxis=dict( title=('Component 1. Вклад %s' % (round(pca.explained_variance_ratio_[0], 2)))), yaxis=dict( title=('Component 2. Вклад %s' % (round(pca.explained_variance_ratio_[1], 2))))) fig2 = go.Figure(data=data_trace, layout=layout) py.image.save_as(fig2, filename='c:/prj/mlivos_data/temp/Words2.png') #Bokeh if True: palette = d3['Category20'][len(tdf['gram'].unique())] #palette = all_palettes['Category20'][len(tdf['gram'].unique())] #palette = Viridis256[len(tdf['gram'].unique())] #palette = Viridis256 color_map = CategoricalColorMapper(factors=tdf['gram'].unique(), palette=palette) #print(mapper) fig = figure(title=s_title, toolbar_location=None) source = ColumnDataSource(tdf[['gram', 'PC1', 'PC2']]) fig.scatter(x='PC1', y='PC2', size=12, color={ 'field': 'gram', 'transform': color_map }, legend='gram', source=source) show(fig) export_png(fig, filename="c:/prj/mlivos_data/temp/PCA.png") return 0
def predict(self, aidtext, b_makestat=False): env = Environment() # Открываем файл со статистикой по тестовым текстам df_stat = pd.read_csv( env.filename_stat_test_csv(), index_col='idstat', encoding='utf-8') # Статистика по тстовым текстам df_texts = pd.read_csv(env.filename_predict_csv(), index_col='idtext', encoding='utf-8') # Реестр текстов mask = df_texts.index.isin(aidtext) df_texts = df_texts[mask] columns = ['idtext', 'idchunk', 'idauthor', 'author', 'name', 'file', \ 'sentences_text', 'words_text','sentence_mean', \ 'sentences_chunk', 'words_chunk', 'words_uniq_chunk','uniq_per_sent_chunk','uniq_per_words_chunk', \ 'NOUN','ADJF','ADJS','COMP','VERB','INFN','PRTF','PRTS','GRND','NUMR',\ 'ADVB','NPRO','PRED','PREP','CONJ','PRCL','INTJ', 'predict'] y_result = [] #Если необходимо подготовить статистику по тестовым текстам if b_makestat: for index, row in df_texts.iterrows( ): # Для каждого текста, который надо обработать file_txt = df_texts.at[index, 'filename'] # Read text file env.debug(1, ['Analyzer', 'predict', 'START file TXT:', file_txt]) t_start = timer() file = codecs.open(file_txt, "r", "utf_8_sig") text = file.read().strip() file.close() # Автор в тестовой выборке вообще говоря нет idauthor = df_texts.at[index, 'idauthor'] # Автор #idauthor = 0 name = df_texts.at[index, 'name'] # Название # Собственно обработка текста df_add = self.analyze_text( columns, text, index, idauthor, name, file_txt) # Analyze text, get Series #print(df_add) df_add.reset_index(drop=True, inplace=True) df_stat = df_stat.append( df_add, ignore_index=True) #Добавляем к файлу результатов df_stat.reset_index(drop=True, inplace=True) df_stat.index.name = 'idstat' t_end = timer() env.debug(1, [ 'END file TXT:', file_txt, 'time:', env.job_time(t_start, t_end) ]) #df_stat теперь содержит информацию о всех тестовых текстах, которые хотели обработать #Указываем верный тип для целочисленных колонок int_cols = [ 'idtext', 'idchunk', 'idauthor', 'sentences_text', 'words_text', 'sentences_chunk', 'words_chunk', 'words_uniq_chunk' ] for col in int_cols: df_stat[col] = df_stat[col].astype(int) # Сохраняем результат на диск df_stat.to_csv(env.filename_stat_test_csv(), encoding='utf-8') #Статистика готова # Открываем файл со статистикой по тестовым текстам df_stat = pd.read_csv( env.filename_stat_test_csv(), index_col='idstat', encoding='utf-8') # Статистика по тстовым текстам #mask = df_stat.index.isin(aidtext) #df_stat2predict = df_stat[mask] #Предсказываем авторов y_res = self.model_predict(df_stat.loc[aidtext]) #print(y_res) df_stat.loc[aidtext, 'predict'] = y_res.astype(int) #print(df_stat) #y_result.append(y_res[0]) #Сохраняем измененный файл с предсказаниями df_stat.to_csv(env.filename_stat_test_csv(), encoding='utf-8') return y_res #Возвращаем предсказания
def vizualize2d(self, mode='train'): n_components = 2 env = Environment() data = self.get_texts_stat(mode=mode) columns = data.columns #print(data) #print(columns) columns2drop = [ 'idtext', 'idauthor', 'author', 'name', 'sentences_text', 'words_text', 'sentence_mean', 'sentences_chunk', 'words_chunk', 'words_uniq_chunk', 'uniq_per_sent_chunk', 'predict', 'shortname', 'name_author' ] y = data['idauthor'].values X = data.drop(columns=columns2drop).values #print(y, X) #return 0 #print(data) #print(X, y) pca = PCA(n_components=n_components) #pca = TSNE(n_components=2) X_new = pca.fit_transform(X, y) print('PCA ratio 2 components', pca.explained_variance_ratio_) #print('components', pca.components_) #print(X_new) tdf = pd.DataFrame(data=X_new, columns=['PC1', 'PC2']) finalDf = pd.concat([tdf, data[['idauthor', 'name', 'shortname']]], axis=1) print('dataframe ', finalDf) mpl.style.use('default') rcParams['font.family'] = 'sans-serif' rcParams['font.sans-serif'] = ['Tahoma'] fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('Component 1. Вклад ' + str(round(pca.explained_variance_ratio_[0], 2)), fontsize=12) ax.set_ylabel('Component 2. Вклад ' + str(round(pca.explained_variance_ratio_[1], 2)), fontsize=12) ax.set_title( '2 component PCA. Точность ' + str(round(sum(float(i) for i in pca.explained_variance_ratio_), 2)), fontsize=12) targets = data.idauthor.unique() print(targets) legends = data.shortname.unique() print(legends) #print(targets) #colors = ['r', 'g', 'b'] #colors = "bgcmykw" #without r #colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', # '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', # '#bcbd22', '#17becf'] colors = [ "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ] for target in targets: indicesToKeep = finalDf['idauthor'] == target ax.scatter(finalDf.loc[indicesToKeep, 'PC1'], finalDf.loc[indicesToKeep, 'PC2'], c=colors[target], s=50) for index, row in finalDf.iterrows(): ax.annotate( finalDf.at[index, 'name'], xy=(finalDf.at[index, 'PC1'], finalDf.at[index, 'PC2']), # xytext=(0.05, 0.05), fontsize=8) ax.legend(legends) ax.grid() plt.show()
def vocabulary(self): env = Environment() file_voc = env.filename_vocabulary_csv() #from vocabulary file file_dict = env.filename_dict_csv() #from dictionary file try: df_voc = pd.read_csv(file_voc, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read vocabulary file:', file_voc]) else: env.debug(1, ['Read vocabulary OK:', file_voc]) try: df_dict = pd.read_csv(file_dict, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read dictionary file:', file_dict]) else: env.debug(1, ['Read dictionary OK:', file_dict]) #Concat df_res = pd.concat([df_voc, df_dict]) df_res = df_res.drop_duplicates() #Apply patch words df_patch = pd.read_csv(env.filename_vocabulary_patch_csv(), index_col='idcorpus', encoding='utf-8') df_res = df_res.drop(df_res[df_res['word'].isin( df_patch['word'])].index, axis=0) df_res = pd.concat([df_res, df_patch]) #print(df_res[df_res['word'].isin(df_patch['word'])]) df_res = df_res.reset_index(drop=True) df_res.index.name = 'idcorpus' #print(df_res) return df_res
def make_report(self): env = Environment() a = mlAnalyzer() #template = Template('Hello {{ name }}!') #print(template.render(name=u'Вася')) jenv = jinjaEnvironment(loader = FileSystemLoader(env.path_templates())) #print(jenv.loader) template = jenv.get_template("report_global.tpl.html") data = a.get_texts_stat() #Статистика по текстам из файла обучающей выборки test = a.get_texts_stat(mode='test') #Статистика по текстам тестовой выборки #print(data) #test['predict'] = test['predict'].astype(int) test['validation'] = 0 test.loc[test.idauthor == test.predict,'validation'] = 1 print(data) print(test) #Summary stat group = pd.merge(data, test, on='idauthor', how='left', suffixes=('', '_test')) print(group) group = group.groupby(['idauthor', 'name_author'], as_index=False).agg({'idtext' : ['nunique'], 'words_chunk' : ['sum'], 'name_test': ['nunique'], 'words_chunk_test': ['sum'], 'validation' : ['mean'] }) print(group) group.drop(['idauthor'], axis=1, inplace=True) group.sort_values('name_author', inplace=True) #Переименовать колонки в русскоязычные group.columns = ['Писатель', 'Кол-во текстов для обучения', 'Объём текстов для обучения (кол-во слов)', 'Кол-во текстов для проверки', 'Объём текстов для проверки (кол-во слов)', 'Точность определения' ] n_accuracy = group['Точность определения'].mean() #Целые числа показываем без дробной части int_cols = ['Кол-во текстов для обучения', 'Объём текстов для обучения (кол-во слов)', 'Кол-во текстов для проверки', 'Объём текстов для проверки (кол-во слов)'] for col in int_cols: group[col] = group[col].astype(int) group.reset_index(drop = True, inplace = True) s = group.style.set_properties(**{'text-align': 'right'}) group.fillna('', inplace = True) s.hide_index().render() #Training stat group_train = data.groupby(['author'], as_index=False).agg({'idauthor' : ['count'], 'sentences_text' : ['sum'], 'words_text' : ['sum'], 'sentence_mean': ['mean'], 'name': [lambda col: '<br />'.join(col)], }) group_train.reset_index(drop = True, inplace = True) s_train = group_train.style.set_properties(**{'text-align': 'right'}) group_train.fillna('', inplace=True) group_train.columns = ['Писатель', 'Кол-во текстов', 'Кол-во предложений', 'Кол-во слов', 'Средняя длина предложения', 'Произведения' ] n_train = group_train['Кол-во текстов'].sum() s_train.hide_index().render() # Testing stat group_test = test.groupby(['author'], as_index=False).agg({'idauthor': ['count'], 'sentences_text': ['sum'], 'words_text': ['sum'], 'sentence_mean': ['mean'], 'name': [lambda col: '<br />'.join(col)], 'validation': ['mean'], 'shortname_predict': [lambda col: '<br />'.join(col)], }) group_test.reset_index(drop=True, inplace=True) s_test = group_test.style.set_properties(**{'text-align': 'right'}) group_test.fillna('', inplace=True) group_test.columns = ['Писатель', 'Кол-во текстов', 'Кол-во предложений', 'Кол-во слов', 'Средняя длина предложения', 'Произведения', 'Результат проверки', 'Определён автор', ] n_test = group_test['Кол-во текстов'].sum() s_test.hide_index().render() template_vars = {"title": "Отчёт", "detection_accuracy": '%s' % (round(n_accuracy,4)*100), "train_texts_pivot_table_style_render" : s.render(), "n_train_texts": round(n_train,0), "train_texts_table_style_render": s_train.render(), "n_test_texts": round(n_test,0), "test_texts_table_style_render": s_test.render() } html_out = template.render(template_vars) file = codecs.open(env.filename_global_report_html(), "w", "utf-8-sig") file.write(html_out) file.close() #print(html_out) return html_out
def test(self, n_min=1, n_max=1): t_start = timer() env = Environment() df_test = pd.DataFrame() for i in range(n_min, n_max + 1): try: dffile = pd.read_csv(env.filename_corpus_csv(i), index_col='idcorpus', encoding='utf-8') except: env.debug(1, [ 'POStagger', 'test', 'Failed to read corpus file:', env.filename_corpus_csv(i) ]) else: env.debug(1, [ 'POStagger', 'test', 'Read OK:', env.filename_corpus_csv(i) ]) if not dffile.empty: df_test = df_test.append(dffile) df_test = df_test.drop_duplicates() df_test.columns = ['word', 'gram', 'idgram'] df_test = df_test.reset_index(drop=True) df_test.index.name = 'idcorpus' df_test['gram_valid'] = df_test['gram'] n_testsize = df_test.shape[0] env.debug(1, ['POStagger', 'test', 'START %s words' % n_testsize]) df_test = self.pos(df_test) print('Test result', df_test) df_err = df_test[df_test['gram_valid'] != df_test['gram']] print('Test errors:', df_err) df_err.to_csv(env.filename_test_err_csv(), encoding='utf-8') env.debug(1, [ 'POStagger', 'test', 'test accuracy %s' % (1 - df_err.shape[0] / n_testsize) ]) t_end = timer() env.debug(1, [ 'POSTagger', 'test', 'test time:', env.job_time(t_start, t_end), 'sec.' ])
def corpus_xml2csv(self, num=1, persistent=True): env = Environment() file_xml = env.filename_corpus_xml(num) df_xml = pd.DataFrame() df_gram = self.grammemes() dgram = df_gram.to_dict().get('name') try: tree = ET.ElementTree(file=file_xml) except: env.debug(1, ['Failed to load XML:', file_xml]) else: t_start = timer() env.debug(1, ['CORPUS', 'XML to CSV:', file_xml]) for elem in tree.iter('token'): #print(elem.tag, elem.attrib) serie = pd.Series(data=[]) badd = False s_text = elem.attrib.get('text') serie[len(serie)] = s_text.lower() for elem2 in elem.iter('g'): #print(elem2.tag, elem2.attrib) sgram = elem2.attrib.get('v') sgram = sgram.upper() if (df_gram[df_gram['name'].isin([sgram]) == True].size ) > 0: serie[len(serie)] = sgram serie[len(serie)] = int(df_gram.index[ df_gram['name'] == sgram].tolist()[0]) #serie[len(serie)] = list(dgram.keys())[list(dgram.values()).index(sgram)] badd = True break #print(s) if badd: df_xml = df_xml.append(serie, ignore_index=True) if not df_xml.empty: df_xml = df_xml.drop_duplicates() df_xml = df_xml.reset_index(drop=True) df_xml.index.name = 'idcorpus' df_xml.columns = ['word', 'gram', 'idgram'] df_xml = df_xml.astype({"idgram": int}) if persistent: file_csv = env.filename_corpus_csv(num) env.debug(1, ['Write corpus file to CSV:', file_csv]) df_xml.to_csv(file_csv, encoding='utf-8') t_end = timer() env.debug(1, [ 'CORPUS', 'CSV written:', file_csv, 'takes %s sec.' % env.job_time(t_start, t_end) ]) return df_xml
def train(self, df=pd.DataFrame(), validation='eval', n_splits=5, b_smoketest=True, n_frac=1): env = Environment() enc = Word_Encoder() df_train = df bgm_columns = env.bgm_columns_list(mode=1) drop_columns = [ 'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token' ] #, 'bgm_l_None' #drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns]) env.debug(1, ['POStagger', 'train', 'Drop colums: %s' % (drop_columns)]) if df_train.empty: t_start = timer() df_train = self.tokenz() t_end = timer() env.debug(1, [ 'POSTagger', 'train', 'tokenz loaded:', 'time:', env.job_time(t_start, t_end) ]) env.debug(1, [ 'POStagger', 'train', 'All tokenz set shape %s' % df_train.shape[0] ]) t_start = timer() env.debug(1, ['POStagger', 'train', 'Learning: START']) if n_frac < 1: df_train = df_train.sample(frac=n_frac) env.debug(1, [ 'POStagger', 'train', 'Training tokenz set shape %s' % df_train.shape[0] ]) #print(df_train.shape) #df_train2 = df_train[bgm_columns] #print(df_train2.shape) #df_train2 = df_train2.astype({"idgram": int}) df_train = df_train.drop(columns=drop_columns, axis=1) env.debug( 1, ['POStagger', 'Train colums: %s' % (df_train.columns.tolist())]) #print(df_train.columns) #df_train = df_train.drop_duplicates() #slow-slow #print(df_train.head()) df_train = df_train.fillna(0) file_x = env.filename_xtrain_csv() df_train.to_csv(file_x, encoding='utf-8') env.debug(1, ['POStagger', 'train', 'Save X', file_x]) y = df_train['idgram'].values df_train.drop(columns=['idgram'], inplace=True) X = df_train.values #array = df_train.values #print(df_train) #X = array[:, 1:] #Y = array[:, 0] #print(X, Y) #validation_size = 0.20 seed = 241 frac_test_size = 0.2 sc = StandardScaler() #Y_sc = sc.fit_transform(Y) t2_start = timer() if validation == 'cv': #Need cross-validation scoring = 'accuracy' # scoring = 'f1_samples' kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed) if True: #Decision tree env.debug(1, ['Tree cross-validation']) # clf = DecisionTreeClassifier(criterion='gini', random_state=seed) # 0.79 # clf = KNeighborsClassifier(n_neighbors=230) model = DecisionTreeClassifier(criterion='entropy', random_state=seed) # 0.81 env.debug( 1, ['Calculate cross_val_score. Splits=%s' % (n_splits)]) scores = cross_val_score(model, X, y, cv=kf) print('DTree scores:', scores.mean(), 'raw', scores) if False: #Logistic regression env.debug(1, ['LGR cross-validation']) n_Cs = [0.01] X = array[:, 5:] X_sc = sc.fit_transform(X) Y = df_train['idgram'].values Y[Y > 0] = 1 print(X_sc, Y) for n_c in n_Cs: #clf = LogisticRegression(penalty='l2', solver='saga', C=n_c, multi_class='multinomial') clf = LogisticRegression(penalty='l2', solver='liblinear', C=n_c) # clf = SVC(kernel='linear', C=10000, random_state=241) # clf = SVC(kernel='linear', C=0.01, random_state=seed) # clf = SVC(random_state=seed) # clf = Perceptron() env.debug(1, [ 'Calculate cross_val_score. Splits=%s C=%s' % (n_splits, n_c) ]) scores = cross_val_score(clf, X_sc, Y, cv=kf) print(scores) if False: #GBM, RandomForest env.debug(1, ['GBM cross-validation']) asteps = [20] #GBM #asteps=[100] #RandomForest for i in asteps: #clf = RandomForestClassifier(n_estimators=i) clf = GradientBoostingClassifier( n_estimators=i, max_depth=8) #, max_features='sqrt' env.debug(1, [ 'Calculate cross_val_score. Splits=%s Estimators=%s' % (n_splits, i) ]) scores = cross_val_score(clf, X, Y, cv=kf) print(scores) if validation == 'eval': # eval model = xgb.XGBClassifier(n_estimators=140, max_depth=16, colsample=1, subsample=0.5, seed=seed) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=frac_test_size, random_state=seed, shuffle=True) eval_set = [(X_train, y_train), (X_test, y_test)] # print(eval_set) f_eval = 'merror' # f_eval = 'mlogloss' model.fit(X_train, y_train, eval_metric=f_eval, eval_set=eval_set, verbose=False, early_stopping_rounds=20) ev_scores = model.evals_result() ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean() #print(model.feature_importances_) print(ev_mean, ev_scores) xgb.plot_importance(model) plt.show() t2_end = timer() t_end = timer() env.debug(1, ['CV completed:', 'time:', env.job_time(t_start, t_end)]) if validation == 'cv': #Training на всех данных X_train, y_train = X, y # model = SVC() # model= DecisionTreeClassifier() #79 # model= LinearDiscriminantAnalysis() #47 # model=LogisticRegression() #48 # model = KNeighborsClassifier(n_neighbors=200) #48 # model = GaussianNB() #43 #print('Fit...') #print('Validate...') # predictions = model.predict(X_validation) # print(accuracy_score(Y_validation, predictions)) # print(confusion_matrix(Y_validation, predictions)) # print(classification_report(Y_validation, predictions)) t_start = timer() env.debug(1, ['Training: START']) model.fit(X_train, y_train) t_end = timer() env.debug(1, ['Training: END', env.job_time(t_start, t_end)]) pickle.dump(sc, open(env.filename_scaler(), 'wb')) pickle.dump(model, open(env.filename_model_tree(), 'wb')) # Smoke test if b_smoketest: X_smoke_predict = [ 'съеште', 'ещё', 'этих', 'мягких', 'французских', 'булок' ] a_smoke = np.array( [enc.word2token(elem) for elem in X_smoke_predict]) y_predictions = model.predict(a_smoke[:, 0:]) y_predictions_proba = model.predict(a_smoke[:, 0:]) #print(y_predictions) print('Prediction', list(zip(X_smoke_predict, y_predictions))) print('Proba', list(zip(X_smoke_predict, y_predictions_proba))) return model
def dict_xml2csv(self, persistent=True, lines=10000): t_start = timer() env = Environment() dfgram = self.grammemes() filename_dict = env.filename_dict_xml() dfcols = ['word', 'gram', 'idgram'] df_xml = pd.DataFrame(columns=dfcols) env.debug( 1, ['CORPUS', 'Start to load dictionary from XML:', filename_dict]) try: fp = io.open(filename_dict, mode="r", encoding="utf-8") except: env.debug(1, [ 'CORPUS', 'Failed to open dictionary file XML:', filename_dict ]) else: number_lines = sum(1 for line in fp) fp.seek(0) t_end = timer() env.debug(1, [ 'CORPUS', 'File opened:', 'lines', '%s' % number_lines, 'time:', env.job_time(t_start, t_end) ]) t_start = timer() step = number_lines // lines env.debug(1, [ 'CORPUS', 'Read dictionary:', filename_dict, 'lines: %s step %s' % (lines, step) ]) n_line = 0 for i in range(0, number_lines): line = fp.readline() #print(line[5:10]) if (line[5:10] == 'lemma') and (n_line == 0): #print(line) tree = ET.fromstring(line) for elem in tree.iter('l'): s_word = elem.attrib.get('t') gram = ['', 0] j = 0 for elem2 in elem.iter('g'): gram[j] = elem2.attrib.get('v') break gram[1] = int(dfgram.index[dfgram['name'] == gram[0]].tolist()[0]) #print(s_word,gram) s = pd.Series(data=[s_word, gram[0], gram[1]], index=dfcols) df_xml = df_xml.append(s, ignore_index=True) n_line += 1 n_line += 1 if n_line >= step: n_line = 0 fp.close() df_xml.index.name = 'idcorpus' t_end = timer() env.debug(1, [ 'CORPUS', 'Dictionary loaded:', 'time:', env.job_time(t_start, t_end) ]) if persistent: filename_csv = env.filename_dict_csv() env.debug(1, ['CORPUS', 'Write dictionary to CSV:', filename_csv]) df_xml.to_csv(filename_csv, encoding='utf-8') env.debug(1, ['CORPUS', 'Dictionary saved:', filename_csv]) return df_xml
def main(): pd.set_option("display.max_columns", 100) pd.set_option('display.width', 1000) env = Environment() c = OpenCorpus() t = POSTagger() a = mlAnalyzer() enc = Word_Encoder() r = Reporter() #c.dict_xml2csv(lines = 600000) #c.grammemes_xml2csv() #c.vocabulary_from_corpus(1,1000) g = pd.DataFrame() g = c.grammemes() #dg = g.to_dict().get('name') dg = c.grammemes(mode=1) #grammemes by id da = c.authors(mode=1) # authors by id #print(dg) #print(p.head()) #for i in range(2015,3000): # c.corpus_xml2csv(i) #c.corpus_xml2csv(2) #for i in range (125,150): # c.corpus_xml2txt(i) #print(c.vocabulary_from_corpus(1,2000).head()) #voc=c.vocabulary() #print(voc.head()) #t.tokenize() #print(t.tokenize(voc, n_frac=1)) #t.tokenz_create_stat() #print(env.bgm_stat()) #print(t.tokenz()) #print(c.vocabulary()) #print(enc.word2token('паровоз')) #print(enc.word2token('аз')) #t.train(n_frac=0.8, validation='cv') #t.train(n_frac=0.95, validation='eval') #t.test(2000,2048) #a.process_from_texts_file([49], mode='chunk_size') #a.process_from_texts_file([58], max_words = 8000) #arrt = [2, 45, 43, 44, 42, 40, 41, 46, 36, 37, 38, 34] #arrt = [69] #for i in range (51,95): #for i in arrt: #a.process_from_texts_file([i], max_words = 8000) #t.vizualize2d(n_frac=0.01) #nltk.download() #a.vizualize2d() #a.vizualize2d(mode='train') #a.vizualize2d(mode='test') #a.model_train() #return 0 #y = a.predict([0, 1, 2, 3, 4]) #y = a.predict([0, 1, 2, 3, 4]) print(a.predict([16], b_makestat=True)) a.vizualize2d(mode='test') #for i in y: # print('idtext=%s' % i, da.get(i)) #text2predict = [11, 12, 13, 14, 15] #y = a.predict(text2predict, b_makestat=True) # предсказать - указать номер текста #j = 0 #for i in y: # print('idtext=%s' % text2predict[j], 'Автор=%s (%s)' % (i, da.get(i))) # j = j + 1 #predict=(t.pos_word_by_voc(['съеште', 'школа','господина','приехал', # 'глокая','куздра','штеко','будланула','бокра','и','кудрячит','бокрёнка'])) X_predict = [ 'съеште', 'школа', 'господина', 'приехал', 'глокая', 'куздра', 'штеко', 'будланула', 'бокра', 'и', 'кудрячит', 'бокрёнка', 'он', 'видел', 'их', 'семью', 'своими', 'глазами' ] #X_predict=['символ'] y_predict = t.pos_word_by_ml(X_predict) print([ '%s/%s' % (X_predict[i], dg.get(y_predict[i])) for i in range(0, len(y_predict)) ]) r.make_report()
def pos(self, df, mode_fast=True, use_cache=True): env = Environment() enc = Word_Encoder() df_res = df t_start = timer() c = OpenCorpus() g = c.grammemes() dg = g.to_dict().get('name') #Cache file cache_columns = ['word', 'gram_ml', 'count'] file_cache = env.filename_mlcache_csv() try: df_cache = pd.read_csv(file_cache, index_col='idcorpus', encoding='utf-8') except: env.debug( 1, ['POSTagger', 'pos', 'Failed to read cache file:', file_cache]) df_cache = pd.DataFrame(columns=cache_columns) else: env.debug(1, ['POSTagger', 'pos', 'Read ML cache OK:', file_cache]) a_predict = np.array([enc.word2token('')]) #a_words = [''] n_words = df_res.shape[0] env.debug(1, [ 'POStagger', 'pos', 'START Vocabulary prediction %s words' % n_words ]) a_words = df_res['word'].tolist() a_ml_words = [] predictions_voc = self.pos_by_voc(a_words) p_se = pd.Series(predictions_voc) df_res['gram'] = p_se.values df_res['gram_voc'] = p_se.values df_res['gram_ml'] = '' t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'END Vocabulary prediction %s sec.' % env.job_time(t_start, t_end) ]) #print(predictions_voc) if mode_fast: #env.debug(1, ['POStagger', 'pos', 'START Fast mode vocabulary search. Words %s' % df.shape[0]]) df_ni_voc = df_res[df_res['gram_voc'] == ''] n_words = df_ni_voc.shape[0] else: df_ni_voc = df_res #print('non-vocabulary',df_ni_voc) if not df_ni_voc.empty: env.debug( 1, ['POStagger', 'pos', 'START Encoding %s words' % n_words]) for index, serie in df_ni_voc.iterrows(): word = df_ni_voc.at[index, 'word'] #print(word) a_padd = np.array([enc.word2token(word)]) a_predict = np.append(a_predict, a_padd, axis=0) a_ml_words.append(word) #print(a_words, a_predict) a_predict = a_predict[1:, :] #print(a_predict) #print('ml_words',a_ml_words) t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'END Encoding %s words %s sec.' % (n_words, env.job_time(t_start, t_end)) ]) t_start = timer() env.debug(1, ['POStagger', 'pos', 'START Model prediction']) clf = pickle.load(open(env.filename_model_tree(), 'rb')) predictions_ml = clf.predict(a_predict[:, 0:]) # print('ml', predictions_ml) t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'END Model prediction %s sec.' % env.job_time(t_start, t_end) ]) #print('ml_words_prediction',list(zip(a_ml_words,predictions_ml))) t_start = timer() i = 0 s_pvoc = '' s_pml = '' for index, row in df_res.iterrows(): word = df_res.at[index, 'word'] s_pvoc = df_res.at[index, 'gram_voc'] #s_pvoc = predictions_voc[i] #print('s_pvoc', word, s_pvoc) #df_res.at[index, 'gram_voc'] = s_pvoc if s_pvoc == '': if mode_fast: try: j = a_ml_words.index(word) except: pass else: s_pml = dg.get(predictions_ml[j]) #print(word,s_pml) else: s_pml = dg.get(predictions_ml[i]) df_res.at[index, 'gram_ml'] = s_pml df_res.at[index, 'gram'] = s_pml i = i + 1 t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'ML predictions dataframe filled %s sec' % env.job_time(t_start, t_end) ]) #print(df_res) df_cache = pd.concat([ df_cache, df_res[df_res.gram_ml != ''][['word', 'gram_ml', 'count']] ]) df_cache = df_cache.groupby(['word', 'gram_ml']).agg({'count': ['sum']}) df_cache.reset_index(inplace=True) df_cache.index.name = 'idcorpus' df_cache.columns = cache_columns df_cache.sort_values(by=['count'], inplace=True, ascending=False) #print(df_cache) env.debug(1, ['POStagger', 'pos', 'Write ML cache to CSV:', file_cache]) df_cache.to_csv(file_cache, encoding='utf-8') return df_res
def tokenize(self, dftokenz=pd.DataFrame(), persistent=True, n_frac=1): env = Environment() enc = Word_Encoder() t_start = timer() if dftokenz.empty: dftokenz = self.tokenz() if n_frac < 1: dftokenz = dftokenz.sample(frac=n_frac) env.debug( 1, ['Transforming to tokenz: START %s words' % dftokenz.shape[0]]) gmask = dftokenz.groupby(['gram']) df_posstat = gmask.count() df_posstat.to_csv(env.filename_stat_pos_tokenz_csv(), encoding='utf-8') print('POSTagger', 'train dataset stat:\n', gmask.count()) fields = [ 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token', 'n_len', 'n_tokens2', 'n_tokens3', 'n_tokenp2', 'n_tokenp3' ] for field in fields: val = 0.0 if field[0] == 's': val = '' dftokenz[field] = val n_letters = 0 s_letters = env.list_rus_letters() di_letters = env.di_bgm_byletters #bgm_columns_i = env.bgm_columns_list(mode=0) bgm_columns = env.bgm_columns_list(mode=1) #print('bgm_columns', bgm_columns) for column_name in bgm_columns: dftokenz[column_name] = None t_end = timer() env.debug(1, [ 'POStagger', 'Letters bigram columns added', env.job_time(t_start, t_end) ]) #Form tokenz t_start = timer() for index, serie in dftokenz.iterrows(): # print (serie.values) a_word = enc.s2token(index, serie) i = 2 # print(a_word) for field in fields: dftokenz.at[index, field] = a_word[i] # print(field, a_word[i]) i = i + 1 # print(dftokenz.loc[index]) #Letters bigram binaries for n_l in range(0, len(a_word[0]) - 1): n_l2 = n_l + 1 di_n = di_letters.get('%s%s' % (a_word[0][n_l], a_word[0][n_l2])) if di_n is not None: #print(di_n) #print(bgm_columns[di_n]) dftokenz.at[index, bgm_columns[di_n]] = 1 t_end = timer() env.debug( 1, ['Transforming to tokenz: COMPLETE', env.job_time(t_start, t_end)]) if persistent: dftokenz.to_csv(env.filename_tokenz_csv(), encoding='utf-8') env.debug(1, ['Tokenz written to CSV:', env.filename_tokenz_csv()]) return dftokenz