def pos_word_by_ml(self, awords): env = Environment() enc = Word_Encoder() file_model = env.filename_model_tree() clf = pickle.load(open(file_model, 'rb')) a_predict = np.array([enc.word2token('')]) for word in awords: a_padd = [enc.word2token(word)] #print(word, a_padd) a_predict = np.append(a_predict, a_padd, axis=0) a_predict = a_predict[1:] #print(a_predict[0, 100]) predictions = clf.predict(a_predict[:, 0:]) return (predictions[0:])
def pos(self, df, mode_fast=True, use_cache=True): env = Environment() enc = Word_Encoder() df_res = df t_start = timer() c = OpenCorpus() g = c.grammemes() dg = g.to_dict().get('name') #Cache file cache_columns = ['word', 'gram_ml', 'count'] file_cache = env.filename_mlcache_csv() try: df_cache = pd.read_csv(file_cache, index_col='idcorpus', encoding='utf-8') except: env.debug( 1, ['POSTagger', 'pos', 'Failed to read cache file:', file_cache]) df_cache = pd.DataFrame(columns=cache_columns) else: env.debug(1, ['POSTagger', 'pos', 'Read ML cache OK:', file_cache]) a_predict = np.array([enc.word2token('')]) #a_words = [''] n_words = df_res.shape[0] env.debug(1, [ 'POStagger', 'pos', 'START Vocabulary prediction %s words' % n_words ]) a_words = df_res['word'].tolist() a_ml_words = [] predictions_voc = self.pos_by_voc(a_words) p_se = pd.Series(predictions_voc) df_res['gram'] = p_se.values df_res['gram_voc'] = p_se.values df_res['gram_ml'] = '' t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'END Vocabulary prediction %s sec.' % env.job_time(t_start, t_end) ]) #print(predictions_voc) if mode_fast: #env.debug(1, ['POStagger', 'pos', 'START Fast mode vocabulary search. Words %s' % df.shape[0]]) df_ni_voc = df_res[df_res['gram_voc'] == ''] n_words = df_ni_voc.shape[0] else: df_ni_voc = df_res #print('non-vocabulary',df_ni_voc) if not df_ni_voc.empty: env.debug( 1, ['POStagger', 'pos', 'START Encoding %s words' % n_words]) for index, serie in df_ni_voc.iterrows(): word = df_ni_voc.at[index, 'word'] #print(word) a_padd = np.array([enc.word2token(word)]) a_predict = np.append(a_predict, a_padd, axis=0) a_ml_words.append(word) #print(a_words, a_predict) a_predict = a_predict[1:, :] #print(a_predict) #print('ml_words',a_ml_words) t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'END Encoding %s words %s sec.' % (n_words, env.job_time(t_start, t_end)) ]) t_start = timer() env.debug(1, ['POStagger', 'pos', 'START Model prediction']) clf = pickle.load(open(env.filename_model_tree(), 'rb')) predictions_ml = clf.predict(a_predict[:, 0:]) # print('ml', predictions_ml) t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'END Model prediction %s sec.' % env.job_time(t_start, t_end) ]) #print('ml_words_prediction',list(zip(a_ml_words,predictions_ml))) t_start = timer() i = 0 s_pvoc = '' s_pml = '' for index, row in df_res.iterrows(): word = df_res.at[index, 'word'] s_pvoc = df_res.at[index, 'gram_voc'] #s_pvoc = predictions_voc[i] #print('s_pvoc', word, s_pvoc) #df_res.at[index, 'gram_voc'] = s_pvoc if s_pvoc == '': if mode_fast: try: j = a_ml_words.index(word) except: pass else: s_pml = dg.get(predictions_ml[j]) #print(word,s_pml) else: s_pml = dg.get(predictions_ml[i]) df_res.at[index, 'gram_ml'] = s_pml df_res.at[index, 'gram'] = s_pml i = i + 1 t_end = timer() env.debug(1, [ 'POStagger', 'pos', 'ML predictions dataframe filled %s sec' % env.job_time(t_start, t_end) ]) #print(df_res) df_cache = pd.concat([ df_cache, df_res[df_res.gram_ml != ''][['word', 'gram_ml', 'count']] ]) df_cache = df_cache.groupby(['word', 'gram_ml']).agg({'count': ['sum']}) df_cache.reset_index(inplace=True) df_cache.index.name = 'idcorpus' df_cache.columns = cache_columns df_cache.sort_values(by=['count'], inplace=True, ascending=False) #print(df_cache) env.debug(1, ['POStagger', 'pos', 'Write ML cache to CSV:', file_cache]) df_cache.to_csv(file_cache, encoding='utf-8') return df_res
def train(self, df=pd.DataFrame(), validation='eval', n_splits=5, b_smoketest=True, n_frac=1): env = Environment() enc = Word_Encoder() df_train = df bgm_columns = env.bgm_columns_list(mode=1) drop_columns = [ 'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token' ] #, 'bgm_l_None' #drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns]) env.debug(1, ['POStagger', 'train', 'Drop colums: %s' % (drop_columns)]) if df_train.empty: t_start = timer() df_train = self.tokenz() t_end = timer() env.debug(1, [ 'POSTagger', 'train', 'tokenz loaded:', 'time:', env.job_time(t_start, t_end) ]) env.debug(1, [ 'POStagger', 'train', 'All tokenz set shape %s' % df_train.shape[0] ]) t_start = timer() env.debug(1, ['POStagger', 'train', 'Learning: START']) if n_frac < 1: df_train = df_train.sample(frac=n_frac) env.debug(1, [ 'POStagger', 'train', 'Training tokenz set shape %s' % df_train.shape[0] ]) #print(df_train.shape) #df_train2 = df_train[bgm_columns] #print(df_train2.shape) #df_train2 = df_train2.astype({"idgram": int}) df_train = df_train.drop(columns=drop_columns, axis=1) env.debug( 1, ['POStagger', 'Train colums: %s' % (df_train.columns.tolist())]) #print(df_train.columns) #df_train = df_train.drop_duplicates() #slow-slow #print(df_train.head()) df_train = df_train.fillna(0) file_x = env.filename_xtrain_csv() df_train.to_csv(file_x, encoding='utf-8') env.debug(1, ['POStagger', 'train', 'Save X', file_x]) y = df_train['idgram'].values df_train.drop(columns=['idgram'], inplace=True) X = df_train.values #array = df_train.values #print(df_train) #X = array[:, 1:] #Y = array[:, 0] #print(X, Y) #validation_size = 0.20 seed = 241 frac_test_size = 0.2 sc = StandardScaler() #Y_sc = sc.fit_transform(Y) t2_start = timer() if validation == 'cv': #Need cross-validation scoring = 'accuracy' # scoring = 'f1_samples' kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed) if True: #Decision tree env.debug(1, ['Tree cross-validation']) # clf = DecisionTreeClassifier(criterion='gini', random_state=seed) # 0.79 # clf = KNeighborsClassifier(n_neighbors=230) model = DecisionTreeClassifier(criterion='entropy', random_state=seed) # 0.81 env.debug( 1, ['Calculate cross_val_score. Splits=%s' % (n_splits)]) scores = cross_val_score(model, X, y, cv=kf) print('DTree scores:', scores.mean(), 'raw', scores) if False: #Logistic regression env.debug(1, ['LGR cross-validation']) n_Cs = [0.01] X = array[:, 5:] X_sc = sc.fit_transform(X) Y = df_train['idgram'].values Y[Y > 0] = 1 print(X_sc, Y) for n_c in n_Cs: #clf = LogisticRegression(penalty='l2', solver='saga', C=n_c, multi_class='multinomial') clf = LogisticRegression(penalty='l2', solver='liblinear', C=n_c) # clf = SVC(kernel='linear', C=10000, random_state=241) # clf = SVC(kernel='linear', C=0.01, random_state=seed) # clf = SVC(random_state=seed) # clf = Perceptron() env.debug(1, [ 'Calculate cross_val_score. Splits=%s C=%s' % (n_splits, n_c) ]) scores = cross_val_score(clf, X_sc, Y, cv=kf) print(scores) if False: #GBM, RandomForest env.debug(1, ['GBM cross-validation']) asteps = [20] #GBM #asteps=[100] #RandomForest for i in asteps: #clf = RandomForestClassifier(n_estimators=i) clf = GradientBoostingClassifier( n_estimators=i, max_depth=8) #, max_features='sqrt' env.debug(1, [ 'Calculate cross_val_score. Splits=%s Estimators=%s' % (n_splits, i) ]) scores = cross_val_score(clf, X, Y, cv=kf) print(scores) if validation == 'eval': # eval model = xgb.XGBClassifier(n_estimators=140, max_depth=16, colsample=1, subsample=0.5, seed=seed) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=frac_test_size, random_state=seed, shuffle=True) eval_set = [(X_train, y_train), (X_test, y_test)] # print(eval_set) f_eval = 'merror' # f_eval = 'mlogloss' model.fit(X_train, y_train, eval_metric=f_eval, eval_set=eval_set, verbose=False, early_stopping_rounds=20) ev_scores = model.evals_result() ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean() #print(model.feature_importances_) print(ev_mean, ev_scores) xgb.plot_importance(model) plt.show() t2_end = timer() t_end = timer() env.debug(1, ['CV completed:', 'time:', env.job_time(t_start, t_end)]) if validation == 'cv': #Training на всех данных X_train, y_train = X, y # model = SVC() # model= DecisionTreeClassifier() #79 # model= LinearDiscriminantAnalysis() #47 # model=LogisticRegression() #48 # model = KNeighborsClassifier(n_neighbors=200) #48 # model = GaussianNB() #43 #print('Fit...') #print('Validate...') # predictions = model.predict(X_validation) # print(accuracy_score(Y_validation, predictions)) # print(confusion_matrix(Y_validation, predictions)) # print(classification_report(Y_validation, predictions)) t_start = timer() env.debug(1, ['Training: START']) model.fit(X_train, y_train) t_end = timer() env.debug(1, ['Training: END', env.job_time(t_start, t_end)]) pickle.dump(sc, open(env.filename_scaler(), 'wb')) pickle.dump(model, open(env.filename_model_tree(), 'wb')) # Smoke test if b_smoketest: X_smoke_predict = [ 'съеште', 'ещё', 'этих', 'мягких', 'французских', 'булок' ] a_smoke = np.array( [enc.word2token(elem) for elem in X_smoke_predict]) y_predictions = model.predict(a_smoke[:, 0:]) y_predictions_proba = model.predict(a_smoke[:, 0:]) #print(y_predictions) print('Prediction', list(zip(X_smoke_predict, y_predictions))) print('Proba', list(zip(X_smoke_predict, y_predictions_proba))) return model