def word2token(self, s): t_start = timer() env = Environment() bgm_columns = env.bgm_columns_list(mode=1) n_shift = 5 a_result=np.zeros(len(bgm_columns)+n_shift) a_result[0] = len(s) a_result[1] = self.s_encode(s[-2:]) # ts2 a_result[2] = self.s_encode(s[-3:]) # ts3 a_result[3] = self.s_encode(s[2:]) # tp2 a_result[4] = self.s_encode(s[3:]) # tp3 t_end = timer() #env.debug(1, ['WordEncoder', 'word2token', '%s without bgm takes %s sec.' % (s, env.job_time(t_start, t_end))]) #t_start = timer() di_letters = env.di_bgm_byletters #print(di_letters) di_word = {} for n_l in range(0, len(s) - 1): n_l2 = n_l + 1 di_n = di_letters.get('%s%s' % (s[n_l], s[n_l2])) #print('%s%s' % (s[n_l], s[n_l2]),di_n) if di_n is not None: #print(di_n) a_result[di_n + n_shift] = 1 t_end = timer() #env.debug(1, ['WordEncoder', 'word2token', '%s takes %s sec.' % (s, env.job_time(t_start, t_end))]) return a_result
def tokenz_create_stat(self, dftokenz=pd.DataFrame(), n_frac=1): env = Environment() enc = Word_Encoder() di_letters = Environment.di_bgm_byletters bgm_columns = env.bgm_columns_list(mode=1) t_start = timer() if dftokenz.empty: dftokenz = self.tokenz() if n_frac < 1: dftokenz = dftokenz.sample(frac=n_frac) env.debug(1, [ 'POStagger', 'create_stat', 'Collecting statistic START %s words' % dftokenz.shape[0] ]) di_tokenz_stat = (dftokenz.count()).to_dict() di_tokenz_res = {} #print('di_letters', di_letters) print('di_tokenz_stat', di_tokenz_stat) bgm_astat = [['init', 0]] bgm_index = [] for key in di_letters: di_n = di_letters.get(key) column_stat = di_tokenz_stat.get(bgm_columns[di_n]) #di_tokenz_res[key] = column_stat bgm_astat.append([key, column_stat]) bgm_index.append(di_n) bgm_astat = bgm_astat[1:] print('column stat', bgm_astat) df_bgm_stat = pd.DataFrame(data=bgm_astat, columns=['bigram', 'counts'], index=bgm_index) df_bgm_stat.index.name = 'idbigram' df_bgm_stat = df_bgm_stat.sort_values(by=['counts'], ascending=False) print('bgm_stat\n', df_bgm_stat) df_bgm_stat.to_csv(env.filename_stat_bigram_letters_csv(), encoding='utf-8')
def train(self, df=pd.DataFrame(), validation='eval', n_splits=5, b_smoketest=True, n_frac=1): env = Environment() enc = Word_Encoder() df_train = df bgm_columns = env.bgm_columns_list(mode=1) drop_columns = [ 'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token' ] #, 'bgm_l_None' #drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns]) env.debug(1, ['POStagger', 'train', 'Drop colums: %s' % (drop_columns)]) if df_train.empty: t_start = timer() df_train = self.tokenz() t_end = timer() env.debug(1, [ 'POSTagger', 'train', 'tokenz loaded:', 'time:', env.job_time(t_start, t_end) ]) env.debug(1, [ 'POStagger', 'train', 'All tokenz set shape %s' % df_train.shape[0] ]) t_start = timer() env.debug(1, ['POStagger', 'train', 'Learning: START']) if n_frac < 1: df_train = df_train.sample(frac=n_frac) env.debug(1, [ 'POStagger', 'train', 'Training tokenz set shape %s' % df_train.shape[0] ]) #print(df_train.shape) #df_train2 = df_train[bgm_columns] #print(df_train2.shape) #df_train2 = df_train2.astype({"idgram": int}) df_train = df_train.drop(columns=drop_columns, axis=1) env.debug( 1, ['POStagger', 'Train colums: %s' % (df_train.columns.tolist())]) #print(df_train.columns) #df_train = df_train.drop_duplicates() #slow-slow #print(df_train.head()) df_train = df_train.fillna(0) file_x = env.filename_xtrain_csv() df_train.to_csv(file_x, encoding='utf-8') env.debug(1, ['POStagger', 'train', 'Save X', file_x]) y = df_train['idgram'].values df_train.drop(columns=['idgram'], inplace=True) X = df_train.values #array = df_train.values #print(df_train) #X = array[:, 1:] #Y = array[:, 0] #print(X, Y) #validation_size = 0.20 seed = 241 frac_test_size = 0.2 sc = StandardScaler() #Y_sc = sc.fit_transform(Y) t2_start = timer() if validation == 'cv': #Need cross-validation scoring = 'accuracy' # scoring = 'f1_samples' kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed) if True: #Decision tree env.debug(1, ['Tree cross-validation']) # clf = DecisionTreeClassifier(criterion='gini', random_state=seed) # 0.79 # clf = KNeighborsClassifier(n_neighbors=230) model = DecisionTreeClassifier(criterion='entropy', random_state=seed) # 0.81 env.debug( 1, ['Calculate cross_val_score. Splits=%s' % (n_splits)]) scores = cross_val_score(model, X, y, cv=kf) print('DTree scores:', scores.mean(), 'raw', scores) if False: #Logistic regression env.debug(1, ['LGR cross-validation']) n_Cs = [0.01] X = array[:, 5:] X_sc = sc.fit_transform(X) Y = df_train['idgram'].values Y[Y > 0] = 1 print(X_sc, Y) for n_c in n_Cs: #clf = LogisticRegression(penalty='l2', solver='saga', C=n_c, multi_class='multinomial') clf = LogisticRegression(penalty='l2', solver='liblinear', C=n_c) # clf = SVC(kernel='linear', C=10000, random_state=241) # clf = SVC(kernel='linear', C=0.01, random_state=seed) # clf = SVC(random_state=seed) # clf = Perceptron() env.debug(1, [ 'Calculate cross_val_score. Splits=%s C=%s' % (n_splits, n_c) ]) scores = cross_val_score(clf, X_sc, Y, cv=kf) print(scores) if False: #GBM, RandomForest env.debug(1, ['GBM cross-validation']) asteps = [20] #GBM #asteps=[100] #RandomForest for i in asteps: #clf = RandomForestClassifier(n_estimators=i) clf = GradientBoostingClassifier( n_estimators=i, max_depth=8) #, max_features='sqrt' env.debug(1, [ 'Calculate cross_val_score. Splits=%s Estimators=%s' % (n_splits, i) ]) scores = cross_val_score(clf, X, Y, cv=kf) print(scores) if validation == 'eval': # eval model = xgb.XGBClassifier(n_estimators=140, max_depth=16, colsample=1, subsample=0.5, seed=seed) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=frac_test_size, random_state=seed, shuffle=True) eval_set = [(X_train, y_train), (X_test, y_test)] # print(eval_set) f_eval = 'merror' # f_eval = 'mlogloss' model.fit(X_train, y_train, eval_metric=f_eval, eval_set=eval_set, verbose=False, early_stopping_rounds=20) ev_scores = model.evals_result() ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean() #print(model.feature_importances_) print(ev_mean, ev_scores) xgb.plot_importance(model) plt.show() t2_end = timer() t_end = timer() env.debug(1, ['CV completed:', 'time:', env.job_time(t_start, t_end)]) if validation == 'cv': #Training на всех данных X_train, y_train = X, y # model = SVC() # model= DecisionTreeClassifier() #79 # model= LinearDiscriminantAnalysis() #47 # model=LogisticRegression() #48 # model = KNeighborsClassifier(n_neighbors=200) #48 # model = GaussianNB() #43 #print('Fit...') #print('Validate...') # predictions = model.predict(X_validation) # print(accuracy_score(Y_validation, predictions)) # print(confusion_matrix(Y_validation, predictions)) # print(classification_report(Y_validation, predictions)) t_start = timer() env.debug(1, ['Training: START']) model.fit(X_train, y_train) t_end = timer() env.debug(1, ['Training: END', env.job_time(t_start, t_end)]) pickle.dump(sc, open(env.filename_scaler(), 'wb')) pickle.dump(model, open(env.filename_model_tree(), 'wb')) # Smoke test if b_smoketest: X_smoke_predict = [ 'съеште', 'ещё', 'этих', 'мягких', 'французских', 'булок' ] a_smoke = np.array( [enc.word2token(elem) for elem in X_smoke_predict]) y_predictions = model.predict(a_smoke[:, 0:]) y_predictions_proba = model.predict(a_smoke[:, 0:]) #print(y_predictions) print('Prediction', list(zip(X_smoke_predict, y_predictions))) print('Proba', list(zip(X_smoke_predict, y_predictions_proba))) return model
def tokenize(self, dftokenz=pd.DataFrame(), persistent=True, n_frac=1): env = Environment() enc = Word_Encoder() t_start = timer() if dftokenz.empty: dftokenz = self.tokenz() if n_frac < 1: dftokenz = dftokenz.sample(frac=n_frac) env.debug( 1, ['Transforming to tokenz: START %s words' % dftokenz.shape[0]]) gmask = dftokenz.groupby(['gram']) df_posstat = gmask.count() df_posstat.to_csv(env.filename_stat_pos_tokenz_csv(), encoding='utf-8') print('POSTagger', 'train dataset stat:\n', gmask.count()) fields = [ 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token', 'n_len', 'n_tokens2', 'n_tokens3', 'n_tokenp2', 'n_tokenp3' ] for field in fields: val = 0.0 if field[0] == 's': val = '' dftokenz[field] = val n_letters = 0 s_letters = env.list_rus_letters() di_letters = env.di_bgm_byletters #bgm_columns_i = env.bgm_columns_list(mode=0) bgm_columns = env.bgm_columns_list(mode=1) #print('bgm_columns', bgm_columns) for column_name in bgm_columns: dftokenz[column_name] = None t_end = timer() env.debug(1, [ 'POStagger', 'Letters bigram columns added', env.job_time(t_start, t_end) ]) #Form tokenz t_start = timer() for index, serie in dftokenz.iterrows(): # print (serie.values) a_word = enc.s2token(index, serie) i = 2 # print(a_word) for field in fields: dftokenz.at[index, field] = a_word[i] # print(field, a_word[i]) i = i + 1 # print(dftokenz.loc[index]) #Letters bigram binaries for n_l in range(0, len(a_word[0]) - 1): n_l2 = n_l + 1 di_n = di_letters.get('%s%s' % (a_word[0][n_l], a_word[0][n_l2])) if di_n is not None: #print(di_n) #print(bgm_columns[di_n]) dftokenz.at[index, bgm_columns[di_n]] = 1 t_end = timer() env.debug( 1, ['Transforming to tokenz: COMPLETE', env.job_time(t_start, t_end)]) if persistent: dftokenz.to_csv(env.filename_tokenz_csv(), encoding='utf-8') env.debug(1, ['Tokenz written to CSV:', env.filename_tokenz_csv()]) return dftokenz