Ejemplo n.º 1
0
    def pos_word_by_ml(self, awords):
        env = Environment()
        enc = Word_Encoder()

        file_model = env.filename_model_tree()
        clf = pickle.load(open(file_model, 'rb'))
        a_predict = np.array([enc.word2token('')])
        for word in awords:
            a_padd = [enc.word2token(word)]
            #print(word, a_padd)
            a_predict = np.append(a_predict, a_padd, axis=0)
        a_predict = a_predict[1:]
        #print(a_predict[0, 100])
        predictions = clf.predict(a_predict[:, 0:])
        return (predictions[0:])
Ejemplo n.º 2
0
    def pos(self, df, mode_fast=True, use_cache=True):
        env = Environment()
        enc = Word_Encoder()
        df_res = df
        t_start = timer()

        c = OpenCorpus()
        g = c.grammemes()
        dg = g.to_dict().get('name')

        #Cache file
        cache_columns = ['word', 'gram_ml', 'count']
        file_cache = env.filename_mlcache_csv()
        try:
            df_cache = pd.read_csv(file_cache,
                                   index_col='idcorpus',
                                   encoding='utf-8')
        except:
            env.debug(
                1,
                ['POSTagger', 'pos', 'Failed to read cache file:', file_cache])
            df_cache = pd.DataFrame(columns=cache_columns)
        else:
            env.debug(1, ['POSTagger', 'pos', 'Read ML cache OK:', file_cache])

        a_predict = np.array([enc.word2token('')])
        #a_words = ['']
        n_words = df_res.shape[0]

        env.debug(1, [
            'POStagger', 'pos',
            'START Vocabulary prediction %s words' % n_words
        ])
        a_words = df_res['word'].tolist()
        a_ml_words = []
        predictions_voc = self.pos_by_voc(a_words)
        p_se = pd.Series(predictions_voc)
        df_res['gram'] = p_se.values
        df_res['gram_voc'] = p_se.values
        df_res['gram_ml'] = ''
        t_end = timer()
        env.debug(1, [
            'POStagger', 'pos',
            'END Vocabulary prediction %s sec.' % env.job_time(t_start, t_end)
        ])
        #print(predictions_voc)

        if mode_fast:
            #env.debug(1, ['POStagger', 'pos', 'START Fast mode vocabulary search. Words %s' % df.shape[0]])
            df_ni_voc = df_res[df_res['gram_voc'] == '']
            n_words = df_ni_voc.shape[0]
        else:
            df_ni_voc = df_res
        #print('non-vocabulary',df_ni_voc)
        if not df_ni_voc.empty:
            env.debug(
                1, ['POStagger', 'pos',
                    'START Encoding %s words' % n_words])
            for index, serie in df_ni_voc.iterrows():
                word = df_ni_voc.at[index, 'word']
                #print(word)
                a_padd = np.array([enc.word2token(word)])
                a_predict = np.append(a_predict, a_padd, axis=0)
                a_ml_words.append(word)
                #print(a_words, a_predict)
            a_predict = a_predict[1:, :]
            #print(a_predict)
            #print('ml_words',a_ml_words)
            t_end = timer()
            env.debug(1, [
                'POStagger', 'pos',
                'END Encoding %s words %s sec.' %
                (n_words, env.job_time(t_start, t_end))
            ])

        t_start = timer()
        env.debug(1, ['POStagger', 'pos', 'START Model prediction'])
        clf = pickle.load(open(env.filename_model_tree(), 'rb'))
        predictions_ml = clf.predict(a_predict[:, 0:])
        # print('ml', predictions_ml)
        t_end = timer()
        env.debug(1, [
            'POStagger', 'pos',
            'END Model prediction %s sec.' % env.job_time(t_start, t_end)
        ])
        #print('ml_words_prediction',list(zip(a_ml_words,predictions_ml)))

        t_start = timer()
        i = 0
        s_pvoc = ''
        s_pml = ''
        for index, row in df_res.iterrows():
            word = df_res.at[index, 'word']
            s_pvoc = df_res.at[index, 'gram_voc']
            #s_pvoc = predictions_voc[i]
            #print('s_pvoc', word, s_pvoc)
            #df_res.at[index, 'gram_voc'] = s_pvoc
            if s_pvoc == '':
                if mode_fast:
                    try:
                        j = a_ml_words.index(word)
                    except:
                        pass
                    else:
                        s_pml = dg.get(predictions_ml[j])
                        #print(word,s_pml)
                else:
                    s_pml = dg.get(predictions_ml[i])
                df_res.at[index, 'gram_ml'] = s_pml
                df_res.at[index, 'gram'] = s_pml
            i = i + 1
        t_end = timer()
        env.debug(1, [
            'POStagger', 'pos',
            'ML predictions dataframe filled %s sec' %
            env.job_time(t_start, t_end)
        ])
        #print(df_res)
        df_cache = pd.concat([
            df_cache,
            df_res[df_res.gram_ml != ''][['word', 'gram_ml', 'count']]
        ])
        df_cache = df_cache.groupby(['word',
                                     'gram_ml']).agg({'count': ['sum']})
        df_cache.reset_index(inplace=True)
        df_cache.index.name = 'idcorpus'
        df_cache.columns = cache_columns
        df_cache.sort_values(by=['count'], inplace=True, ascending=False)
        #print(df_cache)
        env.debug(1,
                  ['POStagger', 'pos', 'Write ML cache to CSV:', file_cache])
        df_cache.to_csv(file_cache, encoding='utf-8')
        return df_res
Ejemplo n.º 3
0
    def train(self,
              df=pd.DataFrame(),
              validation='eval',
              n_splits=5,
              b_smoketest=True,
              n_frac=1):
        env = Environment()
        enc = Word_Encoder()
        df_train = df
        bgm_columns = env.bgm_columns_list(mode=1)
        drop_columns = [
            'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3',
            'n_token'
        ]  #, 'bgm_l_None'
        #drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns])
        env.debug(1,
                  ['POStagger', 'train',
                   'Drop colums: %s' % (drop_columns)])

        if df_train.empty:
            t_start = timer()
            df_train = self.tokenz()
            t_end = timer()
            env.debug(1, [
                'POSTagger', 'train', 'tokenz loaded:', 'time:',
                env.job_time(t_start, t_end)
            ])

        env.debug(1, [
            'POStagger', 'train',
            'All tokenz set shape %s' % df_train.shape[0]
        ])
        t_start = timer()
        env.debug(1, ['POStagger', 'train', 'Learning: START'])
        if n_frac < 1:
            df_train = df_train.sample(frac=n_frac)
            env.debug(1, [
                'POStagger', 'train',
                'Training tokenz set shape %s' % df_train.shape[0]
            ])
            #print(df_train.shape)

        #df_train2 = df_train[bgm_columns]
        #print(df_train2.shape)
        #df_train2 = df_train2.astype({"idgram": int})
        df_train = df_train.drop(columns=drop_columns, axis=1)
        env.debug(
            1, ['POStagger',
                'Train colums: %s' % (df_train.columns.tolist())])
        #print(df_train.columns)

        #df_train = df_train.drop_duplicates() #slow-slow
        #print(df_train.head())

        df_train = df_train.fillna(0)
        file_x = env.filename_xtrain_csv()
        df_train.to_csv(file_x, encoding='utf-8')
        env.debug(1, ['POStagger', 'train', 'Save X', file_x])
        y = df_train['idgram'].values
        df_train.drop(columns=['idgram'], inplace=True)
        X = df_train.values
        #array = df_train.values
        #print(df_train)
        #X = array[:, 1:]
        #Y = array[:, 0]

        #print(X, Y)
        #validation_size = 0.20
        seed = 241
        frac_test_size = 0.2

        sc = StandardScaler()
        #Y_sc = sc.fit_transform(Y)
        t2_start = timer()
        if validation == 'cv':  #Need cross-validation
            scoring = 'accuracy'
            # scoring = 'f1_samples'
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
            if True:  #Decision tree
                env.debug(1, ['Tree cross-validation'])
                # clf = DecisionTreeClassifier(criterion='gini', random_state=seed)  # 0.79
                # clf = KNeighborsClassifier(n_neighbors=230)
                model = DecisionTreeClassifier(criterion='entropy',
                                               random_state=seed)  # 0.81
                env.debug(
                    1, ['Calculate cross_val_score. Splits=%s' % (n_splits)])
                scores = cross_val_score(model, X, y, cv=kf)
                print('DTree scores:', scores.mean(), 'raw', scores)

            if False:  #Logistic regression
                env.debug(1, ['LGR cross-validation'])
                n_Cs = [0.01]
                X = array[:, 5:]
                X_sc = sc.fit_transform(X)
                Y = df_train['idgram'].values
                Y[Y > 0] = 1
                print(X_sc, Y)
                for n_c in n_Cs:
                    #clf = LogisticRegression(penalty='l2', solver='saga', C=n_c, multi_class='multinomial')
                    clf = LogisticRegression(penalty='l2',
                                             solver='liblinear',
                                             C=n_c)
                    # clf = SVC(kernel='linear', C=10000, random_state=241)
                    # clf = SVC(kernel='linear', C=0.01, random_state=seed)
                    # clf = SVC(random_state=seed)
                    # clf = Perceptron()
                    env.debug(1, [
                        'Calculate cross_val_score. Splits=%s C=%s' %
                        (n_splits, n_c)
                    ])
                    scores = cross_val_score(clf, X_sc, Y, cv=kf)
                    print(scores)

            if False:  #GBM, RandomForest
                env.debug(1, ['GBM cross-validation'])
                asteps = [20]  #GBM
                #asteps=[100] #RandomForest
                for i in asteps:
                    #clf = RandomForestClassifier(n_estimators=i)
                    clf = GradientBoostingClassifier(
                        n_estimators=i, max_depth=8)  #, max_features='sqrt'
                    env.debug(1, [
                        'Calculate cross_val_score. Splits=%s Estimators=%s' %
                        (n_splits, i)
                    ])
                    scores = cross_val_score(clf, X, Y, cv=kf)
                    print(scores)

        if validation == 'eval':
            # eval
            model = xgb.XGBClassifier(n_estimators=140,
                                      max_depth=16,
                                      colsample=1,
                                      subsample=0.5,
                                      seed=seed)
            X_train, X_test, y_train, y_test = train_test_split(
                X,
                y,
                test_size=frac_test_size,
                random_state=seed,
                shuffle=True)
            eval_set = [(X_train, y_train), (X_test, y_test)]
            # print(eval_set)
            f_eval = 'merror'
            # f_eval = 'mlogloss'
            model.fit(X_train,
                      y_train,
                      eval_metric=f_eval,
                      eval_set=eval_set,
                      verbose=False,
                      early_stopping_rounds=20)
            ev_scores = model.evals_result()
            ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean()
            #print(model.feature_importances_)
            print(ev_mean, ev_scores)
            xgb.plot_importance(model)
            plt.show()
        t2_end = timer()
        t_end = timer()
        env.debug(1, ['CV completed:', 'time:', env.job_time(t_start, t_end)])

        if validation == 'cv':
            #Training на всех данных
            X_train, y_train = X, y

            # model = SVC()
            # model= DecisionTreeClassifier() #79
            # model= LinearDiscriminantAnalysis() #47
            # model=LogisticRegression() #48
            # model = KNeighborsClassifier(n_neighbors=200) #48
            # model = GaussianNB()   #43
            #print('Fit...')

            #print('Validate...')
            # predictions = model.predict(X_validation)

            # print(accuracy_score(Y_validation, predictions))
            # print(confusion_matrix(Y_validation, predictions))
            # print(classification_report(Y_validation, predictions))

            t_start = timer()
            env.debug(1, ['Training: START'])
            model.fit(X_train, y_train)
            t_end = timer()
            env.debug(1, ['Training: END', env.job_time(t_start, t_end)])

        pickle.dump(sc, open(env.filename_scaler(), 'wb'))
        pickle.dump(model, open(env.filename_model_tree(), 'wb'))

        # Smoke test
        if b_smoketest:
            X_smoke_predict = [
                'съеште', 'ещё', 'этих', 'мягких', 'французских', 'булок'
            ]
            a_smoke = np.array(
                [enc.word2token(elem) for elem in X_smoke_predict])
            y_predictions = model.predict(a_smoke[:, 0:])
            y_predictions_proba = model.predict(a_smoke[:, 0:])
            #print(y_predictions)
            print('Prediction', list(zip(X_smoke_predict, y_predictions)))
            print('Proba', list(zip(X_smoke_predict, y_predictions_proba)))
        return model