def calc_tfidf_new(category, top=10):
    data = collect_data.readData(TRAIN_DATA, TRAINING_FILE)
    tmp = data[data['label'] == category]
    top_words = {}
    for index, text in enumerate(tmp['text'].tolist()):
        top_words[index] = get_top([text], top).tolist()

    return top_words
Exemple #2
0
def multi_classifires():
    df = collect_data.readData(TRAIN_DATA, TRAINING_FILE, 1500)
    losses = []
    auc = []

    for category in collect_data.LABELS:
        classifier = df.copy()
        classifier.loc[classifier['label'] != category, 'label'] = 'Khac'
        train_x, test_x, train_y, test_y = model_selection.train_test_split(
            classifier['text'], classifier['label'])

        tfidf_vect = TfidfVectorizer(analyzer='word',
                                     token_pattern=r'\w{1,}',
                                     stop_words=collect_data.get_stop_word(),
                                     max_features=5000)
        tfidf_vect.fit(classifier['text'])
        xtrain_tfidf = tfidf_vect.transform(train_x)
        xtest_tfidf = tfidf_vect.transform(test_x)

        logistic_classifier = LogisticRegression(multi_class='ovr',
                                                 solver='sag',
                                                 C=10)

        cv_loss = np.mean(
            cross_val_score(logistic_classifier,
                            xtrain_tfidf,
                            train_y,
                            cv=5,
                            scoring='neg_log_loss'))
        losses.append(cv_loss)
        print('CV Log_loss score for class {} is {}'.format(category, cv_loss))

        cv_score = np.mean(
            cross_val_score(logistic_classifier,
                            xtrain_tfidf,
                            train_y,
                            cv=5,
                            scoring='accuracy'))
        print('CV Accuracy score for class {} is {}'.format(
            category, cv_score))

        logistic_classifier.fit(xtrain_tfidf, train_y)
        y_pred = logistic_classifier.predict(xtest_tfidf)
        y_pred_prob = logistic_classifier.predict_proba(xtest_tfidf)[:, 1]
        auc_score = metrics.roc_auc_score(test_y, y_pred_prob)
        auc.append(auc_score)
        print("CV ROC_AUC score {}\n".format(auc_score))

        print(confusion_matrix(test_y, y_pred))
        print(classification_report(test_y, y_pred))
    print('Total average CV Log_loss score is {}'.format(np.mean(losses)))
    print('Total average CV ROC_AUC score is {}'.format(np.mean(auc)))
Exemple #3
0
def train_tunning():
    df = collect_data.readData(TRAIN_DATA, TRAINING_FILE, 1500)
    train_x, test_x, train_y, test_y = model_selection.train_test_split(
        df['text'], df['label'])
    if os.path.isfile('./data/model_train'):
        vec = open("./data/model_train", 'rb')  # rb= read in bytes
        grid3 = pickle.load(vec)
        vec.close()
    else:
        start_time = time.time()
        pipe = make_pipeline(
            TfidfVectorizer(analyzer='word',
                            token_pattern=r'\w{1,}',
                            stop_words=collect_data.get_stop_word()),
            OneVsRestClassifier(LogisticRegression()))
        param_grid = {
            'tfidfvectorizer__max_features': [5000, 10000],
            'onevsrestclassifier__estimator__solver': ['liblinear', 'sag'],
        }
        grid = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy')

        grid3 = grid.fit(train_x, train_y)

        end_time = time.time()
        print("total time", end_time - start_time)

        save_classifier = open("./data/model_train",
                               'wb')  #wb= write in bytes.
        pickle.dump(
            grid3, save_classifier
        )  #use pickle to dump the grid3 we trained, as 'Tfidf_LogR.pickle' in wb format
        save_classifier.close()

    print(grid3.best_estimator_.named_steps['onevsrestclassifier'])
    print(grid3.best_estimator_.named_steps['tfidfvectorizer'])

    grid3.best_params_
    grid3.best_score_
    predicted_y_test = grid3.predict(test_x)

    X_test_list = test_x.tolist()
    predicted_y_test_list = predicted_y_test.tolist()

    save = pd.DataFrame(np.column_stack([X_test_list, predicted_y_test_list]))
    save.to_csv("./data/result_trained.csv",
                sep=',',
                encoding='utf-16',
                header=True,
                index=False)
def clustering_word():
    data = collect_data.readData(TRAIN_DATA, TRAINING_FILE)
    tfidf_vect = TfidfVectorizer(
        analyzer='word',
        token_pattern=
        r'[a-zA-ZàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ0-9_]+',
        lowercase=True,
        ngram_range=(1, 4),
        stop_words=collect_data.get_stop_word(),
        max_features=10000)
    count_train = tfidf_vect.fit(data["text"])
    # bag_of_words = tfidf_vect.transform(data)
    # feature_names = np.array(count_train.get_feature_names())
    print(count_train.get_feature_names(),
          len(count_train.get_feature_names()))
def calc_tfidf_category(category, top=10):
    data = collect_data.readData(TRAIN_DATA, TRAINING_FILE)
    tmp = data[data['label'] == category]

    return get_top(tmp["text"], top)
def clustering_word():
    data = collect_data.readData(TRAIN_DATA, TRAINING_FILE)
    return data["text"]