def calc_tfidf_new(category, top=10): data = collect_data.readData(TRAIN_DATA, TRAINING_FILE) tmp = data[data['label'] == category] top_words = {} for index, text in enumerate(tmp['text'].tolist()): top_words[index] = get_top([text], top).tolist() return top_words
def multi_classifires(): df = collect_data.readData(TRAIN_DATA, TRAINING_FILE, 1500) losses = [] auc = [] for category in collect_data.LABELS: classifier = df.copy() classifier.loc[classifier['label'] != category, 'label'] = 'Khac' train_x, test_x, train_y, test_y = model_selection.train_test_split( classifier['text'], classifier['label']) tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words=collect_data.get_stop_word(), max_features=5000) tfidf_vect.fit(classifier['text']) xtrain_tfidf = tfidf_vect.transform(train_x) xtest_tfidf = tfidf_vect.transform(test_x) logistic_classifier = LogisticRegression(multi_class='ovr', solver='sag', C=10) cv_loss = np.mean( cross_val_score(logistic_classifier, xtrain_tfidf, train_y, cv=5, scoring='neg_log_loss')) losses.append(cv_loss) print('CV Log_loss score for class {} is {}'.format(category, cv_loss)) cv_score = np.mean( cross_val_score(logistic_classifier, xtrain_tfidf, train_y, cv=5, scoring='accuracy')) print('CV Accuracy score for class {} is {}'.format( category, cv_score)) logistic_classifier.fit(xtrain_tfidf, train_y) y_pred = logistic_classifier.predict(xtest_tfidf) y_pred_prob = logistic_classifier.predict_proba(xtest_tfidf)[:, 1] auc_score = metrics.roc_auc_score(test_y, y_pred_prob) auc.append(auc_score) print("CV ROC_AUC score {}\n".format(auc_score)) print(confusion_matrix(test_y, y_pred)) print(classification_report(test_y, y_pred)) print('Total average CV Log_loss score is {}'.format(np.mean(losses))) print('Total average CV ROC_AUC score is {}'.format(np.mean(auc)))
def train_tunning(): df = collect_data.readData(TRAIN_DATA, TRAINING_FILE, 1500) train_x, test_x, train_y, test_y = model_selection.train_test_split( df['text'], df['label']) if os.path.isfile('./data/model_train'): vec = open("./data/model_train", 'rb') # rb= read in bytes grid3 = pickle.load(vec) vec.close() else: start_time = time.time() pipe = make_pipeline( TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words=collect_data.get_stop_word()), OneVsRestClassifier(LogisticRegression())) param_grid = { 'tfidfvectorizer__max_features': [5000, 10000], 'onevsrestclassifier__estimator__solver': ['liblinear', 'sag'], } grid = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy') grid3 = grid.fit(train_x, train_y) end_time = time.time() print("total time", end_time - start_time) save_classifier = open("./data/model_train", 'wb') #wb= write in bytes. pickle.dump( grid3, save_classifier ) #use pickle to dump the grid3 we trained, as 'Tfidf_LogR.pickle' in wb format save_classifier.close() print(grid3.best_estimator_.named_steps['onevsrestclassifier']) print(grid3.best_estimator_.named_steps['tfidfvectorizer']) grid3.best_params_ grid3.best_score_ predicted_y_test = grid3.predict(test_x) X_test_list = test_x.tolist() predicted_y_test_list = predicted_y_test.tolist() save = pd.DataFrame(np.column_stack([X_test_list, predicted_y_test_list])) save.to_csv("./data/result_trained.csv", sep=',', encoding='utf-16', header=True, index=False)
def clustering_word(): data = collect_data.readData(TRAIN_DATA, TRAINING_FILE) tfidf_vect = TfidfVectorizer( analyzer='word', token_pattern= r'[a-zA-ZàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ0-9_]+', lowercase=True, ngram_range=(1, 4), stop_words=collect_data.get_stop_word(), max_features=10000) count_train = tfidf_vect.fit(data["text"]) # bag_of_words = tfidf_vect.transform(data) # feature_names = np.array(count_train.get_feature_names()) print(count_train.get_feature_names(), len(count_train.get_feature_names()))
def calc_tfidf_category(category, top=10): data = collect_data.readData(TRAIN_DATA, TRAINING_FILE) tmp = data[data['label'] == category] return get_top(tmp["text"], top)
def clustering_word(): data = collect_data.readData(TRAIN_DATA, TRAINING_FILE) return data["text"]