def model_one_with_all(df): classifier_params = {} for index, category in enumerate(collect_data.LABELS): classifier = df.copy() #initial another pandas classifier.loc[classifier['label'] != category, 'label'] = 'Khac' size_polipatics_society = classifier[classifier['label'] == category].shape[0] size_others = classifier[classifier['label'] == 'Khac'].shape[0] print('Number of politics-society documents: %s' % size_polipatics_society) print('Number of other documents: %s' % size_others) train_y = classifier['label'] train_x = classifier['text'] # split the dataset into training and test datasets #print(train_x[165], train_y[165]) # label encode the target variable, encode labels to 0 or 1 encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) # word level tf-idf tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words=collect_data.get_stop_word(), max_features=5000) tfidf_vect.fit(train_x) xtrain_tfidf = tfidf_vect.transform(train_x) # Getting transformed training and testing dataset print('Number of training documents: %s' % str(xtrain_tfidf.shape[0])) print('Number of features of each document: %s' % str(xtrain_tfidf.shape[1])) print('xtrain_tfidf shape: %s' % str(xtrain_tfidf.shape)) print('train_y shape: %s' % str(train_y.shape)) ### START CODE HERE ### train_y = np.expand_dims(train_y, axis=0) # for convenience in this exercise, we also use toarray() to convert # sparse to dense matrix xtrain_tfidf = xtrain_tfidf.T.toarray() ### END CODE HERE ### # New shape print('xtrain_tfidf shape: %s' % str(xtrain_tfidf.shape)) print('train_y shape: %s' % str(train_y.shape)) # return c d = functions.model_one_vs_all(xtrain_tfidf, train_y, num_iterations=3000, learning_rate=.5, print_cost=True) classifier_params[category] = {"w": d['w'], 'b': d['b']} return classifier_params
def multi_classifires(): df = collect_data.readData(TRAIN_DATA, TRAINING_FILE, 1500) losses = [] auc = [] for category in collect_data.LABELS: classifier = df.copy() classifier.loc[classifier['label'] != category, 'label'] = 'Khac' train_x, test_x, train_y, test_y = model_selection.train_test_split( classifier['text'], classifier['label']) tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words=collect_data.get_stop_word(), max_features=5000) tfidf_vect.fit(classifier['text']) xtrain_tfidf = tfidf_vect.transform(train_x) xtest_tfidf = tfidf_vect.transform(test_x) logistic_classifier = LogisticRegression(multi_class='ovr', solver='sag', C=10) cv_loss = np.mean( cross_val_score(logistic_classifier, xtrain_tfidf, train_y, cv=5, scoring='neg_log_loss')) losses.append(cv_loss) print('CV Log_loss score for class {} is {}'.format(category, cv_loss)) cv_score = np.mean( cross_val_score(logistic_classifier, xtrain_tfidf, train_y, cv=5, scoring='accuracy')) print('CV Accuracy score for class {} is {}'.format( category, cv_score)) logistic_classifier.fit(xtrain_tfidf, train_y) y_pred = logistic_classifier.predict(xtest_tfidf) y_pred_prob = logistic_classifier.predict_proba(xtest_tfidf)[:, 1] auc_score = metrics.roc_auc_score(test_y, y_pred_prob) auc.append(auc_score) print("CV ROC_AUC score {}\n".format(auc_score)) print(confusion_matrix(test_y, y_pred)) print(classification_report(test_y, y_pred)) print('Total average CV Log_loss score is {}'.format(np.mean(losses))) print('Total average CV ROC_AUC score is {}'.format(np.mean(auc)))
def train_tunning(): df = collect_data.readData(TRAIN_DATA, TRAINING_FILE, 1500) train_x, test_x, train_y, test_y = model_selection.train_test_split( df['text'], df['label']) if os.path.isfile('./data/model_train'): vec = open("./data/model_train", 'rb') # rb= read in bytes grid3 = pickle.load(vec) vec.close() else: start_time = time.time() pipe = make_pipeline( TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words=collect_data.get_stop_word()), OneVsRestClassifier(LogisticRegression())) param_grid = { 'tfidfvectorizer__max_features': [5000, 10000], 'onevsrestclassifier__estimator__solver': ['liblinear', 'sag'], } grid = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy') grid3 = grid.fit(train_x, train_y) end_time = time.time() print("total time", end_time - start_time) save_classifier = open("./data/model_train", 'wb') #wb= write in bytes. pickle.dump( grid3, save_classifier ) #use pickle to dump the grid3 we trained, as 'Tfidf_LogR.pickle' in wb format save_classifier.close() print(grid3.best_estimator_.named_steps['onevsrestclassifier']) print(grid3.best_estimator_.named_steps['tfidfvectorizer']) grid3.best_params_ grid3.best_score_ predicted_y_test = grid3.predict(test_x) X_test_list = test_x.tolist() predicted_y_test_list = predicted_y_test.tolist() save = pd.DataFrame(np.column_stack([X_test_list, predicted_y_test_list])) save.to_csv("./data/result_trained.csv", sep=',', encoding='utf-16', header=True, index=False)
def clustering_word(): data = collect_data.readData(TRAIN_DATA, TRAINING_FILE) tfidf_vect = TfidfVectorizer( analyzer='word', token_pattern= r'[a-zA-ZàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ0-9_]+', lowercase=True, ngram_range=(1, 4), stop_words=collect_data.get_stop_word(), max_features=10000) count_train = tfidf_vect.fit(data["text"]) # bag_of_words = tfidf_vect.transform(data) # feature_names = np.array(count_train.get_feature_names()) print(count_train.get_feature_names(), len(count_train.get_feature_names()))
def get_top(data, top=10): tfidf_vect = TfidfVectorizer( analyzer='word', token_pattern= r'[a-zA-ZàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ0-9_]+', lowercase=True, ngram_range=(1, 4), stop_words=collect_data.get_stop_word(), max_features=10000) count_train = tfidf_vect.fit(data) bag_of_words = tfidf_vect.transform(data) feature_names = np.array(count_train.get_feature_names()) max_val = bag_of_words.max(axis=0).toarray().ravel() #sort weights from smallest to biggest and extract their indices sort_by_tfidf = max_val.argsort() return feature_names[sort_by_tfidf[-top:]]
def get_top(data, top=10): tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'[a-zA-Z0-9_]+', lowercase=True, max_df=0.05, stop_words=collect_data.get_stop_word(), max_features=5000) count_train = tfidf_vect.fit(data) bag_of_words = tfidf_vect.transform(data) feature_names = np.array(count_train.get_feature_names()) print(feature_names) return max_val = bag_of_words.max(axis=0).toarray().ravel() #sort weights from smallest to biggest and extract their indices sort_by_tfidf = max_val.argsort() return feature_names[sort_by_tfidf[-top:]]
rút danh_sách này xuống còn theo đúng quy_định của fifa và thời_hạn để ông làm_việc này là trước ngày ngoài cầu_thủ được dự_định sẽ đưa đến đức hlv beenhakker cũng đã quyết_định triệu_tập thêm cầu_thủ dự_bị và các cầu_thủ này sẽ được lựa_chọn nếu một trong số cầu_thủ chính_thức bất_ngờ bị chấn_thương danh_sách cầu_thủ của trinidad amp tobago thủ_môn kelvin_jack dundee shaka_hislop west_ham clayton_ince coventry_city hậu_vệ dennis_lawrence wrexham cyd_gray san_juan_jabloteh marvin_andrews rangers brent_sancho gillingham ian_cox gillingham atiba_charles w_connection avery_john new_england_revolution tiền_vệ silvio_spann unattached chris_birchall port_vale aurtis_whitley san_juan_jabloteh anthony_rougier united_petrotrin anthony_wolfe san_juan_jabloteh densill_theobald falkirk carlos_edwards luton dwight_yorke sydney_fc russell_latapy falkirk tiền_đạo stern_john coventry kenwyne_jones southampton collin_samuel dundee jason_scotland st_johnstone cornell_glen la_galaxy dự_bị brent_rahim jabloteh anton_pierre defence_force anthony_warner fulham nigel_henry kiruna_ff ricky_shakes swindon hector_sam port_vale scott_sealy kansas_wizards""" print(get_top([corpus])) # print(len(corpus)) print(get_top_n_words([corpus])) # vectorizer = create_vectorizer([corpus]) vectorizer = TfidfVectorizer( analyzer='word', token_pattern= r'[a-zA-ZàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ0-9_]+', # max_df=0.05, # stop_words='english', encoding='utf-16', stop_words=collect_data.get_stop_word(), max_features=5000) tfidf_result = vectorizer.fit_transform([corpus]) display_scores(vectorizer, tfidf_result)