def multinomial_nb(which_comments): print("=> Multinomial Bayes naive classifier") data_frame = pd.read_excel(r'preprocessed_data\all_comments.xlsx') sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1) index = 1 average = 0 for train_index, test_index in sss.split(data_frame['Comment'], data_frame['Type']): preprocessing.preprocess_train_test_data(train_index, test_index) mnb = MultinomialNB() mnb.fit(preprocessing.get_data_set(), preprocessing.get_data_labels()) score = f1_score(preprocessing.get_test_labels(), mnb.predict(preprocessing.get_test_set()), average='weighted') average = average + score print("Score {}.: {:.2f}%".format(index, score * 100), end=" ") if index == 5: print() index += 1 print() print("Average: {:.2f}%".format(average / 10 * 100))
def compare_regularisation_functions(data_frame, rf, c=1.0): sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1) index = 1 average = 0 for train_index, test_index in sss.split(data_frame['Comment'], data_frame['Type']): preprocessing.preprocess_train_test_data(train_index, test_index) if rf == 'l1': solver = 'saga' else: solver = 'lbfgs' lr = LogisticRegression(penalty=rf, C=c, solver=solver, max_iter=15000) lr.fit(preprocessing.get_data_set(), preprocessing.get_data_labels()) score = f1_score(preprocessing.get_test_labels(), lr.predict(preprocessing.get_test_set()), average='weighted') average = average + score print("Score({}) {}.: {:.2f}%".format(rf.upper(), index, score * 100), end=" ") if index == 5: print() index += 1 print() print("Average: {:.2f}%".format(average / 10 * 100))
def optimize_c_parameter(): models_param = { 'max_iter': [15000], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000] } nested_cv_search = NestedCV(model=LogisticRegression(), params_grid=models_param, outer_kfolds=5, inner_kfolds=5, cv_options={'sqrt_of_score': True, 'randomized_search_iter': 30}) nested_cv_search.fit(preprocessing.get_data_set(), preprocessing.get_data_labels()) optimized_c_value = np.mean([d['C'] for d in nested_cv_search.best_inner_params_list]) print("Optimized C: {:.3f}".format(optimized_c_value))
def compare_regularisation_functions(data_frame, rf, c=1): sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1) index = 1 average = 0 for train_index, test_index in sss.split(data_frame['Comment'], data_frame['Type']): preprocessing.preprocess_train_test_data(train_index, test_index) svc = LinearSVC(penalty=rf, C=c, dual=rf == 'l2', max_iter=15000) svc.fit(preprocessing.get_data_set(), preprocessing.get_data_labels()) score = f1_score(preprocessing.get_test_labels(), svc.predict(preprocessing.get_test_set()), average='weighted') average = average + score print("Score({}) {}.: {:.2f}%".format(rf.upper(), index, score * 100), end=" ") if index == 5: print() index += 1 print() print("Average: {:.2f}%".format(average / 10 * 100))