def test(X, y, rand, i): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=rand) clf = MSHMM(2, i).fit(X_train, y_train) predicts = clf.predict(X_test) _, acc, f1 = model_stats.cm_acc_f1(y_test, predicts) return acc, f1
def save_model(file_name, out_file): danish_data, emb_size_da = hmm_data_loader.get_hmm_data(filename=file_name) danish_data_X = [x[1] for x in danish_data] danish_data_y = [x[0] for x in danish_data] X_train, X_test, y_train, y_test = train_test_split(danish_data_X, danish_data_y, test_size=0.20, random_state=42, stratify=danish_data_y) best_model = None train_y_x = list(zip(y_train, X_train)) for s in range(1, 16): best_acc = 0.0 best_f1 = 0.0 clf = HMM(s).fit(X_train, y_train) da_predicts = clf.predict(X_test) _, acc_t_da, f1_t_da = model_stats.cm_acc_f1(y_test, da_predicts) if f1_t_da > best_f1: best_acc = acc_t_da best_f1 = f1_t_da best_model = clf dump(best_model, out_file)
def train_eng_test_danish(file_en, file_da, distribution_voter, min_length=1): # load data danish_data, emb_size_da = hmm_data_loader.get_hmm_data(filename=file_da) danish_data_X = [x[1] for x in danish_data] danish_data_y = [x[0] if x[0] != 2 else 1 for x in danish_data] data_train, _ = hmm_data_loader.get_semeval_hmm_data(filename=file_en) y_train = [x[1] for x in data_train] X_train = [x[2] for x in data_train] print("%-20s%10s%10s%10s" % ('event', 'components', 'accuracy', 'f1')) if distribution_voter: clf = DistributionVoter().fit(X_train, y_train) predicts = clf.predict(danish_data_X) _, acc, f1 = model_stats.cm_acc_f1(danish_data_y, predicts) print("%-20s%-10s%10.2f%10.2f" % ('danish', '-', acc, f1)) else: best_acc = 0.0 best_f1 = 0.0 best_s = None for s in range(1, 16): # try out different random configurations for c in range(1): # test on danish data clf = HMM(s).fit(X_train, y_train) da_predicts = clf.predict(danish_data_X) cm, acc_t_da, f1_t_da = model_stats.cm_acc_f1( danish_data_y, da_predicts) # print(cm) if f1_t_da > best_f1: best_acc = acc_t_da best_f1 = f1_t_da best_s = s print("%-20s%-10s%10.2f%10.2f" % ('danish', best_s, best_acc, best_f1))
def Loo_event_test(file_en, min_length, print_distribution): # load data data_train, _ = hmm_data_loader.get_semeval_hmm_data(filename=file_en) data_train_y_X = [(x[1], x[2]) for x in data_train] events = loadEvents(data_train, print_dist=print_distribution, min_len=min_length) event_list = [(k, v) for k, v in events.items()] print("%-20s%10s%10s%10s" % ('event', 'components', 'accuracy', 'f1')) for i in range(len(event_list)): test_event, test_vec = event_list[i] train = [vec for e, vec in event_list if e != test_event] train = flatten(train) y_train = [x[0] for x in train] X_train = [x[1] for x in train] test_vec = [x for x in test_vec if len(x[1]) >= min_length] # partition test data and y y_test = [x[0] for x in test_vec] X_test = [x[1] for x in test_vec] X_test_len = [len(x) for x in X_test] for s in range(1, 16): best_acc = 0.0 best_f1 = 0.0 # try out different random configurations for c in range(1): clf = HMM(s).fit(X_train, y_train) predicts = clf.predict(X_test) assert len(y_test) == len( predicts ), "The length of y_test does not match number of predictions" # print result _, acc_t, f1_t = model_stats.cm_acc_f1(y_test, predicts) # save results from model with best f1 score if f1_t > best_f1: best_acc = acc_t best_f1 = f1_t print("%-20s%-10s%10.2f%10.2f" % (test_event, s, best_acc, best_f1))
def CV_sup(clfs, name, X, y, X_sup, y_sup, k_folds=5): clf = clfs[name] rs = StratifiedShuffleSplit(n_splits=k_folds, test_size=.2, random_state=rand) f1s = [] accs = [] for (train_i, test_i), (suptrain_i, subtest_i) in zip(rs.split(np.zeros(len(X)), y), rs.split(np.zeros(len(X_sup)), y_sup)): X_train = [X[i] for i in train_i] # X_train = np.append(X_train, X_sup, axis=0) for i in suptrain_i: x_sup_orig, x_sup_cp = X_sup[i] X_train.append(x_sup_orig) X_train.append(x_sup_cp) X_test = [X[i] for i in test_i] for i in subtest_i: x_sup_orig, x_sup_cp = X_sup[i] X_test.append(x_sup_orig) X_test.append(x_sup_cp) y_train = [y[i] for i in train_i] for i in suptrain_i: y_train.append(y_sup[i]) y_train.append(y_sup[i]) # y_train.extend(y_sup) y_test = [y[i] for i in test_i] for i in subtest_i: y_test.append(y_sup[i]) y_test.append(y_sup[i]) clf.fit(X_train, y_train) y_true, y_pred = y_test, clf.predict(X_test) _, acc, f1 = model_stats.cm_acc_f1(y_true, y_pred) f1s.append(f1) accs.append(acc) f1s = np.asarray(f1s) accs = np.asarray(accs) s = "%-20s%s %0.4f (+/- %0.2f) %s %0.4f (+/- %0.2f)" \ % (name, 'f1', f1s.mean(), f1s.std() * 2, 'acc', accs.mean(), accs.std() * 2) print(s)
def main(argv): parser = argparse.ArgumentParser(description='Rumour veracity classification via multinomial hmm') parser.add_argument('-ftrain', '--file_train', default='../data/hmm/hmm_data_branch_time.csv', help='Train file. Tests on this if no test file is given.') parser.add_argument('-ftest', '--file_test', help='Test file') parser.add_argument('-sem_tr', '--sem_data_train', action='store_true', default=False, help='Read train in semeval data format') parser.add_argument('-sem_te', '--sem_data_test', action='store_true', default=False, help='Read test in semeval data format') parser.add_argument('-loo', '--loo', action='store_true', default=False, help='Test leave one out on 5 semeval events') parser.add_argument('-mix', '--mix', action='store_true', default=False, help='Test kfold on a mix of danish an pheme data') parser.add_argument('-kf', '--k_folds', help='Number of folds to do in cross validation', type=int) parser.add_argument('-rs', '--restarts', help='Number of times to restart with new random state', type=int) parser.add_argument('-rand', '--rand_state', default=42, help='Specific random state', type=int) parser.add_argument('-s', '--state_size', help='Specify the size of the space for the model', type=int) args = parser.parse_args(argv) if args.file_train: data = get_data(args.file_train, args.sem_data_train) y = [x[0] for x in data] X = [x[1] for x in data] if args.file_test: data_test = get_data(args.file_test, args.sem_data_test) y_test = [x[0] for x in data_test] X_test = [x[1] for x in data_test] if args.mix and args.k_folds: X.extend(X_test) y.extend(y_test) if args.state_size: cross_val(MSHMM(2, args.state_size), X, y, args.k_folds, args.rand_state, i) skf = StratifiedKFold(n_splits=args.k_folds, shuffle=True, random_state=args.rand_state) predicts = cross_val_predict(MSHMM(2, args.state_size), X, y, cv=skf) cm, acc, f1 = model_stats.cm_acc_f1(y_test, predicts) print(cm) else: for i in range(1, 16): cross_val(MSHMM(2, i), X, y, args.k_folds, args.rand_state, i) else: print("%-10s%10s%10s" % ('state space', 'acc', 'f1')) if args.state_size: clf = MSHMM(2, args.state_size).fit(X, y) predicts = clf.predict(X_test) cm, acc, f1 = model_stats.cm_acc_f1(y_test, predicts) print("%-10s%10.2f%10.2f" % (i, acc, f1)) print("\n") print(cm) else: for i in range(1, 16): clf = MSHMM(2, i).fit(X, y) predicts = clf.predict(X_test) _, acc, f1 = model_stats.cm_acc_f1(y_test, predicts) print("%-10s%10.2f%10.2f" % (i, acc, f1)) elif args.loo: sem_data = hmm_data_loader.get_semeval_hmm_data(filename=args.file_train) events = loadEvents(sem_data) else: if not args.k_folds: print("%-10s%10s%10s" % ('state space', 'acc', 'f1')) if args.state_size: skf = StratifiedKFold(n_splits=args.k_folds, shuffle=True, random_state=args.rand_state) predicts = cross_val_predict(MSHMM(2, args.state_size), X, y, cv=skf) cm, acc, f1 = model_stats.cm_acc_f1(y, predicts) print("%-10s%10.2f%10.2f" % (args.state_size, acc, f1)) print(cm) else: for i in range(1,16): if args.k_folds: if args.restarts: for r in random.sample(range(1, 1000), args.restarts): cross_val(MSHMM(2, i), X, y, args.k_folds, args.rand_state, i) print("\n") else: cross_val(MSHMM(2, i), X, y, args.k_folds, args.rand_state, i) else: best_acc = 0.0 best_f1 = 0.0 best_rand_state = None best_i = None if args.restarts: for r in random.sample(range(1, 1000), args.restarts): acc, f1 = test(X, y, args.rand_state, i) if f1 > best_f1: best_f1 = f1 best_acc = acc best_rand_state = args.rand_state best_i = i else: acc, f1 = test(X, y, args.rand_state, i) best_acc = acc best_f1 = f1 best_rand_state = args.rand_state best_i = i print("%-10s%-10s%10.2f%10.2f" % (best_i, best_rand_state, best_acc, best_f1))
def parameter_search_LOO_features(): for name, estimator, tuned_parameters in settings_rand: filepath = os.path.join(output_folder, name) if not os.path.exists(filepath): os.makedirs(filepath) print("# Tuning hyper-parameters on F1 macro for %s" % name) stats_filename = '%s/parameter_stats_iter%d_k%d' % (filepath, rand_iter, folds) if args.reduce_features: stats_filename += '_vt' if not os.path.exists(stats_filename): with open('%s.csv' % stats_filename, 'w+', newline='') as statsfile: csv_writer = csv.writer(statsfile) csv_writer.writerow([ 'estimator', 'f1_macro', 'acc', 'LOO feature', 'parameters', 'features' ]) for feature_name in feature_names: results_filename = '%s/params_%s_iter%d_k%d' % ( filepath, feature_name, rand_iter, folds) if not config[feature_name] or os.path.exists(results_filename): print('Skipping %s since %s exists' % (feature_name, results_filename)) continue if feature_name == 'all': print('Running with all features enabled') else: print('Leaving %s features out' % feature_name) config[feature_name] = False X_train_ = data_loader.select_features(X_train, feature_mapping, config) X_test_ = data_loader.select_features(X_test, feature_mapping, config) if args.reduce_features: old_len = len(X_train_[0]) X_train_, X_test_ = data_loader.union_reduce_then_split( X_train_, X_test_) new_len = len(X_train_[0]) print('Reduced features from %d to %d' % (old_len, new_len)) results_filename += '_vt%d' % old_len start = time.time() with open('%s.txt' % results_filename, 'a+') as outfile, \ open('%s.csv' % stats_filename, 'a', newline='') as statsfile: csv_writer = csv.writer(statsfile) clf = RandomizedSearchCV(estimator, tuned_parameters, scoring=scorer, n_jobs=-1, error_score=0, n_iter=rand_iter, verbose=1, cv=skf, iid=False, return_train_score=False, pre_dispatch='2*n_jobs', random_state=rand) clf.fit(X_train_, y_train) s = "Best parameters set found on development set for F1 macro:" print(s) outfile.write(s + '\n') print() s = "%0.3f for %r" % (clf.best_score_, clf.best_params_) print(s) outfile.write(s + '\n') print() s = "Grid scores on development set:" print(s) outfile.write(s + '\n') print() results = clf.cv_results_ means = results['mean_test_score'] stds = results['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): s = "%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params) print(s) outfile.write(s + '\n') print() outfile.write( 'Classification report for results on evaluation set:' + '\n') print("Classification report for results on evaluation set:") y_true, y_pred = y_test, clf.predict(X_test_) outfile.write(classification_report(y_true, y_pred)) outfile.write('\n') cm, acc, f1 = cm_acc_f1(y_true, y_pred) outfile.write(np.array2string(cm)) outfile.write('\n') print('acc: %.4f' % acc) outfile.write('acc: %.4f\n' % acc) print('f1 macro: %.4f' % f1) outfile.write('f1 macro: %.4f\n\n' % f1) print() csv_writer.writerow([ name, '%.4f' % f1, '%.4f' % acc, feature_name, clf.best_params_, config ]) if not feature_name == 'all': config[feature_name] = True end = time.time() print('Done with %s features' % feature_name) print('Took %.1f seconds' % (end - start)) print('Done with', name) # if args.v: # parameter_search_LOO_features() # else: # X_train_ = data_loader.select_features(X_train, feature_mapping, config) # X_test_ = data_loader.select_features(X_test, feature_mapping, config) # old_len = len(X_train_[0]) # X_train_, X_test_ = data_loader.union_reduce_then_split(X_train_, X_test_) # new_len = len(X_train_[0]) # print('Reduced features from %d to %d' % (old_len, new_len)) # parameter_search_rand_VT(X_train_, X_test_, y_train, y_test)
results = clf.cv_results_ means = results['mean_test_score'] stds = results['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): s = "%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params) print(s) outfile.write(s + '\n') print() outfile.write('Classification report for results on evaluation set:' + '\n') print("Classification report for results on evaluation set:") y_true, y_pred = y_test, clf.predict(X_test_) outfile.write(classification_report(y_true, y_pred)) outfile.write('\n') cm, acc, f1 = cm_acc_f1(y_true, y_pred) outfile.write(np.array2string(cm)) outfile.write('\n') print('acc: %.4f' % acc) outfile.write('acc: %.4f\n' % acc) print('f1 macro: %.4f' % f1) outfile.write('f1 macro: %.4f\n\n' % f1) print() csv_writer.writerow([ name, '%.4f' % f1, '%.4f' % acc, folds, rand_iter, clf.best_params_, config ]) # end = time.time() # print('Took %.1f seconds' % (end - start)) print('Done with', name)