def test(X, y, rand, i):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=rand)
    clf = MSHMM(2, i).fit(X_train, y_train)
    predicts = clf.predict(X_test)
    
    _, acc, f1 = model_stats.cm_acc_f1(y_test, predicts)
    return acc, f1
def save_model(file_name, out_file):
    danish_data, emb_size_da = hmm_data_loader.get_hmm_data(filename=file_name)

    danish_data_X = [x[1] for x in danish_data]
    danish_data_y = [x[0] for x in danish_data]

    X_train, X_test, y_train, y_test = train_test_split(danish_data_X,
                                                        danish_data_y,
                                                        test_size=0.20,
                                                        random_state=42,
                                                        stratify=danish_data_y)

    best_model = None
    train_y_x = list(zip(y_train, X_train))
    for s in range(1, 16):

        best_acc = 0.0
        best_f1 = 0.0
        clf = HMM(s).fit(X_train, y_train)
        da_predicts = clf.predict(X_test)

        _, acc_t_da, f1_t_da = model_stats.cm_acc_f1(y_test, da_predicts)
        if f1_t_da > best_f1:
            best_acc = acc_t_da
            best_f1 = f1_t_da
            best_model = clf

    dump(best_model, out_file)
def train_eng_test_danish(file_en, file_da, distribution_voter, min_length=1):
    # load data
    danish_data, emb_size_da = hmm_data_loader.get_hmm_data(filename=file_da)
    danish_data_X = [x[1] for x in danish_data]
    danish_data_y = [x[0] if x[0] != 2 else 1 for x in danish_data]

    data_train, _ = hmm_data_loader.get_semeval_hmm_data(filename=file_en)
    y_train = [x[1] for x in data_train]
    X_train = [x[2] for x in data_train]

    print("%-20s%10s%10s%10s" % ('event', 'components', 'accuracy', 'f1'))

    if distribution_voter:
        clf = DistributionVoter().fit(X_train, y_train)
        predicts = clf.predict(danish_data_X)
        _, acc, f1 = model_stats.cm_acc_f1(danish_data_y, predicts)
        print("%-20s%-10s%10.2f%10.2f" % ('danish', '-', acc, f1))
    else:
        best_acc = 0.0
        best_f1 = 0.0
        best_s = None
        for s in range(1, 16):
            # try out different random configurations
            for c in range(1):
                # test on danish data
                clf = HMM(s).fit(X_train, y_train)
                da_predicts = clf.predict(danish_data_X)

            cm, acc_t_da, f1_t_da = model_stats.cm_acc_f1(
                danish_data_y, da_predicts)
            # print(cm)

            if f1_t_da > best_f1:
                best_acc = acc_t_da
                best_f1 = f1_t_da
                best_s = s
        print("%-20s%-10s%10.2f%10.2f" % ('danish', best_s, best_acc, best_f1))
def Loo_event_test(file_en, min_length, print_distribution):
    # load data
    data_train, _ = hmm_data_loader.get_semeval_hmm_data(filename=file_en)
    data_train_y_X = [(x[1], x[2]) for x in data_train]

    events = loadEvents(data_train,
                        print_dist=print_distribution,
                        min_len=min_length)
    event_list = [(k, v) for k, v in events.items()]

    print("%-20s%10s%10s%10s" % ('event', 'components', 'accuracy', 'f1'))
    for i in range(len(event_list)):

        test_event, test_vec = event_list[i]

        train = [vec for e, vec in event_list if e != test_event]
        train = flatten(train)
        y_train = [x[0] for x in train]
        X_train = [x[1] for x in train]

        test_vec = [x for x in test_vec if len(x[1]) >= min_length]
        # partition test data and y
        y_test = [x[0] for x in test_vec]
        X_test = [x[1] for x in test_vec]
        X_test_len = [len(x) for x in X_test]

        for s in range(1, 16):
            best_acc = 0.0
            best_f1 = 0.0

            # try out different random configurations
            for c in range(1):
                clf = HMM(s).fit(X_train, y_train)
                predicts = clf.predict(X_test)

                assert len(y_test) == len(
                    predicts
                ), "The length of y_test does not match number of predictions"

                # print result
                _, acc_t, f1_t = model_stats.cm_acc_f1(y_test, predicts)

                # save results from model with best f1 score
                if f1_t > best_f1:
                    best_acc = acc_t
                    best_f1 = f1_t
            print("%-20s%-10s%10.2f%10.2f" %
                  (test_event, s, best_acc, best_f1))
def CV_sup(clfs, name, X, y, X_sup, y_sup, k_folds=5):
    clf = clfs[name]
    rs = StratifiedShuffleSplit(n_splits=k_folds,
                                test_size=.2,
                                random_state=rand)
    f1s = []
    accs = []

    for (train_i,
         test_i), (suptrain_i,
                   subtest_i) in zip(rs.split(np.zeros(len(X)), y),
                                     rs.split(np.zeros(len(X_sup)), y_sup)):
        X_train = [X[i] for i in train_i]
        # X_train = np.append(X_train, X_sup, axis=0)
        for i in suptrain_i:
            x_sup_orig, x_sup_cp = X_sup[i]
            X_train.append(x_sup_orig)
            X_train.append(x_sup_cp)
        X_test = [X[i] for i in test_i]
        for i in subtest_i:
            x_sup_orig, x_sup_cp = X_sup[i]
            X_test.append(x_sup_orig)
            X_test.append(x_sup_cp)
        y_train = [y[i] for i in train_i]
        for i in suptrain_i:
            y_train.append(y_sup[i])
            y_train.append(y_sup[i])
        # y_train.extend(y_sup)
        y_test = [y[i] for i in test_i]
        for i in subtest_i:
            y_test.append(y_sup[i])
            y_test.append(y_sup[i])
        clf.fit(X_train, y_train)
        y_true, y_pred = y_test, clf.predict(X_test)
        _, acc, f1 = model_stats.cm_acc_f1(y_true, y_pred)
        f1s.append(f1)
        accs.append(acc)
    f1s = np.asarray(f1s)
    accs = np.asarray(accs)
    s = "%-20s%s %0.4f (+/- %0.2f) %s %0.4f (+/- %0.2f)" \
        % (name, 'f1', f1s.mean(), f1s.std() * 2,
           'acc', accs.mean(), accs.std() * 2)
    print(s)
def main(argv):

    parser = argparse.ArgumentParser(description='Rumour veracity classification via multinomial hmm')
    parser.add_argument('-ftrain', '--file_train', default='../data/hmm/hmm_data_branch_time.csv', help='Train file. Tests on this if no test file is given.')
    parser.add_argument('-ftest', '--file_test', help='Test file')
    parser.add_argument('-sem_tr', '--sem_data_train', action='store_true', default=False, help='Read train in semeval data format')
    parser.add_argument('-sem_te', '--sem_data_test', action='store_true', default=False, help='Read test in semeval data format')
    parser.add_argument('-loo', '--loo', action='store_true', default=False, help='Test leave one out on 5 semeval events')
    parser.add_argument('-mix', '--mix', action='store_true', default=False, help='Test kfold on a mix of danish an pheme data')
    parser.add_argument('-kf', '--k_folds', help='Number of folds to do in cross validation', type=int)
    parser.add_argument('-rs', '--restarts', help='Number of times to restart with new random state', type=int)
    parser.add_argument('-rand', '--rand_state', default=42, help='Specific random state', type=int)
    parser.add_argument('-s', '--state_size', help='Specify the size of the space for the model', type=int)
    
    
    args = parser.parse_args(argv)

    if args.file_train:
        data = get_data(args.file_train, args.sem_data_train)
        y = [x[0] for x in data]
        X = [x[1] for x in data]
        
        if args.file_test:
            data_test = get_data(args.file_test, args.sem_data_test)
            y_test = [x[0] for x in data_test]
            X_test = [x[1] for x in data_test]

            if args.mix and args.k_folds:
                X.extend(X_test)
                y.extend(y_test)
                if args.state_size:
                    cross_val(MSHMM(2, args.state_size), X, y, args.k_folds, args.rand_state, i)
                    skf = StratifiedKFold(n_splits=args.k_folds, shuffle=True, random_state=args.rand_state)
                    predicts = cross_val_predict(MSHMM(2, args.state_size), X, y, cv=skf)
                    cm, acc, f1 = model_stats.cm_acc_f1(y_test, predicts)
                    print(cm)
                else:
                    for i in range(1, 16):
                        cross_val(MSHMM(2, i), X, y, args.k_folds, args.rand_state, i)
            else:
                print("%-10s%10s%10s" % ('state space', 'acc', 'f1'))
                if args.state_size:
                    clf = MSHMM(2, args.state_size).fit(X, y)
                    predicts = clf.predict(X_test)
                    cm, acc, f1 = model_stats.cm_acc_f1(y_test, predicts)
                    print("%-10s%10.2f%10.2f" % (i, acc, f1))
                    print("\n")
                    print(cm)
                else:
                    for i in range(1, 16):
                        clf = MSHMM(2, i).fit(X, y)
                        predicts = clf.predict(X_test)
                        _, acc, f1 = model_stats.cm_acc_f1(y_test, predicts)
                        print("%-10s%10.2f%10.2f" % (i, acc, f1))

        elif args.loo:
            sem_data = hmm_data_loader.get_semeval_hmm_data(filename=args.file_train)
            events = loadEvents(sem_data)

        else:
            if not args.k_folds:
                print("%-10s%10s%10s" % ('state space', 'acc', 'f1'))
            
            if args.state_size:
                skf = StratifiedKFold(n_splits=args.k_folds, shuffle=True, random_state=args.rand_state)
                predicts = cross_val_predict(MSHMM(2, args.state_size), X, y, cv=skf)
                cm, acc, f1 = model_stats.cm_acc_f1(y, predicts)
                print("%-10s%10.2f%10.2f" % (args.state_size, acc, f1))
                print(cm)
            else:
                for i in range(1,16):
                    if args.k_folds:
                        if args.restarts:
                            for r in random.sample(range(1, 1000), args.restarts):
                                cross_val(MSHMM(2, i), X, y, args.k_folds, args.rand_state, i)
                            print("\n")
                        else:
                            cross_val(MSHMM(2, i), X, y, args.k_folds, args.rand_state, i)
                    else:
                        best_acc = 0.0
                        best_f1 = 0.0
                        best_rand_state = None
                        best_i = None

                        if args.restarts:
                            for r in random.sample(range(1, 1000), args.restarts):
                                acc, f1 = test(X, y, args.rand_state, i)
                                if f1 > best_f1:
                                    best_f1 = f1
                                    best_acc = acc
                                    best_rand_state = args.rand_state
                                    best_i = i
                        else:
                            acc, f1 = test(X, y, args.rand_state, i)
                            
                            best_acc = acc
                            best_f1 = f1
                            best_rand_state = args.rand_state
                            best_i = i
                        
                        print("%-10s%-10s%10.2f%10.2f" % (best_i, best_rand_state, best_acc, best_f1))
Esempio n. 7
0
def parameter_search_LOO_features():
    for name, estimator, tuned_parameters in settings_rand:
        filepath = os.path.join(output_folder, name)
        if not os.path.exists(filepath):
            os.makedirs(filepath)
        print("# Tuning hyper-parameters on F1 macro for %s" % name)
        stats_filename = '%s/parameter_stats_iter%d_k%d' % (filepath,
                                                            rand_iter, folds)
        if args.reduce_features:
            stats_filename += '_vt'
        if not os.path.exists(stats_filename):
            with open('%s.csv' % stats_filename, 'w+',
                      newline='') as statsfile:
                csv_writer = csv.writer(statsfile)
                csv_writer.writerow([
                    'estimator', 'f1_macro', 'acc', 'LOO feature',
                    'parameters', 'features'
                ])
        for feature_name in feature_names:
            results_filename = '%s/params_%s_iter%d_k%d' % (
                filepath, feature_name, rand_iter, folds)
            if not config[feature_name] or os.path.exists(results_filename):
                print('Skipping %s since %s exists' %
                      (feature_name, results_filename))
                continue
            if feature_name == 'all':
                print('Running with all features enabled')
            else:
                print('Leaving %s features out' % feature_name)
            config[feature_name] = False
            X_train_ = data_loader.select_features(X_train, feature_mapping,
                                                   config)
            X_test_ = data_loader.select_features(X_test, feature_mapping,
                                                  config)
            if args.reduce_features:
                old_len = len(X_train_[0])
                X_train_, X_test_ = data_loader.union_reduce_then_split(
                    X_train_, X_test_)
                new_len = len(X_train_[0])
                print('Reduced features from %d to %d' % (old_len, new_len))
                results_filename += '_vt%d' % old_len
            start = time.time()
            with open('%s.txt' % results_filename, 'a+') as outfile, \
                    open('%s.csv' % stats_filename, 'a', newline='') as statsfile:
                csv_writer = csv.writer(statsfile)
                clf = RandomizedSearchCV(estimator,
                                         tuned_parameters,
                                         scoring=scorer,
                                         n_jobs=-1,
                                         error_score=0,
                                         n_iter=rand_iter,
                                         verbose=1,
                                         cv=skf,
                                         iid=False,
                                         return_train_score=False,
                                         pre_dispatch='2*n_jobs',
                                         random_state=rand)
                clf.fit(X_train_, y_train)

                s = "Best parameters set found on development set for F1 macro:"
                print(s)
                outfile.write(s + '\n')
                print()
                s = "%0.3f for %r" % (clf.best_score_, clf.best_params_)
                print(s)
                outfile.write(s + '\n')
                print()
                s = "Grid scores on development set:"
                print(s)
                outfile.write(s + '\n')
                print()
                results = clf.cv_results_
                means = results['mean_test_score']
                stds = results['std_test_score']
                for mean, std, params in zip(means, stds,
                                             clf.cv_results_['params']):
                    s = "%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)
                    print(s)
                    outfile.write(s + '\n')
                print()

                outfile.write(
                    'Classification report for results on evaluation set:' +
                    '\n')
                print("Classification report for results on evaluation set:")
                y_true, y_pred = y_test, clf.predict(X_test_)
                outfile.write(classification_report(y_true, y_pred))
                outfile.write('\n')
                cm, acc, f1 = cm_acc_f1(y_true, y_pred)
                outfile.write(np.array2string(cm))
                outfile.write('\n')
                print('acc: %.4f' % acc)
                outfile.write('acc: %.4f\n' % acc)
                print('f1 macro: %.4f' % f1)
                outfile.write('f1 macro: %.4f\n\n' % f1)
                print()
                csv_writer.writerow([
                    name,
                    '%.4f' % f1,
                    '%.4f' % acc, feature_name, clf.best_params_, config
                ])
            if not feature_name == 'all':
                config[feature_name] = True
            end = time.time()
            print('Done with %s features' % feature_name)
            print('Took %.1f seconds' % (end - start))
        print('Done with', name)


# if args.v:
#     parameter_search_LOO_features()
# else:
#     X_train_ = data_loader.select_features(X_train, feature_mapping, config)
#     X_test_ = data_loader.select_features(X_test, feature_mapping, config)
#     old_len = len(X_train_[0])
#     X_train_, X_test_ = data_loader.union_reduce_then_split(X_train_, X_test_)
#     new_len = len(X_train_[0])
#     print('Reduced features from %d to %d' % (old_len, new_len))
#     parameter_search_rand_VT(X_train_, X_test_, y_train, y_test)
Esempio n. 8
0
        results = clf.cv_results_
        means = results['mean_test_score']
        stds = results['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            s = "%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)
            print(s)
            outfile.write(s + '\n')
        print()

        outfile.write('Classification report for results on evaluation set:' +
                      '\n')
        print("Classification report for results on evaluation set:")
        y_true, y_pred = y_test, clf.predict(X_test_)
        outfile.write(classification_report(y_true, y_pred))
        outfile.write('\n')
        cm, acc, f1 = cm_acc_f1(y_true, y_pred)
        outfile.write(np.array2string(cm))
        outfile.write('\n')
        print('acc: %.4f' % acc)
        outfile.write('acc: %.4f\n' % acc)
        print('f1 macro: %.4f' % f1)
        outfile.write('f1 macro: %.4f\n\n' % f1)
        print()
        csv_writer.writerow([
            name,
            '%.4f' % f1,
            '%.4f' % acc, folds, rand_iter, clf.best_params_, config
        ])
    # end = time.time()
    # print('Took %.1f seconds' % (end - start))
    print('Done with', name)