dep_posts = new_arr

    y = np.concatenate((np.ones(len(reg_posts)), np.zeros(len(dep_posts))))
    x = np.concatenate((reg_posts, dep_posts))

    print('b. initializing')
    rs = ShuffleSplit(n_splits=10, test_size=.10, random_state=0)
    rs.get_n_splits(x)
    split = 0
    for train_index, test_index in rs.split(x):
        print "split", split

        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        new_doc = D2V('w2v_' + str(split), 300)
        train_arrays, test_arrays, train_labels, test_labels = new_doc.build_d2v_vecs(
            x_train, x_test, y_train, y_test)

        print('Logreg')
        logreg.run_logreg(train_arrays, test_arrays, train_labels, test_labels)

        print('SVM')
        svm.train_svm(train_arrays, test_arrays, train_labels, test_labels)

        print('Simple neural network')
        NNet.simpleNN(train_arrays, test_arrays, train_labels, test_labels,
                      0.01, 100, 100)

        split += 1
Beispiel #2
0
    split = 0

    for train_index, test_index in rs.split(x):
        print "split", split
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        feat_model = DeCh("reg")
        feat_model.load_liwc('data/mixed_liwc2007.csv',
                             'data/anxiety_filtered2007.csv')

        print "calculating train"
        train_vecs = feat_model.build_feat(x_train, train_index)

        print "calculating test"

        test_vecs = feat_model.build_feat(x_test, test_index)

        np.save('feat/test_de' + str(split), test_vecs)
        np.save('feat/train_de' + str(split), train_vecs)

        print('Simple NN')
        NNet.simpleNN(train_vecs, test_vecs, y_train, y_test, 0.01, 100, 100)

        print('Logreg')
        logreg.run_logreg(train_vecs, test_vecs, y_train, y_test)

        print('SVM')
        svm.train_svm(train_vecs, test_vecs, y_train, y_test)

        split += 1
    return x_train, x_test, y_train, y_test


if __name__ == "__main__":
    labels, anx_liwc = read_liwc_csv('data//anxious_liwc.csv')
    labels, mixed_liwc = read_liwc_csv('data//mixed_liwc.csv')

    print len(anx_liwc)
    y = np.concatenate((np.ones(len(mixed_liwc)), np.zeros(len(anx_liwc))))
    x = np.concatenate((mixed_liwc, anx_liwc))

    rs = ShuffleSplit(n_splits=10, test_size=.10, random_state=0)
    rs.get_n_splits(x)
    split = 0
    for train_index, test_index in rs.split(x):
        print split
        x_train, x_test, y_train, y_test = build_train_test(
            x, y, train_index, test_index)
        train_w_labels = np.concatenate(
            (x_train, y_train.reshape(len(x_train), 1)), axis=1)

        arff.dump('result.arff', train_w_labels, relation='liwc', names=labels)
        print('log reg ')
        run_logreg(x_train, x_test, y_train, y_test)

        print('svm')
        train_svm(x_train, x_test, y_train, y_test)

        split += 1
Beispiel #4
0
    test_vecs_d = np.load('feat/test_d2v' + str(split) + '.npy')
    train_vecs_d = np.load('feat/train_d2v' + str(split) + '.npy')

    test_vecs_u = np.load('feat/test_unibigram' + str(split) + '.npy')
    train_vecs_u = np.load('feat/train_unibigram' + str(split) + '.npy')

    test_vecs_l = np.load('feat/test_lda' + str(split) + '.npy')
    train_vecs_l = np.load('feat/train_lda' + str(split) + '.npy')

    test_vecs = np.concatenate((test_vecs_w, test_vecs_l), axis=1)

    train_vecs = np.concatenate((train_vecs_w, train_vecs_l), axis=1)

    print('Logreg')
    acc, per, rec = logreg.run_logreg(train_vecs, test_vecs, y_train, y_test)

    results[split][0] = acc
    results[split][1] = per
    results[split][2] = rec

    print('SVM')
    acc, per, rec = svm.train_svm(train_vecs_l, test_vecs_l, y_train, y_test)
    results[split][3] = acc
    results[split][4] = per
    results[split][5] = rec

    print('Simple NN')
    acc, per, rec = NNet.simpleNN(train_vecs, test_vecs, y_train, y_test, 0.01, 10, 100)
    results[split][6] = acc
    results[split][7] = per
def execute(topic1, topic2, test, dump_files):
    if dump_files == "True":
        print_bold("\n" + "Downloading the datasets ..." + "\n")
        create_cleaned_files(topic1, topic2, test)

    print_bold("Dumps TFIDF features ..." + "\n")

    # category is used to specify the unique Id of the dumped model
    category = topic1 + "-" + topic2
    dump_tfidf(category)

    print("=========================================================")
    print_bold("Start Running bayes model to establish a baseline")
    print("=========================================================")

    print_bold("\n" + "Run Bayes model ..." + "\n")

    pred_train_bayes, pred_test_bayes = run_bayes(category)

    print("=========================================================")
    print_bold("Improvement of the baseline")
    print("=========================================================")

    print_bold("Run Cnn model ..." + "\n")

    pred_train_cnn, pred_test_cnn = run_cnn()

    print(
        "--------------------------------------------------------------------------"
    )
    print_bold("Run Fasttext model ..." + "\n")
    pred_train_fasttext, pred_test_fasttext = run_fasttext()

    print(
        "--------------------------------------------------------------------------"
    )

    print_bold("Run SVM model ..." + "\n")

    pred_train_svm, pred_test_svm, y_train = run_svm(category)

    print(
        "--------------------------------------------------------------------------"
    )

    print_bold("Run Logistic Regression model ..." + "\n")

    pred_train_logreg, pred_test_logreg, y_test = run_logreg(category)

    print(
        "--------------------------------------------------------------------------"
    )

    print_bold("Starting Ensemble Method")

    # using train+val for training the ensemble (training on more dataset == stronger results)
    train = np.column_stack((pred_train_svm, pred_train_logreg, pred_train_cnn,
                             pred_train_fasttext))
    test = np.column_stack(
        (pred_test_svm, pred_test_logreg, pred_test_cnn, pred_test_fasttext))
    model = xgb().fit(train, y_train)

    print(
        "--------------------------------------------------------------------------"
    )
    print_bold("Final results on the test set : ")
    print(classification_report(y_test, model.predict(test)))