Example #1
0
def train_dev_test(w,
                   cv_data,
                   train_data,
                   train_labels,
                   dev_data,
                   test_data,
                   test_labels,
                   n_features,
                   name,
                   params,
                   dynamic=False,
                   average=False,
                   aggressive=False,
                   preprocessor=lambda x: x):
    cv_acc, max_param = cross_validate(w, params, cv_data, dynamic, average,
                                       aggressive)

    max_w, train_acc = train_dev(w, max_param, train_data, train_labels,
                                 dev_data[:, :-1], dev_data[:, -1], dynamic,
                                 average, aggressive)

    test_acc = test(max_w, test_data, test_labels)

    def predictor(row):
        label = np.sign(np.dot(row, max_w))
        if label == -1:
            label = 0
        return label

    write_output(name, max_param, cv_acc, train_acc, test_acc)
    write_predictions(name[:3],
                      predictor,
                      n_features=n_features,
                      neg_labels=True,
                      bias=True,
                      preprocessor=preprocessor)
Example #2
0
    cv_acc = 0
    if max_param is None:
        cv_acc, max_param = cross_validate(cv_data, weights, update_weights,
                                           params, update_params)

    max_weights = train(train_data,
                        train_labels,
                        weights,
                        update_weights,
                        max_param,
                        update_params,
                        epochs=10)
    train_acc = classify(train_data, train_labels, max_weights)

    test_acc = classify(test_data, test_labels, max_weights)

    def predictor(row):
        label = np.sign(np.dot(row, max_weights))
        if label == -1:
            label = 0
        return label

    write_output('Logistic regression', max_param, cv_acc, train_acc, test_acc)
    write_predictions('logreg',
                      predictor,
                      n_features=n_features,
                      neg_labels=True,
                      bias=True,
                      preprocessor=preprocessor)
Example #3
0
    cv_acc = 0
    if max_param is None:
        cv_acc, max_param = cross_validate(cv_data, weights, update_weights,
                                           params, update_params)

    max_weights = train(train_data,
                        train_labels,
                        weights,
                        update_weights,
                        max_param,
                        update_params,
                        epochs=100)
    train_acc = classify(train_data, train_labels, max_weights)

    test_acc = classify(test_data, test_labels, max_weights)

    def predictor(row):
        label = np.sign(np.dot(row, max_weights))
        if label == -1:
            label = 0
        return label

    write_output('SVM', max_param, cv_acc, train_acc, test_acc)
    write_predictions('svm',
                      predictor,
                      n_features,
                      neg_labels=True,
                      bias=True,
                      preprocessor=preprocessor)
Example #4
0
        '../data/data-splits/data.test', n_features=n_features, preprocessor=preprocessor)

    cv_data = np.array_split(np.hstack((train_data, train_labels)), 5)

    max_acc = 0
    opt_depth = 0
    for i in range(2, n_features + 2):
        acc = []

        for j in range(len(cv_data)):
            cv_test = cv_data[j]
            cv_train = np.vstack(cv_data[:j] + cv_data[j + 1:])

            tree, depth = id3(cv_train[:, :-1], cv_train[:, -1], max_depth=i)

            cv_acc = evaluate_tree(cv_test[:, :-1], cv_test[:, -1], tree)
            acc.append(cv_acc)

        avg_acc = np.mean(acc)
        if avg_acc > max_acc:
            opt_depth = i
            max_acc = avg_acc

    tree, depth = id3(train_data, train_labels, max_depth=opt_depth)
    train_acc = evaluate_tree(train_data, train_labels, tree)
    test_acc = evaluate_tree(test_data, test_labels, tree)

    write_output('ID3', opt_depth, max_acc, train_acc, test_acc)
    write_predictions('id3', lambda row: classify(row, tree),
                      n_features=n_features, preprocessor=preprocessor)
    for i in range(num_trees):
        ri = np.random.choice(data_indices, size=data_size, replace=True)
        rf = np.random.choice(feature_indices,
                              size=n_features - feature_size,
                              replace=False)

        tree, depth = id3(train_data[ri],
                          train_labels[ri],
                          used_features=rf,
                          max_depth=max_depth,
                          split_size=split_size)
        trees.append(tree)

        if (i + 1) % (num_trees / 10) == 0:
            print(str((i + 1) / num_trees * 100) + '%')

    np.save('../data/trees', trees)
    train_acc = evaluate_forest(train_data, train_labels, trees,
                                '../data/new_train')
    test_acc = evaluate_forest(test_data, test_labels, trees,
                               '../data/new_test')

    def predictor(row):
        return mode(list(map(lambda tree: classify(row, tree), trees)))[0][0]

    write_output('Random forest', None, None, train_acc, test_acc)
    write_predictions('rf',
                      predictor,
                      n_features=n_features,
                      preprocessor=preprocessor)
    lams = [2, 1.5, 1, 0.5]

    train_data, train_labels = load_data(
        '../data/data-splits/data.train', n_features=n_features, pos_labels=True)
    test_data, test_labels = load_data(
        '../data/data-splits/data.train', n_features=n_features, pos_labels=True)
    cv_data = np.array_split(np.hstack((train_data, train_labels)), 5)

    cv_acc, lam = cross_validate(cv_data, lams)

    counts = count_pos(train_data, train_labels)
    like, like_inv = naive_bayes(train_data.shape[0], counts, lam)
    train_acc = classify(train_data, train_labels, like, like_inv)
    test_acc = classify(test_data, test_labels, like, like_inv)

    write_output('Naive Bayes', lam, cv_acc, train_acc, test_acc)

    # Write predictions
    with open('../data/data-splits/data.eval.id') as file:
        eval_id = [int(line) for line in file]
    eval_data, _ = load_data('../data/data-splits/data.eval.anon', n_features=n_features, pos_labels=True)

    n_samples, n_features = eval_data.shape

    samples_inv = np.ones((n_samples, n_features)) - eval_data
    samples_inv = np.hstack((np.zeros((n_samples, 1)), samples_inv))
    samples = np.hstack((np.ones((n_samples, 1)), eval_data))

    probs = np.dot(samples, like) + np.dot(samples_inv, like_inv)
    probs = probs.argmax(axis=1).reshape((-1, 1))