def train_dev_test(w, cv_data, train_data, train_labels, dev_data, test_data, test_labels, n_features, name, params, dynamic=False, average=False, aggressive=False, preprocessor=lambda x: x): cv_acc, max_param = cross_validate(w, params, cv_data, dynamic, average, aggressive) max_w, train_acc = train_dev(w, max_param, train_data, train_labels, dev_data[:, :-1], dev_data[:, -1], dynamic, average, aggressive) test_acc = test(max_w, test_data, test_labels) def predictor(row): label = np.sign(np.dot(row, max_w)) if label == -1: label = 0 return label write_output(name, max_param, cv_acc, train_acc, test_acc) write_predictions(name[:3], predictor, n_features=n_features, neg_labels=True, bias=True, preprocessor=preprocessor)
cv_acc = 0 if max_param is None: cv_acc, max_param = cross_validate(cv_data, weights, update_weights, params, update_params) max_weights = train(train_data, train_labels, weights, update_weights, max_param, update_params, epochs=10) train_acc = classify(train_data, train_labels, max_weights) test_acc = classify(test_data, test_labels, max_weights) def predictor(row): label = np.sign(np.dot(row, max_weights)) if label == -1: label = 0 return label write_output('Logistic regression', max_param, cv_acc, train_acc, test_acc) write_predictions('logreg', predictor, n_features=n_features, neg_labels=True, bias=True, preprocessor=preprocessor)
cv_acc = 0 if max_param is None: cv_acc, max_param = cross_validate(cv_data, weights, update_weights, params, update_params) max_weights = train(train_data, train_labels, weights, update_weights, max_param, update_params, epochs=100) train_acc = classify(train_data, train_labels, max_weights) test_acc = classify(test_data, test_labels, max_weights) def predictor(row): label = np.sign(np.dot(row, max_weights)) if label == -1: label = 0 return label write_output('SVM', max_param, cv_acc, train_acc, test_acc) write_predictions('svm', predictor, n_features, neg_labels=True, bias=True, preprocessor=preprocessor)
'../data/data-splits/data.test', n_features=n_features, preprocessor=preprocessor) cv_data = np.array_split(np.hstack((train_data, train_labels)), 5) max_acc = 0 opt_depth = 0 for i in range(2, n_features + 2): acc = [] for j in range(len(cv_data)): cv_test = cv_data[j] cv_train = np.vstack(cv_data[:j] + cv_data[j + 1:]) tree, depth = id3(cv_train[:, :-1], cv_train[:, -1], max_depth=i) cv_acc = evaluate_tree(cv_test[:, :-1], cv_test[:, -1], tree) acc.append(cv_acc) avg_acc = np.mean(acc) if avg_acc > max_acc: opt_depth = i max_acc = avg_acc tree, depth = id3(train_data, train_labels, max_depth=opt_depth) train_acc = evaluate_tree(train_data, train_labels, tree) test_acc = evaluate_tree(test_data, test_labels, tree) write_output('ID3', opt_depth, max_acc, train_acc, test_acc) write_predictions('id3', lambda row: classify(row, tree), n_features=n_features, preprocessor=preprocessor)
for i in range(num_trees): ri = np.random.choice(data_indices, size=data_size, replace=True) rf = np.random.choice(feature_indices, size=n_features - feature_size, replace=False) tree, depth = id3(train_data[ri], train_labels[ri], used_features=rf, max_depth=max_depth, split_size=split_size) trees.append(tree) if (i + 1) % (num_trees / 10) == 0: print(str((i + 1) / num_trees * 100) + '%') np.save('../data/trees', trees) train_acc = evaluate_forest(train_data, train_labels, trees, '../data/new_train') test_acc = evaluate_forest(test_data, test_labels, trees, '../data/new_test') def predictor(row): return mode(list(map(lambda tree: classify(row, tree), trees)))[0][0] write_output('Random forest', None, None, train_acc, test_acc) write_predictions('rf', predictor, n_features=n_features, preprocessor=preprocessor)
lams = [2, 1.5, 1, 0.5] train_data, train_labels = load_data( '../data/data-splits/data.train', n_features=n_features, pos_labels=True) test_data, test_labels = load_data( '../data/data-splits/data.train', n_features=n_features, pos_labels=True) cv_data = np.array_split(np.hstack((train_data, train_labels)), 5) cv_acc, lam = cross_validate(cv_data, lams) counts = count_pos(train_data, train_labels) like, like_inv = naive_bayes(train_data.shape[0], counts, lam) train_acc = classify(train_data, train_labels, like, like_inv) test_acc = classify(test_data, test_labels, like, like_inv) write_output('Naive Bayes', lam, cv_acc, train_acc, test_acc) # Write predictions with open('../data/data-splits/data.eval.id') as file: eval_id = [int(line) for line in file] eval_data, _ = load_data('../data/data-splits/data.eval.anon', n_features=n_features, pos_labels=True) n_samples, n_features = eval_data.shape samples_inv = np.ones((n_samples, n_features)) - eval_data samples_inv = np.hstack((np.zeros((n_samples, 1)), samples_inv)) samples = np.hstack((np.ones((n_samples, 1)), eval_data)) probs = np.dot(samples, like) + np.dot(samples_inv, like_inv) probs = probs.argmax(axis=1).reshape((-1, 1))