Ejemplo n.º 1
0
    '''

    oversampled_path = "resources/oversampled_normalized_data_ratio_2.bin"
    homesite = Data()
    homesite.load_sliptted_data(oversampled_path)
    del homesite.test_x  # Deleted to save memory.
    print homesite.train_x.shape

    # Creating classifier.
    # clf = DecisionTreeClassifier()
    clf = RandomForestClassifier(max_features=100)
    # clf = AdaBoostClassifier(n_estimators = 10)
    # clf = svm.SVC(gamma = 0.00005)
    # clf = RandomForestClassifier()
    # clf = MultiplePLS(n_classifiers = 10, n_samples = 5000, n_positive_samples = 2500, threshold = 0.9, acc = 0.999)
    # clf = svm.LinearSVC()

    # Train classifier.
    print "Training classifier."
    clf.fit(homesite.train_x, homesite.train_y)

    # Test classifier.
    print 'Testing classifier.'
    predicted_labels = clf.predict_proba(homesite.validation_x)[:, 1]

    # Show final results.
    results = confusion_matrix(homesite.validation_y,
                               np.round(predicted_labels))
    accuracy, precision, recall = compute_performance_metrics(results)
    auc = compute_auc(homesite.validation_y, predicted_labels)
Ejemplo n.º 2
0
    cvs = StratifiedKFold(homesite.train_y, n_folds = 5)
    clf = RandomForestClassifier(n_estimators = c, max_features = 100, n_jobs = 4)

    # Train classifier.
    print "\nTraining classifier param %d" % c

    for i, (train, test) in enumerate(cvs):
        sm = OverSampler(verbose = False, ratio = 2.5)
        train_oversampled_x, train_oversampled_train_y = sm.fit_transform(homesite.train_x[train], homesite.train_y[train])
        probas_ = clf.fit(train_oversampled_x, train_oversampled_train_y).predict_proba(homesite.train_x[test])

        fpr, tpr, thresholds = roc_curve(homesite.train_y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = compute_auc(homesite.train_y[test], probas_[:, 1])
        fold_cm = confusion_matrix(homesite.train_y[test], np.round(probas_)[:, 1])
        confusion_matrix_history = np.dstack((confusion_matrix_history, fold_cm))

        accuracy, precision, recall = compute_performance_metrics(fold_cm)
        mean_acc += accuracy
        mean_recall += recall
        mean_precision += precision

        accuracy_history.append(accuracy)
        precision_history.append(precision)
        recall_history.append(recall)
        auc_history.append(roc_auc)

        save_np_array("../../results/random_forests/rf_accuracy_" + str(c) + ".bin", np.array(accuracy_history))
        save_np_array("../../results/random_forests/rf_precision_" + str(c) + ".bin", np.array(precision_history))
        save_np_array("../../results/random_forests/rf_recall_" + str(c) + ".bin", np.array(recall_history))

if __name__ == '__main__':
    '''
    Train neural network.
    '''

    # oversampled_path = "../../homesite_data/resources/oversampled_normalized_data_ratio_2.5.bin"
    oversampled_path = "../../homesite_data/resources/oversampled_normalized_data_ratio_2.bin"
    homesite_data = Data()
    homesite_data.load_sliptted_data(oversampled_path, one_hot = True)

    # Train neural network.
    clf = NeuralNetwork(input_units = 644, hidden_units = 50, output_units = 2, \
                        lr = 0.00005, lamb = 0.)
#     clf.fit(homesite_data, batch_size = 128,
#             max_iterations = 100, save_interval = 10,
#             path = "../homesite_data/ann_weights.bin")

    # Test neural network.
#     clf = NeuralNetwork(path = "../../homesite_data/ann_weights.bin", lr = 0.05, lamb = 0.000005)

    # Test classifier.
    print 'Testing classifier.'
    predicted_labels = clf.predict_proba(homesite_data.validation_x)[:, 1]

    # Show final results.
    results = confusion_matrix(np.argmax(homesite_data.validation_y, axis = 1), np.round(predicted_labels))
    accuracy, precision, recall = compute_performance_metrics(results)
    auc = compute_auc(np.argmax(homesite_data.validation_y, axis = 1), predicted_labels)