def fit(self, data, batch_size = 128,
            max_iterations = 100, save_interval = 10, path = "ann_weights.bin", return_cost = False):
        '''
        Train neural network.
        '''

        cost_history = []
        self.best_w_h = self.w_h.get_value()
        self.best_w_o = self.w_o.get_value()
        best_auc = 0

        for iteration in range(max_iterations):
            i = 0

            for start, end in zip(range(0, len(data.train_x), batch_size), range(batch_size, len(data.train_x), batch_size)):

                cost = self._t_train(data.train_x[start:end], data.train_y[start:end])
                i = i + 1

                if i % save_interval == 0:
                    self.save(path)

                    if data.validation_y is not None:
                        predicted_labels = self.predict_proba(data.validation_x)[:, 1]

                        auc = compute_auc(np.argmax(data.validation_y, axis = 1), predicted_labels)

                        if auc > best_auc:
                            best_auc = auc
                            self.best_w_h = self.w_h.get_value()
                            self.best_w_o = self.w_o.get_value()
                        else:
                            if abs(best_auc - auc) < 0.000005:
                                self.w_h.set_value(self.best_w_h)
                                self.w_o.set_value(self.best_w_o)
                                return

                    self.save(path)

            print cost

            if return_cost:
                cost_history.append(cost)
                print cost

        return np.array(cost_history)
Example #2
0
    cvs = StratifiedKFold(homesite.train_y, n_folds = 5)
    clf = RandomForestClassifier(n_estimators = c, max_features = 100, n_jobs = 4)

    # Train classifier.
    print "\nTraining classifier param %d" % c

    for i, (train, test) in enumerate(cvs):
        sm = OverSampler(verbose = False, ratio = 2.5)
        train_oversampled_x, train_oversampled_train_y = sm.fit_transform(homesite.train_x[train], homesite.train_y[train])
        probas_ = clf.fit(train_oversampled_x, train_oversampled_train_y).predict_proba(homesite.train_x[test])

        fpr, tpr, thresholds = roc_curve(homesite.train_y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = compute_auc(homesite.train_y[test], probas_[:, 1])
        fold_cm = confusion_matrix(homesite.train_y[test], np.round(probas_)[:, 1])
        confusion_matrix_history = np.dstack((confusion_matrix_history, fold_cm))

        accuracy, precision, recall = compute_performance_metrics(fold_cm)
        mean_acc += accuracy
        mean_recall += recall
        mean_precision += precision

        accuracy_history.append(accuracy)
        precision_history.append(precision)
        recall_history.append(recall)
        auc_history.append(roc_auc)

        save_np_array("../../results/random_forests/rf_accuracy_" + str(c) + ".bin", np.array(accuracy_history))
        save_np_array("../../results/random_forests/rf_precision_" + str(c) + ".bin", np.array(precision_history))
    '''

    oversampled_path = "resources/oversampled_normalized_data_ratio_2.bin"
    homesite = Data()
    homesite.load_sliptted_data(oversampled_path)
    del homesite.test_x  # Deleted to save memory.
    print homesite.train_x.shape

    # Creating classifier.
    # clf = DecisionTreeClassifier()
    clf = RandomForestClassifier(max_features=100)
    # clf = AdaBoostClassifier(n_estimators = 10)
    # clf = svm.SVC(gamma = 0.00005)
    # clf = RandomForestClassifier()
    # clf = MultiplePLS(n_classifiers = 10, n_samples = 5000, n_positive_samples = 2500, threshold = 0.9, acc = 0.999)
    # clf = svm.LinearSVC()

    # Train classifier.
    print "Training classifier."
    clf.fit(homesite.train_x, homesite.train_y)

    # Test classifier.
    print 'Testing classifier.'
    predicted_labels = clf.predict_proba(homesite.validation_x)[:, 1]

    # Show final results.
    results = confusion_matrix(homesite.validation_y,
                               np.round(predicted_labels))
    accuracy, precision, recall = compute_performance_metrics(results)
    auc = compute_auc(homesite.validation_y, predicted_labels)

if __name__ == '__main__':
    '''
    Train neural network.
    '''

    # oversampled_path = "../../homesite_data/resources/oversampled_normalized_data_ratio_2.5.bin"
    oversampled_path = "../../homesite_data/resources/oversampled_normalized_data_ratio_2.bin"
    homesite_data = Data()
    homesite_data.load_sliptted_data(oversampled_path, one_hot = True)

    # Train neural network.
    clf = NeuralNetwork(input_units = 644, hidden_units = 50, output_units = 2, \
                        lr = 0.00005, lamb = 0.)
#     clf.fit(homesite_data, batch_size = 128,
#             max_iterations = 100, save_interval = 10,
#             path = "../homesite_data/ann_weights.bin")

    # Test neural network.
#     clf = NeuralNetwork(path = "../../homesite_data/ann_weights.bin", lr = 0.05, lamb = 0.000005)

    # Test classifier.
    print 'Testing classifier.'
    predicted_labels = clf.predict_proba(homesite_data.validation_x)[:, 1]

    # Show final results.
    results = confusion_matrix(np.argmax(homesite_data.validation_y, axis = 1), np.round(predicted_labels))
    accuracy, precision, recall = compute_performance_metrics(results)
    auc = compute_auc(np.argmax(homesite_data.validation_y, axis = 1), predicted_labels)