def all_pairs_model(df, classes, CONFIG):
    '''
    Classification for a dataset with 3 classes. Usine all 
    possible pairs, generates threee models.

    Keyword arguments:
        df : DataFrame
          DataFrame with a label columns, specifying the class
          for that row. 
        classes: tuple
          tuple with all class names.
        CONFIG: Namespace
          A namespace with the configurations need to build the
          the model and specifations on the output. 
    Returns:
        None
    '''
    models = {"Logistic":LogisticRegression,
                "RandomForest":RandomForestClassifier,
                "ExtraTrees":ExtraTreesClassifier,
                "GradBoost":GradientBoostingClassifier}

    if CONFIG.show_roc:
        fig, ax = plt.subplots()

    for c1, c2 in combinations(classes, 2):
        data = df[(df.label == c1) | (df.label == c2)]

        X = data.drop("label", 1).values
        classes = {l:i for i,l in enumerate(set(data.label))}
        y = np.array([classes[i] for i in data.label])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

        if CONFIG.model_type == "Logistic":
            final_model = models[CONFIG.model_type]()
        else:
            final_model = models[CONFIG.model_type](**CONFIG.model_args)

        final_model.fit(X_train,y_train)
        print "Labels: ", classes
        print "-done training model"

        if CONFIG.serialize_model:
            with open('Parkinson_%s.model'%"_vs_".join(classes.keys()), 'wb') as fw:
                fw.write(pickle.dumps(final_model))
            print "--done serializing model"
        if CONFIG.show_roc:
            y_pred_probas = final_model.predict_proba(X_test)
            plot_ROC(y_pred_probas, y_test, classes, ax)

        print_classification_report(final_model,X_train, X_test, y_train, y_test)

    if CONFIG.show_roc:
        plot_ROC_style(ax)
        plt.show()
def two_class_model(df, classes, CONFIG):
    '''
    classification for a dataset of 3 classes. Usine One vs One,
    and taking pairwise combinations generate three models.

    Keyword arguments:
        df : DataFrame
          DataFrame with a label columns, specifying the class
          for that row. 
        classes: dict 
          contails the class label as key and the numeric label
          as keys.
        CONFIG: Namespace
          A namespace with the configurations need to build the
          the model and specifations on the output. 
    Returns:
        None
    '''
    models = {"Logistic":LogisticRegression,
                "RandomForest":RandomForestClassifier,
                "ExtraTrees":ExtraTreesClassifier,
                "GradBoost":GradientBoostingClassifier}

    X = df.drop("label", 1).values
    y = np.array([classes[i] for i in df.label])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    if CONFIG.model_type == "Logistic":
        final_model = models[CONFIG.model_type]()
    else:
        final_model = models[CONFIG.model_type](**CONFIG.model_args)

    final_model.fit(X_train,y_train)
    print "Labels: ", classes
    print "-done training model"

    if CONFIG.serialize_model:
        with open('Parkinson_two_class.model', 'wb') as fw:
            fw.write(pickle.dumps(final_model))
        print "--done serializing model"

    print_classification_report(final_model, X_train, X_test, y_train, y_test)

    if CONFIG.show_roc:
        fig, ax = plt.subplots()
        y_pred_probas = final_model.predict_proba(X_test)
        #plt.figure()

        plot_ROC(y_pred_probas, y_test, classes, ax)
        plot_ROC_style(ax)
        plt.show()
Example #3
0
def Q2():

    X = load_obj('X_Q2')
    y = load_obj('label_Q2')

    rf = RandomForestClassifier(max_features=50, random_state=20)
    svc = svm.LinearSVC(C=10)
    lr = LogisticRegression(random_state=20)
    knn = KNeighborsClassifier(n_neighbors=3)
    mlp = MLPClassifier(solver='lbfgs',
                        activation="relu",
                        alpha=1e-4,
                        hidden_layer_sizes=(200, 400),
                        random_state=1)
    dt = DecisionTreeClassifier(random_state=20)
    clfs = [rf, svc, lr, knn, mlp, dt]
    clf_names = ['rf', 'svc', 'lr', 'knn', 'mlp', 'dt']

    for clf, clf_name in zip(clfs, clf_names):
        print(clf_name)
        score = True
        if clf_name == 'svc':
            score = False
        y_t_train, y_p_train, y_t_test, y_p_test, y_score_train, y_score_test \
            = cross_val(clf, X, y, shuffle=True, score=score, verbose=True)

        acc_test, rec_test, prec_test = metrics(y_t_test, y_p_test)
        acc_train, rec_train, prec_train = metrics(y_t_train, y_p_train)
        print(
            'Test accuracy %0.4f, recall score %0.4f and precision score %.4f'
            % (acc_test, rec_test, prec_test))
        print(
            'Train accuracy %0.4f, recall score %0.4f and precision score %.4f'
            % (acc_train, rec_train, prec_train))

        classnames = ['Washington', 'Massachusetts']

        plot_confusion_matrix(y_t_test, y_p_test, classnames)

        plot_ROC(y_t_test, y_score_test, no_score=(not score))
early_stopping = EarlyStopping(monitor='loss', patience=1)

print('Training')
for i in range(epochs):
    print('Epoch', i+1, '/', epochs)
    model.fit(X_train,
              y_train,
              batch_size=batch_size,
              verbose=1,
              nb_epoch=1,
              shuffle=False, # turn off shuffle to ensure training data patterns remain sequential
              callbacks=[early_stopping])  # stop early if training loss not improving after 1 epoch
    model.reset_states()

# Evaluation 
print('Evaluating results in terms of classification accuracy')

loss = model.evaluate(X_test, y_test, batch_size=batch_size) # compute loss on test data, batch-by-batch
print("%s: %.2f%%" % (model.metrics_names[1], loss[1]*100))

print('Evaluating results in terms of AUC')

y_probs = model.predict_proba(X_test, batch_size=batch_size, verbose=1)
print('AUC ' + str(roc_auc_score(y_test, y_probs)))

y_pred = model.predict(X_test, batch_size=batch_size, verbose=1) # generate output predictions for test samples, batch-by-batch

# Plot ROC curve
plot_ROC(y_test, y_pred)
Example #5
0
     y_preds = tf.argmax(y_preds, axis=1).eval()
     y_true = tf.argmax(y_batch, axis=1).eval()
     # cm.batch_add(y_true, y_preds)
     test_loss.append(_loss)
     test_accuracy.append(_acc)
     # test_writer.add_summary(_summary, data.train.epochs_completed)
 print('Test Loss {:6.3f}, Test acc {:6.3f}'.format(
     np.mean(test_loss), np.mean(test_accuracy)))
 namestr += ":acc{:.3f}".format(np.mean(test_accuracy))
 acc_list.append("{:.3f}".format(np.mean(test_accuracy)))
 saver.save(sess, save_dir+'header_{0}_{1}_units.ckpt'.format(num_header, hidden_units))
 feed_dict_test = {x_pl: data.test.payloads, y_pl: data.test.labels}
 # Create ROC curve for all classes
 y_preds = sess.run(fetches=y, feed_dict=feed_dict_test)
 y_true = data.test.labels
 utils.plot_ROC(y_true, y_preds, num_classes, labels, micro=False, macro=False)
 # Compute different metrics for confusionmatrix and more
 y_preds = sess.run(fetches=y_, feed_dict=feed_dict_test)
 y_true = tf.argmax(data.test.labels, axis=1).eval()
 y_true = [labels[i] for i in y_true]
 y_preds = [labels[i] for i in y_preds]
 conf = metrics.confusion_matrix(y_true, y_preds, labels=labels)
 report = metrics.classification_report(y_true, y_preds, labels=labels)
 nostream_dict = ['http', 'https']
 y_stream_true = []
 y_stream_preds = []
 for i, v in enumerate(y_true):
     pred = y_preds[i]
     if v in nostream_dict:
         y_stream_true.append('non-streaming')
     else:
Example #6
0
mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)

fig_ROC = plt.figure(figsize=(8,6))
plt.plot(mean_fpr, mean_tpr, 'r-',
         label='3-fold CV Mean ROC (area = %0.2f)' % mean_auc, lw=2)

#scores = cross_val_score(clf, X, Y, cv=3, scoring='f1')
#print('CV accuracy: %.3f +/- %.3f' % (scores.mean(), scores.std()))

# also fit full dataset to get ROC
clf.fit(X, Y)

y_pred = clf.predict_proba(df[predictors])[:,1]
utils.plot_ROC(Y, y_pred)

# make hard predictions on the full data set
yy = clf.predict(df[predictors])
df = df.assign(predicted_label=yy)
check = df[['predicted_label']]
full_res = pd.merge(check, features, left_index=True, right_index=True)

# compute feature importance from impurity
utils.plot_feature_importances(clf.feature_importances_)

# compute feature importance from accuracy
# the idea is to permute the values of each feature and see its impact on the accuracy
scores = defaultdict(list)

for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3,
Example #7
0
def main():
    # load and preprocess data
    train_labels, train_imgs = extract_data(config.train_path)
    test_labels, test_imgs = extract_data(config.test_path)
    f = open(config.output_file, 'w')

    # model selection
    if config.select_model is True:
        print("Selecting model...")
        f.write("Scores for model selection:\n")
        # original image
        plain = True  # set parameters for feature extraction
        pool = {'take': False, 'class': 'max'}
        hist = {'take': False, 'h': [4], 'w': [4]}
        grad = {'take': False, 'class': 'hist'}
        chain = {'take': False, 'class': 'hist'}
        select_feats1 = get_feats(train_imgs, plain, pool, hist, grad, chain)

        # feature vector
        pool = {'take': False, 'class': 'max'}
        hist = {'take': True, 'h': [4], 'w': [4]}
        grad = {'take': True, 'class': 'hist'}
        chain = {'take': True, 'class': 'hist'}
        select_feats2 = get_feats(train_imgs, plain, pool, hist, grad, chain)

        # get cross-validation scores
        f.write("Baseline (original image):" + '\n')
        print("logistic regression models:")
        scores = cross_valid(config.models_select1, select_feats1,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names1[i] + ':' + str(scores[i]) + '\n')
        print("multi-class logistic regression models:")
        scores = cross_valid(config.models_select2, select_feats1,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names2[i] + ':' + str(scores[i]) + '\n')
        print("k-nearest neighbour models:")
        scores = cross_valid(config.models_select3, select_feats1,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names3[i] + ':' + str(scores[i]) + '\n')

        f.write("\nFeature vector:" + '\n')
        scores = cross_valid(config.models_select1, select_feats2,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names1[i] + ':' + str(scores[i]) + '\n')
        print("multi-class logistic regression models:")
        scores = cross_valid(config.models_select2, select_feats2,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names2[i] + ':' + str(scores[i]) + '\n')
        print("k-nearest neighbour models:")
        scores = cross_valid(config.models_select3, select_feats2,
                             train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names3[i] + ':' + str(scores[i]) + '\n')

        f.write("\n######################\n\n")

    # feature selection
    if config.select_feature is True:
        print("Selecting features...")
        f.write("Scores for feature selection:\n")
        plain = True  # set parameters for feature extraction
        pool = {'take': False, 'class': 'max'}
        hist = {'take': False, 'h': [4], 'w': [4]}
        grad = {'take': False, 'class': 'hist'}
        chain = {'take': False, 'class': 'hist'}

        # histogram
        hist['take'] = True
        print("Extract histogram from training data set...")
        f.write("\nHistogram:\n")
        select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain)
        scores = cross_valid(config.models, select_feats, train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names[i] + ':' + str(scores[i]) + '\n')

        # gradient histogram
        hist['take'] = False
        grad['take'] = True
        print("Extract gradient histogram from training data set...")
        f.write("\nGradient histogram:\n")
        select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain)
        scores = cross_valid(config.models, select_feats, train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names[i] + ':' + str(scores[i]) + '\n')

        # gradient image
        grad['class'] = 'plain'
        print("Extract gradient image from training data set...")
        f.write("\nGradient image:\n")
        select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain)
        scores = cross_valid(config.models, select_feats, train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names[i] + ':' + str(scores[i]) + '\n')

        # chain code histogram
        grad['take'] = False
        chain['take'] = True
        print("Extract chain code histogram from training data set...")
        f.write("\nChain code histogram:\n")
        select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain)
        scores = cross_valid(config.models, select_feats, train_labels)
        print(scores)
        for i, s in enumerate(scores):
            f.write(config.names[i] + ':' + str(scores[i]) + '\n')

        f.write("\n######################\n\n")

    if config.produce_results is True or config.draw_ROC is True:
        # feature extraction
        print("Extract feature from training data set...")
        train_feats = get_feats(train_imgs, config.plain, config.pool,
                                config.hist, config.grad, config.chain)
        print("Extract feature from testing data set...")
        test_feats = get_feats(test_imgs, config.plain, config.pool,
                               config.hist, config.grad, config.chain)
        print("All data processed. Number of features extracted is " +
              str(len(train_feats[0])))

        if config.produce_results is True:
            print("Producing prediction results...")
            f.write("Prediction results\n")
            f.write('original image: ' + str(config.plain))
            f.write('\n')
            f.write('pooled:' + str(config.pool))
            f.write('\n')
            f.write('histogram:' + str(config.hist))
            f.write('\n')
            f.write('gradient:' + str(config.grad))
            f.write('\n')
            f.write('chain code:' + str(config.chain))
            f.write('\n\n')
            all_preds = final_result(config.models, config.names, train_feats,
                                     train_labels, test_feats, test_labels, f)

            if config.visualize_error is True:
                preds = all_preds[2]
                err_imgs = test_imgs[preds != test_labels]
                err_labels = test_labels[preds != test_labels]
                visualize(err_labels, err_imgs)

        if config.draw_ROC is True:
            print("Drawing ROC for LDA model...")
            preds_proba = plot_ROC(config.lda, train_feats, train_labels,
                                   test_feats, test_labels)