def all_pairs_model(df, classes, CONFIG): ''' Classification for a dataset with 3 classes. Usine all possible pairs, generates threee models. Keyword arguments: df : DataFrame DataFrame with a label columns, specifying the class for that row. classes: tuple tuple with all class names. CONFIG: Namespace A namespace with the configurations need to build the the model and specifations on the output. Returns: None ''' models = {"Logistic":LogisticRegression, "RandomForest":RandomForestClassifier, "ExtraTrees":ExtraTreesClassifier, "GradBoost":GradientBoostingClassifier} if CONFIG.show_roc: fig, ax = plt.subplots() for c1, c2 in combinations(classes, 2): data = df[(df.label == c1) | (df.label == c2)] X = data.drop("label", 1).values classes = {l:i for i,l in enumerate(set(data.label))} y = np.array([classes[i] for i in data.label]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) if CONFIG.model_type == "Logistic": final_model = models[CONFIG.model_type]() else: final_model = models[CONFIG.model_type](**CONFIG.model_args) final_model.fit(X_train,y_train) print "Labels: ", classes print "-done training model" if CONFIG.serialize_model: with open('Parkinson_%s.model'%"_vs_".join(classes.keys()), 'wb') as fw: fw.write(pickle.dumps(final_model)) print "--done serializing model" if CONFIG.show_roc: y_pred_probas = final_model.predict_proba(X_test) plot_ROC(y_pred_probas, y_test, classes, ax) print_classification_report(final_model,X_train, X_test, y_train, y_test) if CONFIG.show_roc: plot_ROC_style(ax) plt.show()
def two_class_model(df, classes, CONFIG): ''' classification for a dataset of 3 classes. Usine One vs One, and taking pairwise combinations generate three models. Keyword arguments: df : DataFrame DataFrame with a label columns, specifying the class for that row. classes: dict contails the class label as key and the numeric label as keys. CONFIG: Namespace A namespace with the configurations need to build the the model and specifations on the output. Returns: None ''' models = {"Logistic":LogisticRegression, "RandomForest":RandomForestClassifier, "ExtraTrees":ExtraTreesClassifier, "GradBoost":GradientBoostingClassifier} X = df.drop("label", 1).values y = np.array([classes[i] for i in df.label]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) if CONFIG.model_type == "Logistic": final_model = models[CONFIG.model_type]() else: final_model = models[CONFIG.model_type](**CONFIG.model_args) final_model.fit(X_train,y_train) print "Labels: ", classes print "-done training model" if CONFIG.serialize_model: with open('Parkinson_two_class.model', 'wb') as fw: fw.write(pickle.dumps(final_model)) print "--done serializing model" print_classification_report(final_model, X_train, X_test, y_train, y_test) if CONFIG.show_roc: fig, ax = plt.subplots() y_pred_probas = final_model.predict_proba(X_test) #plt.figure() plot_ROC(y_pred_probas, y_test, classes, ax) plot_ROC_style(ax) plt.show()
def Q2(): X = load_obj('X_Q2') y = load_obj('label_Q2') rf = RandomForestClassifier(max_features=50, random_state=20) svc = svm.LinearSVC(C=10) lr = LogisticRegression(random_state=20) knn = KNeighborsClassifier(n_neighbors=3) mlp = MLPClassifier(solver='lbfgs', activation="relu", alpha=1e-4, hidden_layer_sizes=(200, 400), random_state=1) dt = DecisionTreeClassifier(random_state=20) clfs = [rf, svc, lr, knn, mlp, dt] clf_names = ['rf', 'svc', 'lr', 'knn', 'mlp', 'dt'] for clf, clf_name in zip(clfs, clf_names): print(clf_name) score = True if clf_name == 'svc': score = False y_t_train, y_p_train, y_t_test, y_p_test, y_score_train, y_score_test \ = cross_val(clf, X, y, shuffle=True, score=score, verbose=True) acc_test, rec_test, prec_test = metrics(y_t_test, y_p_test) acc_train, rec_train, prec_train = metrics(y_t_train, y_p_train) print( 'Test accuracy %0.4f, recall score %0.4f and precision score %.4f' % (acc_test, rec_test, prec_test)) print( 'Train accuracy %0.4f, recall score %0.4f and precision score %.4f' % (acc_train, rec_train, prec_train)) classnames = ['Washington', 'Massachusetts'] plot_confusion_matrix(y_t_test, y_p_test, classnames) plot_ROC(y_t_test, y_score_test, no_score=(not score))
early_stopping = EarlyStopping(monitor='loss', patience=1) print('Training') for i in range(epochs): print('Epoch', i+1, '/', epochs) model.fit(X_train, y_train, batch_size=batch_size, verbose=1, nb_epoch=1, shuffle=False, # turn off shuffle to ensure training data patterns remain sequential callbacks=[early_stopping]) # stop early if training loss not improving after 1 epoch model.reset_states() # Evaluation print('Evaluating results in terms of classification accuracy') loss = model.evaluate(X_test, y_test, batch_size=batch_size) # compute loss on test data, batch-by-batch print("%s: %.2f%%" % (model.metrics_names[1], loss[1]*100)) print('Evaluating results in terms of AUC') y_probs = model.predict_proba(X_test, batch_size=batch_size, verbose=1) print('AUC ' + str(roc_auc_score(y_test, y_probs))) y_pred = model.predict(X_test, batch_size=batch_size, verbose=1) # generate output predictions for test samples, batch-by-batch # Plot ROC curve plot_ROC(y_test, y_pred)
y_preds = tf.argmax(y_preds, axis=1).eval() y_true = tf.argmax(y_batch, axis=1).eval() # cm.batch_add(y_true, y_preds) test_loss.append(_loss) test_accuracy.append(_acc) # test_writer.add_summary(_summary, data.train.epochs_completed) print('Test Loss {:6.3f}, Test acc {:6.3f}'.format( np.mean(test_loss), np.mean(test_accuracy))) namestr += ":acc{:.3f}".format(np.mean(test_accuracy)) acc_list.append("{:.3f}".format(np.mean(test_accuracy))) saver.save(sess, save_dir+'header_{0}_{1}_units.ckpt'.format(num_header, hidden_units)) feed_dict_test = {x_pl: data.test.payloads, y_pl: data.test.labels} # Create ROC curve for all classes y_preds = sess.run(fetches=y, feed_dict=feed_dict_test) y_true = data.test.labels utils.plot_ROC(y_true, y_preds, num_classes, labels, micro=False, macro=False) # Compute different metrics for confusionmatrix and more y_preds = sess.run(fetches=y_, feed_dict=feed_dict_test) y_true = tf.argmax(data.test.labels, axis=1).eval() y_true = [labels[i] for i in y_true] y_preds = [labels[i] for i in y_preds] conf = metrics.confusion_matrix(y_true, y_preds, labels=labels) report = metrics.classification_report(y_true, y_preds, labels=labels) nostream_dict = ['http', 'https'] y_stream_true = [] y_stream_preds = [] for i, v in enumerate(y_true): pred = y_preds[i] if v in nostream_dict: y_stream_true.append('non-streaming') else:
mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) fig_ROC = plt.figure(figsize=(8,6)) plt.plot(mean_fpr, mean_tpr, 'r-', label='3-fold CV Mean ROC (area = %0.2f)' % mean_auc, lw=2) #scores = cross_val_score(clf, X, Y, cv=3, scoring='f1') #print('CV accuracy: %.3f +/- %.3f' % (scores.mean(), scores.std())) # also fit full dataset to get ROC clf.fit(X, Y) y_pred = clf.predict_proba(df[predictors])[:,1] utils.plot_ROC(Y, y_pred) # make hard predictions on the full data set yy = clf.predict(df[predictors]) df = df.assign(predicted_label=yy) check = df[['predicted_label']] full_res = pd.merge(check, features, left_index=True, right_index=True) # compute feature importance from impurity utils.plot_feature_importances(clf.feature_importances_) # compute feature importance from accuracy # the idea is to permute the values of each feature and see its impact on the accuracy scores = defaultdict(list) for train_idx, test_idx in ShuffleSplit(len(df), n_iter=10, test_size=0.3,
def main(): # load and preprocess data train_labels, train_imgs = extract_data(config.train_path) test_labels, test_imgs = extract_data(config.test_path) f = open(config.output_file, 'w') # model selection if config.select_model is True: print("Selecting model...") f.write("Scores for model selection:\n") # original image plain = True # set parameters for feature extraction pool = {'take': False, 'class': 'max'} hist = {'take': False, 'h': [4], 'w': [4]} grad = {'take': False, 'class': 'hist'} chain = {'take': False, 'class': 'hist'} select_feats1 = get_feats(train_imgs, plain, pool, hist, grad, chain) # feature vector pool = {'take': False, 'class': 'max'} hist = {'take': True, 'h': [4], 'w': [4]} grad = {'take': True, 'class': 'hist'} chain = {'take': True, 'class': 'hist'} select_feats2 = get_feats(train_imgs, plain, pool, hist, grad, chain) # get cross-validation scores f.write("Baseline (original image):" + '\n') print("logistic regression models:") scores = cross_valid(config.models_select1, select_feats1, train_labels) print(scores) for i, s in enumerate(scores): f.write(config.names1[i] + ':' + str(scores[i]) + '\n') print("multi-class logistic regression models:") scores = cross_valid(config.models_select2, select_feats1, train_labels) print(scores) for i, s in enumerate(scores): f.write(config.names2[i] + ':' + str(scores[i]) + '\n') print("k-nearest neighbour models:") scores = cross_valid(config.models_select3, select_feats1, train_labels) print(scores) for i, s in enumerate(scores): f.write(config.names3[i] + ':' + str(scores[i]) + '\n') f.write("\nFeature vector:" + '\n') scores = cross_valid(config.models_select1, select_feats2, train_labels) print(scores) for i, s in enumerate(scores): f.write(config.names1[i] + ':' + str(scores[i]) + '\n') print("multi-class logistic regression models:") scores = cross_valid(config.models_select2, select_feats2, train_labels) print(scores) for i, s in enumerate(scores): f.write(config.names2[i] + ':' + str(scores[i]) + '\n') print("k-nearest neighbour models:") scores = cross_valid(config.models_select3, select_feats2, train_labels) print(scores) for i, s in enumerate(scores): f.write(config.names3[i] + ':' + str(scores[i]) + '\n') f.write("\n######################\n\n") # feature selection if config.select_feature is True: print("Selecting features...") f.write("Scores for feature selection:\n") plain = True # set parameters for feature extraction pool = {'take': False, 'class': 'max'} hist = {'take': False, 'h': [4], 'w': [4]} grad = {'take': False, 'class': 'hist'} chain = {'take': False, 'class': 'hist'} # histogram hist['take'] = True print("Extract histogram from training data set...") f.write("\nHistogram:\n") select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain) scores = cross_valid(config.models, select_feats, train_labels) print(scores) for i, s in enumerate(scores): f.write(config.names[i] + ':' + str(scores[i]) + '\n') # gradient histogram hist['take'] = False grad['take'] = True print("Extract gradient histogram from training data set...") f.write("\nGradient histogram:\n") select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain) scores = cross_valid(config.models, select_feats, train_labels) print(scores) for i, s in enumerate(scores): f.write(config.names[i] + ':' + str(scores[i]) + '\n') # gradient image grad['class'] = 'plain' print("Extract gradient image from training data set...") f.write("\nGradient image:\n") select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain) scores = cross_valid(config.models, select_feats, train_labels) print(scores) for i, s in enumerate(scores): f.write(config.names[i] + ':' + str(scores[i]) + '\n') # chain code histogram grad['take'] = False chain['take'] = True print("Extract chain code histogram from training data set...") f.write("\nChain code histogram:\n") select_feats = get_feats(train_imgs, plain, pool, hist, grad, chain) scores = cross_valid(config.models, select_feats, train_labels) print(scores) for i, s in enumerate(scores): f.write(config.names[i] + ':' + str(scores[i]) + '\n') f.write("\n######################\n\n") if config.produce_results is True or config.draw_ROC is True: # feature extraction print("Extract feature from training data set...") train_feats = get_feats(train_imgs, config.plain, config.pool, config.hist, config.grad, config.chain) print("Extract feature from testing data set...") test_feats = get_feats(test_imgs, config.plain, config.pool, config.hist, config.grad, config.chain) print("All data processed. Number of features extracted is " + str(len(train_feats[0]))) if config.produce_results is True: print("Producing prediction results...") f.write("Prediction results\n") f.write('original image: ' + str(config.plain)) f.write('\n') f.write('pooled:' + str(config.pool)) f.write('\n') f.write('histogram:' + str(config.hist)) f.write('\n') f.write('gradient:' + str(config.grad)) f.write('\n') f.write('chain code:' + str(config.chain)) f.write('\n\n') all_preds = final_result(config.models, config.names, train_feats, train_labels, test_feats, test_labels, f) if config.visualize_error is True: preds = all_preds[2] err_imgs = test_imgs[preds != test_labels] err_labels = test_labels[preds != test_labels] visualize(err_labels, err_imgs) if config.draw_ROC is True: print("Drawing ROC for LDA model...") preds_proba = plot_ROC(config.lda, train_feats, train_labels, test_feats, test_labels)