def embedFig(target="IBD"): #Embed f = plt.figure(figsize=(15, 5)) X_train, X_val, X_test, y_train, y_val, y_test = hf.getMlInput( otu_train, otu_test, map_train, map_test, target=target, embed=True, qual_vecs=qual_vecs) X_train = pd.concat([X_train, X_val], axis=0) y_train = y_train + y_val plt.subplot(1, 2, 1) m, auc_embed, auc_train_embed, fpr_embed, tpr_embed, prec_embed, f1_embed, f2_embed, _ = hf.predictIBD( X_train, y_train, X_test, y_test, graph_title="Embedding weighted by averaging taxa " + str(X_train.shape[1]) + " features", max_depth=5, n_estimators=95, weight=20, plot=True, plot_pr=True) f.savefig(os.path.join(fig_dir, "curves_AGP_test_embed.pdf"))
def pcaFig(target="IBD"): f = plt.figure(figsize=(15, 5)) X_train, X_val, X_test, y_train, y_val, y_test = hf.getMlInput( otu_train, otu_test, map_train, map_test, target=target, pca_reduced=True, numComponents=100) X_train = pd.concat([X_train, X_val], axis=0) y_train = y_train + y_val plt.subplot(1, 2, 1) m, auc_pca, auc_train_pca, fpr_pca, tpr_pca, prec_pca, f1_pca, f2_pca, _ = hf.predictIBD( X_train, y_train, X_test, y_test, graph_title="PCA dimensionality reduced " + str(X_train.shape[1]) + " features", max_depth=5, n_estimators=50, weight=20, plot=True, plot_pr=True) f.savefig(os.path.join(fig_dir + "curves_AGP_test_pca.pdf"))
def asinFig(target="IBD"): #Normalize with asinh f = plt.figure(figsize=(15, 5)) X_train, X_val, X_test, y_train, y_val, y_test = hf.getMlInput( otu_train, otu_test, map_train, map_test, target=target, asinNormalized=True) X_train = pd.concat([X_train, X_val], axis=0) y_train = y_train + y_val plt.subplot(1, 2, 1) m, auc_asin, auc_train_asin, fpr_asin, tpr_asin, prec_asin, f1_asin, f2_asin, _ = hf.predictIBD( X_train, y_train, X_test, y_test, graph_title="Normalized asinh Taxa Abundances " + str(X_train.shape[1]) + " features", max_depth=5, n_estimators=170, weight=20, plot=True, plot_pr=True) f.savefig(os.path.join(fig_dir, "asin_otu.pdf"))
# Classifying embedded data i.e. 113 features using Naive Bayes X_embed_train, X_embed_test, y_embed_train, y_embed_test = train_test_split(X_embed, y_embed, test_size = 0.2, random_state = 10) # Input data has negative values, MultinomialNB and ComplementNG cannot be used. clf = GaussianNB() model = clf.fit(X_embed_train, y_embed_train) predicted_y = model.predict(X_embed_test) f = plt.figure(figsize=(15,5)) roc_auc, fpr, tpr, average_precision, f1, f2 = hf.computeMLstats(model, X_embed_test, y_embed_test, plot=True, plot_pr=True, graph_title = "Naive Bayes Classifier on embedded data", flipped = False) f.savefig(os.path.join(fig_dir, "naive_bayes_classifier_embed.pdf")) # Classifying OTU data i.e. 26k+ features using Naive Bayes X_train, X_val, X_test, y_train, y_val, y_test = hf.getMlInput(otu_train, otu_test, map_train, map_test, target = "IBD", asinNormalized=True) X_train = pd.concat([X_train, X_val], axis = 0) y_train = y_train + y_val # Input data has negative values, MultinomialNB and ComplementNG cannot be used. clf = GaussianNB() model = clf.fit(X_train, y_train) predicted_y = model.predict(X_test) f = plt.figure(figsize=(15,5)) roc_auc, fpr, tpr, average_precision, f1, f2 = hf.computeMLstats(model, X_test, y_test, plot=True, plot_pr=True, graph_title = "Naive Bayes Classifier on OTU table", flipped = False) f.savefig(os.path.join(fig_dir, "naive_bayes_classifier_otu.pdf"))
weight=20, plot=True, plot_pr=True) f.savefig(os.path.join(fig_dir + "curves_AGP_test_pca.pdf")) pcaFig() importlib.reload(hf) target = "IBD" # hf.getMlInput generates training, validation, and testing data # if embed is True, the data is normalized using asinh i.e. hyperbolic inverse sin function X_train, X_val, X_test, y_train, y_val, y_test = hf.getMlInput( otu_train, otu_test, map_train, map_test, target=target, embed=True, qual_vecs=qual_vecs) X = pd.concat([X_train, X_val, X_test], axis=0) y = y_train + y_val + y_test # These values are never used again (?) auc_crossVal, auc_prec_crossVal, f1_crossVal, feat_imp_embed = hf.crossValPrediction( X, y, max_depth=2, n_estimators=50, weight=20) # weights/importance of 113 features (100 properties + 13 demographic features) feat_imp_df = hf.getFeatImpDf(feat_imp_embed) # mapping of property to function from kegg database pathway_table = pd.read_csv(data_dir + "/property_pathway_dict.txt",