def beta_dist(data_sets, kmer_size, n_factor): # Loop over all data sets for data_set in data_sets: data_set = data_set[0] # Retrieve diseased data and labels allowed_labels = ['0', '1'] kmer_cnts, accessions, labelz, domain_labels = load_kmer_cnts_jf.load_kmers( kmer_size, data_set, allowed_labels) print("LOADED DATASET " + str(data_set[0]) + ": " + str(len(kmer_cnts)) + " SAMPLES") labelz = np.asarray(labelz) labelz = labelz.astype(np.int) # Conduct NMF and resave to data_normalized if n_factor == 0: data_normalized = normalize(kmer_cnts, axis=1, norm='l1') data_normalized, labels = shuffle(data_normalized, labelz, random_state=0) x = data_normalized y = labels else: data_normalized = normalize(kmer_cnts, axis=1, norm='l1') data_normalized = stats_utils_AEB.NMF_factor( data_normalized, kmer_size, n_components=int(n_factor), title=(str(data_set) + str(kmer_size) + "mers" + str(n_factor) + "factors")) data_normalized, labels = shuffle(data_normalized, labelz, random_state=0) x = data_normalized y = labels return x, y
# Conduct NMF and resave to data_normalized #enter in desired no.factors for n in [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]: if n == 0: data_normalized = normalize(kmer_cnts, axis=1, norm='l1') data_normalized, labels = shuffle(data_normalized, labelz, random_state=0) x = data_normalized y = labels else: data_normalized = normalize(kmer_cnts, axis=1, norm='l1') data_normalized = stats_utils_AEB.NMF_factor( data_normalized, kmer_size, n_components=int(n), title=(str(data_set) + str(kmer_size) + "mers" + str(n) + "factors")) data_normalized, labels = shuffle(data_normalized, labelz, random_state=0) x = data_normalized y = labels param_grid = param_dict[learn_type] scoring = { 'Acc': 'accuracy', 'AUC': 'roc_auc', 'Precision': 'precision', 'Recall': 'recall',
kmer_size = 7 data = pd.read_pickle( "/pollard/home/abustion/deep_learning_microbiome/data_AEB/NMF_on_all_data/before_NMF_no_norm.pickle" ) data_normalized = pd.read_pickle( "/pollard/home/abustion/deep_learning_microbiome/data_AEB/NMF_on_all_data/before_NMF_with_norm.pickle" ) factors = 30 for n in range(2, factors + 1): data_NMF = stats_utils_AEB.NMF_factor( data, kmer_size, n_components=int(n), title=("ALL_DATA_no_norm_" + str(kmer_size) + "mers" + str(n) + "factors")) data_NMF.to_pickle( "/pollard/home/abustion/deep_learning_microbiome/data_AEB/NMF_on_all_data/after_NMF_no_norm_" + str(n) + "factors.pickle") factors = 30 for n in range(2, factors + 1): data_NMF = stats_utils_AEB.NMF_factor( data_normalized, kmer_size, n_components=int(n), title=("ALL_DATA_no_norm_" + str(kmer_size) + "mers" + str(n) + "factors")) data_NMF.to_pickle(