def test_GMM_Unsup(genes, gene_len): """ Test that if unsupervised GMM method works """ k_min = 2 k_max = 6 num_class = 2 cov_type = 'full' seed = 0 count_vect = CountVectorizer(analyzer='char', ngram_range=(k_min, k_max)) X = count_vect.fit_transform(genes) chars = count_vect.get_feature_names() kmers = X.toarray() kmer_freq = [] for i in range(len(genes)): kmer_freq.append(kmers[i] / gene_len[i]) kmer_table = pd.DataFrame(kmer_freq, columns=chars) gmm = GMM(n_components=num_class, covariance_type=cov_type, random_state=seed).fit(kmer_table) gmm.init_params = 'random' labels = gmm.predict(kmer_table) assert all(labels == [0, 0, 1, 0])
def get_predictions(path, k_min, k_max, num_class, cov_type, seed): kmer_table = get_kmer_table(path, k_min, k_max) gmm = GMM(n_components=num_class, covariance_type=cov_type, random_state=seed).fit(kmer_table) gmm.init_params = 'random' labels = gmm.predict(kmer_table) return labels
def get_predictions_semi(path, k_min, k_max, num_class, cov_type, seed, labels): targets = [] kmer_table = get_kmer_table(path, k_min, k_max) finalDf = pd.concat([kmer_table, pd.Series(labels)], axis=1) gmm = GMM(n_components=num_class, covariance_type=cov_type, random_state=seed) gmm.init_params = 'random' for i in range(num_class): if (i in list(finalDf.Labels)): targets.append(i) if (len(targets) == num_class): #print("Yes") gmm.means_init = np.array( [kmer_table[finalDf.Labels == i].mean(axis=0) for i in targets]) gmm.fit(kmer_table) predictions = gmm.predict(kmer_table) return predictions