accs_names = ['Calinski-Harabaz', 'Silhouette (euc)', 'Silhouette (corr)', 'Silhouette (jacc)', 'ARI'] accs = np.zeros((5, len(mixtures), len(num_cluster))) for i in range(len(num_cluster)): for j in range(len(mixtures)): print('Iteration k={0} mix={1}') trg_k = num_cluster[i] mix = mixtures[j] # -------------------------------------------------- # 3.1. SETUP SOURCE DATA NMF CLUSTERING # -------------------------------------------------- src_clustering = None if src_data is not None: src_clustering = NmfClustering(src_data, src_gene_ids, num_cluster=arguments.src_k) src_clustering.add_cell_filter(src_cell_filter_fun) src_clustering.add_gene_filter(src_gene_filter_fun) src_clustering.set_data_transformation(src_data_transf_fun) # -------------------------------------------------- # 3.2. SETUP TARGET DATA CLUSTERING # -------------------------------------------------- if arguments.method is 'NMF' and src_data is not None: print('Transfer learning method is NMF.') trg_clustering = DaNmfClustering(src_clustering, trg_data, trg_gene_ids, num_cluster=trg_k) trg_clustering.add_cell_filter(trg_cell_filter_fun) trg_clustering.add_gene_filter(trg_gene_filter_fun) trg_clustering.set_data_transformation(trg_data_transf_fun) trg_clustering.apply(mix=mix, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err)
data_src = data_transformation_log2(data_src) # Load Target data data_trg = np.loadtxt(path_trg) gene_ids_trg = np.loadtxt(path_geneids_trg, dtype=np.str) # Delete non-unique genes data_trg, gene_ids_trg = delete_nonunique_genes(data_trg, gene_ids_trg) # Apply cell filter valid_cells = cell_filter(data_trg) # Apply gene filter valid_genes = gene_filter(data_trg) # Create filtered data data_trg = data_trg[:, valid_cells] data_trg = data_trg[valid_genes, :] gene_ids_trg = gene_ids_trg[valid_genes] # Log transform data data_trg = data_transformation_log2(data_trg) # train source and test performance source_nmf = NmfClustering(data_src, gene_ids_src, num_cluster=n_source_cluster) source_nmf.apply(k=n_source_cluster, max_iter=100, rel_err=1e-3) # Number of repetitions can be changed in line 153 of utils.py target_nmf = DaNmfClustering(source_nmf, data_trg.copy(), gene_ids_trg, num_cluster=n_target_cluster) target_nmf.apply(k=n_target_cluster, calc_transferability=True) # target_nmf.transferability_pvalue # np.savez(fname, source_ari=source_ari, target_ari=target_ari, n_mix=n_mix, n_source=n_source, n_target=n_target, n_source_cluster=n_source_cluster, # n_target_cluster=n_target_cluster)
# -------------------------------------------------- num_cluster = map(np.int, arguments.cluster_range.split(",")) accs_names = ['KTA (linear)', 'ARI'] accs = np.zeros((2, len(num_cluster))) for i in range(len(num_cluster)): k = num_cluster[i] print('Iteration {0}, num-cluster={1}'.format(i, k)) # -------------------------------------------------- # 3.1. SETUP SOURCE DATA NMF CLUSTERING # -------------------------------------------------- if labels is None: # No source labels are provided, generate them via NMF clustering nmf_labels = None nmf_labels = NmfClustering(data, gene_ids, num_cluster=k, labels=[]) nmf_labels.add_cell_filter(cell_filter_fun) nmf_labels.add_gene_filter(gene_filter_fun) nmf_labels.set_data_transformation(data_transf_fun) nmf_labels.apply(k=k, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err) labels = nmf_labels.cluster_labels # Use perfect number of latent states for nmf and sc3 src_labels = np.array(labels, dtype=np.int) src_lbl_set = np.unique(src_labels) k_now = src_lbl_set.size nmf = None nmf = NmfClustering_initW(data, gene_ids, labels=labels, num_cluster=k_now) nmf.add_cell_filter(cell_filter_fun) nmf.add_gene_filter(gene_filter_fun)
print np.unique(data.cluster_labels) source_ari = np.zeros(reps) target_ari = np.zeros((7, reps, len(n_target), len(n_mix))) n = 0 while n < reps: source_perc = n_source / float(data.cluster_labels.size) sss = StratifiedShuffleSplit(n_splits=2, test_size=source_perc) for split_1, split_2 in sss.split(data.pp_data.T, data.cluster_labels): print split_1.size, split_2.size source_data = data.pp_data[:, split_2] source_labels = data.cluster_labels[split_2] # train source and test performance source_nmf = NmfClustering(source_data, gene_ids, num_cluster=n_source_cluster) source_nmf.apply(k=n_source_cluster, max_iter=4000, rel_err=1e-3) source_ari[n] = metrics.adjusted_rand_score(source_labels, source_nmf.cluster_labels) print 'ITER(', n,'): SOURCE ARI = ', source_ari[n] if source_ari[n] < 0.94: continue for i in range(len(n_target)): target_perc = n_target[i] / float(split_1.size) ttt = StratifiedShuffleSplit(n_splits=2, test_size=target_perc) for split_11, split_22 in ttt.split(data.pp_data[:, split_1].T, data.cluster_labels[split_1]): print split_11.size, split_22.size # shuffle the gene ids for testing perm_inds = np.random.permutation(data.pp_data.shape[0]) target_gene_ids = gene_ids[perm_inds].copy()
print data.shape else: # Cell and gene filter and transformation within the procedure cell_filter_fun = partial(sc.cell_filter, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold) gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=perc_consensus_genes, non_zero_threshold=non_zero_threshold) data_transf_fun = sc.data_transformation_log2 # Generating labels from complete dataset print "Train complete data" complete_nmf = None complete_nmf = NmfClustering(data, np.arange(data.shape[0]), num_cluster=num_cluster) complete_nmf.add_cell_filter(cell_filter_fun) complete_nmf.add_gene_filter(gene_filter_fun) complete_nmf.set_data_transformation(data_transf_fun) complete_nmf.apply(k=num_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err) # Get labels desc, target_nmf, trg_lbls_pred, mixed_data = method_sc3_filter( complete_nmf, data, [], cell_filter=cell_filter_fun, gene_filter=gene_filter_fun,
def method_nmf(src, src_labels, trg, trg_labels, n_src_cluster, n_trg_cluster): ids = np.arange(trg.shape[0]) cp = NmfClustering(trg, ids, num_cluster=n_trg_cluster) cp.apply() return 'NMF', cp.cluster_labels, None
else: raise Warning("Within-Filtering is not implemented for R SC3") # Cell and gene filter and transformation within the procedure cell_filter_fun = partial(sc.cell_filter, num_expr_genes=min_expr_genes, non_zero_threshold=non_zero_threshold) gene_filter_fun = partial(sc.gene_filter, perc_consensus_genes=perc_consensus_genes, non_zero_threshold=non_zero_threshold) data_transf_fun = sc.data_transformation_log2 # Generating labels from complete dataset print("Train complete data") complete_nmf = None complete_nmf = NmfClustering(data, np.arange(data.shape[0]), num_cluster=num_cluster, labels=[]) complete_nmf.add_cell_filter(cell_filter_fun) complete_nmf.add_gene_filter(gene_filter_fun) complete_nmf.set_data_transformation(data_transf_fun) complete_nmf.apply(k=num_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err) # Get labels labels = complete_nmf.cluster_labels label_names, label_counts = np.unique(labels, return_counts=True) print("Labels: ", label_names) print("Counts: ", label_counts)
accs_names = [ 'KTA (linear)', 'Silhouette (euc)', 'Silhouette (pearson)', 'Silhouette (spearman)', 'ARI' ] accs = np.zeros((5, len(num_cluster))) for i in range(len(num_cluster)): k = num_cluster[i] print('Iteration {0}, num-cluster={0}'.format(i, k)) # -------------------------------------------------- # 3.1. SETUP SOURCE DATA NMF CLUSTERING # -------------------------------------------------- nmf = None nmf = NmfClustering(data, gene_ids, num_cluster=k) nmf.add_cell_filter(cell_filter_fun) nmf.add_gene_filter(gene_filter_fun) nmf.set_data_transformation(data_transf_fun) nmf.apply(k=k, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err) # -------------------------------------------------- # 3.2. EVALUATE CLUSTER ASSIGNMENT # -------------------------------------------------- print('\nUnsupervised evaluation:') accs[0, i] = unsupervised_acc_kta(nmf.pp_data, nmf.cluster_labels,
else: src, trg, src_labels, trg_labels = split_source_target( data, labels, mode=1, target_ncells=n_trg, source_ncells=n_src[s]) src_labels = np.array(src_labels, dtype=np.int) #src_labels_SC3 = np.array(src_labels_SC3, dtype=np.int) # 3.c. train source once per repetition print "Train source data of rep {0}".format(r + 1) source_nmf = None source_nmf = NmfClustering(src, np.arange(src.shape[0]), num_cluster=num_cluster) source_nmf.add_cell_filter(cell_filter_fun) source_nmf.add_gene_filter(gene_filter_fun) source_nmf.set_data_transformation(data_transf_fun) source_nmf.apply(k=num_cluster, alpha=nmf_alpha, l1=nmf_l1, max_iter=nmf_max_iter, rel_err=nmf_rel_err) # Calculate ARIs and KTAs source_aris[s, r] = metrics.adjusted_rand_score( src_labels[source_nmf.remain_cell_inds], source_nmf.cluster_labels) print 'SOURCE ARI Labels NMF, Method NMF = ', source_aris[s, r]
source_clusters=None, noise_target=False, noise_sd=0.1, common=common[c], cluster_spec=cluster_spec) trg_labels = np.array(trg_labels, dtype=np.int) src_labels = np.array(src_labels, dtype=np.int) # 3.a. Subsampling order for target inds = np.random.permutation(trg_labels.size) # 3.b. Use perfect number of latent states for nmf and sc3 src_lbl_set = np.unique(src_labels) n_trg_cluster = np.unique(trg_labels).size n_src_cluster = src_lbl_set.size # 3.c. train source once per repetition source_nmf = NmfClustering(src, np.arange(src.shape[0]), num_cluster=n_src_cluster) source_nmf.apply(k=n_src_cluster, max_iter=4000, rel_err=1e-3) source_aris[s, g, c, r] = metrics.adjusted_rand_score( src_labels, source_nmf.cluster_labels) print 'ITER(', r, '): SOURCE ARI = ', source_aris[s, g, c, r] if source_aris[s, g, c, r] < 0.94: continue # 3.d. Target data subsampling loop plot_cnt = 1 for i in range(len(percs)): n_trg_perc = np.int(n_trg * percs[i]) p_trg = trg[:, inds[:n_trg_perc]].copy() p_trg_labels = trg_labels[inds[:n_trg_perc]].copy() # 4. MTL/DA mixing parameter loop res_desc = list()