def method_da_nmf(src, src_labels, trg, trg_labels, n_src_cluster, n_trg_cluster, mix=0.0): src = NmfClustering(src, np.arange(src.shape[0]), num_cluster=n_src_cluster) cp = DaNmfClustering(src, trg, np.arange(trg.shape[0]), num_cluster=n_trg_cluster) cp.apply(mix=mix) lbls = cp.cluster_labels desc = 'DA-NMF-Mix {0}'.format(np.int(mix*100.)) return desc, lbls, cp.reject
def da_nmf_distances(data, gene_ids, src, metric='euclidean', mixture=0.5): if mixture == 0.0: return distances(data, [], metric=metric) cp = DaNmfClustering(src, data, gene_ids, num_cluster=3) cp.apply(mix=0.0) W, H, H2 = cp.intermediate_model # convex combination of vanilla distance and nmf distance dist1 = distances(data, [], metric=metric) dist2 = distances(W.dot(H2), [], metric=metric) # normalize distance if np.max(dist2) < 1e-10: if mixture == 1.0: raise Exception('Distances are all zero and mixture=1.0. Seems that source and target' ' data do not go well together.') else: print 'Warning! Max distance is 0.0.' else: print 'Max dists before normalization: ', np.max(dist1), np.max(dist2) dist2 *= np.max(dist1) / np.max(dist2) return mixture*dist2 + (1.-mixture)*dist1
# -------------------------------------------------- # 3.1. MIX TARGET & SOURCE DATE # -------------------------------------------------- src_data = np.load( arguments.src_fname) # src data gets while applying da_nmf... src_nmf = src_data['src'][()] print type(src_nmf) src_nmf.cell_filter_list = list() src_nmf.gene_filter_list = list() src_nmf.add_cell_filter(lambda x: np.arange(x.shape[1]).tolist()) src_nmf.add_gene_filter(lambda x: np.arange(x.shape[0]).tolist()) src_nmf.set_data_transformation(lambda x: x) da_nmf = DaNmfClustering(src_nmf, data, gene_ids, k) da_nmf.add_cell_filter(cell_filter_fun) da_nmf.add_gene_filter(gene_filter_fun) da_nmf.set_data_transformation(data_transf_fun) calc_transf = False if i == 0 and j == 0: calc_transf = True mix_data, _, _ = da_nmf.get_mixed_data( k=k, mix=mix, reject_ratio=0., calc_transferability=calc_transf, max_iter=2000) mix_gene_ids = da_nmf.common_ids if calc_transf: _, accs_trans[j, i] = da_nmf.reject[-1]
# -------------------------------------------------- # 3.1. SETUP SOURCE DATA NMF CLUSTERING # -------------------------------------------------- src_clustering = None if src_data is not None: src_clustering = NmfClustering(src_data, src_gene_ids, num_cluster=arguments.src_k) src_clustering.add_cell_filter(src_cell_filter_fun) src_clustering.add_gene_filter(src_gene_filter_fun) src_clustering.set_data_transformation(src_data_transf_fun) # -------------------------------------------------- # 3.2. SETUP TARGET DATA CLUSTERING # -------------------------------------------------- if arguments.method is 'NMF' and src_data is not None: print('Transfer learning method is NMF.') trg_clustering = DaNmfClustering(src_clustering, trg_data, trg_gene_ids, num_cluster=trg_k) trg_clustering.add_cell_filter(trg_cell_filter_fun) trg_clustering.add_gene_filter(trg_gene_filter_fun) trg_clustering.set_data_transformation(trg_data_transf_fun) trg_clustering.apply(mix=mix, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err) if arguments.method is 'NMF' and src_data is None: print('Single task clustering method is NMF.') trg_clustering = NmfClustering(trg_data, trg_gene_ids, num_cluster=trg_k) trg_clustering.add_cell_filter(trg_cell_filter_fun) trg_clustering.add_gene_filter(trg_gene_filter_fun) trg_clustering.set_data_transformation(trg_data_transf_fun) trg_clustering.apply(alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err)
data_src = data_transformation_log2(data_src) # Load Target data data_trg = np.loadtxt(path_trg) gene_ids_trg = np.loadtxt(path_geneids_trg, dtype=np.str) # Delete non-unique genes data_trg, gene_ids_trg = delete_nonunique_genes(data_trg, gene_ids_trg) # Apply cell filter valid_cells = cell_filter(data_trg) # Apply gene filter valid_genes = gene_filter(data_trg) # Create filtered data data_trg = data_trg[:, valid_cells] data_trg = data_trg[valid_genes, :] gene_ids_trg = gene_ids_trg[valid_genes] # Log transform data data_trg = data_transformation_log2(data_trg) # train source and test performance source_nmf = NmfClustering(data_src, gene_ids_src, num_cluster=n_source_cluster) source_nmf.apply(k=n_source_cluster, max_iter=100, rel_err=1e-3) # Number of repetitions can be changed in line 153 of utils.py target_nmf = DaNmfClustering(source_nmf, data_trg.copy(), gene_ids_trg, num_cluster=n_target_cluster) target_nmf.apply(k=n_target_cluster, calc_transferability=True) # target_nmf.transferability_pvalue # np.savez(fname, source_ari=source_ari, target_ari=target_ari, n_mix=n_mix, n_source=n_source, n_target=n_target, n_source_cluster=n_source_cluster, # n_target_cluster=n_target_cluster)
for i in range(len(n_target)): target_perc = n_target[i] / float(split_1.size) ttt = StratifiedShuffleSplit(n_splits=2, test_size=target_perc) for split_11, split_22 in ttt.split(data.pp_data[:, split_1].T, data.cluster_labels[split_1]): print split_11.size, split_22.size # shuffle the gene ids for testing perm_inds = np.random.permutation(data.pp_data.shape[0]) target_gene_ids = gene_ids[perm_inds].copy() target_data = data.pp_data[:, split_1[split_22]] target_data = target_data[perm_inds, :] target_labels = data.cluster_labels[split_1[split_22]] for m in range(len(n_mix)): target_nmf = DaNmfClustering(source_nmf, target_data.copy(), target_gene_ids, num_cluster=n_target_cluster) # target_nmf.apply(k=n_target_cluster, mix=n_mix[m], calc_transferability=False) mixed_data, rec_trg_data, _ = target_nmf.get_mixed_data(mix=n_mix[m], use_H2=True, calc_transferability=False) W, H, H2 = target_nmf.intermediate_model num_cells = target_data.shape[1] max_pca_comp = np.ceil(num_cells*0.07).astype(np.int) min_pca_comp = np.floor(num_cells*0.04).astype(np.int) sc3_mix = SC3Clustering(mixed_data, target_gene_ids, pc_range=[min_pca_comp, max_pca_comp], sub_sample=True, consensus_mode=0) sc3_mix.add_distance_calculation(partial(sc.distances, metric='euclidean')) sc3_mix.add_dimred_calculation(partial(sc.transformations, components=max_pca_comp, method='pca')) sc3_mix.add_intermediate_clustering(partial(sc.intermediate_kmeans_clustering, k=n_target_cluster)) sc3_mix.set_build_consensus_matrix(sc.build_consensus_matrix) sc3_mix.set_consensus_clustering(partial(sc.consensus_clustering, n_components=n_target_cluster))
# -------------------------------------------------- src_data = np.load(arguments.src_fname) # src data gets while applying da_nmf ... src_nmf = src_data['src'][()] # print type(src_nmf) src_nmf.cell_filter_list = list() src_nmf.gene_filter_list = list() # source data is already filtered and transformed ... src_nmf.add_cell_filter(lambda x: np.arange(x.shape[1]).tolist()) src_nmf.add_gene_filter(lambda x: np.arange(x.shape[0]).tolist()) src_nmf.set_data_transformation(lambda x: x) #print '\n\n+++++++++++++++++++++++++++++++++++' #print np.max(data) #print '+++++++++++++++++++++++++++++++++++\n\n' da_nmf = DaNmfClustering(src_nmf, data.copy(), gene_ids.copy(), k) da_nmf.add_cell_filter(cell_filter_fun) da_nmf.add_gene_filter(gene_filter_fun) da_nmf.set_data_transformation(data_transf_fun) mixed_data, _, _ = \ da_nmf.get_mixed_data(mix=mix, calc_transferability=False) # mix_gene_ids = da_nmf.common_ids mix_gene_ids = da_nmf.gene_ids #print '\n\n+++++++++++++++++++++++++++++++++++------' #print np.max(data) #print '+++++++++++++++++++++++++++++++++++------\n\n' # -------------------------------------------------- # 3.2. TARGET DATA CLUSTERING