try: source_clusters = None if args.splitting_mode == 6: source_clusters = ast.literal_eval(args.source_clusters) except SyntaxError: sys.stderr.write("Error: Invalid source cluster specification.") sys.exit() # 1. GENERATE TOY DATA print('\nGenerate artificial single-cell RNA-seq data.') data, labels = generate_toy_data( num_genes=args.num_genes, num_cells=args.num_cells, cluster_spec=cluster_spec, dirichlet_parameter_cluster_size=args.dir_cluster_size, gamma_shape=args.gamma_shape, gamma_rate=args.gamma_rate, nb_dispersion=args.nb_dispersion, min_prop_genes_de=args.min_prop_genes_de, max_prop_genes_de=args.max_prop_genes_de, mean_de_logfc=args.mean_de_logfc, sd_de_logfc=args.sd_de_logfc) print 'Data dimension: ', data.shape output_fmt = "%u" #Perform FPKM and log2 normalisation if required if args.normalise: data = np.log2(data.astype(float) / (np.sum(data, 0) / 1e6) + 1) output_fmt = "%f" # 2. SPLIT TOY DATA IN TARGET AND SOURCE DATA
source_clusters = ast.literal_eval(args.source_clusters) except SyntaxError: sys.stderr.write("Error: Invalid source cluster specification.") sys.exit() # 1. GENERATE TOY DATA print('\nGenerate artificial single-cell RNA-seq data.') data, labels = generate_toy_data( num_genes = args.num_genes, num_cells = args.num_cells, cluster_spec = cluster_spec, dirichlet_parameter_cluster_size = args.dir_cluster_size, gamma_shape = args.gamma_shape, gamma_rate = args.gamma_rate, nb_dispersion = args.nb_dispersion, min_prop_genes_de = args.min_prop_genes_de, max_prop_genes_de = args.max_prop_genes_de, mean_de_logfc = args.mean_de_logfc, sd_de_logfc = args.sd_de_logfc ) print 'Data dimension: ', data.shape output_fmt = "%u" #Perform FPKM and log2 normalisation if required if args.normalise: data = np.log2(data.astype(float) / (np.sum(data, 0) / 1e6) + 1) output_fmt = "%f"
def experiment_loop(fname, methods, acc_funcs, n_src=800, n_trg=800, n_genes=1000, mode=2, reps=10, cluster_mode=False, cluster_spec=[1, 2, 3, [4, 5], [6, [7, 8]]], percs=[0.1, 0.4, 0.8]): flatten = lambda l: flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l] n_cluster = len(flatten(cluster_spec)) print 'Number of cluster is ', n_cluster accs = np.zeros((len(acc_funcs), reps, len(percs), len(methods))) accs_desc = [''] * len(acc_funcs) num_strat = np.zeros((reps, len(percs), len(methods))) res_desc = [] for r in range(reps): # 1. Generate scRNA data data, labels = generate_toy_data(num_genes=n_genes, num_cells=2.*(n_trg + n_src), # generate more data cluster_spec=cluster_spec) # 2. Split source and target according to specified mode/setting src, trg, src_labels, trg_labels = split_source_target(data, labels, target_ncells=n_trg, source_ncells=n_src, mode=mode, source_clusters=None, noise_target=False, noise_sd=0.1) trg_labels = np.array(trg_labels, dtype=np.int) src_labels = np.array(src_labels, dtype=np.int) # 3.a. Subsampling order for target inds = np.random.permutation(trg_labels.size) # 3.b. Use perfect number of latent states for nmf and sc3 src_lbl_set = np.unique(src_labels) n_trg_cluster = np.unique(trg_labels).size n_src_cluster = src_lbl_set.size # 3.c. Target data subsampling loop for i in range(len(percs)): if cluster_mode: n_trg_cluster = percs[i] p_trg = trg[:, inds[:n_trg]].copy() p_trg_labels = trg_labels[inds[:n_trg]].copy() else: n_trg_perc = np.int(n_trg * percs[i]) p_trg = trg[:, inds[:n_trg_perc]].copy() p_trg_labels = trg_labels[inds[:n_trg_perc]].copy() # 4. MTL/DA mixing parameter loop res_desc = list() for m in range(len(methods)): desc, lbls, reject = methods[m](src.copy(), src_labels.copy(), p_trg.copy(), p_trg_labels.copy(), n_src_cluster=n_src_cluster, n_trg_cluster=n_trg_cluster) res_desc.append(desc) # evaluation strat_lbl_inds = [] for n in range(p_trg_labels.size): if p_trg_labels[n] in src_lbl_set: strat_lbl_inds.append(n) for f in range(len(acc_funcs)): accs[f, r, i, m], desc = acc_funcs[f](p_trg.copy(), p_trg_labels.copy(), lbls.copy(), reject, strat_lbl_inds) accs_desc[f] = desc # save the result and then plot np.savez(fname, methods=methods, acc_funcs=acc_funcs, accs=accs, accs_desc=accs_desc, percs=percs, reps=reps, n_genes=n_genes, n_src=n_src, n_trg=n_trg, desc=res_desc, mode=mode, num_strat=num_strat, cluster_mode=cluster_mode) print('Done.')
n_cluster = len(flatten(cluster_spec)) #print 'Number of cluster is ', n_cluster accs = np.zeros((len(acc_funcs), reps, len(percs), len(methods))) #accs_mixed = np.zeros((len(acc_funcs), reps, len(percs), len(methods))) accs_desc = list() opt_mix_ind = np.zeros((reps, len(percs))) opt_mix_aris = np.zeros((reps, len(percs))) num_strat = np.zeros((reps, len(percs), len(methods))) res_desc = [] r = 0 while r < reps: # 1. Generate scRNA data data, labels = generate_toy_data(num_genes=genes[g], num_cells=10. * (n_trg + n_src[s]), cluster_spec=cluster_spec) # 2. Split source and target according to specified mode/setting src, trg, src_labels, trg_labels = split_source_target( data, labels, target_ncells=n_trg, source_ncells=n_src[s], mode=7, source_clusters=None, noise_target=False, noise_sd=0.1, common=common[c], cluster_spec=cluster_spec) trg_labels = np.array(trg_labels, dtype=np.int) src_labels = np.array(src_labels, dtype=np.int)