Beispiel #1
0
exp_counter = 1
num_exps = len(n_src) * reps * len(percs) * len(methods)

# Run jobs
for s in range(len(n_src)):
    accs = np.zeros((len(acc_funcs), reps, len(percs), len(methods)))
    accs_desc = list()
    opt_mix_ind = np.zeros((reps, len(percs)))
    opt_mix_aris = np.zeros((reps, len(percs)))

    num_strat = np.zeros((reps, len(percs), len(methods)))
    res_desc = []
    r = 0
    while r < reps:
        # Split data in source and target randomly (mode =1) or randomly stratified (mode = 2)
        src, trg, src_labels, trg_labels = split_source_target(
            data, labels, mode=1, target_ncells=n_trg, source_ncells=n_src[s])

        trg_labels = np.array(trg_labels, dtype=np.int)
        src_labels = np.array(src_labels, dtype=np.int)

        # 3.a. Subsampling order for target
        inds = np.random.permutation(trg_labels.size)

        # 3.b. Use perfect number of latent states for nmf and sc3
        src_lbl_set = np.unique(src_labels)
        n_trg_cluster = np.unique(trg_labels).size
        n_src_cluster = src_lbl_set.size

        # 3.c. train source once per repetition
        print "Train source data of rep {0}".format(r + 1)
        source_nmf = None
Beispiel #2
0
if len(target_ncells_range) == 0:
    target_ncells_range = [args.target_ncells]

for sidx, source_ncells in enumerate(source_ncells_range):
    for tidx, target_ncells in enumerate(target_ncells_range):

        print(
            '\nSplit artificial single-cell RNA-seq data in target and source data.'
        )
        data_source, data_target, true_labels_source, true_labels_target = \
            split_source_target(
                data,
                labels,
                target_ncells = target_ncells,
                source_ncells = source_ncells,
                source_clusters = source_clusters,
                noise_target = args.noise_target,
                noise_sd = args.noise_sd,
                mode = args.splitting_mode,
                common = args.common
            )
        print 'Target data dimension: ', data_target.shape
        print 'Source data dimension: ', data_source.shape

        # 3. GENERATE GENE AND CELL NAMES
        gene_ids = np.arange(args.num_genes)

        # 4. SAVE RESULTS
        print('Saving target data to \'{0}\'.'.format(args.fout_target_data))
        np.savetxt(os.path.splitext(args.fout_target_data)[0] + "_T" +
                   str(tidx + 1) + "_" + str(target_ncells) + "_S" +
Beispiel #3
0
if len(source_ncells_range) == 0:
    source_ncells_range = [args.source_ncells]

if len(target_ncells_range) == 0:
    target_ncells_range = [args.target_ncells]

for sidx, source_ncells in enumerate(source_ncells_range):
    for tidx, target_ncells in enumerate(target_ncells_range):

        print('\nSplit artificial single-cell RNA-seq data in target and source data.')
        data_source, data_target, true_labels_source, true_labels_target = \
            split_source_target(
                data,
                labels,
                target_ncells = target_ncells,
                source_ncells = source_ncells,
                source_clusters = source_clusters,
                noise_target = args.noise_target,
                noise_sd = args.noise_sd,
                mode = args.splitting_mode
            )
        print 'Target data dimension: ', data_target.shape
        print 'Source data dimension: ', data_source.shape

        # 3. GENERATE GENE AND CELL NAMES
        gene_ids = np.arange(args.num_genes)

        # 4. SAVE RESULTS
        print('Saving target data to \'{0}\'.'.format(args.fout_target_data))
        np.savetxt(
            os.path.splitext(args.fout_target_data)[0] + 
            "_T" + str(tidx+1) + "_" + str(target_ncells) + 
Beispiel #4
0
def experiment_loop(fname, methods, acc_funcs, n_src=800, n_trg=800, n_genes=1000, mode=2, reps=10,
                    cluster_mode=False, cluster_spec=[1, 2, 3, [4, 5], [6, [7, 8]]], percs=[0.1, 0.4, 0.8]):
    flatten = lambda l: flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l]
    n_cluster = len(flatten(cluster_spec))
    print 'Number of cluster is ', n_cluster

    accs = np.zeros((len(acc_funcs), reps, len(percs), len(methods)))
    accs_desc = [''] * len(acc_funcs)

    num_strat = np.zeros((reps, len(percs), len(methods)))
    res_desc = []
    for r in range(reps):
        # 1. Generate scRNA data
        data, labels = generate_toy_data(num_genes=n_genes,
                                         num_cells=2.*(n_trg + n_src),  # generate more data
                                         cluster_spec=cluster_spec)
        # 2. Split source and target according to specified mode/setting
        src, trg, src_labels, trg_labels = split_source_target(data, labels,
                                                               target_ncells=n_trg, source_ncells=n_src,
                                                               mode=mode, source_clusters=None,
                                                               noise_target=False, noise_sd=0.1)
        trg_labels = np.array(trg_labels, dtype=np.int)
        src_labels = np.array(src_labels, dtype=np.int)
        # 3.a. Subsampling order for target
        inds = np.random.permutation(trg_labels.size)
        # 3.b. Use perfect number of latent states for nmf and sc3
        src_lbl_set = np.unique(src_labels)
        n_trg_cluster = np.unique(trg_labels).size
        n_src_cluster = src_lbl_set.size
        # 3.c. Target data subsampling loop
        for i in range(len(percs)):
            if cluster_mode:
                n_trg_cluster = percs[i]
                p_trg = trg[:, inds[:n_trg]].copy()
                p_trg_labels = trg_labels[inds[:n_trg]].copy()
            else:
                n_trg_perc = np.int(n_trg * percs[i])
                p_trg = trg[:, inds[:n_trg_perc]].copy()
                p_trg_labels = trg_labels[inds[:n_trg_perc]].copy()
            # 4. MTL/DA mixing parameter loop
            res_desc = list()
            for m in range(len(methods)):
                desc, lbls, reject = methods[m](src.copy(), src_labels.copy(),
                                                p_trg.copy(), p_trg_labels.copy(),
                                                n_src_cluster=n_src_cluster, n_trg_cluster=n_trg_cluster)
                res_desc.append(desc)

                # evaluation
                strat_lbl_inds = []
                for n in range(p_trg_labels.size):
                    if p_trg_labels[n] in src_lbl_set:
                        strat_lbl_inds.append(n)

                for f in range(len(acc_funcs)):
                    accs[f, r, i, m], desc = acc_funcs[f](p_trg.copy(), p_trg_labels.copy(),
                                                          lbls.copy(), reject, strat_lbl_inds)
                    accs_desc[f] = desc

    # save the result and then plot
    np.savez(fname, methods=methods, acc_funcs=acc_funcs, accs=accs, accs_desc=accs_desc,
             percs=percs, reps=reps, n_genes=n_genes, n_src=n_src, n_trg=n_trg,
             desc=res_desc, mode=mode, num_strat=num_strat, cluster_mode=cluster_mode)
    print('Done.')
Beispiel #5
0
 num_strat = np.zeros((reps, len(percs), len(methods)))
 res_desc = []
 r = 0
 while r < reps:
     # 1. Generate scRNA data
     data, labels = generate_toy_data(num_genes=genes[g],
                                      num_cells=10. *
                                      (n_trg + n_src[s]),
                                      cluster_spec=cluster_spec)
     # 2. Split source and target according to specified mode/setting
     src, trg, src_labels, trg_labels = split_source_target(
         data,
         labels,
         target_ncells=n_trg,
         source_ncells=n_src[s],
         mode=7,
         source_clusters=None,
         noise_target=False,
         noise_sd=0.1,
         common=common[c],
         cluster_spec=cluster_spec)
     trg_labels = np.array(trg_labels, dtype=np.int)
     src_labels = np.array(src_labels, dtype=np.int)
     # 3.a. Subsampling order for target
     inds = np.random.permutation(trg_labels.size)
     # 3.b. Use perfect number of latent states for nmf and sc3
     src_lbl_set = np.unique(src_labels)
     n_trg_cluster = np.unique(trg_labels).size
     n_src_cluster = src_lbl_set.size
     # 3.c. train source once per repetition
     source_nmf = NmfClustering(src,