Beispiel #1
0
try:
    source_clusters = None
    if args.splitting_mode == 6:
        source_clusters = ast.literal_eval(args.source_clusters)
except SyntaxError:
    sys.stderr.write("Error: Invalid source cluster specification.")
    sys.exit()

# 1. GENERATE TOY DATA
print('\nGenerate artificial single-cell RNA-seq data.')
data, labels = generate_toy_data(
    num_genes=args.num_genes,
    num_cells=args.num_cells,
    cluster_spec=cluster_spec,
    dirichlet_parameter_cluster_size=args.dir_cluster_size,
    gamma_shape=args.gamma_shape,
    gamma_rate=args.gamma_rate,
    nb_dispersion=args.nb_dispersion,
    min_prop_genes_de=args.min_prop_genes_de,
    max_prop_genes_de=args.max_prop_genes_de,
    mean_de_logfc=args.mean_de_logfc,
    sd_de_logfc=args.sd_de_logfc)
print 'Data dimension: ', data.shape

output_fmt = "%u"

#Perform FPKM and log2 normalisation if required
if args.normalise:
    data = np.log2(data.astype(float) / (np.sum(data, 0) / 1e6) + 1)
    output_fmt = "%f"

# 2. SPLIT TOY DATA IN TARGET AND SOURCE DATA
Beispiel #2
0
        source_clusters = ast.literal_eval(args.source_clusters)
except SyntaxError:
    sys.stderr.write("Error: Invalid source cluster specification.")
    sys.exit()


# 1. GENERATE TOY DATA
print('\nGenerate artificial single-cell RNA-seq data.')
data, labels = generate_toy_data(
    num_genes                        = args.num_genes,
    num_cells                        = args.num_cells,

    cluster_spec                     = cluster_spec,
    dirichlet_parameter_cluster_size = args.dir_cluster_size,

    gamma_shape                      = args.gamma_shape,
    gamma_rate                       = args.gamma_rate,
    nb_dispersion                    = args.nb_dispersion,
    min_prop_genes_de                = args.min_prop_genes_de,
    max_prop_genes_de                = args.max_prop_genes_de,
    mean_de_logfc                    = args.mean_de_logfc,
    sd_de_logfc                      = args.sd_de_logfc
)
print 'Data dimension: ', data.shape

output_fmt = "%u"

#Perform FPKM and log2 normalisation if required
if args.normalise:
    data = np.log2(data.astype(float) / (np.sum(data, 0) / 1e6) + 1)
    output_fmt = "%f"
Beispiel #3
0
def experiment_loop(fname, methods, acc_funcs, n_src=800, n_trg=800, n_genes=1000, mode=2, reps=10,
                    cluster_mode=False, cluster_spec=[1, 2, 3, [4, 5], [6, [7, 8]]], percs=[0.1, 0.4, 0.8]):
    flatten = lambda l: flatten(l[0]) + (flatten(l[1:]) if len(l) > 1 else []) if type(l) is list else [l]
    n_cluster = len(flatten(cluster_spec))
    print 'Number of cluster is ', n_cluster

    accs = np.zeros((len(acc_funcs), reps, len(percs), len(methods)))
    accs_desc = [''] * len(acc_funcs)

    num_strat = np.zeros((reps, len(percs), len(methods)))
    res_desc = []
    for r in range(reps):
        # 1. Generate scRNA data
        data, labels = generate_toy_data(num_genes=n_genes,
                                         num_cells=2.*(n_trg + n_src),  # generate more data
                                         cluster_spec=cluster_spec)
        # 2. Split source and target according to specified mode/setting
        src, trg, src_labels, trg_labels = split_source_target(data, labels,
                                                               target_ncells=n_trg, source_ncells=n_src,
                                                               mode=mode, source_clusters=None,
                                                               noise_target=False, noise_sd=0.1)
        trg_labels = np.array(trg_labels, dtype=np.int)
        src_labels = np.array(src_labels, dtype=np.int)
        # 3.a. Subsampling order for target
        inds = np.random.permutation(trg_labels.size)
        # 3.b. Use perfect number of latent states for nmf and sc3
        src_lbl_set = np.unique(src_labels)
        n_trg_cluster = np.unique(trg_labels).size
        n_src_cluster = src_lbl_set.size
        # 3.c. Target data subsampling loop
        for i in range(len(percs)):
            if cluster_mode:
                n_trg_cluster = percs[i]
                p_trg = trg[:, inds[:n_trg]].copy()
                p_trg_labels = trg_labels[inds[:n_trg]].copy()
            else:
                n_trg_perc = np.int(n_trg * percs[i])
                p_trg = trg[:, inds[:n_trg_perc]].copy()
                p_trg_labels = trg_labels[inds[:n_trg_perc]].copy()
            # 4. MTL/DA mixing parameter loop
            res_desc = list()
            for m in range(len(methods)):
                desc, lbls, reject = methods[m](src.copy(), src_labels.copy(),
                                                p_trg.copy(), p_trg_labels.copy(),
                                                n_src_cluster=n_src_cluster, n_trg_cluster=n_trg_cluster)
                res_desc.append(desc)

                # evaluation
                strat_lbl_inds = []
                for n in range(p_trg_labels.size):
                    if p_trg_labels[n] in src_lbl_set:
                        strat_lbl_inds.append(n)

                for f in range(len(acc_funcs)):
                    accs[f, r, i, m], desc = acc_funcs[f](p_trg.copy(), p_trg_labels.copy(),
                                                          lbls.copy(), reject, strat_lbl_inds)
                    accs_desc[f] = desc

    # save the result and then plot
    np.savez(fname, methods=methods, acc_funcs=acc_funcs, accs=accs, accs_desc=accs_desc,
             percs=percs, reps=reps, n_genes=n_genes, n_src=n_src, n_trg=n_trg,
             desc=res_desc, mode=mode, num_strat=num_strat, cluster_mode=cluster_mode)
    print('Done.')
Beispiel #4
0
            n_cluster = len(flatten(cluster_spec))
            #print 'Number of cluster is ', n_cluster

            accs = np.zeros((len(acc_funcs), reps, len(percs), len(methods)))
            #accs_mixed = np.zeros((len(acc_funcs), reps, len(percs), len(methods)))
            accs_desc = list()
            opt_mix_ind = np.zeros((reps, len(percs)))
            opt_mix_aris = np.zeros((reps, len(percs)))

            num_strat = np.zeros((reps, len(percs), len(methods)))
            res_desc = []
            r = 0
            while r < reps:
                # 1. Generate scRNA data
                data, labels = generate_toy_data(num_genes=genes[g],
                                                 num_cells=10. *
                                                 (n_trg + n_src[s]),
                                                 cluster_spec=cluster_spec)
                # 2. Split source and target according to specified mode/setting
                src, trg, src_labels, trg_labels = split_source_target(
                    data,
                    labels,
                    target_ncells=n_trg,
                    source_ncells=n_src[s],
                    mode=7,
                    source_clusters=None,
                    noise_target=False,
                    noise_sd=0.1,
                    common=common[c],
                    cluster_spec=cluster_spec)
                trg_labels = np.array(trg_labels, dtype=np.int)
                src_labels = np.array(src_labels, dtype=np.int)