Beispiel #1
0
accs_names = ['Calinski-Harabaz', 'Silhouette (euc)', 'Silhouette (corr)', 'Silhouette (jacc)', 'ARI']
accs = np.zeros((5, len(mixtures), len(num_cluster)))

for i in range(len(num_cluster)):
    for j in range(len(mixtures)):
        print('Iteration k={0} mix={1}')
        trg_k = num_cluster[i]
        mix = mixtures[j]

        # --------------------------------------------------
        # 3.1. SETUP SOURCE DATA NMF CLUSTERING
        # --------------------------------------------------
        src_clustering = None
        if src_data is not None:
            src_clustering = NmfClustering(src_data, src_gene_ids, num_cluster=arguments.src_k)
            src_clustering.add_cell_filter(src_cell_filter_fun)
            src_clustering.add_gene_filter(src_gene_filter_fun)
            src_clustering.set_data_transformation(src_data_transf_fun)

        # --------------------------------------------------
        # 3.2. SETUP TARGET DATA CLUSTERING
        # --------------------------------------------------
        if arguments.method is 'NMF' and src_data is not None:
            print('Transfer learning method is NMF.')
            trg_clustering = DaNmfClustering(src_clustering, trg_data, trg_gene_ids, num_cluster=trg_k)
            trg_clustering.add_cell_filter(trg_cell_filter_fun)
            trg_clustering.add_gene_filter(trg_gene_filter_fun)
            trg_clustering.set_data_transformation(trg_data_transf_fun)
            trg_clustering.apply(mix=mix, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1,
                                 max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err)
Beispiel #2
0
    data_src = data_transformation_log2(data_src)

    # Load Target data
    data_trg = np.loadtxt(path_trg)
    gene_ids_trg = np.loadtxt(path_geneids_trg, dtype=np.str)
    # Delete non-unique genes
    data_trg, gene_ids_trg = delete_nonunique_genes(data_trg, gene_ids_trg)
    # Apply cell filter
    valid_cells = cell_filter(data_trg)
    # Apply gene filter
    valid_genes = gene_filter(data_trg)

    # Create filtered data
    data_trg = data_trg[:, valid_cells]
    data_trg = data_trg[valid_genes, :]
    gene_ids_trg = gene_ids_trg[valid_genes]
    # Log transform data
    data_trg = data_transformation_log2(data_trg)

    # train source and test performance
    source_nmf = NmfClustering(data_src, gene_ids_src, num_cluster=n_source_cluster)
    source_nmf.apply(k=n_source_cluster, max_iter=100, rel_err=1e-3)

    # Number of repetitions can be changed in line 153 of utils.py
    target_nmf = DaNmfClustering(source_nmf, data_trg.copy(), gene_ids_trg, num_cluster=n_target_cluster)
    target_nmf.apply(k=n_target_cluster, calc_transferability=True)
    # target_nmf.transferability_pvalue

    # np.savez(fname, source_ari=source_ari, target_ari=target_ari, n_mix=n_mix, n_source=n_source, n_target=n_target, n_source_cluster=n_source_cluster,
    # n_target_cluster=n_target_cluster)
Beispiel #3
0
# --------------------------------------------------
num_cluster = map(np.int, arguments.cluster_range.split(","))

accs_names = ['KTA (linear)',  'ARI']
accs = np.zeros((2, len(num_cluster)))

for i in range(len(num_cluster)):
    k = num_cluster[i]
    print('Iteration {0}, num-cluster={1}'.format(i, k))
    # --------------------------------------------------
    # 3.1. SETUP SOURCE DATA NMF CLUSTERING
    # --------------------------------------------------
    if labels is None:
        # No source labels are provided, generate them via NMF clustering
        nmf_labels = None
        nmf_labels = NmfClustering(data, gene_ids, num_cluster=k, labels=[])
        nmf_labels.add_cell_filter(cell_filter_fun)
        nmf_labels.add_gene_filter(gene_filter_fun)
        nmf_labels.set_data_transformation(data_transf_fun)
        nmf_labels.apply(k=k, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err)
        labels = nmf_labels.cluster_labels

    # Use perfect number of latent states for nmf and sc3
    src_labels = np.array(labels, dtype=np.int)
    src_lbl_set = np.unique(src_labels)
    k_now = src_lbl_set.size

    nmf = None
    nmf = NmfClustering_initW(data, gene_ids, labels=labels, num_cluster=k_now)
    nmf.add_cell_filter(cell_filter_fun)
    nmf.add_gene_filter(gene_filter_fun)
Beispiel #4
0
    print np.unique(data.cluster_labels)

    source_ari = np.zeros(reps)
    target_ari = np.zeros((7, reps, len(n_target), len(n_mix)))
    n = 0
    while n < reps:
        source_perc = n_source / float(data.cluster_labels.size)
        sss = StratifiedShuffleSplit(n_splits=2, test_size=source_perc)
        for split_1, split_2 in sss.split(data.pp_data.T, data.cluster_labels):
            print split_1.size, split_2.size

        source_data = data.pp_data[:, split_2]
        source_labels = data.cluster_labels[split_2]

        # train source and test performance
        source_nmf = NmfClustering(source_data, gene_ids, num_cluster=n_source_cluster)
        source_nmf.apply(k=n_source_cluster, max_iter=4000, rel_err=1e-3)
        source_ari[n] = metrics.adjusted_rand_score(source_labels, source_nmf.cluster_labels)
        print 'ITER(', n,'): SOURCE ARI = ', source_ari[n]
        if source_ari[n] < 0.94:
            continue

        for i in range(len(n_target)):
            target_perc = n_target[i] / float(split_1.size)
            ttt = StratifiedShuffleSplit(n_splits=2, test_size=target_perc)
            for split_11, split_22 in ttt.split(data.pp_data[:, split_1].T, data.cluster_labels[split_1]):
                print split_11.size, split_22.size

            # shuffle the gene ids for testing
            perm_inds = np.random.permutation(data.pp_data.shape[0])
            target_gene_ids = gene_ids[perm_inds].copy()
Beispiel #5
0
    print data.shape
else:
    # Cell and gene filter and transformation within the procedure
    cell_filter_fun = partial(sc.cell_filter,
                              num_expr_genes=min_expr_genes,
                              non_zero_threshold=non_zero_threshold)
    gene_filter_fun = partial(sc.gene_filter,
                              perc_consensus_genes=perc_consensus_genes,
                              non_zero_threshold=non_zero_threshold)
    data_transf_fun = sc.data_transformation_log2

# Generating labels from complete dataset
print "Train complete data"
complete_nmf = None
complete_nmf = NmfClustering(data,
                             np.arange(data.shape[0]),
                             num_cluster=num_cluster)
complete_nmf.add_cell_filter(cell_filter_fun)
complete_nmf.add_gene_filter(gene_filter_fun)
complete_nmf.set_data_transformation(data_transf_fun)
complete_nmf.apply(k=num_cluster,
                   alpha=nmf_alpha,
                   l1=nmf_l1,
                   max_iter=nmf_max_iter,
                   rel_err=nmf_rel_err)
# Get labels
desc, target_nmf, trg_lbls_pred, mixed_data = method_sc3_filter(
    complete_nmf,
    data, [],
    cell_filter=cell_filter_fun,
    gene_filter=gene_filter_fun,
Beispiel #6
0
def method_nmf(src, src_labels, trg, trg_labels, n_src_cluster, n_trg_cluster):
    ids = np.arange(trg.shape[0])
    cp = NmfClustering(trg, ids, num_cluster=n_trg_cluster)
    cp.apply()
    return 'NMF', cp.cluster_labels, None
Beispiel #7
0
else:
    raise Warning("Within-Filtering is not implemented for R SC3")
    # Cell and gene filter and transformation within the procedure
    cell_filter_fun = partial(sc.cell_filter,
                              num_expr_genes=min_expr_genes,
                              non_zero_threshold=non_zero_threshold)
    gene_filter_fun = partial(sc.gene_filter,
                              perc_consensus_genes=perc_consensus_genes,
                              non_zero_threshold=non_zero_threshold)
    data_transf_fun = sc.data_transformation_log2

# Generating labels from complete dataset
print("Train complete data")
complete_nmf = None
complete_nmf = NmfClustering(data,
                             np.arange(data.shape[0]),
                             num_cluster=num_cluster,
                             labels=[])
complete_nmf.add_cell_filter(cell_filter_fun)
complete_nmf.add_gene_filter(gene_filter_fun)
complete_nmf.set_data_transformation(data_transf_fun)
complete_nmf.apply(k=num_cluster,
                   alpha=nmf_alpha,
                   l1=nmf_l1,
                   max_iter=nmf_max_iter,
                   rel_err=nmf_rel_err)

# Get labels
labels = complete_nmf.cluster_labels
label_names, label_counts = np.unique(labels, return_counts=True)
print("Labels: ", label_names)
print("Counts: ", label_counts)
Beispiel #8
0
accs_names = [
    'KTA (linear)', 'Silhouette (euc)', 'Silhouette (pearson)',
    'Silhouette (spearman)', 'ARI'
]
accs = np.zeros((5, len(num_cluster)))

for i in range(len(num_cluster)):
    k = num_cluster[i]
    print('Iteration {0}, num-cluster={0}'.format(i, k))

    # --------------------------------------------------
    # 3.1. SETUP SOURCE DATA NMF CLUSTERING
    # --------------------------------------------------
    nmf = None
    nmf = NmfClustering(data, gene_ids, num_cluster=k)
    nmf.add_cell_filter(cell_filter_fun)
    nmf.add_gene_filter(gene_filter_fun)
    nmf.set_data_transformation(data_transf_fun)
    nmf.apply(k=k,
              alpha=arguments.nmf_alpha,
              l1=arguments.nmf_l1,
              max_iter=arguments.nmf_max_iter,
              rel_err=arguments.nmf_rel_err)

    # --------------------------------------------------
    # 3.2. EVALUATE CLUSTER ASSIGNMENT
    # --------------------------------------------------
    print('\nUnsupervised evaluation:')
    accs[0, i] = unsupervised_acc_kta(nmf.pp_data,
                                      nmf.cluster_labels,
Beispiel #9
0
        else:
            src, trg, src_labels, trg_labels = split_source_target(
                data,
                labels,
                mode=1,
                target_ncells=n_trg,
                source_ncells=n_src[s])

        src_labels = np.array(src_labels, dtype=np.int)
        #src_labels_SC3 = np.array(src_labels_SC3, dtype=np.int)

        # 3.c. train source once per repetition
        print "Train source data of rep {0}".format(r + 1)
        source_nmf = None
        source_nmf = NmfClustering(src,
                                   np.arange(src.shape[0]),
                                   num_cluster=num_cluster)
        source_nmf.add_cell_filter(cell_filter_fun)
        source_nmf.add_gene_filter(gene_filter_fun)
        source_nmf.set_data_transformation(data_transf_fun)
        source_nmf.apply(k=num_cluster,
                         alpha=nmf_alpha,
                         l1=nmf_l1,
                         max_iter=nmf_max_iter,
                         rel_err=nmf_rel_err)

        # Calculate ARIs and KTAs
        source_aris[s, r] = metrics.adjusted_rand_score(
            src_labels[source_nmf.remain_cell_inds], source_nmf.cluster_labels)

        print 'SOURCE ARI Labels NMF, Method NMF = ', source_aris[s, r]
Beispiel #10
0
     source_clusters=None,
     noise_target=False,
     noise_sd=0.1,
     common=common[c],
     cluster_spec=cluster_spec)
 trg_labels = np.array(trg_labels, dtype=np.int)
 src_labels = np.array(src_labels, dtype=np.int)
 # 3.a. Subsampling order for target
 inds = np.random.permutation(trg_labels.size)
 # 3.b. Use perfect number of latent states for nmf and sc3
 src_lbl_set = np.unique(src_labels)
 n_trg_cluster = np.unique(trg_labels).size
 n_src_cluster = src_lbl_set.size
 # 3.c. train source once per repetition
 source_nmf = NmfClustering(src,
                            np.arange(src.shape[0]),
                            num_cluster=n_src_cluster)
 source_nmf.apply(k=n_src_cluster, max_iter=4000, rel_err=1e-3)
 source_aris[s, g, c, r] = metrics.adjusted_rand_score(
     src_labels, source_nmf.cluster_labels)
 print 'ITER(', r, '): SOURCE ARI = ', source_aris[s, g, c, r]
 if source_aris[s, g, c, r] < 0.94:
     continue
 # 3.d. Target data subsampling loop
 plot_cnt = 1
 for i in range(len(percs)):
     n_trg_perc = np.int(n_trg * percs[i])
     p_trg = trg[:, inds[:n_trg_perc]].copy()
     p_trg_labels = trg_labels[inds[:n_trg_perc]].copy()
     # 4. MTL/DA mixing parameter loop
     res_desc = list()