Beispiel #1
0
# --------------------------------------------------
num_cluster = map(np.int, arguments.cluster_range.split(","))

accs_names = ['KTA (linear)',  'ARI']
accs = np.zeros((2, len(num_cluster)))

for i in range(len(num_cluster)):
    k = num_cluster[i]
    print('Iteration {0}, num-cluster={1}'.format(i, k))
    # --------------------------------------------------
    # 3.1. SETUP SOURCE DATA NMF CLUSTERING
    # --------------------------------------------------
    if labels is None:
        # No source labels are provided, generate them via NMF clustering
        nmf_labels = None
        nmf_labels = NmfClustering(data, gene_ids, num_cluster=k, labels=[])
        nmf_labels.add_cell_filter(cell_filter_fun)
        nmf_labels.add_gene_filter(gene_filter_fun)
        nmf_labels.set_data_transformation(data_transf_fun)
        nmf_labels.apply(k=k, alpha=arguments.nmf_alpha, l1=arguments.nmf_l1, max_iter=arguments.nmf_max_iter, rel_err=arguments.nmf_rel_err)
        labels = nmf_labels.cluster_labels

    # Use perfect number of latent states for nmf and sc3
    src_labels = np.array(labels, dtype=np.int)
    src_lbl_set = np.unique(src_labels)
    k_now = src_lbl_set.size

    nmf = None
    nmf = NmfClustering_initW(data, gene_ids, labels=labels, num_cluster=k_now)
    nmf.add_cell_filter(cell_filter_fun)
    nmf.add_gene_filter(gene_filter_fun)
Beispiel #2
0
    print data.shape
else:
    # Cell and gene filter and transformation within the procedure
    cell_filter_fun = partial(sc.cell_filter,
                              num_expr_genes=min_expr_genes,
                              non_zero_threshold=non_zero_threshold)
    gene_filter_fun = partial(sc.gene_filter,
                              perc_consensus_genes=perc_consensus_genes,
                              non_zero_threshold=non_zero_threshold)
    data_transf_fun = sc.data_transformation_log2

# Generating labels from complete dataset
print "Train complete data"
complete_nmf = None
complete_nmf = NmfClustering(data,
                             np.arange(data.shape[0]),
                             num_cluster=num_cluster)
complete_nmf.add_cell_filter(cell_filter_fun)
complete_nmf.add_gene_filter(gene_filter_fun)
complete_nmf.set_data_transformation(data_transf_fun)
complete_nmf.apply(k=num_cluster,
                   alpha=nmf_alpha,
                   l1=nmf_l1,
                   max_iter=nmf_max_iter,
                   rel_err=nmf_rel_err)
# Get labels
desc, target_nmf, trg_lbls_pred, mixed_data = method_sc3_filter(
    complete_nmf,
    data, [],
    cell_filter=cell_filter_fun,
    gene_filter=gene_filter_fun,
Beispiel #3
0
    data_src = data_transformation_log2(data_src)

    # Load Target data
    data_trg = np.loadtxt(path_trg)
    gene_ids_trg = np.loadtxt(path_geneids_trg, dtype=np.str)
    # Delete non-unique genes
    data_trg, gene_ids_trg = delete_nonunique_genes(data_trg, gene_ids_trg)
    # Apply cell filter
    valid_cells = cell_filter(data_trg)
    # Apply gene filter
    valid_genes = gene_filter(data_trg)

    # Create filtered data
    data_trg = data_trg[:, valid_cells]
    data_trg = data_trg[valid_genes, :]
    gene_ids_trg = gene_ids_trg[valid_genes]
    # Log transform data
    data_trg = data_transformation_log2(data_trg)

    # train source and test performance
    source_nmf = NmfClustering(data_src, gene_ids_src, num_cluster=n_source_cluster)
    source_nmf.apply(k=n_source_cluster, max_iter=100, rel_err=1e-3)

    # Number of repetitions can be changed in line 153 of utils.py
    target_nmf = DaNmfClustering(source_nmf, data_trg.copy(), gene_ids_trg, num_cluster=n_target_cluster)
    target_nmf.apply(k=n_target_cluster, calc_transferability=True)
    # target_nmf.transferability_pvalue

    # np.savez(fname, source_ari=source_ari, target_ari=target_ari, n_mix=n_mix, n_source=n_source, n_target=n_target, n_source_cluster=n_source_cluster,
    # n_target_cluster=n_target_cluster)
Beispiel #4
0
    print np.unique(data.cluster_labels)

    source_ari = np.zeros(reps)
    target_ari = np.zeros((7, reps, len(n_target), len(n_mix)))
    n = 0
    while n < reps:
        source_perc = n_source / float(data.cluster_labels.size)
        sss = StratifiedShuffleSplit(n_splits=2, test_size=source_perc)
        for split_1, split_2 in sss.split(data.pp_data.T, data.cluster_labels):
            print split_1.size, split_2.size

        source_data = data.pp_data[:, split_2]
        source_labels = data.cluster_labels[split_2]

        # train source and test performance
        source_nmf = NmfClustering(source_data, gene_ids, num_cluster=n_source_cluster)
        source_nmf.apply(k=n_source_cluster, max_iter=4000, rel_err=1e-3)
        source_ari[n] = metrics.adjusted_rand_score(source_labels, source_nmf.cluster_labels)
        print 'ITER(', n,'): SOURCE ARI = ', source_ari[n]
        if source_ari[n] < 0.94:
            continue

        for i in range(len(n_target)):
            target_perc = n_target[i] / float(split_1.size)
            ttt = StratifiedShuffleSplit(n_splits=2, test_size=target_perc)
            for split_11, split_22 in ttt.split(data.pp_data[:, split_1].T, data.cluster_labels[split_1]):
                print split_11.size, split_22.size

            # shuffle the gene ids for testing
            perm_inds = np.random.permutation(data.pp_data.shape[0])
            target_gene_ids = gene_ids[perm_inds].copy()
        trg_labels = np.array(trg_labels, dtype=np.int)
        src_labels = np.array(src_labels, dtype=np.int)

        # 3.a. Subsampling order for target
        inds = np.random.permutation(trg_labels.size)

        # 3.b. Use perfect number of latent states for nmf and sc3
        src_lbl_set = np.unique(src_labels)
        n_trg_cluster = np.unique(trg_labels).size
        n_src_cluster = src_lbl_set.size

        # 3.c. train source once per repetition
        print "Train source data of rep {0}".format(r + 1)
        source_nmf = None
        source_nmf = NmfClustering(src,
                                   np.arange(src.shape[0]),
                                   num_cluster=num_cluster)
        source_nmf.add_cell_filter(cell_filter_fun)
        source_nmf.add_gene_filter(gene_filter_fun)
        source_nmf.set_data_transformation(data_transf_fun)
        source_nmf.apply(k=num_cluster,
                         alpha=nmf_alpha,
                         l1=nmf_l1,
                         max_iter=nmf_max_iter,
                         rel_err=nmf_rel_err)

        # Calculate ARIs and KTAs
        print "Evaluation of source results"
        source_ktas[s, r] = unsupervised_acc_kta(source_nmf.pp_data,
                                                 source_nmf.cluster_labels,
                                                 kernel='linear')
Beispiel #6
0
else:
    raise Warning("Within-Filtering is not implemented for R SC3")
    # Cell and gene filter and transformation within the procedure
    cell_filter_fun = partial(sc.cell_filter,
                              num_expr_genes=min_expr_genes,
                              non_zero_threshold=non_zero_threshold)
    gene_filter_fun = partial(sc.gene_filter,
                              perc_consensus_genes=perc_consensus_genes,
                              non_zero_threshold=non_zero_threshold)
    data_transf_fun = sc.data_transformation_log2

# Generating labels from complete dataset
print("Train complete data")
complete_nmf = None
complete_nmf = NmfClustering(data,
                             np.arange(data.shape[0]),
                             num_cluster=num_cluster,
                             labels=[])
complete_nmf.add_cell_filter(cell_filter_fun)
complete_nmf.add_gene_filter(gene_filter_fun)
complete_nmf.set_data_transformation(data_transf_fun)
complete_nmf.apply(k=num_cluster,
                   alpha=nmf_alpha,
                   l1=nmf_l1,
                   max_iter=nmf_max_iter,
                   rel_err=nmf_rel_err)

# Get labels
labels = complete_nmf.cluster_labels
label_names, label_counts = np.unique(labels, return_counts=True)
print("Labels: ", label_names)
print("Counts: ", label_counts)
Beispiel #7
0
data_source_indices = list(
    list(gene_names_source).index(x) for x in gene_intersection)
data_source = data_source[data_source_indices, ]

print(
    "Target data dimensions after taking source intersection: genes x cells: ",
    data_target.shape)
print(
    "source data dimensions after taking target intersection: genes x cells: ",
    data_source.shape)

# Generating labels for source dataset
print("Train complete data")
complete_nmf = None
complete_nmf = NmfClustering(data_source,
                             np.arange(data_source.shape[0]),
                             num_cluster=num_cluster_source,
                             labels=[])
complete_nmf.add_cell_filter(cell_filter_fun)
complete_nmf.add_gene_filter(gene_filter_fun)
complete_nmf.set_data_transformation(data_transf_fun)
complete_nmf.apply(k=num_cluster_source,
                   alpha=nmf_alpha,
                   l1=nmf_l1,
                   max_iter=nmf_max_iter,
                   rel_err=nmf_rel_err)

# Get labels
labels_source = complete_nmf.cluster_labels
label_source_names, label_source_counts = np.unique(labels_source,
                                                    return_counts=True)
print("Source labels: ", label_source_names)
Beispiel #8
0
accs_names = [
    'KTA (linear)', 'Silhouette (euc)', 'Silhouette (pearson)',
    'Silhouette (spearman)', 'ARI'
]
accs = np.zeros((5, len(num_cluster)))

for i in range(len(num_cluster)):
    k = num_cluster[i]
    print('Iteration {0}, num-cluster={0}'.format(i, k))

    # --------------------------------------------------
    # 3.1. SETUP SOURCE DATA NMF CLUSTERING
    # --------------------------------------------------
    nmf = None
    nmf = NmfClustering(data, gene_ids, num_cluster=k)
    nmf.add_cell_filter(cell_filter_fun)
    nmf.add_gene_filter(gene_filter_fun)
    nmf.set_data_transformation(data_transf_fun)
    nmf.apply(k=k,
              alpha=arguments.nmf_alpha,
              l1=arguments.nmf_l1,
              max_iter=arguments.nmf_max_iter,
              rel_err=arguments.nmf_rel_err)

    # --------------------------------------------------
    # 3.2. EVALUATE CLUSTER ASSIGNMENT
    # --------------------------------------------------
    print('\nUnsupervised evaluation:')
    accs[0, i] = unsupervised_acc_kta(nmf.pp_data,
                                      nmf.cluster_labels,