Ejemplo n.º 1
0
def main(args, outs):
    np.random.seed(args.random_seed)

    if args.skip or args.is_multi_genome:
        return

    matrix = cr_matrix.GeneBCMatrix.load_h5(args.matrix_h5)
    pca = cr_pca.load_pca_from_h5(args.pca_h5)
    pca_mat = pca.transformed_pca_matrix

    # Subsample barcodes
    if args.num_bcs is not None:
        use_bcs = np.random.choice(pca_mat.shape[0],
                                   args.num_bcs,
                                   replace=False)
        matrix = matrix.select_barcodes(use_bcs)
        pca_mat = pca_mat[use_bcs, :]

    # Subset principal components
    if args.num_pcs is not None:
        pca_mat = pca_mat[:, np.arange(args.num_pcs)]

    kmeans = cr_kmeans.run_kmeans(pca_mat,
                                  args.n_clusters,
                                  random_state=args.random_seed)

    with cr_io.open_h5_for_writing(outs.kmeans_h5) as f:
        cr_kmeans.save_kmeans_h5(f, args.n_clusters, kmeans)

    clustering_key = cr_clustering.format_clustering_key(
        cr_clustering.CLUSTER_TYPE_KMEANS, args.n_clusters)

    cr_clustering.save_clustering_csv(outs.kmeans_csv, clustering_key,
                                      kmeans.clusters, matrix.bcs)
Ejemplo n.º 2
0
def main(args, outs):
    np.random.seed(0)

    if args.filtered_matrix is None:
        return

    if not os.path.exists(outs.clustered_data):
        cr_io.mkdir(outs.clustered_data)

    matrix_bcs = cr_matrix.CountMatrix.load_bcs_from_h5_file(
        args.filtered_matrix)
    for method in args.factorization:
        transformed_matrix = args.transformed_matrix[method]
        method_dir = os.path.join(outs.clustered_data, method)
        cr_io.mkdir(method_dir, allow_existing=True)
        file_head = CLUSTER_FILE_HEAD[method]
        _h5 = os.path.join(method_dir, file_head + ".h5")
        _csv = os.path.join(method_dir, file_head + "_csv")
        dr_mat = None

        if not os.path.exists(transformed_matrix):
            raise IOError('matrix does not exist')

        if method == 'pca':
            pca = cr_pca.load_pca_from_h5(transformed_matrix)
            dr_mat = pca.transformed_pca_matrix
        if method == 'lsa':
            lsa = cr_lsa.load_lsa_from_h5(transformed_matrix)
            lsa = lsa._replace(
                transformed_lsa_matrix=lsa.transformed_lsa_matrix + 1e-120)
            dr_mat = lsa.transformed_lsa_matrix / np.linalg.norm(
                lsa.transformed_lsa_matrix, axis=1, keepdims=True)
        if method == 'plsa':
            plsa = cr_plsa.load_plsa_from_h5(args.transformed_matrix[method])
            plsa = plsa._replace(
                transformed_plsa_matrix=plsa.transformed_plsa_matrix + 1e-120)
            dr_mat = plsa.transformed_plsa_matrix / np.linalg.norm(
                plsa.transformed_plsa_matrix, axis=1, keepdims=True)

        if args.num_dims is not None:
            if args.num_dims > dr_mat.shape[1]:
                raise ValueError(
                    'number of dimensions requested to use is larger than number of dimensions in data'
                )
            dr_mat = dr_mat[:, np.arange(args.num_dims)]

        kmeans = cr_kmeans.run_kmeans(dr_mat,
                                      args.n_clusters,
                                      random_state=args.random_seed)
        with analysis_io.open_h5_for_writing(_h5) as f:
            cr_kmeans.save_kmeans_h5(f, args.n_clusters, kmeans)
        clustering_key = cr_clustering.format_clustering_key(
            cr_clustering.CLUSTER_TYPE_KMEANS, args.n_clusters)
        cr_clustering.save_clustering_csv(_csv, clustering_key,
                                          kmeans.clusters, matrix_bcs)
Ejemplo n.º 3
0
def main(args, outs):
    if args.skip or args.is_multi_genome:
        return

    matrix = cr_matrix.GeneBCMatrix.load_h5(args.matrix_h5)

    clustering = SingleGenomeAnalysis.load_clustering_from_h5(args.clustering_h5, args.clustering_key)

    diffexp = cr_diffexp.run_differential_expression(matrix, clustering.clusters)

    with cr_io.open_h5_for_writing(outs.diffexp_h5) as f:
        cr_diffexp.save_differential_expression_h5(f, args.clustering_key, diffexp)

    cr_diffexp.save_differential_expression_csv(args.clustering_key, diffexp, matrix, outs.diffexp_csv)
Ejemplo n.º 4
0
def join(args, outs, chunk_defs, chunk_outs):
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        outs.enrichment_analysis = None
        outs.enrichment_analysis_summary = {}
        return

    peak_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_peak_bc_matrix)
    tf_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_tf_bc_matrix) if args.filtered_tf_bc_matrix is not None else None
    outs.enrichment_analysis_summary = {'h5': {}, 'csv': {}}
    # for each method, we merge h5 files and copy csv directories to one place
    cr_io.mkdir(outs.enrichment_analysis, allow_existing=True)
    for method in args.factorization:
        method_dir = os.path.join(outs.enrichment_analysis, method)
        cr_io.mkdir(method_dir, allow_existing=True)

        _h5 = os.path.join(method_dir, '{}_enrichment_h5.h5'.format(method))
        outs.enrichment_analysis_summary['h5'][method] = _h5
        chunk_h5s = []

        _csv = os.path.join(method_dir, '{}_enrichment_csv'.format(method))
        outs.enrichment_analysis_summary['csv'][method] = _csv
        diffexp_prefixes = [(fr.id, fr.name) for fr in peak_matrix_features.feature_defs]
        if args.filtered_tf_bc_matrix is not None:
            diffexp_prefixes += [(fr.id, fr.name) for fr in tf_matrix_features.feature_defs]

        clustering_h5 = args.clustering_summary['h5'][method]
        for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5):

            chunk_outs_def_method_clustering = sorted([[chunk_out, chunk_def] for
                                                       chunk_out, chunk_def in zip(chunk_outs, chunk_defs)
                                                       if chunk_def.clustering_key == key], key=lambda x: x[1].cluster)
            chunk_outs_method_clustering = [c[0] for c in chunk_outs_def_method_clustering]

            # load 1 vs rest tests in sorted order of chunks and combine into one output per clustering
            diffexp = cr_diffexp.DIFFERENTIAL_EXPRESSION(np.hstack([np.loadtxt(com.tmp_diffexp, delimiter=',')[:, 0:3] for com in chunk_outs_method_clustering]))

            # write out h5
            chunk_h5 = martian.make_path('{}_enrichment_h5.h5'.format(key))
            with analysis_io.open_h5_for_writing(chunk_h5) as f:
                cr_diffexp.save_differential_expression_h5(f, key, diffexp)
            chunk_h5s += [chunk_h5]

            # write out csv
            cr_diffexp.save_differential_expression_csv_from_features(key, diffexp, diffexp_prefixes, _csv)

        analysis_io.combine_h5_files(chunk_h5s, _h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP,
                                                      analysis_constants.ANALYSIS_H5_MAP_DE[method]])
Ejemplo n.º 5
0
def main(args, outs):
    if args.skip:
        return

    matrix = cr_matrix.CountMatrix.load_h5_file(args.matrix_h5)

    # For now, only compute for gene expression features
    matrix = matrix.select_features_by_type(lib_constants.GENE_EXPRESSION_LIBRARY_TYPE)

    clustering = SingleGenomeAnalysis.load_clustering_from_h5(args.clustering_h5, args.clustering_key)

    diffexp = cr_diffexp.run_differential_expression(matrix, clustering.clusters)

    with analysis_io.open_h5_for_writing(outs.diffexp_h5) as f:
        cr_diffexp.save_differential_expression_h5(f, args.clustering_key, diffexp)

    cr_diffexp.save_differential_expression_csv(args.clustering_key, diffexp, matrix, outs.diffexp_csv)
Ejemplo n.º 6
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return
    # Merge the neighbor matrices
    with LogPerf('merge_nn'):
        nn = cr_graphclust.merge_nearest_neighbors(
            [chunk.chunked_neighbors for chunk in chunk_outs],
            chunk_defs[0].total_rows)
    print 'nn\tnn_nodes\t%0.4f' % nn.shape[0]
    print 'nn\tnn_links\t%0.4f' % nn.nnz
    print 'nn\tnn_density\t%0.4f' % cr_graphclust.matrix_density(nn)
    sys.stdout.flush()

    matrix_bin = martian.make_path('matrix.bin')
    matrix_weights = martian.make_path('matrix.weights')
    louvain_out = martian.make_path('louvain.out')

    if args.similarity_type == 'snn':
        snn = cr_graphclust.compute_snn_matrix(nn, chunk_defs[0].k_nearest)

        print 'snn\tsnn_nodes\t%d' % snn.shape[0]
        print 'snn\tsnn_links\t%d' % (snn.nnz / 2)
        print 'snn\tsnn_density\t%0.4f' % (
            (snn.nnz) / float(snn.shape[0] * (snn.shape[0] - 1)))
        sys.stdout.flush()

        with LogPerf('convert'):
            cr_graphclust.pipe_weighted_edgelist_to_convert(
                snn, matrix_bin, matrix_weights)

        with LogPerf('louvain'):
            cr_graphclust.run_louvain_weighted_clustering(
                matrix_bin, matrix_weights, louvain_out)

    else:
        with LogPerf('tocoo'):
            nn = nn.tocoo(copy=False)

        with LogPerf('convert'):
            cr_graphclust.pipe_unweighted_edgelist_to_convert(nn, matrix_bin)

        with LogPerf('louvain'):
            cr_graphclust.run_louvain_unweighted_clustering(
                matrix_bin, louvain_out)

    with LogPerf('load_bcs'):
        barcodes = SingleGenomeAnalysis.load_bcs_from_matrix_h5(args.matrix_h5)

    use_bcs = cr_graphclust.load_ndarray_h5(chunk_defs[0].use_bcs, 'use_bcs')

    labels = cr_graphclust.load_louvain_results(len(barcodes), use_bcs,
                                                louvain_out)

    labels = cr_clustering.relabel_by_size(labels)

    # Save cluster results
    with analysis_io.open_h5_for_writing(outs.clusters_h5) as f:
        cr_graphclust.save_graphclust_h5(f, labels)

    clustering_key = cr_clustering.format_clustering_key(
        cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0)

    cr_clustering.save_clustering_csv(outs.clusters_csv, clustering_key,
                                      labels, barcodes)

    outs.chunked_neighbors = None
Ejemplo n.º 7
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return

    np.random.seed(0)

    # Load the matrix
    mat = CountMatrix.load_h5_file(args.matrix_h5)
    print mat.m.shape, mat.m.nnz

    barcodes = mat.bcs

    # Load graph-based clustering from analysis H5
    clustering_key = cr_clustering.format_clustering_key(
        cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0)
    clustering = SingleGenomeAnalysis.load_clustering_from_h5(
        args.clusters_h5, clustering_key)
    labels = clustering.clusters

    # Clusters that were 0 were unused in the clustering analysis (only relevant if the cluster stage was run by itself)
    total_bcs = len(labels)
    use_bcs = np.flatnonzero(labels > 0)
    expr_mat = mat.m[:, use_bcs]

    # Make cluster labels 0-based
    labels = labels[use_bcs] - 1

    # Convert PCA coords to dataframe
    pca = SingleGenomeAnalysis.load_pca_from_h5(
        args.pca_h5).transformed_pca_matrix[use_bcs, :]
    pca_df = pd.DataFrame(pca)
    print pca_df.shape

    # 1) Run hierarchical clustering on cluster medoids in PCA-space
    # 2) For each pair of clusters that are sibling leaves,
    #   3) Run a differential expression analysis
    #   4) Merge the clusters if not enough genes are differentially expressed
    #   5) If merged, stop considering cluster-pairs and goto 1)

    # Cache already-checked cluster-pairs
    # set of (frozenset, frozenset)
    checked_cluster_pairs = set()

    while True:
        print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        sys.stdout.flush()
        if len(np.bincount(labels)) == 1:
            # One cluster remains
            break

        # Compute medoids, perform hierarchical clustering
        pca_df['cluster'] = labels
        medoids = pca_df.groupby('cluster').apply(
            lambda x: x.median(axis=0)).as_matrix()[:, :-1]
        hc = linkage(medoids, 'complete')
        max_label = np.max(labels)

        print np.bincount(labels)
        print 'max=%d' % max_label

        any_merged = False
        for step in xrange(hc.shape[0]):
            if hc[step, 0] <= max_label and hc[step, 1] <= max_label:
                leaf0, leaf1 = hc[step, 0], hc[step, 1]

                group0 = np.flatnonzero(labels == leaf0)
                group1 = np.flatnonzero(labels == leaf1)

                # Skip this cluster pair if already checked
                set0 = frozenset(group0)
                set1 = frozenset(group1)
                cluster_pair = tuple(sorted([set0, set1]))
                if cluster_pair in checked_cluster_pairs:
                    continue
                checked_cluster_pairs.add(cluster_pair)

                print 'Comparing clusters (%d,%d)' % (1 + leaf0, 1 + leaf1)
                submat = expr_mat[:, np.concatenate((group0, group1))]

                print '\tComputing params on (%d,%d) matrix' % submat.shape
                params = compute_sseq_params(submat)

                print '\tRunning DE on %d vs %d cells' % (len(group0),
                                                          len(group1))
                group0_submat = np.arange(len(group0))
                group1_submat = np.arange(len(group0),
                                          len(group0) + len(group1))
                de_result = sseq_differential_expression(
                    submat, group0_submat, group1_submat, params)

                n_de_genes = np.sum(de_result.adjusted_p_value <
                                    MERGE_CLUSTERS_DE_ADJ_P_THRESHOLD)
                if n_de_genes == 0:
                    print '\tFound %d DE genes. Merging clusters (%d,%d)' % (
                        n_de_genes, 1 + leaf0, 1 + leaf1)
                    # Relabel as the smaller-index cluster
                    labels[labels == leaf1] = leaf0

                    # Shift all labels above old label down
                    labels[labels > leaf1] = labels[labels > leaf1] - 1

                    any_merged = True
                    break

        sys.stdout.flush()

        if not any_merged:
            break

    # Convert back to one-based cluster labels
    labels += 1

    labels = cr_clustering.relabel_by_size(labels)

    # Convert back into original bc space, with 0s for unused bcs
    final_labels = np.zeros(total_bcs, dtype=int)
    final_labels[use_bcs] = labels

    # Save results
    with analysis_io.open_h5_for_writing(outs.clusters_h5) as f:
        cr_graphclust.save_graphclust_h5(f, final_labels)

    clustering_key = cr_clustering.format_clustering_key(
        cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0)

    cr_clustering.save_clustering_csv(outs.clusters_csv, clustering_key,
                                      final_labels, barcodes)
Ejemplo n.º 8
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.matrix_h5 is None:
        outs.graph_clustering_summary = {}
        return

    outs.graph_clustering_summary = {'h5': {}, 'csv': {}}
    # Merge the neighbor matrices
    for method in args.factorization:
        chunk_outs_def_method = [[
            chunk_out, chunk_def
        ] for chunk_out, chunk_def in zip(chunk_outs, chunk_defs)
                                 if chunk_def.method == method]
        chunk_outs_method = [c[0] for c in chunk_outs_def_method]
        chunk_defs_method = [c[1] for c in chunk_outs_def_method]

        with LogPerf('merge_nn'):
            nn = cr_graphclust.merge_nearest_neighbors(
                [chunk.chunked_neighbors for chunk in chunk_outs_method],
                chunk_defs_method[0].total_rows)
        print 'nn\tnn_nodes\t%0.4f' % nn.shape[0]
        print 'nn\tnn_links\t%0.4f' % nn.nnz
        print 'nn\tnn_density\t%0.4f' % cr_graphclust.matrix_density(nn)
        sys.stdout.flush()

        matrix_bin = martian.make_path('matrix_{}.bin'.format(method))
        matrix_weights = martian.make_path('matrix_{}.weights'.format(method))
        louvain_out = martian.make_path('louvain_{}.out'.format(method))

        if args.similarity_type == 'snn':
            snn = cr_graphclust.compute_snn_matrix(
                nn, chunk_defs_method[0].k_nearest)

            print 'snn\tsnn_nodes\t%d' % snn.shape[0]
            print 'snn\tsnn_links\t%d' % (snn.nnz / 2)
            print 'snn\tsnn_density\t%0.4f' % (
                (snn.nnz) / float(snn.shape[0] * (snn.shape[0] - 1)))
            sys.stdout.flush()

            with LogPerf('convert'):
                cr_graphclust.pipe_weighted_edgelist_to_convert(
                    snn, matrix_bin, matrix_weights)

            with LogPerf('louvain'):
                cr_graphclust.run_louvain_weighted_clustering(
                    matrix_bin, matrix_weights, louvain_out)

        else:
            with LogPerf('tocoo'):
                nn = nn.tocoo(copy=False)

            with LogPerf('convert'):
                cr_graphclust.pipe_unweighted_edgelist_to_convert(
                    nn, matrix_bin)

            with LogPerf('louvain'):
                cr_graphclust.run_louvain_unweighted_clustering(
                    matrix_bin, louvain_out)

        with LogPerf('load_bcs'):
            barcodes = None
            with h5.File(args.matrix_h5, 'r') as f:
                group_name = f.keys()[0]
                barcodes = cr_matrix.CountMatrix.load_bcs_from_h5_group(
                    f[group_name])

        use_bcs = cr_graphclust.load_ndarray_h5(chunk_defs_method[0].use_bcs,
                                                'use_bcs')

        labels = cr_graphclust.load_louvain_results(len(barcodes), use_bcs,
                                                    louvain_out)

        labels = cr_clustering.relabel_by_size(labels)

        # Save cluster results
        cr_io.mkdir(outs.knn_clusters, allow_existing=True)
        method_dir = os.path.join(outs.knn_clusters, method)
        cr_io.mkdir(method_dir, allow_existing=True)
        _h5 = os.path.join(method_dir, "clusters.h5")
        _csv = os.path.join(method_dir, "clusters_csv")
        with analysis_io.open_h5_for_writing(_h5) as f:
            cr_graphclust.save_graphclust_h5(f, labels)

        clustering_key = cr_clustering.format_clustering_key(
            cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0)
        cr_clustering.save_clustering_csv(_csv, clustering_key, labels,
                                          barcodes)
        outs.graph_clustering_summary['h5'][method] = _h5
        outs.graph_clustering_summary['csv'][method] = _csv

    outs.chunked_neighbors = None