Ejemplo n.º 1
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.filtered_matrix is None:
        outs.tsne = None
        outs.tsne_summary = {}
        return

    if not os.path.exists(outs.tsne):
        os.mkdir(outs.tsne)

    outs.tsne_summary = {'h5': {}, 'csv': {}}
    for method in args.factorization:
        # get all tsnes for a given method
        chunk_h5s = [os.path.join(chunk_out.tsne, method + '_tsne.h5')
                     for chunk_def, chunk_out in zip(chunk_defs, chunk_outs) if chunk_def.method == method]

        chunk_csv_dirs = [os.path.join(chunk_out.tsne, method + '_tsne_csv')
                          for chunk_def, chunk_out in zip(chunk_defs, chunk_outs) if chunk_def.method == method]

        analysis_io.combine_h5_files(chunk_h5s,
                                     os.path.join(outs.tsne, method + "_tsne.h5"),
                                     [analysis_constants.ANALYSIS_H5_TSNE_GROUP])

        for csv_dir in chunk_csv_dirs:
            cr_io.copytree(csv_dir,
                           os.path.join(outs.tsne, method + "_tsne_csv"),
                           allow_existing=True)

        outs.tsne_summary['h5'][method] = os.path.join(outs.tsne, method + "_tsne.h5")
        outs.tsne_summary['csv'][method] = os.path.join(outs.tsne, method + "_tsne_csv")
Ejemplo n.º 2
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return

    chunk_h5s = [chunk_out.tsne_h5 for chunk_out in chunk_outs]
    chunk_csv_dirs = [chunk_out.tsne_csv for chunk_out in chunk_outs]
    analysis_io.combine_h5_files(chunk_h5s, outs.tsne_h5,
                                 [analysis_constants.ANALYSIS_H5_TSNE_GROUP])
    for csv_dir in chunk_csv_dirs:
        cr_io.copytree(csv_dir, outs.tsne_csv, allow_existing=True)
Ejemplo n.º 3
0
def main(args, outs):
    if args.skip:
        return

    analysis_io.combine_h5_files([args.kmeans_h5, args.graphclust_h5], outs.clustering_h5,
                           [analysis_constants.ANALYSIS_H5_KMEANS_GROUP,
                            analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP])

    csv_path = os.path.join(outs.clustering_csv)
    cr_io.makedirs(csv_path, allow_existing=True)
    copy_subdirs(args.kmeans_csv, csv_path)
    copy_subdirs(args.graphclust_csv, csv_path)
Ejemplo n.º 4
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip or args.is_multi_genome:
        return

    chunk_h5s = [chunk_out.diffexp_h5 for chunk_out in chunk_outs]
    chunk_csv_dirs = [chunk_out.diffexp_csv for chunk_out in chunk_outs]

    cr_io.combine_h5_files(chunk_h5s, outs.diffexp_h5, [cr_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP,
                                                        cr_constants.ANALYSIS_H5_KMEANS_DIFFERENTIAL_EXPRESSION_GROUP])

    for csv_dir in chunk_csv_dirs:
        cr_utils.copytree(csv_dir, outs.diffexp_csv, allow_existing=True)
Ejemplo n.º 5
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip or args.is_multi_genome:
        return

    chunk_h5s = [chunk_out.kmeans_h5 for chunk_out in chunk_outs]
    chunk_csv_dirs = [chunk_out.kmeans_csv for chunk_out in chunk_outs]

    cr_io.combine_h5_files(chunk_h5s, outs.kmeans_h5, [
        cr_constants.ANALYSIS_H5_CLUSTERING_GROUP,
        cr_constants.ANALYSIS_H5_KMEANS_GROUP
    ])

    for csv_dir in chunk_csv_dirs:
        cr_utils.copytree(csv_dir, outs.kmeans_csv, allow_existing=True)
Ejemplo n.º 6
0
def join(args, outs, chunk_defs, chunk_outs):
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        outs.enrichment_analysis = None
        outs.enrichment_analysis_summary = {}
        return

    peak_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_peak_bc_matrix)
    tf_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_tf_bc_matrix) if args.filtered_tf_bc_matrix is not None else None
    outs.enrichment_analysis_summary = {'h5': {}, 'csv': {}}
    # for each method, we merge h5 files and copy csv directories to one place
    cr_io.mkdir(outs.enrichment_analysis, allow_existing=True)
    for method in args.factorization:
        method_dir = os.path.join(outs.enrichment_analysis, method)
        cr_io.mkdir(method_dir, allow_existing=True)

        _h5 = os.path.join(method_dir, '{}_enrichment_h5.h5'.format(method))
        outs.enrichment_analysis_summary['h5'][method] = _h5
        chunk_h5s = []

        _csv = os.path.join(method_dir, '{}_enrichment_csv'.format(method))
        outs.enrichment_analysis_summary['csv'][method] = _csv
        diffexp_prefixes = [(fr.id, fr.name) for fr in peak_matrix_features.feature_defs]
        if args.filtered_tf_bc_matrix is not None:
            diffexp_prefixes += [(fr.id, fr.name) for fr in tf_matrix_features.feature_defs]

        clustering_h5 = args.clustering_summary['h5'][method]
        for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5):

            chunk_outs_def_method_clustering = sorted([[chunk_out, chunk_def] for
                                                       chunk_out, chunk_def in zip(chunk_outs, chunk_defs)
                                                       if chunk_def.clustering_key == key], key=lambda x: x[1].cluster)
            chunk_outs_method_clustering = [c[0] for c in chunk_outs_def_method_clustering]

            # load 1 vs rest tests in sorted order of chunks and combine into one output per clustering
            diffexp = cr_diffexp.DIFFERENTIAL_EXPRESSION(np.hstack([np.loadtxt(com.tmp_diffexp, delimiter=',')[:, 0:3] for com in chunk_outs_method_clustering]))

            # write out h5
            chunk_h5 = martian.make_path('{}_enrichment_h5.h5'.format(key))
            with analysis_io.open_h5_for_writing(chunk_h5) as f:
                cr_diffexp.save_differential_expression_h5(f, key, diffexp)
            chunk_h5s += [chunk_h5]

            # write out csv
            cr_diffexp.save_differential_expression_csv_from_features(key, diffexp, diffexp_prefixes, _csv)

        analysis_io.combine_h5_files(chunk_h5s, _h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP,
                                                      analysis_constants.ANALYSIS_H5_MAP_DE[method]])
Ejemplo n.º 7
0
def main(args, outs):
    outs.clustering_summary = {}
    if args.filtered_matrix is None:
        outs.clustering = None
        return

    if not os.path.exists(outs.clustering):
        cr_io.mkdir(outs.clustering)

    # NOTE: both graph clustering and normal clustering should have run for given method
    assert args.clustering_summary['h5'].keys(
    ) == args.graph_clustering_summary['h5'].keys()

    outs.clustering_summary = {'h5': {}, 'csv': {}}
    for method in args.clustering_summary['h5'].keys():
        if method not in ALLOWED_FACTORIZATIONS:
            raise ValueError("invalid method")
        merge_h5 = [
            args.clustering_summary['h5'][method],
            args.graph_clustering_summary['h5'][method]
        ]
        groups = [
            analysis_constants.ANALYSIS_H5_MAP_CLUSTERING[method],
            analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP
        ]

        out_method_dir = os.path.join(outs.clustering, method)
        cr_io.mkdir(out_method_dir, allow_existing=True)

        out_clustering_h5 = os.path.join(out_method_dir,
                                         "{}_clustering.h5".format(method))
        outs.clustering_summary['h5'][method] = out_clustering_h5
        analysis_io.combine_h5_files(merge_h5, out_clustering_h5, groups)

        _csv1 = os.path.join(args.clustered_data, method,
                             CLUSTER_FILE_HEAD[method] + "_csv")
        _csv2 = os.path.join(args.knn_clusters, method, "clusters_csv")
        out_csv = os.path.join(out_method_dir, method + "_csv")
        cr_io.copytree(_csv1, out_csv, allow_existing=True)
        cr_io.copytree(_csv2, out_csv, allow_existing=True)
        outs.clustering_summary['csv'][method] = out_csv
Ejemplo n.º 8
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.filtered_matrix is None:
        outs.clustered_data = None
        outs.clustering_summary = {}
        return

    if not os.path.exists(outs.clustered_data):
        cr_io.mkdir(outs.clustered_data)

    outs.clustering_summary = {'h5': {}, 'csv': {}}
    for method in args.factorization:
        chunk_h5s = [
            os.path.join(chunk_out.clustered_data, method,
                         CLUSTER_FILE_HEAD[method] + ".h5")
            for chunk_out in chunk_outs
        ]
        chunk_csv_dirs = [
            os.path.join(chunk_out.clustered_data, method,
                         CLUSTER_FILE_HEAD[method] + "_csv")
            for chunk_out in chunk_outs
        ]

        method_dir = os.path.join(outs.clustered_data, method)
        cr_io.mkdir(method_dir, allow_existing=True)
        analysis_io.combine_h5_files(
            chunk_h5s,
            os.path.join(method_dir, CLUSTER_FILE_HEAD[method] + ".h5"), [
                analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP,
                analysis_constants.ANALYSIS_H5_MAP_CLUSTERING[method]
            ])

        for csv_dir in chunk_csv_dirs:
            cr_io.copytree(csv_dir,
                           os.path.join(method_dir,
                                        CLUSTER_FILE_HEAD[method] + "_csv"),
                           allow_existing=True)

        outs.clustering_summary['h5'][method] = os.path.join(
            method_dir, CLUSTER_FILE_HEAD[method] + ".h5")
        outs.clustering_summary['csv'][method] = os.path.join(
            method_dir, CLUSTER_FILE_HEAD[method] + "_csv")