def join(args, outs, chunk_defs, chunk_outs):
    if args.filtered_matrix is None:
        outs.tsne = None
        outs.tsne_summary = {}
        return

    if not os.path.exists(outs.tsne):
        os.mkdir(outs.tsne)

    outs.tsne_summary = {'h5': {}, 'csv': {}}
    for method in args.factorization:
        # get all tsnes for a given method
        chunk_h5s = [os.path.join(chunk_out.tsne, method + '_tsne.h5')
                     for chunk_def, chunk_out in zip(chunk_defs, chunk_outs) if chunk_def.method == method]

        chunk_csv_dirs = [os.path.join(chunk_out.tsne, method + '_tsne_csv')
                          for chunk_def, chunk_out in zip(chunk_defs, chunk_outs) if chunk_def.method == method]

        analysis_io.combine_h5_files(chunk_h5s,
                                     os.path.join(outs.tsne, method + "_tsne.h5"),
                                     [analysis_constants.ANALYSIS_H5_TSNE_GROUP])

        for csv_dir in chunk_csv_dirs:
            cr_io.copytree(csv_dir,
                           os.path.join(outs.tsne, method + "_tsne_csv"),
                           allow_existing=True)

        outs.tsne_summary['h5'][method] = os.path.join(outs.tsne, method + "_tsne.h5")
        outs.tsne_summary['csv'][method] = os.path.join(outs.tsne, method + "_tsne_csv")
Beispiel #2
0
def main(args, outs):
    list_of_files = [
        args.protospacer_calls_summary, args.protospacer_calls_per_cell,
        args.cells_per_protospacer, args.protospacer_umi_thresholds_csv,
        args.protospacer_umi_thresholds_json,
        args.perturbation_efficiencies_by_feature,
        args.perturbations_efficiencies_by_target
    ]

    cr_io.makedirs(outs.crispr_analysis, allow_existing=True)

    for (file_path, file_name) in itertools.izip(
            list_of_files, protospacer_calling.CRISPR_ANALYSIS_FILE_NAMES):
        if file_path is None:
            continue
        cr_io.copy(file_path, os.path.join(outs.crispr_analysis, file_name))

    if os.path.isdir(args.perturbation_effects_by_feature):
        perturbation_effects_by_feature_dir = os.path.join(
            outs.crispr_analysis, 'perturbation_effects_by_feature')
        cr_io.makedirs(perturbation_effects_by_feature_dir,
                       allow_existing=True)
        cr_io.copytree(args.perturbation_effects_by_feature,
                       perturbation_effects_by_feature_dir,
                       allow_existing=True)

    if os.path.isdir(args.perturbation_effects_by_target):
        perturbation_effects_by_target_dir = os.path.join(
            outs.crispr_analysis, 'perturbation_effects_by_target')
        cr_io.makedirs(perturbation_effects_by_target_dir, allow_existing=True)
        cr_io.copytree(args.perturbation_effects_by_target,
                       perturbation_effects_by_target_dir,
                       allow_existing=True)
Beispiel #3
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return

    chunk_out = chunk_outs[0]
    cr_io.copy(chunk_out.pca_h5, outs.pca_h5)
    cr_io.copytree(chunk_out.pca_csv, outs.pca_csv)
Beispiel #4
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        outs.analysis = None
        outs.analysis_csv = None
        outs.summary = None
        return

    chunk_out = chunk_outs[0]
    cr_io.copytree(chunk_out.analysis, outs.analysis)
    cr_io.copytree(chunk_out.analysis_csv, outs.analysis_csv)

    summary = {}

    # batch correction summary
    if args.chemistry_batch_correction is True:
        summary[
            'batch_effect_score_before_correction'] = args.batch_score_before_correction
        summary[
            'batch_effect_score_after_correction'] = args.batch_score_after_correction

    if args.is_multi_genome:
        with open(args.multi_genome_summary) as reader:
            multi_genome_summary = json.load(reader)
        summary.update(multi_genome_summary)
    else:
        summary = summary

    with open(outs.summary, 'w') as f:
        json.dump(summary, f, indent=4, sort_keys=True)
Beispiel #5
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip or not args.is_multi_genome:
        return

    chunk_out = chunk_outs[0]
    cr_io.copy(chunk_out.multi_genome_summary, outs.multi_genome_summary)
    cr_io.copytree(chunk_out.multi_genome_csv, outs.multi_genome_csv)
    cr_io.copytree(chunk_out.multi_genome_json, outs.multi_genome_json)
Beispiel #6
0
def main(args, outs):
    if args.skip:
        return

    for h5, csv in zip(args.pca_h5_list, args.pca_csv_list):
        if h5 is not None and csv is not None:
            cr_io.copy(h5, outs.pca_h5)
            cr_io.copytree(csv, outs.pca_csv)
Beispiel #7
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return

    chunk_h5s = [chunk_out.tsne_h5 for chunk_out in chunk_outs]
    chunk_csv_dirs = [chunk_out.tsne_csv for chunk_out in chunk_outs]
    analysis_io.combine_h5_files(chunk_h5s, outs.tsne_h5,
                                 [analysis_constants.ANALYSIS_H5_TSNE_GROUP])
    for csv_dir in chunk_csv_dirs:
        cr_io.copytree(csv_dir, outs.tsne_csv, allow_existing=True)
Beispiel #8
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return

    chunk_h5s = [chunk_out.diffexp_h5 for chunk_out in chunk_outs]
    chunk_csv_dirs = [chunk_out.diffexp_csv for chunk_out in chunk_outs]

    analysis_io.combine_h5_files(chunk_h5s, outs.diffexp_h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP,
                                                        analysis_constants.ANALYSIS_H5_KMEANS_DIFFERENTIAL_EXPRESSION_GROUP])

    for csv_dir in chunk_csv_dirs:
        cr_io.copytree(csv_dir, outs.diffexp_csv, allow_existing=True)
def join(args, outs, chunk_defs, chunk_outs):
    if args.filtered_matrix is None:
        outs.reduced_data = None
        outs.reduction_summary = {}
        return

    if not os.path.exists(outs.reduced_data):
        cr_io.mkdir(outs.reduced_data)

    # copy chunk outs
    for chunk_out in chunk_outs:
        cr_io.copytree(chunk_out.reduced_data, outs.reduced_data, allow_existing=True)

    # Use final destinations to update summary
    outs.reduction_summary = {'h5': {}, 'csv': {}}
    for method in args.factorization:
        outs.reduction_summary['h5'][method] = os.path.join(outs.reduced_data, method, method + ".h5")
        outs.reduction_summary['csv'][method] = os.path.join(outs.reduced_data, method, method + "_csv")
def main(args, outs):
    outs.clustering_summary = {}
    if args.filtered_matrix is None:
        outs.clustering = None
        return

    if not os.path.exists(outs.clustering):
        cr_io.mkdir(outs.clustering)

    # NOTE: both graph clustering and normal clustering should have run for given method
    assert args.clustering_summary['h5'].keys(
    ) == args.graph_clustering_summary['h5'].keys()

    outs.clustering_summary = {'h5': {}, 'csv': {}}
    for method in args.clustering_summary['h5'].keys():
        if method not in ALLOWED_FACTORIZATIONS:
            raise ValueError("invalid method")
        merge_h5 = [
            args.clustering_summary['h5'][method],
            args.graph_clustering_summary['h5'][method]
        ]
        groups = [
            analysis_constants.ANALYSIS_H5_MAP_CLUSTERING[method],
            analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP
        ]

        out_method_dir = os.path.join(outs.clustering, method)
        cr_io.mkdir(out_method_dir, allow_existing=True)

        out_clustering_h5 = os.path.join(out_method_dir,
                                         "{}_clustering.h5".format(method))
        outs.clustering_summary['h5'][method] = out_clustering_h5
        analysis_io.combine_h5_files(merge_h5, out_clustering_h5, groups)

        _csv1 = os.path.join(args.clustered_data, method,
                             CLUSTER_FILE_HEAD[method] + "_csv")
        _csv2 = os.path.join(args.knn_clusters, method, "clusters_csv")
        out_csv = os.path.join(out_method_dir, method + "_csv")
        cr_io.copytree(_csv1, out_csv, allow_existing=True)
        cr_io.copytree(_csv2, out_csv, allow_existing=True)
        outs.clustering_summary['csv'][method] = out_csv
def join(args, outs, chunk_defs, chunk_outs):
    if args.filtered_matrix is None:
        outs.clustered_data = None
        outs.clustering_summary = {}
        return

    if not os.path.exists(outs.clustered_data):
        cr_io.mkdir(outs.clustered_data)

    outs.clustering_summary = {'h5': {}, 'csv': {}}
    for method in args.factorization:
        chunk_h5s = [
            os.path.join(chunk_out.clustered_data, method,
                         CLUSTER_FILE_HEAD[method] + ".h5")
            for chunk_out in chunk_outs
        ]
        chunk_csv_dirs = [
            os.path.join(chunk_out.clustered_data, method,
                         CLUSTER_FILE_HEAD[method] + "_csv")
            for chunk_out in chunk_outs
        ]

        method_dir = os.path.join(outs.clustered_data, method)
        cr_io.mkdir(method_dir, allow_existing=True)
        analysis_io.combine_h5_files(
            chunk_h5s,
            os.path.join(method_dir, CLUSTER_FILE_HEAD[method] + ".h5"), [
                analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP,
                analysis_constants.ANALYSIS_H5_MAP_CLUSTERING[method]
            ])

        for csv_dir in chunk_csv_dirs:
            cr_io.copytree(csv_dir,
                           os.path.join(method_dir,
                                        CLUSTER_FILE_HEAD[method] + "_csv"),
                           allow_existing=True)

        outs.clustering_summary['h5'][method] = os.path.join(
            method_dir, CLUSTER_FILE_HEAD[method] + ".h5")
        outs.clustering_summary['csv'][method] = os.path.join(
            method_dir, CLUSTER_FILE_HEAD[method] + "_csv")
def join(args, outs, chunk_defs, chunk_outs):
    if args.filtered_peak_bc_matrix is None or not args.reduction_summary[
            'h5'].keys():
        outs.analysis = None
        outs.analysis_csv = None
        outs.feature_bc_matrix = None
        return

    # Make the FBM
    # build joint Peak + TF count matrix for single genomes
    # combine peak annotations for single genome analysis
    peak_annotation = None
    if args.peak_annotation:
        annotations = pd.read_csv(args.peak_annotation,
                                  sep='\t')[['gene', 'peak_type']]
        annotations = annotations.replace(np.nan, '', regex=True)
        annotations = annotations.values.astype(str).tolist()
        peak_annotation = []
        for row in annotations:
            genes = row[0].split(";")
            annotation = row[1].split(";")
            promoter = []
            nearby_gene = []
            assert len(annotation) == len(genes)
            for en, kind in enumerate(annotation):
                if kind == 'promoter':
                    promoter += [genes[en]]
                nearby_gene += [genes[en]]
            peak_annotation += [[';'.join(promoter), ';'.join(nearby_gene)]]
    fbm = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix)
    mapping = None
    if args.filtered_tf_bc_matrix:
        # combine matrices, ensure the barcodes are same and ordered the same way
        tf_matrix = cr_matrix.CountMatrix.load_h5_file(
            args.filtered_tf_bc_matrix)
        assert (fbm.bcs == tf_matrix.bcs).all()
        if peak_annotation is not None:
            fbm.feature_ref = FeatureReference.addtags(
                fbm.feature_ref, ['promoter', 'nearby_gene'], peak_annotation)
            tf_matrix.feature_ref = FeatureReference.addtags(
                tf_matrix.feature_ref, ['promoter', 'nearby_gene'])
        combined_feature_defs = FeatureReference.join(fbm.feature_ref,
                                                      tf_matrix.feature_ref)
        combined_matrix = vstack([fbm.m, tf_matrix.m])
        # explicit map linking rows in diffexp to combined matrix
        mapping = np.zeros((tf_matrix.features_dim, 2))
        for x in range(tf_matrix.features_dim):
            mapping[x, 0] = x
            mapping[x, 1] = x + fbm.features_dim
        fbm = cr_matrix.CountMatrix(combined_feature_defs, fbm.bcs,
                                    combined_matrix)
    fbm.save_h5_file(outs.feature_bc_matrix,
                     sw_version=martian.get_pipelines_version())

    # Pytables doesn't support variable len strings, so use h5py first
    with h5.File(outs.feature_bc_matrix, 'r') as matrix, \
            h5.File(outs.analysis, 'w') as out:
        # TODO: copy the first group; fixme when we have a key
        name = matrix.keys()[0]
        matrix.copy(matrix[name], out, name='matrix')

    factorizations = args.reduction_summary['h5'].keys()
    USE_FACTORIZATION = DEFAULT_FACTORIZATION if DEFAULT_FACTORIZATION in factorizations else factorizations[
        0]
    with tables.open_file(outs.analysis, 'a') as out:
        for summary, key in zip([
                args.reduction_summary, args.clustering_summary,
                args.tsne_summary, args.enrichment_analysis_summary
        ], [USE_FACTORIZATION, 'clustering', 'tsne', 'enrichment']):
            if summary is None or not summary:
                continue
            print(key, summary)
            data_h5 = summary['h5'][USE_FACTORIZATION]
            with tables.open_file(data_h5, 'r') as indata:
                indata.copy_children(indata.root, out.root, recursive=True)
            dirname = os.path.join(outs.analysis_csv, key)
            cr_io.copytree(summary['csv'][USE_FACTORIZATION], dirname)

    # if mapping is present (single genome case), so is the coloring matrix
    if mapping is not None:
        with h5.File(outs.analysis, 'a') as out:
            out.create_dataset('feature_DE_map', data=mapping)
        args.coerce_strings()
        tf_propZ_matrix = np.loadtxt(args.tf_propZ_matrix)
        with h5.File(outs.analysis, 'a') as out:
            out.create_dataset('diffexp_coloring_matrix', data=tf_propZ_matrix)
Beispiel #13
0
def copy_subdirs(src_dir, dest_dir):
    for subdir in os.listdir(src_dir):
        cr_io.copytree(os.path.join(src_dir, subdir), os.path.join(dest_dir, subdir))
Beispiel #14
0
def main(args, outs):
    if args.skip:
        return

    if args.is_multi_genome:
        cr_io.copytree(args.multi_genome_json, outs.analysis)
        cr_io.copytree(args.multi_genome_csv, outs.analysis_csv)

    analysis_h5 = analysis_io.h5_path(outs.analysis)
    cr_io.makedirs(os.path.dirname(analysis_h5), allow_existing=True)

    # Pytables doesn't support variable len strings, so use h5py first
    with h5.File(args.matrix_h5, 'r') as matrix,\
         h5.File(analysis_h5, 'w') as out:
        # TODO: copy the first group; fixme when we have a key
        name = matrix.keys()[0]
        matrix.copy(matrix[name], out, name='matrix')

    with tables.open_file(args.pca_h5, 'r') as pca,\
         tables.open_file(args.clustering_h5, 'r') as clustering,\
         tables.open_file(args.diffexp_h5, 'r') as diffexp,\
         tables.open_file(args.tsne_h5, 'r') as tsne,\
         tables.open_file(analysis_h5, 'a') as out:

        pca.copy_children(pca.root, out.root, recursive=True)
        clustering.copy_children(clustering.root, out.root, recursive=True)
        diffexp.copy_children(diffexp.root, out.root, recursive=True)
        tsne.copy_children(tsne.root, out.root, recursive=True)

    pca_dir = os.path.join(outs.analysis_csv, 'pca')
    cr_io.copytree(args.pca_csv, pca_dir)

    clustering_dir = os.path.join(outs.analysis_csv, 'clustering')
    cr_io.copytree(args.clustering_csv, clustering_dir)

    diffexp_dir = os.path.join(outs.analysis_csv, 'diffexp')
    cr_io.copytree(args.diffexp_csv, diffexp_dir)

    tsne_dir = os.path.join(outs.analysis_csv, 'tsne')
    cr_io.copytree(args.tsne_csv, tsne_dir)