def join(args, outs, chunk_defs, chunk_outs): if args.filtered_matrix is None: outs.tsne = None outs.tsne_summary = {} return if not os.path.exists(outs.tsne): os.mkdir(outs.tsne) outs.tsne_summary = {'h5': {}, 'csv': {}} for method in args.factorization: # get all tsnes for a given method chunk_h5s = [os.path.join(chunk_out.tsne, method + '_tsne.h5') for chunk_def, chunk_out in zip(chunk_defs, chunk_outs) if chunk_def.method == method] chunk_csv_dirs = [os.path.join(chunk_out.tsne, method + '_tsne_csv') for chunk_def, chunk_out in zip(chunk_defs, chunk_outs) if chunk_def.method == method] analysis_io.combine_h5_files(chunk_h5s, os.path.join(outs.tsne, method + "_tsne.h5"), [analysis_constants.ANALYSIS_H5_TSNE_GROUP]) for csv_dir in chunk_csv_dirs: cr_io.copytree(csv_dir, os.path.join(outs.tsne, method + "_tsne_csv"), allow_existing=True) outs.tsne_summary['h5'][method] = os.path.join(outs.tsne, method + "_tsne.h5") outs.tsne_summary['csv'][method] = os.path.join(outs.tsne, method + "_tsne_csv")
def main(args, outs): list_of_files = [ args.protospacer_calls_summary, args.protospacer_calls_per_cell, args.cells_per_protospacer, args.protospacer_umi_thresholds_csv, args.protospacer_umi_thresholds_json, args.perturbation_efficiencies_by_feature, args.perturbations_efficiencies_by_target ] cr_io.makedirs(outs.crispr_analysis, allow_existing=True) for (file_path, file_name) in itertools.izip( list_of_files, protospacer_calling.CRISPR_ANALYSIS_FILE_NAMES): if file_path is None: continue cr_io.copy(file_path, os.path.join(outs.crispr_analysis, file_name)) if os.path.isdir(args.perturbation_effects_by_feature): perturbation_effects_by_feature_dir = os.path.join( outs.crispr_analysis, 'perturbation_effects_by_feature') cr_io.makedirs(perturbation_effects_by_feature_dir, allow_existing=True) cr_io.copytree(args.perturbation_effects_by_feature, perturbation_effects_by_feature_dir, allow_existing=True) if os.path.isdir(args.perturbation_effects_by_target): perturbation_effects_by_target_dir = os.path.join( outs.crispr_analysis, 'perturbation_effects_by_target') cr_io.makedirs(perturbation_effects_by_target_dir, allow_existing=True) cr_io.copytree(args.perturbation_effects_by_target, perturbation_effects_by_target_dir, allow_existing=True)
def join(args, outs, chunk_defs, chunk_outs): if args.skip: return chunk_out = chunk_outs[0] cr_io.copy(chunk_out.pca_h5, outs.pca_h5) cr_io.copytree(chunk_out.pca_csv, outs.pca_csv)
def join(args, outs, chunk_defs, chunk_outs): if args.skip: outs.analysis = None outs.analysis_csv = None outs.summary = None return chunk_out = chunk_outs[0] cr_io.copytree(chunk_out.analysis, outs.analysis) cr_io.copytree(chunk_out.analysis_csv, outs.analysis_csv) summary = {} # batch correction summary if args.chemistry_batch_correction is True: summary[ 'batch_effect_score_before_correction'] = args.batch_score_before_correction summary[ 'batch_effect_score_after_correction'] = args.batch_score_after_correction if args.is_multi_genome: with open(args.multi_genome_summary) as reader: multi_genome_summary = json.load(reader) summary.update(multi_genome_summary) else: summary = summary with open(outs.summary, 'w') as f: json.dump(summary, f, indent=4, sort_keys=True)
def join(args, outs, chunk_defs, chunk_outs): if args.skip or not args.is_multi_genome: return chunk_out = chunk_outs[0] cr_io.copy(chunk_out.multi_genome_summary, outs.multi_genome_summary) cr_io.copytree(chunk_out.multi_genome_csv, outs.multi_genome_csv) cr_io.copytree(chunk_out.multi_genome_json, outs.multi_genome_json)
def main(args, outs): if args.skip: return for h5, csv in zip(args.pca_h5_list, args.pca_csv_list): if h5 is not None and csv is not None: cr_io.copy(h5, outs.pca_h5) cr_io.copytree(csv, outs.pca_csv)
def join(args, outs, chunk_defs, chunk_outs): if args.skip: return chunk_h5s = [chunk_out.tsne_h5 for chunk_out in chunk_outs] chunk_csv_dirs = [chunk_out.tsne_csv for chunk_out in chunk_outs] analysis_io.combine_h5_files(chunk_h5s, outs.tsne_h5, [analysis_constants.ANALYSIS_H5_TSNE_GROUP]) for csv_dir in chunk_csv_dirs: cr_io.copytree(csv_dir, outs.tsne_csv, allow_existing=True)
def join(args, outs, chunk_defs, chunk_outs): if args.skip: return chunk_h5s = [chunk_out.diffexp_h5 for chunk_out in chunk_outs] chunk_csv_dirs = [chunk_out.diffexp_csv for chunk_out in chunk_outs] analysis_io.combine_h5_files(chunk_h5s, outs.diffexp_h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP, analysis_constants.ANALYSIS_H5_KMEANS_DIFFERENTIAL_EXPRESSION_GROUP]) for csv_dir in chunk_csv_dirs: cr_io.copytree(csv_dir, outs.diffexp_csv, allow_existing=True)
def join(args, outs, chunk_defs, chunk_outs): if args.filtered_matrix is None: outs.reduced_data = None outs.reduction_summary = {} return if not os.path.exists(outs.reduced_data): cr_io.mkdir(outs.reduced_data) # copy chunk outs for chunk_out in chunk_outs: cr_io.copytree(chunk_out.reduced_data, outs.reduced_data, allow_existing=True) # Use final destinations to update summary outs.reduction_summary = {'h5': {}, 'csv': {}} for method in args.factorization: outs.reduction_summary['h5'][method] = os.path.join(outs.reduced_data, method, method + ".h5") outs.reduction_summary['csv'][method] = os.path.join(outs.reduced_data, method, method + "_csv")
def main(args, outs): outs.clustering_summary = {} if args.filtered_matrix is None: outs.clustering = None return if not os.path.exists(outs.clustering): cr_io.mkdir(outs.clustering) # NOTE: both graph clustering and normal clustering should have run for given method assert args.clustering_summary['h5'].keys( ) == args.graph_clustering_summary['h5'].keys() outs.clustering_summary = {'h5': {}, 'csv': {}} for method in args.clustering_summary['h5'].keys(): if method not in ALLOWED_FACTORIZATIONS: raise ValueError("invalid method") merge_h5 = [ args.clustering_summary['h5'][method], args.graph_clustering_summary['h5'][method] ] groups = [ analysis_constants.ANALYSIS_H5_MAP_CLUSTERING[method], analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP ] out_method_dir = os.path.join(outs.clustering, method) cr_io.mkdir(out_method_dir, allow_existing=True) out_clustering_h5 = os.path.join(out_method_dir, "{}_clustering.h5".format(method)) outs.clustering_summary['h5'][method] = out_clustering_h5 analysis_io.combine_h5_files(merge_h5, out_clustering_h5, groups) _csv1 = os.path.join(args.clustered_data, method, CLUSTER_FILE_HEAD[method] + "_csv") _csv2 = os.path.join(args.knn_clusters, method, "clusters_csv") out_csv = os.path.join(out_method_dir, method + "_csv") cr_io.copytree(_csv1, out_csv, allow_existing=True) cr_io.copytree(_csv2, out_csv, allow_existing=True) outs.clustering_summary['csv'][method] = out_csv
def join(args, outs, chunk_defs, chunk_outs): if args.filtered_matrix is None: outs.clustered_data = None outs.clustering_summary = {} return if not os.path.exists(outs.clustered_data): cr_io.mkdir(outs.clustered_data) outs.clustering_summary = {'h5': {}, 'csv': {}} for method in args.factorization: chunk_h5s = [ os.path.join(chunk_out.clustered_data, method, CLUSTER_FILE_HEAD[method] + ".h5") for chunk_out in chunk_outs ] chunk_csv_dirs = [ os.path.join(chunk_out.clustered_data, method, CLUSTER_FILE_HEAD[method] + "_csv") for chunk_out in chunk_outs ] method_dir = os.path.join(outs.clustered_data, method) cr_io.mkdir(method_dir, allow_existing=True) analysis_io.combine_h5_files( chunk_h5s, os.path.join(method_dir, CLUSTER_FILE_HEAD[method] + ".h5"), [ analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP, analysis_constants.ANALYSIS_H5_MAP_CLUSTERING[method] ]) for csv_dir in chunk_csv_dirs: cr_io.copytree(csv_dir, os.path.join(method_dir, CLUSTER_FILE_HEAD[method] + "_csv"), allow_existing=True) outs.clustering_summary['h5'][method] = os.path.join( method_dir, CLUSTER_FILE_HEAD[method] + ".h5") outs.clustering_summary['csv'][method] = os.path.join( method_dir, CLUSTER_FILE_HEAD[method] + "_csv")
def join(args, outs, chunk_defs, chunk_outs): if args.filtered_peak_bc_matrix is None or not args.reduction_summary[ 'h5'].keys(): outs.analysis = None outs.analysis_csv = None outs.feature_bc_matrix = None return # Make the FBM # build joint Peak + TF count matrix for single genomes # combine peak annotations for single genome analysis peak_annotation = None if args.peak_annotation: annotations = pd.read_csv(args.peak_annotation, sep='\t')[['gene', 'peak_type']] annotations = annotations.replace(np.nan, '', regex=True) annotations = annotations.values.astype(str).tolist() peak_annotation = [] for row in annotations: genes = row[0].split(";") annotation = row[1].split(";") promoter = [] nearby_gene = [] assert len(annotation) == len(genes) for en, kind in enumerate(annotation): if kind == 'promoter': promoter += [genes[en]] nearby_gene += [genes[en]] peak_annotation += [[';'.join(promoter), ';'.join(nearby_gene)]] fbm = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix) mapping = None if args.filtered_tf_bc_matrix: # combine matrices, ensure the barcodes are same and ordered the same way tf_matrix = cr_matrix.CountMatrix.load_h5_file( args.filtered_tf_bc_matrix) assert (fbm.bcs == tf_matrix.bcs).all() if peak_annotation is not None: fbm.feature_ref = FeatureReference.addtags( fbm.feature_ref, ['promoter', 'nearby_gene'], peak_annotation) tf_matrix.feature_ref = FeatureReference.addtags( tf_matrix.feature_ref, ['promoter', 'nearby_gene']) combined_feature_defs = FeatureReference.join(fbm.feature_ref, tf_matrix.feature_ref) combined_matrix = vstack([fbm.m, tf_matrix.m]) # explicit map linking rows in diffexp to combined matrix mapping = np.zeros((tf_matrix.features_dim, 2)) for x in range(tf_matrix.features_dim): mapping[x, 0] = x mapping[x, 1] = x + fbm.features_dim fbm = cr_matrix.CountMatrix(combined_feature_defs, fbm.bcs, combined_matrix) fbm.save_h5_file(outs.feature_bc_matrix, sw_version=martian.get_pipelines_version()) # Pytables doesn't support variable len strings, so use h5py first with h5.File(outs.feature_bc_matrix, 'r') as matrix, \ h5.File(outs.analysis, 'w') as out: # TODO: copy the first group; fixme when we have a key name = matrix.keys()[0] matrix.copy(matrix[name], out, name='matrix') factorizations = args.reduction_summary['h5'].keys() USE_FACTORIZATION = DEFAULT_FACTORIZATION if DEFAULT_FACTORIZATION in factorizations else factorizations[ 0] with tables.open_file(outs.analysis, 'a') as out: for summary, key in zip([ args.reduction_summary, args.clustering_summary, args.tsne_summary, args.enrichment_analysis_summary ], [USE_FACTORIZATION, 'clustering', 'tsne', 'enrichment']): if summary is None or not summary: continue print(key, summary) data_h5 = summary['h5'][USE_FACTORIZATION] with tables.open_file(data_h5, 'r') as indata: indata.copy_children(indata.root, out.root, recursive=True) dirname = os.path.join(outs.analysis_csv, key) cr_io.copytree(summary['csv'][USE_FACTORIZATION], dirname) # if mapping is present (single genome case), so is the coloring matrix if mapping is not None: with h5.File(outs.analysis, 'a') as out: out.create_dataset('feature_DE_map', data=mapping) args.coerce_strings() tf_propZ_matrix = np.loadtxt(args.tf_propZ_matrix) with h5.File(outs.analysis, 'a') as out: out.create_dataset('diffexp_coloring_matrix', data=tf_propZ_matrix)
def copy_subdirs(src_dir, dest_dir): for subdir in os.listdir(src_dir): cr_io.copytree(os.path.join(src_dir, subdir), os.path.join(dest_dir, subdir))
def main(args, outs): if args.skip: return if args.is_multi_genome: cr_io.copytree(args.multi_genome_json, outs.analysis) cr_io.copytree(args.multi_genome_csv, outs.analysis_csv) analysis_h5 = analysis_io.h5_path(outs.analysis) cr_io.makedirs(os.path.dirname(analysis_h5), allow_existing=True) # Pytables doesn't support variable len strings, so use h5py first with h5.File(args.matrix_h5, 'r') as matrix,\ h5.File(analysis_h5, 'w') as out: # TODO: copy the first group; fixme when we have a key name = matrix.keys()[0] matrix.copy(matrix[name], out, name='matrix') with tables.open_file(args.pca_h5, 'r') as pca,\ tables.open_file(args.clustering_h5, 'r') as clustering,\ tables.open_file(args.diffexp_h5, 'r') as diffexp,\ tables.open_file(args.tsne_h5, 'r') as tsne,\ tables.open_file(analysis_h5, 'a') as out: pca.copy_children(pca.root, out.root, recursive=True) clustering.copy_children(clustering.root, out.root, recursive=True) diffexp.copy_children(diffexp.root, out.root, recursive=True) tsne.copy_children(tsne.root, out.root, recursive=True) pca_dir = os.path.join(outs.analysis_csv, 'pca') cr_io.copytree(args.pca_csv, pca_dir) clustering_dir = os.path.join(outs.analysis_csv, 'clustering') cr_io.copytree(args.clustering_csv, clustering_dir) diffexp_dir = os.path.join(outs.analysis_csv, 'diffexp') cr_io.copytree(args.diffexp_csv, diffexp_dir) tsne_dir = os.path.join(outs.analysis_csv, 'tsne') cr_io.copytree(args.tsne_csv, tsne_dir)