def make_scrnaseq_object(samples): """ load the initial se.rda object using sinclecell-experiment """ local_sitelib = R_sitelib() counts_dir = os.path.dirname( dd.get_in_samples(samples, dd.get_combined_counts)) gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf) if not gtf_file: gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) rda_file = os.path.join(counts_dir, "se.rda") if not file_exists(rda_file): with file_transaction(rda_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(rda_file)[0] rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0] rrna_file = _find_rRNA_genes(gtf_file, rrna_file) with open(rcode, "w") as out_handle: out_handle.write(_script.format(**locals())) rscript = Rscript_cmd() try: # do.run([rscript, "--vanilla", rcode], # "SingleCellExperiment", # log_error=False) rda_file = rcode except subprocess.CalledProcessError as msg: logger.exception()
def rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples): """ organizes RNA-seq and small-RNAseq samples, converting from BAM if necessary and trimming if necessary """ pipeline = dd.get_in_samples(samples, dd.get_analysis) trim_reads_set = dd.get_in_samples(samples, dd.get_trim_reads) resources = ["picard"] needs_trimming = (_is_smallrnaseq(pipeline) or trim_reads_set) if needs_trimming: resources.append("cutadapt") with prun.start(_wres(parallel, resources), samples, config, dirs, "trimming", max_multicore=1) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[ dirs, config, run_info_yaml, [x[0]["description"] for x in samples] ]]) samples = run_parallel("prepare_sample", samples) if needs_trimming: with profile.report("adapter trimming", dirs): if _is_smallrnaseq(pipeline): samples = run_parallel("trim_srna_sample", samples) else: samples = run_parallel("trim_sample", samples) return samples
def detect_fusions(samples): """Run fusion with a standalone tool, specified in config as fusion_caller. If fusion_mode is True, and no fusion_caller is specified, or fusion_caller == 'aligner', it is assumed that gene fusion detection was run on the alignment step. """ fusion_mode = dd.get_in_samples(samples, dd.get_fusion_mode) if not fusion_mode: return samples caller = dd.get_in_samples(samples, dd.get_fusion_caller) if not caller or caller == 'aligner': logger.info("No standalone fusion caller specified in the config.") return samples STANDALONE_CALLERS = { 'ericscript': ericscript.run, } caller_fn = STANDALONE_CALLERS.get(caller) if not caller_fn: logger.warning("Gene fusion detection with %s is not supported." "Supported callers:\n%s" % ', '.join(STANDALONE_CALLERS.keys())) return samples logger.info("Running gene fusion detection with %s" % caller) return [[caller_fn(s)] for s in dd.sample_data_iterator(samples)]
def make_scrnaseq_object(samples): """ load the initial se.rda object using sinclecell-experiment """ local_sitelib = R_sitelib() counts_dir = os.path.dirname(dd.get_in_samples(samples, dd.get_combined_counts)) gtf_file = dd.get_in_samples(samples, dd.get_transcriptome_gtf) if not gtf_file: gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) rda_file = os.path.join(counts_dir, "se.rda") if not file_exists(rda_file): with file_transaction(rda_file) as tx_out_file: rcode = "%s-run.R" % os.path.splitext(rda_file)[0] rrna_file = "%s-rrna.txt" % os.path.splitext(rda_file)[0] rrna_file = _find_rRNA_genes(gtf_file, rrna_file) with open(rcode, "w") as out_handle: out_handle.write(_script.format(**locals())) rscript = Rscript_cmd() try: # do.run([rscript, "--no-environ", rcode], # "SingleCellExperiment", # log_error=False) rda_file = rcode except subprocess.CalledProcessError as msg: logger.exception()
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([ file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene] ]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "sailfish") gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "combined.sf") transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm") tx2gene = os.path.join(sailfish_dir, "tx2gene.csv") if not all([file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file, tx2gene]]): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot("id", "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") tx2gene = gtf.tx2genefile(gtf_file, tx2gene) logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) data = dd.set_tx2gene(data, tx2gene) updated_samples.append([data]) return updated_samples
def rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples): """ organizes RNA-seq and small-RNAseq samples, converting from BAM if necessary and trimming if necessary """ pipeline = dd.get_in_samples(samples, dd.get_analysis) trim_reads_set = any([tz.get_in(["algorithm", "trim_reads"], d) for d in dd.sample_data_iterator(samples)]) resources = ["picard"] needs_trimming = (_is_smallrnaseq(pipeline) or trim_reads_set) if needs_trimming: resources.append("atropos") with prun.start(_wres(parallel, resources), samples, config, dirs, "trimming", max_multicore=1 if not needs_trimming else None) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) samples = run_parallel("prepare_sample", samples) if needs_trimming: with profile.report("adapter trimming", dirs): if _is_smallrnaseq(pipeline): samples = run_parallel("trim_srna_sample", samples) else: samples = run_parallel("trim_sample", samples) return samples
def singlecell_rnaseq(samples, run_parallel): quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier) quantifier = quantifier.lower() samples = run_parallel("run_umi_transform", samples) demultiplexed = run_parallel("demultiplex_samples", samples) # break demultiplixed lanes into their own samples samples = [] for lane in demultiplexed: for index in lane: samples.append([index]) samples = run_parallel("run_filter_barcodes", samples) samples = run_parallel("run_barcode_histogram", samples) if quantifier == "rapmap": samples = run_parallel("run_rapmap_index", [samples]) samples = run_parallel("run_rapmap_align", samples) samples = run_parallel("run_tagcount", samples) samples = run_parallel("run_concatenate_sparse_counts", [samples]) elif quantifier == "kallisto": samples = run_parallel("run_kallisto_singlecell", samples) else: logger.error(("%s is not supported for singlecell RNA-seq " "quantification." % quantifier)) sys.exit(1) samples = scrnaseq_concatenate_metadata(samples) singlecellexperiment.make_scrnaseq_object(samples) return samples
def singlecell_rnaseq(samples, run_parallel): quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier) quantifier = quantifier.lower() samples = run_parallel("run_umi_transform", samples) demultiplexed = run_parallel("demultiplex_samples", samples) # break demultiplixed lanes into their own samples samples = [] for lane in demultiplexed: for index in lane: samples.append([index]) if not samples: logger.error(f"No samples were found matching the supplied sample barcodes. See " f"https://github.com/bcbio/bcbio-nextgen/issues/3428#issuecomment-772609904 " f"for how to debug this issue.") sys.exit(1) samples = run_parallel("run_filter_barcodes", samples) samples = run_parallel("run_barcode_histogram", samples) if quantifier == "rapmap": samples = run_parallel("run_rapmap_index", [samples]) samples = run_parallel("run_rapmap_align", samples) samples = run_parallel("run_tagcount", samples) samples = run_parallel("run_concatenate_sparse_counts", [samples]) elif quantifier == "kallisto": samples = run_parallel("run_kallisto_singlecell", samples) else: logger.error(("%s is not supported for singlecell RNA-seq " "quantification." % quantifier)) sys.exit(1) samples = scrnaseq_concatenate_metadata(samples) singlecellexperiment.make_scrnaseq_object(samples) return samples
def combine_spikein(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "spikein") dont_combine, to_combine = partition(dd.get_spikein_counts, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "spikein.sf") if not file_exists(tidy_file): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_spikein_counts(data) samplename = dd.get_sample_name(data) new_df = sailfish._sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_spikein_counts(data, tidy_file) updated_samples.append([data]) return updated_samples
def concatenate_sparse_counts(*samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") out_file = os.path.join(umi_dir, "tagcounts.mtx") if file_exists(out_file): return out_file files = [ dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] descriptions = [ dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples out_file = os.path.join(work_dir, "sailfish", "combined.sf") if not file_exists(out_file): df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) with file_transaction(out_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_combined(data, out_file) updated_samples.append([data]) return updated_samples
def combine_spikein(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) sailfish_dir = os.path.join(work_dir, "spikein") dont_combine, to_combine = partition(dd.get_spikein_counts, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(sailfish_dir, "spikein.sf") if not file_exists(tidy_file): logger.info("Combining count files into %s." % tidy_file) df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_spikein_counts(data) samplename = dd.get_sample_name(data) new_df = sailfish._sailfish_expression_parser( sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) df["id"] = df.index # some versions of the transcript annotations can have duplicated entries df = df.drop_duplicates(["id", "sample"]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") logger.info("Finished combining count files into %s." % tidy_file) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_spikein_counts(data, tidy_file) updated_samples.append([data]) return updated_samples
def combine_sailfish(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) gtf_file = dd.get_in_samples(samples, dd.get_gtf_file) dont_combine, to_combine = partition(dd.get_sailfish, dd.sample_data_iterator(samples), True) if not to_combine: return samples tidy_file = os.path.join(work_dir, "sailfish", "combined.sf") transcript_tpm_file = os.path.join(work_dir, "sailfish", "combined.isoform.sf.tpm") gene_tpm_file = os.path.join(work_dir, "sailfish", "combined.gene.sf.tpm") if not all([file_exists(x) for x in [gene_tpm_file, tidy_file, transcript_tpm_file]]): df = pd.DataFrame() for data in to_combine: sailfish_file = dd.get_sailfish(data) samplename = dd.get_sample_name(data) new_df = _sailfish_expression_parser(sailfish_file, samplename) if df.empty: df = new_df else: df = rbind([df, new_df]) with file_transaction(tidy_file) as tx_out_file: df.to_csv(tx_out_file, sep="\t", index_label="name") with file_transaction(transcript_tpm_file) as tx_out_file: df.pivot(None, "sample", "tpm").to_csv(tx_out_file, sep="\t") with file_transaction(gene_tpm_file) as tx_out_file: pivot = df.pivot(None, "sample", "tpm") tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file), orient="index") tdf.columns = ["gene_id"] pivot = pivot.join(tdf) pivot = pivot.groupby("gene_id").agg(np.sum) pivot.to_csv(tx_out_file, sep="\t") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_sailfish_tidy(data, tidy_file) data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file) data = dd.set_sailfish_gene_tpm(data, gene_tpm_file) updated_samples.append([data]) return updated_samples
def initialize_watcher(samples): """ check to see if cwl_reporting is set for any samples, and if so, initialize a WorldWatcher object from a set of samples, """ work_dir = dd.get_in_samples(samples, dd.get_work_dir) ww = WorldWatcher(work_dir, is_on=any([dd.get_cwl_reporting(d[0]) for d in samples])) ww.initialize(samples) return ww
def rnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start(_wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={"tophat": 10, "tophat2": 10, "star": 2, "hisat2": 8}), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression (threaded)", dirs): samples = rnaseq.quantitate_expression_parallel(samples, run_parallel) with prun.start(_wres(parallel, ["dexseq", "express"]), samples, config, dirs, "rnaseqcount-singlethread", max_multicore=1) as run_parallel: with profile.report("estimate expression (single threaded)", dirs): samples = rnaseq.quantitate_expression_noparallel(samples, run_parallel) samples = rnaseq.combine_files(samples) with prun.start(_wres(parallel, ["gatk", "vardict"]), samples, config, dirs, "rnaseq-variation") as run_parallel: with profile.report("RNA-seq variant calling", dirs): samples = rnaseq.rnaseq_variant_calling(samples, run_parallel) with prun.start(_wres(parallel, ["samtools", "fastqc", "qualimap", "kraken", "gatk", "preseq"], ensure_mem={"qualimap": 4}), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) with profile.report("bcbioRNAseq loading", dirs): tools_on = dd.get_in_samples(samples, dd.get_tools_on) bcbiornaseq_on = tools_on and "bcbiornaseq" in tools_on if bcbiornaseq_on: if len(samples) < 3: logger.warn("bcbioRNASeq needs at least three samples total, skipping.") elif len(samples) > 100: logger.warn("Over 100 samples, skipping bcbioRNASeq.") else: run_parallel("run_bcbiornaseqload", [sample]) logger.info("Timing: finished") return samples
def singlecell_rnaseq(samples, run_parallel): quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier) quantifier = quantifier.lower() samples = run_parallel("run_umi_transform", samples) samples = run_parallel("run_barcode_histogram", samples) samples = run_parallel("run_filter_barcodes", samples) if quantifier == "rapmap": samples = run_parallel("run_rapmap_align", samples) samples = run_parallel("run_tagcount", samples) elif quantifier == "kallisto": samples = run_parallel("run_kallisto_singlecell", samples) else: logger.error(("%s is not supported for singlecell RNA-seq " "quantification." % quantifier)) sys.exit(1) return samples
def assemble_transcripts(run_parallel, samples): """ assembly strategy rationale implemented as suggested in http://www.nature.com/nprot/journal/v7/n3/full/nprot.2012.016.html run Cufflinks in without a reference GTF for each individual sample merge the assemblies with Cuffmerge using a reference GTF """ assembler = dd.get_in_samples(samples, dd.get_transcript_assembler) if assembler: if "cufflinks" in assembler: samples = run_parallel("cufflinks_assemble", samples) if "stringtie" in assembler: samples = run_parallel("run_stringtie_expression", samples) samples = run_parallel("cufflinks_merge", [samples]) return samples
def _vardict_options_from_config(items, config, out_file, target=None, is_rnaseq=False): var2vcf_opts = [] opts = ["-c 1", "-S 2", "-E 3", "-g 4"] # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] cores = dd.get_num_cores(items[0]) if cores and cores > 1: opts += ["-th", str(cores)] # Disable SV calling for vardict, causes issues with regional analysis # by detecting SVs outside of target regions, which messes up merging # SV calling will be worked on as a separate step # use tools_on: vardict_sv to turn sv calling in vardict on (experimental) tools_on = dd.get_in_samples(items, dd.get_tools_on) vardict_sv_on = tools_on and "vardict_sv" in tools_on vardict_cl = get_vardict_command(items[0]) version = programs.get_version_manifest(vardict_cl) # turn off structural variants if ((vardict_cl and version and ((vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.5")) or (vardict_cl == "vardict"))) and not vardict_sv_on): opts += ["--nosv"] if (vardict_cl and version and (vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.6"))): opts += ["--deldupvar"] # remove low mapping quality reads if not is_rnaseq: opts += ["-Q", "10"] # Remove QCfail reads, avoiding high depth repetitive regions opts += ["-F", "0x700"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += [str(x) for x in resources["options"]] resources = config_utils.get_resources("var2vcf", config) if resources.get("options"): var2vcf_opts += [str(x) for x in resources["options"]] if target and _is_bed_file(target): target = _enforce_max_region_size(target, items[0]) opts += [target] # this must be the last option _add_freq_options(config, opts, var2vcf_opts) return " ".join(opts), " ".join(var2vcf_opts)
def concatenate_cb_histograms(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") out_file = os.path.join(umi_dir, "cb-histogram.txt") files = [dd.get_histogram_counts(data) for data in dd.sample_data_iterator(samples) if dd.get_histogram_counts(data)] files = " ".join(files) cmd = "cat {files} > {out_file}" if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: message = "Concay cb histograms." do.run(cmd.format(**locals()), message) newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_histogram(data, out_file)]) return newsamples
def concatenate_sparse_matrices(samples, deduped=True): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") if deduped: out_file = os.path.join(umi_dir, "tagcounts.mtx") else: out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx") if file_exists(out_file): if deduped: newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples else: return samples files = [ dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] if not deduped: files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files] files = [fn for fn in files if file_exists(fn)] descriptions = [ dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data) ] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] if deduped: for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples return samples
def assemble_transcripts(run_parallel, samples): """ assembly strategy rationale implemented as suggested in http://www.nature.com/nprot/journal/v7/n3/full/nprot.2012.016.html run Cufflinks in without a reference GTF for each individual sample merge the assemblies with Cuffmerge using a reference GTF """ assembler = dd.get_in_samples(samples, dd.get_transcript_assembler) data = samples[0][0] if assembler: if "cufflinks" in assembler: samples = run_parallel("cufflinks_assemble", samples) if "stringtie" in assembler: samples = run_parallel("run_stringtie_expression", samples) if "stringtie" in assembler and stringtie.supports_merge(data): samples = run_parallel("stringtie_merge", [samples]) else: samples = run_parallel("cufflinks_merge", [samples]) return samples
def concatenate_cb_histograms(samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") out_file = os.path.join(umi_dir, "cb-histogram.txt") files = [ dd.get_histogram_counts(data) for data in dd.sample_data_iterator(samples) if dd.get_histogram_counts(data) ] files = " ".join(files) cmd = "cat {files} > {out_file}" if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: message = "Concat cellular barcode histograms: %s." % files do.run(cmd.format(**locals()), message) newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_histogram(data, out_file)]) return newsamples
def concatenate_sparse_matrices(samples, deduped=True): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") if deduped: out_file = os.path.join(umi_dir, "tagcounts.mtx") else: out_file = os.path.join(umi_dir, "tagcounts-dupes.mtx") if file_exists(out_file): if deduped: newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples else: return samples files = [dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] if not deduped: files = [os.path.splitext(x)[0] + "-dupes.mtx" for x in files] files = [fn for fn in files if file_exists(fn)] descriptions = [dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] if deduped: for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples return samples
def concatenate_sparse_counts(*samples): work_dir = dd.get_in_samples(samples, dd.get_work_dir) umi_dir = os.path.join(work_dir, "umis") out_file = os.path.join(umi_dir, "tagcounts.mtx") if file_exists(out_file): return out_file files = [dd.get_count_file(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] descriptions = [dd.get_sample_name(data) for data in dd.sample_data_iterator(samples) if dd.get_count_file(data)] if not files: return samples counts = SparseMatrix() counts.read(filename=files.pop(), colprefix=descriptions.pop()) for filename, description in zip(files, descriptions): newcounts = SparseMatrix() newcounts.read(filename=filename, colprefix=description) counts.cat(newcounts) counts.write(out_file) newsamples = [] for data in dd.sample_data_iterator(samples): newsamples.append([dd.set_combined_counts(data, out_file)]) return newsamples