def run_stringtie_expression(data): """ estimate expression from Stringtie, using the bcbio datadict does not do transcriptome assembly """ bam = dd.get_work_bam(data) sample_name = dd.get_sample_name(data) out_dir = os.path.join("stringtie", sample_name) isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm") gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm") assembly = os.path.abspath(os.path.join(out_dir, "stringtie-assembly.gtf")) if file_exists(isoform_fpkm) and file_exists(gene_fpkm): data = dd.set_stringtie_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): assembled_gtfs = dd.get_assembled_gtf(data) assembled_gtfs.append(assembly) data = dd.set_assembled_gtf(data, assembled_gtfs) return data with file_transaction(data, out_dir) as tx_out_dir: transcript_file = _stringtie_expression(bam, data, tx_out_dir) df = _parse_ballgown(transcript_file) _write_fpkms(df, tx_out_dir, sample_name) data = dd.set_stringtie_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): assembled_gtfs = dd.get_assembled_gtf(data) assembled_gtfs.append(assembly) data = dd.set_assembled_gtf(data, assembled_gtfs) return data
def run_stringtie_expression(data): """ estimate expression from Stringtie, using the bcbio datadict does not do transcriptome assembly """ bam = dd.get_work_bam(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) sample_name = dd.get_sample_name(data) out_dir = os.path.join("stringtie", sample_name) isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm") gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm") assembly = os.path.abspath(os.path.join(out_dir, "stringtie-assembly.gtf")) if file_exists(isoform_fpkm) and file_exists(gene_fpkm): data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): dd.get_assembled_gtf(data).append(assembly) return data with file_transaction(data, out_dir) as tx_out_dir: transcript_file = _stringtie_expression(bam, gtf_file, num_cores, tx_out_dir) df = _parse_ballgown(transcript_file) _write_fpkms(df, tx_out_dir, sample_name) data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): dd.get_assembled_gtf(data).append(assembly) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ( "{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": gtf_file = dd.get_gtf_file(data) splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") cmd += "--known-splicesite-infile {splicesites} " message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with file_transaction(out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def _stringtie_expression(bam, data, out_dir="."): """ only estimate expression the Stringtie, do not assemble new transcripts """ gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) error_message = "The %s file for %s is missing. StringTie has an error." stringtie = config_utils.get_program("stringtie", data, default="stringtie") # don't assemble transcripts unless asked exp_flag = ("-e" if "stringtie" not in dd.get_transcript_assembler(data) else "") base_cmd = ( "{stringtie} {exp_flag} -b {out_dir} -p {num_cores} -G {gtf_file} " "-o {out_gtf} {bam}") transcript_file = os.path.join(out_dir, "t_data.ctab") exon_file = os.path.join(out_dir, "e_data.ctab") out_gtf = os.path.join(out_dir, "stringtie-assembly.gtf") if file_exists(transcript_file): return exon_file, transcript_file, out_gtf cmd = base_cmd.format(**locals()) do.run(cmd, "Running Stringtie on %s." % bam) assert file_exists(exon_file), error_message % ("exon", exon_file) assert file_exists(transcript_file), error_message % ("transcript", transcript_file) return transcript_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ("{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": gtf_file = dd.get_gtf_file(data) splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") cmd += "--known-splicesite-infile {splicesites} " message = "Aligning %s and %s with hisat2." %(fastq_file, pair_file) with file_transaction(out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if not file_exists(out_file) and (final_file is None or not file_exists(final_file)): cmd = ( "{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": splicesites = get_known_splicesites_file(align_dir, data) if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " novel_splicesite_file = os.path.join( align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data))) cmd += "--novel-splicesite-outfile {novel_splicesite_file} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cmd += " | " + tobam_cl do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) junctionbed = get_splicejunction_file(align_dir, data) data = dd.set_junction_bed(data, junctionbed) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not file_exists(out_file) and (final_file is None or not file_exists(final_file)): cmd = ("{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": splicesites = get_known_splicesites_file(align_dir, data) if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " novel_splicesite_file = os.path.join(align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data))) cmd += "--novel-splicesite-outfile {novel_splicesite_file} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cmd += " | " + tobam_cl do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) junctionbed = get_splicejunction_file(align_dir, data) data = dd.set_junction_bed(data, junctionbed) return data
def _stringtie_expression(bam, data, out_dir="."): """ only estimate expression the Stringtie, do not assemble new transcripts """ gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) error_message = "The %s file for %s is missing. StringTie has an error." stringtie = config_utils.get_program("stringtie", data, default="stringtie") # don't assemble transcripts unless asked exp_flag = ("-e" if "stringtie" not in dd.get_transcript_assembler(data) else "") base_cmd = ("{stringtie} {exp_flag} -b {out_dir} -p {num_cores} -G {gtf_file} " "-o {out_gtf} {bam}") transcript_file = os.path.join(out_dir, "t_data.ctab") exon_file = os.path.join(out_dir, "e_data.ctab") out_gtf = os.path.join(out_dir, "stringtie-assembly.gtf") if file_exists(transcript_file): return exon_file, transcript_file, out_gtf cmd = base_cmd.format(**locals()) do.run(cmd, "Running Stringtie on %s." % bam) assert file_exists(exon_file), error_message % ("exon", exon_file) assert file_exists(transcript_file), error_message % ("transcript", transcript_file) return transcript_file
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""): out.append({"path": sample["provenance"]["data"]}) for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists( os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({ "path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": "" }) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) if "summary" in sample and sample["summary"].get("metadata"): out.append({"path": sample["summary"]["metadata"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({ "path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check" }) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) multiqc = tz.get_in(["summary", "multiqc"], sample) if multiqc: out.extend(_flatten_file_with_secondary(multiqc, "multiqc")) if sample.get("seqcluster", {}): out.append({ "path": sample["seqcluster"].get("out_dir"), "type": "directory", "ext": "seqcluster" }) if sample.get("mirge", {}): for fn in sample["mirge"]: out.append({"path": fn, "dir": "mirge"}) if sample.get("report", None): out.append({ "path": os.path.dirname(sample["report"]), "type": "directory", "ext": "seqclusterViz" }) for x in sample.get("variants", []): if "pop_db" in x: out.append({ "path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"] }) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({ "path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"] }) suffix = "-annotated-decomposed" if tz.get_in( ("population", "decomposed"), x) else "-annotated" vcfs = _get_project_vcf(x, suffix) out.extend([_add_batch(f, sample) for f in vcfs]) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break sv_project = set([]) for svcall in sample.get("sv", []): if svcall.get("variantcaller") == "seq2c": if svcall.get( "calls_all") and svcall["calls_all"] not in sv_project: out.append({ "path": svcall["coverage_all"], "batch": "seq2c", "ext": "coverage", "type": "tsv" }) out.append({ "path": svcall["read_mapping"], "batch": "seq2c", "ext": "read_mapping", "type": "txt" }) out.append({ "path": svcall["calls_all"], "batch": "seq2c", "ext": "calls", "type": "tsv" }) sv_project.add(svcall["calls_all"]) if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({ "path": all_coverage, "type": "bed", "ext": "coverage" }) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): count_file = dd.get_combined_counts(sample) if sample["analysis"].lower() == "scrna-seq": out.append({"path": count_file, "type": "mtx"}) out.append({"path": count_file + ".rownames", "type": "rownames"}) out.append({"path": count_file + ".colnames", "type": "colnames"}) out.append({"path": count_file + ".metadata", "type": "metadata"}) umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx" if utils.file_exists(umi_file): out.append({"path": umi_file, "type": "mtx"}) out.append({ "path": umi_file + ".rownames", "type": "rownames" }) out.append({ "path": umi_file + ".colnames", "type": "colnames" }) if dd.get_combined_histogram(sample): out.append({ "path": dd.get_combined_histogram(sample), "type": "txt" }) rda = os.path.join(os.path.dirname(count_file), "se.rda") if utils.file_exists(rda): out.append({"path": rda, "type": "rda"}) else: out.append({"path": dd.get_combined_counts(sample)}) if dd.get_tximport(sample): out.append({"path": dd.get_tximport(sample)["gene_tpm"], "dir": "tpm"}) out.append({ "path": dd.get_tximport(sample)["gene_counts"], "dir": "counts" }) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) out.append({"path": "%s.ann" % dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) if dd.get_tx2gene(sample): out.append({"path": dd.get_tx2gene(sample)}) if dd.get_spikein_counts(sample): out.append({"path": dd.get_spikein_counts(sample)}) if tz.get_in(("peaks_files", "consensus", "main"), sample): out.append({ "path": tz.get_in(("peaks_files", "consensus", "main"), sample), "dir": "consensus" }) if tz.get_in(("peak_counts", "peaktable"), sample): out.append({ "path": tz.get_in(("peak_counts", "peaktable"), sample), "dir": "consensus" }) transcriptome_dir = os.path.join(dd.get_work_dir(sample), "inputs", "transcriptome") if os.path.exists(transcriptome_dir): out.append({ "path": transcriptome_dir, "type": "directory", "ext": "transcriptome" }) return _add_meta(out, config=upload_config)
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""): out.append({"path": sample["provenance"]["data"]}) for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) multiqc = tz.get_in(["summary", "multiqc"], sample) if multiqc: out.extend(_flatten_file_with_secondary(multiqc, "multiqc")) if sample.get("seqcluster", None): out.append({"path": sample["seqcluster"], "type": "directory", "ext": "seqcluster"}) if sample.get("report", None): out.append({"path": os.path.dirname(sample["report"]), "type": "directory", "ext": "seqclusterViz"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) out.extend(_get_variant_file(x, ("population", "vcf"))) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({"path": all_coverage, "type": "bed", "ext": "coverage"}) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_tidy(sample): out.append({"path": dd.get_sailfish_tidy(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) if dd.get_tx2gene(sample): out.append({"path": dd.get_tx2gene(sample)}) return _add_meta(out, config=upload_config)
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""): out.append({"path": sample["provenance"]["data"]}) for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) multiqc = tz.get_in(["summary", "multiqc"], sample) if multiqc: out.extend(_flatten_file_with_secondary(multiqc, "multiqc")) if sample.get("seqcluster", {}): out.append({"path": sample["seqcluster"].get("out_dir"), "type": "directory", "ext": "seqcluster"}) if sample.get("report", None): out.append({"path": os.path.dirname(sample["report"]), "type": "directory", "ext": "seqclusterViz"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) suffix = "-annotated-decomposed" if tz.get_in(("population", "decomposed"), x) else "-annotated" out.extend([_add_batch(x, sample) for x in _get_variant_file(x, ("population", "vcf"), suffix=suffix)]) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({"path": all_coverage, "type": "bed", "ext": "coverage"}) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): count_file = dd.get_combined_counts(sample) if sample["analysis"].lower() == "scrna-seq": out.append({"path": count_file, "type": "mtx"}) out.append({"path": count_file + ".rownames", "type": "rownames"}) out.append({"path": count_file + ".colnames", "type": "colnames"}) else: out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_tidy(sample): out.append({"path": dd.get_sailfish_tidy(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) if dd.get_tx2gene(sample): out.append({"path": dd.get_tx2gene(sample)}) if dd.get_spikein_counts(sample): out.append({"path": dd.get_spikein_counts(sample)}) return _add_meta(out, config=upload_config)
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""): out.append({"path": sample["provenance"]["data"]}) for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) if "summary" in sample and sample["summary"].get("metadata"): out.append({"path": sample["summary"]["metadata"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) multiqc = tz.get_in(["summary", "multiqc"], sample) if multiqc: out.extend(_flatten_file_with_secondary(multiqc, "multiqc")) if sample.get("seqcluster", {}): out.append({"path": sample["seqcluster"].get("out_dir"), "type": "directory", "ext": "seqcluster"}) if sample.get("mirge", {}): for fn in sample["mirge"]: out.append({"path": fn, "dir": "mirge"}) if sample.get("report", None): out.append({"path": os.path.dirname(sample["report"]), "type": "directory", "ext": "seqclusterViz"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) suffix = "-annotated-decomposed" if tz.get_in(("population", "decomposed"), x) else "-annotated" vcfs = _get_project_vcf(x, suffix) out.extend([_add_batch(f, sample) for f in vcfs]) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break sv_project = set([]) for svcall in sample.get("sv", []): if svcall.get("variantcaller") == "seq2c": if svcall.get("calls_all") and svcall["calls_all"] not in sv_project: out.append({"path": svcall["coverage_all"], "batch": "seq2c", "ext": "coverage", "type": "tsv"}) out.append({"path": svcall["read_mapping"], "batch": "seq2c", "ext": "read_mapping", "type": "txt"}) out.append({"path": svcall["calls_all"], "batch": "seq2c", "ext": "calls", "type": "tsv"}) sv_project.add(svcall["calls_all"]) if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({"path": all_coverage, "type": "bed", "ext": "coverage"}) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): count_file = dd.get_combined_counts(sample) if sample["analysis"].lower() == "scrna-seq": out.append({"path": count_file, "type": "mtx"}) out.append({"path": count_file + ".rownames", "type": "rownames"}) out.append({"path": count_file + ".colnames", "type": "colnames"}) out.append({"path": count_file + ".metadata", "type": "metadata"}) umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx" if utils.file_exists(umi_file): out.append({"path": umi_file, "type": "mtx"}) out.append({"path": umi_file + ".rownames", "type": "rownames"}) out.append({"path": umi_file + ".colnames", "type": "colnames"}) if dd.get_combined_histogram(sample): out.append({"path": dd.get_combined_histogram(sample), "type": "txt"}) rda = os.path.join(os.path.dirname(count_file), "se.rda") if utils.file_exists(rda): out.append({"path": rda, "type": "rda"}) else: out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) out.append({"path": "%s.ann" % dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) if dd.get_tx2gene(sample): out.append({"path": dd.get_tx2gene(sample)}) if dd.get_spikein_counts(sample): out.append({"path": dd.get_spikein_counts(sample)}) transcriptome_dir = os.path.join(dd.get_work_dir(sample), "inputs", "transcriptome") if os.path.exists(transcriptome_dir): out.append({"path": transcriptome_dir, "type": "directory", "ext": "transcriptome"}) return _add_meta(out, config=upload_config)
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists( os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({ "path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": "" }) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({ "path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check" }) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) if sample.get("seqcluster", None): out.append({ "path": sample["seqcluster"], "type": "directory", "ext": "seqcluster" }) for x in sample.get("variants", []): if "pop_db" in x: out.append({ "path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"] }) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({ "path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"] }) out.extend(_get_variant_file(x, ("population", "vcf"))) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({ "path": all_coverage, "type": "bed", "ext": "coverage" }) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_tidy(sample): out.append({"path": dd.get_sailfish_tidy(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) return _add_meta(out, config=upload_config)