def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) out.extend(_get_variant_file(x, ("population", "vcf"))) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) if "combined_counts" in sample: out.append({"path": sample["combined_counts"]}) if "annotated_combined_counts" in sample: out.append({"path": sample["annotated_combined_counts"]}) if "combined_fpkm" in sample: out.append({"path": sample["combined_fpkm"]}) if "combined_fpkm_isoform" in sample: out.append({"path": sample["combined_fpkm_isoform"]}) if "assembled_gtf" in sample: out.append({"path": sample["assembled_gtf"]}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) return _add_meta(out, config=upload_config)
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) out.extend(_get_variant_file(x, ("population", "vcf"))) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "combined_counts" in sample: out.append({"path": sample["combined_counts"]}) if "annotated_combined_counts" in sample: out.append({"path": sample["annotated_combined_counts"]}) if "combined_fpkm" in sample: out.append({"path": sample["combined_fpkm"]}) if "combined_fpkm_isoform" in sample: out.append({"path": sample["combined_fpkm_isoform"]}) if "assembled_gtf" in sample: out.append({"path": sample["assembled_gtf"]}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) return _add_meta(out, config=upload_config)
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ gtf_file = dd.get_gtf_file(samples[0][0], None) dexseq_gff = dd.get_dexseq_gff(samples[0][0]) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) updated_samples.append([data]) return updated_samples
def estimate_expression(samples, run_parallel): samples = run_parallel("generate_transcript_counts", samples) count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files) gtf_file = dd.get_gtf_file(samples[0][0], None) annotated = count.annotate_combined_count_file(combined, gtf_file) samples = run_parallel("run_express", samples) express_counts_combined = combine_express(samples, combined) samples = run_parallel("run_cufflinks", samples) #gene fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) #isoform fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") else: dexseq_combined = None updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) updated_samples.append([data]) return updated_samples
def estimate_expression(samples, run_parallel): samples = run_parallel("generate_transcript_counts", samples) combined = count.combine_count_files( [x[0]["count_file"] for x in samples if "count_file" in x[0]]) gtf_file = dd.get_gtf_file(samples[0][0], None) annotated = count.annotate_combined_count_file(combined, gtf_file) samples = run_parallel("run_cufflinks", samples) #gene fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" to_combine = [x[0]["fpkm"] for x in samples if "fpkm" in x[0]] fpkm_combined = count.combine_count_files(to_combine, fpkm_combined_file) #isoform fpkm_isoform_combined_file = os.path.splitext( combined)[0] + ".isoform.fpkm" to_combine_isoform = [ x[0]["fpkm_isoform"] for x in samples if "fpkm_isoform" in x[0] ] fpkm_isoform_combined = count.combine_count_files( to_combine_isoform, fpkm_isoform_combined_file, ".isoform.fpkm") dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = [dd.get_dexseq_counts(data[0]) for data in samples] to_combine_dexseq = filter(lambda x: x, to_combine_dexseq) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") else: dexseq_combined = None for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated if fpkm_combined: x[0]["combined_fpkm"] = fpkm_combined if fpkm_isoform_combined: x[0]["combined_fpkm_isoform"] = fpkm_isoform_combined if dexseq_combined: x[0] = dd.set_dexseq_counts(x[0], dexseq_combined_file) return samples
def estimate_expression(samples, run_parallel): samples = run_parallel("generate_transcript_counts", samples) combined = count.combine_count_files([x[0]["count_file"] for x in samples if "count_file" in x[0]]) gtf_file = dd.get_gtf_file(samples[0][0], None) annotated = count.annotate_combined_count_file(combined, gtf_file) samples = run_parallel("run_cufflinks", samples) #gene fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" to_combine = [x[0]["fpkm"] for x in samples if "fpkm" in x[0]] fpkm_combined = count.combine_count_files(to_combine, fpkm_combined_file) #isoform fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" to_combine_isoform = [x[0]["fpkm_isoform"] for x in samples if "fpkm_isoform" in x[0]] fpkm_isoform_combined = count.combine_count_files(to_combine_isoform, fpkm_isoform_combined_file, ".isoform.fpkm") dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" to_combine_dexseq = [dd.get_dexseq_counts(data[0]) for data in samples] to_combine_dexseq = filter(lambda x: x, to_combine_dexseq) if to_combine_dexseq: dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") else: dexseq_combined = None for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated if fpkm_combined: x[0]["combined_fpkm"] = fpkm_combined if fpkm_isoform_combined: x[0]["combined_fpkm_isoform"] = fpkm_isoform_combined if dexseq_combined: x[0] = dd.set_dexseq_counts(x[0], dexseq_combined_file) return samples
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""): out.append({"path": sample["provenance"]["data"]}) for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists( os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({ "path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": "" }) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) if "summary" in sample and sample["summary"].get("metadata"): out.append({"path": sample["summary"]["metadata"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({ "path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check" }) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) multiqc = tz.get_in(["summary", "multiqc"], sample) if multiqc: out.extend(_flatten_file_with_secondary(multiqc, "multiqc")) if sample.get("seqcluster", {}): out.append({ "path": sample["seqcluster"].get("out_dir"), "type": "directory", "ext": "seqcluster" }) if sample.get("mirge", {}): for fn in sample["mirge"]: out.append({"path": fn, "dir": "mirge"}) if sample.get("report", None): out.append({ "path": os.path.dirname(sample["report"]), "type": "directory", "ext": "seqclusterViz" }) for x in sample.get("variants", []): if "pop_db" in x: out.append({ "path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"] }) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({ "path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"] }) suffix = "-annotated-decomposed" if tz.get_in( ("population", "decomposed"), x) else "-annotated" vcfs = _get_project_vcf(x, suffix) out.extend([_add_batch(f, sample) for f in vcfs]) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break sv_project = set([]) for svcall in sample.get("sv", []): if svcall.get("variantcaller") == "seq2c": if svcall.get( "calls_all") and svcall["calls_all"] not in sv_project: out.append({ "path": svcall["coverage_all"], "batch": "seq2c", "ext": "coverage", "type": "tsv" }) out.append({ "path": svcall["read_mapping"], "batch": "seq2c", "ext": "read_mapping", "type": "txt" }) out.append({ "path": svcall["calls_all"], "batch": "seq2c", "ext": "calls", "type": "tsv" }) sv_project.add(svcall["calls_all"]) if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({ "path": all_coverage, "type": "bed", "ext": "coverage" }) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): count_file = dd.get_combined_counts(sample) if sample["analysis"].lower() == "scrna-seq": out.append({"path": count_file, "type": "mtx"}) out.append({"path": count_file + ".rownames", "type": "rownames"}) out.append({"path": count_file + ".colnames", "type": "colnames"}) out.append({"path": count_file + ".metadata", "type": "metadata"}) umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx" if utils.file_exists(umi_file): out.append({"path": umi_file, "type": "mtx"}) out.append({ "path": umi_file + ".rownames", "type": "rownames" }) out.append({ "path": umi_file + ".colnames", "type": "colnames" }) if dd.get_combined_histogram(sample): out.append({ "path": dd.get_combined_histogram(sample), "type": "txt" }) rda = os.path.join(os.path.dirname(count_file), "se.rda") if utils.file_exists(rda): out.append({"path": rda, "type": "rda"}) else: out.append({"path": dd.get_combined_counts(sample)}) if dd.get_tximport(sample): out.append({"path": dd.get_tximport(sample)["gene_tpm"], "dir": "tpm"}) out.append({ "path": dd.get_tximport(sample)["gene_counts"], "dir": "counts" }) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) out.append({"path": "%s.ann" % dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) if dd.get_tx2gene(sample): out.append({"path": dd.get_tx2gene(sample)}) if dd.get_spikein_counts(sample): out.append({"path": dd.get_spikein_counts(sample)}) if tz.get_in(("peaks_files", "consensus", "main"), sample): out.append({ "path": tz.get_in(("peaks_files", "consensus", "main"), sample), "dir": "consensus" }) if tz.get_in(("peak_counts", "peaktable"), sample): out.append({ "path": tz.get_in(("peak_counts", "peaktable"), sample), "dir": "consensus" }) transcriptome_dir = os.path.join(dd.get_work_dir(sample), "inputs", "transcriptome") if os.path.exists(transcriptome_dir): out.append({ "path": transcriptome_dir, "type": "directory", "ext": "transcriptome" }) return _add_meta(out, config=upload_config)
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""): out.append({"path": sample["provenance"]["data"]}) for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) multiqc = tz.get_in(["summary", "multiqc"], sample) if multiqc: out.extend(_flatten_file_with_secondary(multiqc, "multiqc")) if sample.get("seqcluster", None): out.append({"path": sample["seqcluster"], "type": "directory", "ext": "seqcluster"}) if sample.get("report", None): out.append({"path": os.path.dirname(sample["report"]), "type": "directory", "ext": "seqclusterViz"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) out.extend(_get_variant_file(x, ("population", "vcf"))) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({"path": all_coverage, "type": "bed", "ext": "coverage"}) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_tidy(sample): out.append({"path": dd.get_sailfish_tidy(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) if dd.get_tx2gene(sample): out.append({"path": dd.get_tx2gene(sample)}) return _add_meta(out, config=upload_config)
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing( [dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files: fpkm_isoform_combined_file = os.path.splitext( combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files( isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = filter_missing( [dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene( data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""): out.append({"path": sample["provenance"]["data"]}) for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) multiqc = tz.get_in(["summary", "multiqc"], sample) if multiqc: out.extend(_flatten_file_with_secondary(multiqc, "multiqc")) if sample.get("seqcluster", {}): out.append({"path": sample["seqcluster"].get("out_dir"), "type": "directory", "ext": "seqcluster"}) if sample.get("report", None): out.append({"path": os.path.dirname(sample["report"]), "type": "directory", "ext": "seqclusterViz"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) suffix = "-annotated-decomposed" if tz.get_in(("population", "decomposed"), x) else "-annotated" out.extend([_add_batch(x, sample) for x in _get_variant_file(x, ("population", "vcf"), suffix=suffix)]) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({"path": all_coverage, "type": "bed", "ext": "coverage"}) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): count_file = dd.get_combined_counts(sample) if sample["analysis"].lower() == "scrna-seq": out.append({"path": count_file, "type": "mtx"}) out.append({"path": count_file + ".rownames", "type": "rownames"}) out.append({"path": count_file + ".colnames", "type": "colnames"}) else: out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_tidy(sample): out.append({"path": dd.get_sailfish_tidy(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) if dd.get_tx2gene(sample): out.append({"path": dd.get_tx2gene(sample)}) if dd.get_spikein_counts(sample): out.append({"path": dd.get_spikein_counts(sample)}) return _add_meta(out, config=upload_config)
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) if sample.get("seqcluster", None): out.append({"path": sample["seqcluster"], "type": "directory", "ext": "seqcluster"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) out.extend(_get_variant_file(x, ("population", "vcf"))) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({"path": all_coverage, "type": "bed", "ext": "coverage"}) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_combined_counts(sample): out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_assembled_gtf(sample): out.append({"path": dd.get_assembled_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_tidy(sample): out.append({"path": dd.get_sailfish_tidy(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) return _add_meta(out, config=upload_config)
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""): out.append({"path": sample["provenance"]["data"]}) for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) if "summary" in sample and sample["summary"].get("metadata"): out.append({"path": sample["summary"]["metadata"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) multiqc = tz.get_in(["summary", "multiqc"], sample) if multiqc: out.extend(_flatten_file_with_secondary(multiqc, "multiqc")) if sample.get("seqcluster", {}): out.append({"path": sample["seqcluster"].get("out_dir"), "type": "directory", "ext": "seqcluster"}) if sample.get("mirge", {}): for fn in sample["mirge"]: out.append({"path": fn, "dir": "mirge"}) if sample.get("report", None): out.append({"path": os.path.dirname(sample["report"]), "type": "directory", "ext": "seqclusterViz"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) suffix = "-annotated-decomposed" if tz.get_in(("population", "decomposed"), x) else "-annotated" vcfs = _get_project_vcf(x, suffix) out.extend([_add_batch(f, sample) for f in vcfs]) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break sv_project = set([]) for svcall in sample.get("sv", []): if svcall.get("variantcaller") == "seq2c": if svcall.get("calls_all") and svcall["calls_all"] not in sv_project: out.append({"path": svcall["coverage_all"], "batch": "seq2c", "ext": "coverage", "type": "tsv"}) out.append({"path": svcall["read_mapping"], "batch": "seq2c", "ext": "read_mapping", "type": "txt"}) out.append({"path": svcall["calls_all"], "batch": "seq2c", "ext": "calls", "type": "tsv"}) sv_project.add(svcall["calls_all"]) if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({"path": all_coverage, "type": "bed", "ext": "coverage"}) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): count_file = dd.get_combined_counts(sample) if sample["analysis"].lower() == "scrna-seq": out.append({"path": count_file, "type": "mtx"}) out.append({"path": count_file + ".rownames", "type": "rownames"}) out.append({"path": count_file + ".colnames", "type": "colnames"}) out.append({"path": count_file + ".metadata", "type": "metadata"}) umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx" if utils.file_exists(umi_file): out.append({"path": umi_file, "type": "mtx"}) out.append({"path": umi_file + ".rownames", "type": "rownames"}) out.append({"path": umi_file + ".colnames", "type": "colnames"}) if dd.get_combined_histogram(sample): out.append({"path": dd.get_combined_histogram(sample), "type": "txt"}) rda = os.path.join(os.path.dirname(count_file), "se.rda") if utils.file_exists(rda): out.append({"path": rda, "type": "rda"}) else: out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) out.append({"path": "%s.ann" % dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) if dd.get_tx2gene(sample): out.append({"path": dd.get_tx2gene(sample)}) if dd.get_spikein_counts(sample): out.append({"path": dd.get_spikein_counts(sample)}) transcriptome_dir = os.path.join(dd.get_work_dir(sample), "inputs", "transcriptome") if os.path.exists(transcriptome_dir): out.append({"path": transcriptome_dir, "type": "directory", "ext": "transcriptome"}) return _add_meta(out, config=upload_config)
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files and combined: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing([dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files and combined: fpkm_isoform_combined_file = os.path.splitext(combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files(isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]) if to_combine_dexseq and combined: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") if dexseq_combined: dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene(data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) updated_samples.append([data]) return updated_samples