def run_stringtie_expression(data): """ estimate expression from Stringtie, using the bcbio datadict does not do transcriptome assembly """ bam = dd.get_work_bam(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) sample_name = dd.get_sample_name(data) out_dir = os.path.join("stringtie", sample_name) isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm") gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm") assembly = os.path.abspath(os.path.join(out_dir, "stringtie-assembly.gtf")) if file_exists(isoform_fpkm) and file_exists(gene_fpkm): data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): dd.get_assembled_gtf(data).append(assembly) return data with file_transaction(data, out_dir) as tx_out_dir: transcript_file = _stringtie_expression(bam, gtf_file, num_cores, tx_out_dir) df = _parse_ballgown(transcript_file) _write_fpkms(df, tx_out_dir, sample_name) data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): dd.get_assembled_gtf(data).append(assembly) return data
def run_stringtie_expression(data): """ estimate expression from Stringtie, using the bcbio datadict does not do transcriptome assembly """ bam = dd.get_work_bam(data) sample_name = dd.get_sample_name(data) out_dir = os.path.join("stringtie", sample_name) isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm") gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm") assembly = os.path.abspath(os.path.join(out_dir, "stringtie-assembly.gtf")) if file_exists(isoform_fpkm) and file_exists(gene_fpkm): data = dd.set_stringtie_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): assembled_gtfs = dd.get_assembled_gtf(data) assembled_gtfs.append(assembly) data = dd.set_assembled_gtf(data, assembled_gtfs) return data with file_transaction(data, out_dir) as tx_out_dir: transcript_file = _stringtie_expression(bam, data, tx_out_dir) df = _parse_ballgown(transcript_file) _write_fpkms(df, tx_out_dir, sample_name) data = dd.set_stringtie_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): assembled_gtfs = dd.get_assembled_gtf(data) assembled_gtfs.append(assembly) data = dd.set_assembled_gtf(data, assembled_gtfs) return data
def cufflinks_assemble(data): bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) assembled_gtf = cufflinks.assemble(bam_file, ref_file, num_cores, out_dir, data) dd.get_assembled_gtf(data).append(assembled_gtf) return [[data]]
def stringtie_merge(*samples): to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)])) data = samples[0][0] ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_merged_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def cufflinks_merge(*samples): to_merge = filter_missing([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)]) data = samples[0][0] bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0]) for data in dd.sample_data_iterator(samples): dd.set_assembled_gtf(data, merged_gtf) return samples
def cufflinks_merge(*samples): to_merge = filter_missing([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)]) data = samples[0][0] bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0]) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_assembled_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) out.extend(_get_variant_file(x, ("population", "vcf"))) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({"path": all_coverage, "type": "bed", "ext": "coverage"}) if dd.get_combined_counts(sample): out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_assembled_gtf(sample): out.append({"path": dd.get_assembled_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) return _add_meta(out, config=upload_config)
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) if sample.get("seqcluster", None): out.append({"path": sample["seqcluster"], "type": "directory", "ext": "seqcluster"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) out.extend(_get_variant_file(x, ("population", "vcf"))) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({"path": all_coverage, "type": "bed", "ext": "coverage"}) if dd.get_combined_counts(sample): out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_assembled_gtf(sample): out.append({"path": dd.get_assembled_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) return _add_meta(out, config=upload_config)