def unpack_tarballs(xs, data, use_subdir=True): """Unpack workflow tarballs into ready to use directories. """ if isinstance(xs, dict): for k, v in xs.items(): xs[k] = unpack_tarballs(v, data, use_subdir) elif isinstance(xs, (list, tuple)): xs = [unpack_tarballs(x, data, use_subdir) for x in xs] elif isinstance(xs, six.string_types): if os.path.isfile(xs.encode("utf-8", "ignore")) and xs.endswith("-wf.tar.gz"): if use_subdir: tarball_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "wf-inputs")) else: tarball_dir = dd.get_work_dir(data) out_dir = os.path.join(tarball_dir, os.path.basename(xs).replace("-wf.tar.gz", "").replace("--", os.path.sep)) if not os.path.exists(out_dir): with utils.chdir(tarball_dir): with tarfile.open(xs, "r:gz") as tar: tar.extractall() assert os.path.exists(out_dir), out_dir # Default to representing output directory xs = out_dir # Look for aligner indices for fname in os.listdir(out_dir): if fname.endswith(DIR_TARGETS): xs = os.path.join(out_dir, fname) break elif fname.endswith(BASENAME_TARGETS): base = os.path.join(out_dir, utils.splitext_plus(os.path.basename(fname))[0]) xs = glob.glob("%s*" % base) break return xs
def sample_annotation(data): """ Annotate miRNAs using miRBase database with seqbuster tool """ names = data["rgnames"]['sample'] tools = dd.get_expression_caller(data) work_dir = os.path.join(dd.get_work_dir(data), "mirbase") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = op.join(out_dir, names) if dd.get_mirbase_hairpin(data): mirbase = op.abspath(op.dirname(dd.get_mirbase_hairpin(data))) if utils.file_exists(data["collapse"]): data['transcriptome_bam'] = _align(data["collapse"], dd.get_mirbase_hairpin(data), out_file, data) data['seqbuster'] = _miraligner(data["collapse"], out_file, dd.get_species(data), mirbase, data['config']) else: logger.debug("Trimmed collapsed file is empty for %s." % names) else: logger.debug("No annotation file from miRBase.") sps = dd.get_species(data) if dd.get_species(data) else "None" logger.debug("Looking for mirdeep2 database for %s" % names) if file_exists(op.join(dd.get_work_dir(data), "mirdeep2", "novel", "hairpin.fa")): data['seqbuster_novel'] = _miraligner(data["collapse"], "%s_novel" % out_file, sps, op.join(dd.get_work_dir(data), "mirdeep2", "novel"), data['config']) if "trna" in tools: data['trna'] = _mint_trna_annotation(data) data = spikein.counts_spikein(data) return [[data]]
def gatk_rnaseq_calling(data): """Use GATK to perform gVCF variant calling on RNA-seq data """ from bcbio.bam import callable data = utils.deepish_copy(data) tools_on = dd.get_tools_on(data) if not tools_on: tools_on = [] tools_on.append("gvcf") data = dd.set_tools_on(data, tools_on) data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)]) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype")) data = _setup_variant_regions(data, out_dir) out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data)) if not utils.file_exists(out_file): region_files = [] regions = [] for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data): str_region = "_".join([str(x) for x in cur_region]) region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype", "regions")), "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region)) region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {}, region=cur_region, out_file=region_file) region_files.append(region_file) regions.append(cur_region) out_file = vcfutils.concat_variant_files(region_files, out_file, regions, dd.get_ref_file(data), data["config"]) return dd.set_vrn_file(data, out_file)
def get_fastq_files(data): """Retrieve fastq files for the given lane, ready to process. """ assert "files" in data, "Did not find `files` in input; nothing to process" ready_files = [] should_gzip = True # Bowtie does not accept gzipped fastq if 'bowtie' in data['reference'].keys(): should_gzip = False for fname in data["files"]: if fname.endswith(".bam"): if _pipeline_needs_fastq(data["config"], data): ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"], data, data["dirs"], data["config"]) else: ready_files = [fname] elif objectstore.is_remote(fname): ready_files.append(fname) # Trimming does quality conversion, so if not doing that, do an explicit conversion elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard": out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert")) ready_files.append(fastq.groom(fname, data, out_dir=out_dir)) else: ready_files.append(fname) ready_files = [x for x in ready_files if x is not None] if should_gzip: out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq")) ready_files = [_gzip_fastq(x, out_dir) for x in ready_files] for in_file in ready_files: if not objectstore.is_remote(in_file): assert os.path.exists(in_file), "%s does not exist." % in_file return ready_files
def tagcount(data): bam = dd.get_transcriptome_bam(data) umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) out_prefix = os.path.join(sample_dir, dd.get_sample_name(data)) out_file = out_prefix + ".mtx" if file_exists(out_file): data = dd.set_count_file(data, out_file) return [[data]] umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) cutoff = dd.get_minimum_barcode_depth(data) cb_histogram = os.path.join(sample_dir, "cb-histogram.txt") positional = "--positional" if dd.get_positional_umi(data, False) else "" if use_installed_transcriptome(data): gtf_file = dd.get_gtf_file(data) else: gtf_file = dd.get_transcriptome_gtf(data, None) if gtf_file: gene_map_file = os.path.join(dd.get_work_dir(data), "annotation", os.path.splitext(gtf_file)[0] + "-tx2gene.tsv") gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True) gene_map_flag = " --genemap {0} ".format(gene_map_file) else: gene_map_flag = "" message = "Counting alignments of transcripts in %s." % bam cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} " "{gene_map_flag} " "{positional} " "--cb_histogram {cb_histogram}") out_files = [out_file, out_file + ".rownames", out_file + ".colnames"] umi_matrix_file = out_prefix + "-dupes.mtx" out_files += [umi_matrix_file, umi_matrix_file + ".rownames", umi_matrix_file + ".colnames"] if has_umi_matrix(data): umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} " else: umi_matrix_flag = "" cmd += umi_matrix_flag cmd += " {bam} {tx_out_file_full}" with file_transaction(out_files) as tx_out_files: tx_out_file = tx_out_files[0] tx_out_file_full = tx_out_file + ".full" tx_umi_matrix = tx_out_files[3] tx_umi_matrix_full = tx_out_files[3] + ".full" do.run(cmd.format(**locals()), message) cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}") message = "Converting %s to sparse format." % tx_out_file_full do.run(cmd.format(**locals()), message) if has_umi_matrix(data): cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}") message = "Converting %s to sparse format." % tx_umi_matrix_full do.run(cmd.format(**locals()), message) data = dd.set_count_file(data, out_file) return [[data]]
def _symlink_to_workdir(data, key): """For CWL support, symlink files into a working directory if in read-only imports. """ orig_file = tz.get_in(key, data) if orig_file and not orig_file.startswith(dd.get_work_dir(data)): variantcaller = genotype.get_variantcaller(data) out_file = os.path.join(dd.get_work_dir(data), variantcaller, os.path.basename(orig_file)) utils.safe_makedir(os.path.dirname(out_file)) utils.symlink_plus(orig_file, out_file) data = tz.update_in(data, key, lambda x: out_file) return data
def run_cluster(data): """ Run seqcluster cluster to detect smallRNA clusters """ out_dir = os.path.join(dd.get_work_dir(data[0]), "seqcluster", "cluster") out_dir = os.path.abspath(safe_makedir(out_dir)) prepare_dir = op.join(dd.get_work_dir(data[0]), "seqcluster", "prepare") bam_file = op.join(dd.get_work_dir(data[0]), "align", "seqs.bam") cluster_dir = _cluster(bam_file, prepare_dir, out_dir, dd.get_ref_file(data[0]), dd.get_srna_gtf_file(data[0])) for sample in data: sample["seqcluster"] = out_dir return [data]
def filter_barcodes(data): # if data was pre-demultiplexed, there is no need to filter the barcodes if dd.get_demultiplexed(data): return [[data]] fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = get_cellular_barcodes(data) if not bc: logger.info("No cellular barcodes found, skipping filtering.") return [[data]] bc1 = None bc2 = None bc3 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, six.string_types): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) > 1: bc1 = bc[0] bc2 = bc[1] if len(bc) == 3: bc3 = bc[2] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " if bc3: cmd += "--bc3 {bc3} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def __init__(self, data): self._db_location = self._get_ericscript_db(data) self._sample_name = dd.get_lane(data) self._work_dir = dd.get_work_dir(data) self._env = None self._output_dir = None self._sample_out_dir = None
def run(data): """Quantitaive isoforms expression by eXpress""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) config = data['config'] if not in_bam: logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.") return data gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data)) out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") express = config_utils.get_program("express", data['config']) strand = _set_stranded_flag(in_bam, data) if not file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(out_dir) as tx_out_dir: bam_file = _prepare_bam_file(in_bam, tmp_dir, config) cmd = ("{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}") do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {}) shutil.move(os.path.join(out_dir, "results.xprs"), out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14) fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10) data = dd.set_express_counts(data, eff_count_file) data = dd.set_express_tpm(data, tpm_file) data = dd.set_express_fpkm(data, fpkm_file) return data
def _make_isomir_counts(data, srna_type="seqbuster", out_dir=None, stem=""): """ Parse miraligner files to create count matrix. """ work_dir = dd.get_work_dir(data[0][0]) if not out_dir: out_dir = op.join(work_dir, "mirbase") out_novel_isomir = append_stem(op.join(out_dir, "counts.tsv"), stem) out_novel_mirna = append_stem(op.join(out_dir, "counts_mirna.tsv"), stem) logger.debug("Create %s count data at %s." % (srna_type, out_dir)) if file_exists(out_novel_mirna): return [out_novel_mirna, out_novel_isomir] out_dts = [] for sample in data: if sample[0].get(srna_type): miraligner_fn = sample[0][srna_type] reads = _read_miraligner(miraligner_fn) if reads: out_file, dt, dt_pre = _tab_output(reads, miraligner_fn + ".back", dd.get_sample_name(sample[0])) out_dts.append(dt) else: logger.debug("WARNING::%s has NOT miRNA annotated for %s. Check if fasta files is small or species value." % (dd.get_sample_name(sample[0]), srna_type)) if out_dts: out_files = _create_counts(out_dts, out_dir) out_files = [move_safe(out_files[0], out_novel_isomir), move_safe(out_files[1], out_novel_mirna)] return out_files else: logger.debug("WARNING::any samples have miRNA annotated for %s. Check if fasta files is small or species value." % srna_type)
def run_cluster(*data): """ Run seqcluster cluster to detect smallRNA clusters """ sample = data[0][0] work_dir = dd.get_work_dir(sample) out_dir = op.join(work_dir, "seqcluster", "cluster") out_dir = op.abspath(safe_makedir(out_dir)) prepare_dir = op.join(work_dir, "seqcluster", "prepare") bam_file = op.join(work_dir, "align", "seqs.bam") cluster_dir = _cluster(bam_file, prepare_dir, out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample)) sample["report"] = _report(sample, dd.get_ref_file(sample)) sample["seqcluster"] = out_dir out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase")) if out_mirna: sample = dd.set_mirna_counts(sample, out_mirna[0]) sample = dd.set_isomir_counts(sample, out_mirna[1]) out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel") novel_db = mirdeep.run(data) if out_novel: sample = dd.set_novel_mirna_counts(sample, out_novel[0]) sample = dd.set_novel_isomir_counts(sample, out_novel[1]) data[0][0] = sample return data
def _maybe_add_salmon_files(algorithm, sample, out): salmon_dir = os.path.join(dd.get_work_dir(sample), "salmon", dd.get_sample_name(sample), "quant") if os.path.exists(salmon_dir): out.append({"path": salmon_dir, "type": "directory", "ext": "salmon"}) return out
def convert_to_kallisto(data): files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq") out_file = os.path.join(kallisto_dir, "barcodes.batch") umis = config_utils.get_program("umis", dd.get_config(data)) if file_exists(out_file): return out_file if dd.get_minimum_barcode_depth(data): cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt") cb_cutoff = dd.get_minimum_barcode_depth(data) cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}" cb_options = cb_options.format(**locals()) else: cb_options = "" cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}") with file_transaction(data, kallisto_dir) as tx_kallisto_dir: safe_makedir(tx_kallisto_dir) message = ("Transforming %s to Kallisto singlecell format. " % fq1) do.run(cmd.format(**locals()), message) return out_file
def run_prepare(*data): """ Run seqcluster prepare to merge all samples in one file """ out_dir = os.path.join(dd.get_work_dir(data[0][0]), "seqcluster", "prepare") out_dir = os.path.abspath(safe_makedir(out_dir)) prepare_dir = os.path.join(out_dir, "prepare") tools = dd.get_expression_caller(data[0][0]) if len(tools) == 0: logger.info("You didn't specify any other expression caller tool." "You can add to the YAML file:" "expression_caller:[trna, seqcluster, mirdeep2]") fn = [] for sample in data: name = sample[0]["rgnames"]['sample'] fn.append("%s\t%s" % (sample[0]['collapse'], name)) args = namedtuple('args', 'debug print_debug minc minl maxl out') args = args(False, False, 2, 17, 40, out_dir) ma_out = op.join(out_dir, "seqs.ma") seq_out = op.join(out_dir, "seqs.fastq") min_shared = max(int(len(fn) / 10.0), 1) if not file_exists(ma_out): seq_l, sample_l = prepare._read_fastq_files(fn, args) with file_transaction(ma_out) as ma_tx: with open(ma_tx, 'w') as ma_handle: with open(seq_out, 'w') as seq_handle: prepare._create_matrix_uniq_seq(sample_l, seq_l, ma_handle, seq_handle, min_shared) for sample in data: sample[0]["seqcluster_prepare_ma"] = ma_out sample[0]["seqcluster_prepare_fastq"] = seq_out return data
def run_cluster(*data): """ Run seqcluster cluster to detect smallRNA clusters """ sample = data[0][0] tools = dd.get_expression_caller(data[0][0]) work_dir = dd.get_work_dir(sample) out_dir = op.join(work_dir, "seqcluster", "cluster") out_dir = op.abspath(safe_makedir(out_dir)) prepare_dir = op.join(work_dir, "seqcluster", "prepare") bam_file = data[0][0]["work_bam"] if "seqcluster" in tools: sample["seqcluster"] = _cluster(bam_file, data[0][0]["seqcluster_prepare_ma"], out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample)) sample["report"] = _report(sample, dd.get_ref_file(sample)) out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase")) if out_mirna: sample = dd.set_mirna_counts(sample, out_mirna[0]) sample = dd.set_isomir_counts(sample, out_mirna[1]) out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel") if out_novel: sample = dd.set_novel_mirna_counts(sample, out_novel[0]) sample = dd.set_novel_isomir_counts(sample, out_novel[1]) data[0][0] = sample return data
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = ("unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(Rscript_cmd())) ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " with file_transaction(out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def _merge_fastqc(samples): """ merge all fastqc samples into one by module """ fastqc_list = collections.defaultdict(list) seen = set() for data in samples: name = dd.get_sample_name(data) if name in seen: continue seen.add(name) fns = glob.glob(os.path.join(dd.get_work_dir(data), "qc", dd.get_sample_name(data), "fastqc") + "/*") for fn in fns: if fn.endswith("tsv"): metric = os.path.basename(fn) fastqc_list[metric].append([name, fn]) for metric in fastqc_list: dt_by_sample = [] for fn in fastqc_list[metric]: dt = pd.read_csv(fn[1], sep="\t") dt['sample'] = fn[0] dt_by_sample.append(dt) dt = utils.rbind(dt_by_sample) dt.to_csv(metric, sep="\t", index=False, mode ='w') return samples
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def _mint_trna_annotation(data): """ use MINTmap to quantify tRNAs """ trna_lookup = op.join(dd.get_srna_mint_lookup(data)) trna_space = op.join(dd.get_srna_mint_space(data)) trna_other = op.join(dd.get_srna_mint_other(data)) name = dd.get_sample_name(data) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name)) in_file = op.basename(data["clean_fastq"]) mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl")) perl_export = utils.get_perl_exports() if not file_exists(trna_lookup) or not file_exists(mintmap): logger.info("There is no tRNA annotation to run MINTmap.") return work_dir jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates") out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt") if not file_exists(out_file): with tx_tmpdir(data) as txdir: with utils.chdir(txdir): utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file)) cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} " "-l {trna_lookup} -s {trna_space} -j {jar_folder} " "-o {trna_other}").format(**locals()) do.run(cmd, "tRNA for %s" % name) for filename in glob.glob("*MINTmap*"): shutil.move(filename, work_dir) return work_dir
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0]))) species = dd.get_species(data[0][0]) hairpin = op.join(mirbase, "hairpin.fa") mature = op.join(mirbase, "mature.fa") rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file): do.run(cmd.format(**locals()), "Running mirdeep2.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = utils.to_single_data(data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(bam_file_ready, ref_file, data) sample_callable = callable.sample_callable_bed(bam_file_ready, ref_file, data) offtarget_stats = callable.calculate_offtarget(bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": sample_callable, "offtarget_stats": offtarget_stats} data = coverage.assign_interval(data) highdepth_bed = highdepth.identify(data) data["regions"]["highdepth"] = highdepth_bed if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = bedutils.clean_inputs(data) data = _recal_no_markduplicates(data) return [[data]]
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage")) if not bed_file: return data cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed") cleaned_bed = bed.decomment(bed_file, cleaned_bed) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) _calculate_percentiles(os.path.abspath(parse_file), sample) data['coverage'] = os.path.abspath(parse_file) return data
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if trim_reads: adapter = dd.get_adapters(data)[0] out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt") cmd = _cmd_cutadapt() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter") else: symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"): ref_file = dd.get_ref_file(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed, callable_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = {"nblock": nblock_bed, "callable": callable_bed, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data)} data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) if (os.path.exists(callable_region_bed) and not data["config"]["algorithm"].get("variant_regions")): data["config"]["algorithm"]["variant_regions"] = callable_region_bed data = clean_inputs(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) return [[data]]
def report_summary(samples, run_parallel): """ Run coverage report with bcbiocov package """ work_dir = dd.get_work_dir(samples[0][0]) parent_dir = utils.safe_makedir(os.path.join(work_dir, "report")) qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma") with utils.chdir(parent_dir): logger.info("copy qsignature") if qsignature_fn: if utils.file_exists(qsignature_fn) and not utils.file_exists("qsignature.ma"): shutil.copy(qsignature_fn, "qsignature.ma") out_dir = utils.safe_makedir("fastqc") logger.info("summarize fastqc") with utils.chdir(out_dir): _merge_fastqc(samples) out_dir = utils.safe_makedir("coverage") out_dir = utils.safe_makedir("variants") samples = run_parallel("coverage_report", samples) try: import bcbreport.prepare as bcbreport bcbreport.report(parent_dir) except: logger.info("skipping report. No bcbreport installed.") pass logger.info("summarize metrics") samples = _merge_metrics(samples) return samples
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() hairpin, mature, species = "none", "none", "na" rfam_file = dd.get_mirdeep2_file(data[0][0]) if file_exists(dd.get_mirbase_hairpin(data[0][0])): species = dd.get_species(data[0][0]) hairpin = dd.get_mirbase_hairpin(data[0][0]) mature = dd.get_mirbase_mature(data[0][0]) logger.debug("Preparing for mirdeep2 analysis.") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(rfam_file): try: do.run(cmd.format(**locals()), "Running mirdeep2.") except: logger.warning("mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data): safe_makedir(sailfish_dir) samplename = dd.get_sample_name(data) quant_dir = os.path.join(sailfish_dir, "quant") out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file build_string = get_build_string(data) sailfish_idx = os.path.join(dd.get_work_dir(data), "sailfish", "index", build_string) num_cores = dd.get_num_cores(data) sailfish = config_utils.get_program("sailfish", data["config"]) cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} " cmd += _libtype_string(fq1, fq2, strandedness) fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "--useVBOpt --numBootstraps 30 " cmd += "-o {tx_out_dir}" message = "Quantifying transcripts in {fq1} and {fq2}." with file_transaction(data, quant_dir) as tx_out_dir: do.run(cmd.format(**locals()), message.format(**locals()), None) _sleuthify_sailfish(tx_out_dir) return out_file
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ from bcbio.variation import coverage from bcbio.structural import annotate, cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): return [[data]] work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data)) out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data)) if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and (dd.get_align_bam(data) or dd.get_work_bam(data))): # mosdepth target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data) # TODO: Correct for GC bias if os.path.exists(out_target_file): data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file} return [[data]]
def _merge_hla_fastq_inputs(data): """Merge HLA inputs from a split initial alignment. """ hla_key = ["hla", "fastq"] hla_sample_files = [x for x in tz.get_in(hla_key, data, []) if x and x != "None"] if hla_sample_files: out_files = collections.defaultdict(list) for hla_files in hla_sample_files: for hla_file in hla_files: rehla = re.search(".hla.(?P<hlatype>[\w-]+).fq", hla_file) if rehla: hlatype = rehla.group("hlatype") out_files[hlatype].append(hla_file) if len(out_files) > 0: hla_outdir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "hla")) merged_hlas = [] for hlatype, files in out_files.items(): out_file = os.path.join(hla_outdir, "%s-%s.fq" % (dd.get_sample_name(data), hlatype)) optitype.combine_hla_fqs([(hlatype, f) for f in files], out_file, data) merged_hlas.append(out_file) data = tz.update_in(data, hla_key, lambda x: merged_hlas) else: data = tz.update_in(data, hla_key, lambda x: None) return data
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4 - len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info( "No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info( "%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return [[data]] else: logger.error( "No UMI transform was specified, but %s does not look " "pre-transformed." % fq1) sys.exit(1) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" if dd.get_demultiplexed(data): demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data) split_option = "" else: demuxed_option = "" cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return [[data]] locale_export = utils.locale_export() umis = _umis_cmd(data) cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} {demuxed_option} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def get_kallisto_fusions(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "quant") return os.path.join(kallisto_dir, "fusion.txt")
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ data = umi_transform(data) in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) log_out = os.path.join(out_dir, "%s.log" % names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) data["log_trimming"] = log_out return [[data]] adapter = dd.get_adapters(data) is_4n = any([a == "4N" for a in adapter]) adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)] if adapter and not trim_reads: trim_reads = True logger.info( "Adapter is set up in config file, but trim_reads is not true." "If you want to skip trimming, skip adapter option from config.") if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) if trim_reads: adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if not trim_reads or len( adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) if any([a for a in adapters if re.compile("^N+$").match(a)]): adapter_cmd = "-N %s" % adapter_cmd out_noadapter_file = replace_directory( append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) # atropos = _get_atropos() atropos = config_utils.get_program("atropos", data, default="atropos") options = " ".join( data.get('resources', {}).get('atropos', {}).get("options", "")) if options.strip() == "-u 4 -u -4": options = "" is_4n = "4N" cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join( data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError( "Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace( out_short_file, names) open(log_out, 'w').write(content) if is_4n: options = "-u 4 -u -4" in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run( cmd.format(**locals()), "atropos with this parameters %s for %s" % (options, names)) data["log_trimming"] = log_out else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["files"][0] = out_file data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""): out.append({"path": sample["provenance"]["data"]}) for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": ""}) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) if "summary" in sample and sample["summary"].get("metadata"): out.append({"path": sample["summary"]["metadata"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({"path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check"}) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) multiqc = tz.get_in(["summary", "multiqc"], sample) if multiqc: out.extend(_flatten_file_with_secondary(multiqc, "multiqc")) ataqv = tz.get_in(["ataqv_report"], sample) if ataqv: out.extend(_flatten_file_with_secondary(ataqv, "ataqv")) if sample.get("seqcluster", {}): out.append({"path": sample["seqcluster"].get("out_dir"), "type": "directory", "ext": "seqcluster"}) if sample.get("mirge", {}): for fn in sample["mirge"]: out.append({"path": fn, "dir": "mirge"}) if sample.get("report", None): out.append({"path": os.path.dirname(sample["report"]), "type": "directory", "ext": "seqclusterViz"}) for x in sample.get("variants", []): if "pop_db" in x: out.append({"path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"]}) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({"path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"]}) suffix = "-annotated-decomposed" if tz.get_in(("population", "decomposed"), x) else "-annotated" vcfs = _get_project_vcf(x, suffix) out.extend([_add_batch(f, sample) for f in vcfs]) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break sv_project = set([]) for svcall in sample.get("sv", []): if svcall.get("variantcaller") == "seq2c": if svcall.get("calls_all") and svcall["calls_all"] not in sv_project: out.append({"path": svcall["coverage_all"], "batch": "seq2c", "ext": "coverage", "type": "tsv"}) out.append({"path": svcall["read_mapping"], "batch": "seq2c", "ext": "read_mapping", "type": "txt"}) out.append({"path": svcall["calls_all"], "batch": "seq2c", "ext": "calls", "type": "tsv"}) sv_project.add(svcall["calls_all"]) if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({"path": all_coverage, "type": "bed", "ext": "coverage"}) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): count_file = dd.get_combined_counts(sample) if sample["analysis"].lower() == "scrna-seq": out.append({"path": count_file, "type": "mtx"}) out.append({"path": count_file + ".rownames", "type": "rownames"}) out.append({"path": count_file + ".colnames", "type": "colnames"}) out.append({"path": count_file + ".metadata", "type": "metadata"}) umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx" if utils.file_exists(umi_file): out.append({"path": umi_file, "type": "mtx"}) out.append({"path": umi_file + ".rownames", "type": "rownames"}) out.append({"path": umi_file + ".colnames", "type": "colnames"}) if dd.get_combined_histogram(sample): out.append({"path": dd.get_combined_histogram(sample), "type": "txt"}) rda = os.path.join(os.path.dirname(count_file), "se.rda") if utils.file_exists(rda): out.append({"path": rda, "type": "rda"}) else: out.append({"path": dd.get_combined_counts(sample)}) if dd.get_tximport(sample): out.append({"path": dd.get_tximport(sample)["gene_tpm"], "dir": "tpm"}) out.append({"path": dd.get_tximport(sample)["gene_counts"], "dir": "counts"}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) out.append({"path": "%s.ann" % dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) if dd.get_tx2gene(sample): out.append({"path": dd.get_tx2gene(sample)}) if dd.get_spikein_counts(sample): out.append({"path": dd.get_spikein_counts(sample)}) if tz.get_in(("peaks_files", "consensus", "main"), sample): out.append({"path": tz.get_in(("peaks_files", "consensus", "main"), sample), "dir": "consensus"}) if tz.get_in(("peak_counts", "peaktable"), sample): out.append({"path": tz.get_in(("peak_counts", "peaktable"), sample), "dir": "consensus"}) transcriptome_dir = os.path.join(dd.get_work_dir(sample), "inputs", "transcriptome") if os.path.exists(transcriptome_dir): out.append({"path": transcriptome_dir, "type": "directory", "ext": "transcriptome"}) return _add_meta(out, config=upload_config)
def _get_cache_file(data, target_name): prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) cache_file = prefix + "-" + target_name + "-stats.yaml" return cache_file
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_correct_umis(data): data["work_bam"] = postalign.correct_umis(data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2: f1, f2, avg_cov = postalign.umi_consensus(data) data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) else: raise ValueError( "Single fastq input for UMI processing; fgbio needs paired reads: %s" % dd.get_sample_name(data)) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) ref_file = dd.get_ref_file(data) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "remove_extracontigs": out_bam = cleanbam.remove_extracontigs(fastq1, data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) if not utils.file_exists(out_file): work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "{}-sort.bam".format(dd.get_sample_name(data))) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = _link_bam_file( fastq1, os.path.join(dd.get_work_dir(data), "prealign", dd.get_sample_name(data)), data) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and not dd.get_aligner(data): data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) elif "kraken" in config["algorithm"]: # kraken doesn's need bam pass else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def combine_files(samples): """ after quantitation, combine the counts/FPKM/TPM/etc into a single table with all samples """ data = samples[0][0] # prefer the supplied transcriptome gtf file gtf_file = dd.get_transcriptome_gtf(data, None) if not gtf_file: gtf_file = dd.get_gtf_file(data, None) dexseq_gff = dd.get_dexseq_gff(data) # combine featureCount files count_files = filter_missing([dd.get_count_file(x[0]) for x in samples]) combined = count.combine_count_files(count_files, ext=".counts") annotated = count.annotate_combined_count_file(combined, gtf_file) # add tx2gene file tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation", "tx2gene.csv") if gtf_file: tx2gene_file = sailfish.create_combined_tx2gene(data) # combine eXpress files express_counts_combined = combine_express(samples, combined) # combine Cufflinks files fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples]) if fpkm_files and combined: fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm" fpkm_combined = count.combine_count_files(fpkm_files, fpkm_combined_file) else: fpkm_combined = None isoform_files = filter_missing( [dd.get_fpkm_isoform(x[0]) for x in samples]) if isoform_files and combined: fpkm_isoform_combined_file = os.path.splitext( combined)[0] + ".isoform.fpkm" fpkm_isoform_combined = count.combine_count_files( isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm") else: fpkm_isoform_combined = None # combine DEXseq files to_combine_dexseq = list( filter_missing([dd.get_dexseq_counts(data[0]) for data in samples])) if to_combine_dexseq and combined: dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq" dexseq_combined = count.combine_count_files(to_combine_dexseq, dexseq_combined_file, ".dexseq") if dexseq_combined: dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined) else: dexseq_combined = None samples = spikein.combine_spikein(samples) tximport = load_tximport(data) updated_samples = [] for data in dd.sample_data_iterator(samples): if combined: data = dd.set_combined_counts(data, combined) if annotated: data = dd.set_annotated_combined_counts(data, annotated) if fpkm_combined: data = dd.set_combined_fpkm(data, fpkm_combined) if fpkm_isoform_combined: data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined) if express_counts_combined: data = dd.set_express_counts(data, express_counts_combined['counts']) data = dd.set_express_tpm(data, express_counts_combined['tpm']) data = dd.set_express_fpkm(data, express_counts_combined['fpkm']) data = dd.set_isoform_to_gene( data, express_counts_combined['isoform_to_gene']) if dexseq_combined: data = dd.set_dexseq_counts(data, dexseq_combined_file) if gtf_file: data = dd.set_tx2gene(data, tx2gene_file) data = dd.set_tximport(data, tximport) updated_samples.append([data]) return updated_samples
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs( [utils.deepish_copy(x) for x in samples], samples[0]) work_samples = _summarize_inputs(work_samples, out_dir) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): config_file = _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir( samples[0]) else: export_tmp = "" locale_export = utils.locale_export() path_export = utils.local_path_export() other_opts = config_utils.get_resources( "multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = ( "{path_export}{export_tmp}{locale_export} " "{multiqc} -c {config_file} -f -l {input_list_file} {other_opts} -o {tx_out}" ) do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add( os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add( os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) [ data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*")) ] data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json( samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append( data_json_final) # Prepare final file list and inputs for downstream usage file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append( file_list_final) if any([cwlutils.is_cwl_run(d) for d in samples]): for indir in ["inputs", "report"]: tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir)) if not utils.file_exists(tarball): with utils.chdir(out_dir): cmd = ["tar", "-czvpf", tarball, indir] do.run(cmd, "Compress multiqc inputs: %s" % indir) samples[0]["summary"]["multiqc"]["secondary"].append( tarball) if any([cwlutils.is_cwl_run(d) for d in samples]): samples = _add_versions(samples) return [[data] for data in samples]
def _maybe_add_salmon_files(algorithm, sample, out): salmon_dir = os.path.join(dd.get_work_dir(sample), "salmon", dd.get_sample_name(sample)) if os.path.exists(salmon_dir): out.append({"path": salmon_dir, "type": "directory", "ext": "salmon"}) return out
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1 = data["files"][0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info( "No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info( "%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return data else: logger.error( "No UMI transform was specified, but %s does not look " "pre-transformed. Assuming non-umi data." % fq1) return data if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return data umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return data cmd = ("{umis} fastqtransform {transform_file} " "--cores {cores} " "{fq1}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return data
def get_kallisto_h5(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "quant") return os.path.join(kallisto_dir, "abundance.h5")
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""): out.append({"path": sample["provenance"]["data"]}) for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists( os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({ "path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": "" }) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({ "path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check" }) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) multiqc = tz.get_in(["summary", "multiqc"], sample) if multiqc: out.extend(_flatten_file_with_secondary(multiqc, "multiqc")) if sample.get("seqcluster", {}): out.append({ "path": sample["seqcluster"].get("out_dir"), "type": "directory", "ext": "seqcluster" }) if sample.get("report", None): out.append({ "path": os.path.dirname(sample["report"]), "type": "directory", "ext": "seqclusterViz" }) for x in sample.get("variants", []): if "pop_db" in x: out.append({ "path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"] }) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({ "path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"] }) suffix = "-annotated-decomposed" if tz.get_in( ("population", "decomposed"), x) else "-annotated" out.extend([ _add_batch(x, sample) for x in _get_variant_file(x, ("population", "vcf"), suffix=suffix) ]) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({ "path": all_coverage, "type": "bed", "ext": "coverage" }) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): count_file = dd.get_combined_counts(sample) if sample["analysis"].lower() == "scrna-seq": out.append({"path": count_file, "type": "mtx"}) out.append({"path": count_file + ".rownames", "type": "rownames"}) out.append({"path": count_file + ".colnames", "type": "colnames"}) else: out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) if dd.get_tx2gene(sample): out.append({"path": dd.get_tx2gene(sample)}) if dd.get_spikein_counts(sample): out.append({"path": dd.get_spikein_counts(sample)}) transcriptome_dir = os.path.join(dd.get_work_dir(sample), "inputs", "transcriptome") if os.path.exists(transcriptome_dir): out.append({ "path": transcriptome_dir, "type": "directory", "ext": "transcriptome" }) return _add_meta(out, config=upload_config)
def cpg_stats(sample): dtdepth = Counter() dtratio = Counter() work_dir = dd.get_work_dir(sample) sample_name = dd.get_sample_name(sample) depth_out = os.path.join(work_dir, "cpg_split", sample_name, "depth.tsv") ratio_out = os.path.join(work_dir, "cpg_split", sample_name, "ratio.tsv") hmc_out = os.path.join(work_dir, "cpg_split", sample_name, "%s_hmc.tsv.gz" % sample_name) hmc_files = " ".join(sample["hmc_split"]) with file_transaction(hmc_out) as tx_out: header = " ".join(["chr", "pos", "strand", "context", "ratio", "eff_CT_counts", "C_counts", "CT_counts", "rev_G_counts", "rev_GA_counts", "CI_lover", "CI_upper", "ox_ratio", "ox_eff_CT_counts", "ox_C_counts", "ox_CT_counts", "ox_rev_G_counts", "ox_rev_GA_counts", "ox_CI_lower", "ox_CI_upper", "pvalue"]) cmd = "cat <(echo {header} | sed 's/ /\t/g') {hmc_files} | gzip -c > {tx_out}" if not file_exists(hmc_out): do.run(cmd.format(**locals()), "Merging %s" % sample_name) work_bam = dd.get_work_bam(sample) if not file_exists(depth_out): for cpg_file in sample["cpg_split"]: logger.debug("Reading %s of sample %s" % (cpg_file, sample_name)) if file_exists(cpg_file): with open(cpg_file) as in_handle: for line in in_handle: cols = line.strip().split("\t") if cols[3] == "CG": ratio = int(float(cols[4]) * 100) dtratio[ratio] += 1 depth = int(math.ceil(float(cols[5]))) if float(cols[5]) < 50 else 50 dtdepth[depth] += 1 pd.DataFrame(dtdepth, index=[1]).to_csv(depth_out, sep="\t") pd.DataFrame(dtratio, index=[1]).to_csv(ratio_out, sep="\t") # calculate mlml if not hmc_files: return None out_dir = safe_makedir(os.path.join(work_dir, "mlml", sample_name)) mlml_out = os.path.join(out_dir, "%s_mlml.txt.gz" % sample_name) if not file_exists(mlml_out) and file_exists(hmc_out): with chdir(out_dir): with file_transaction(mlml_out) as tx_out: tx_out_1 = "%s_noheader" % tx_out tx_out_2 = "%s_alone" % tx_out cmd = " ".join(["zcat %s | sed -e '1d' | awk " % hmc_out, ''' '{rounded = sprintf("%d", $14);print $1"\\t"$2"\t"$3"\\tCpG\\t"$13"\\t"rounded}' ''', "> %s_ox.txt" % sample_name]) do.run(cmd, "Creating OX input for %s" % sample_name) cmd = " ".join(["zcat %s | sed -e '1d' | awk " % hmc_out, ''' '{rounded = sprintf("%d", $6);print $1"\\t"$2"\\t"$3"\\tCpG\\t"$5"\\t"rounded}' ''', "> %s_bs.txt" % sample_name]) do.run(cmd, "Creating BS input for %s" % sample_name) cmd = ("mlml -o {tx_out_1} -u {sample_name}_bs.txt -m {sample_name}_ox.txt -v").format(**locals()) do.run(cmd, "Run MLML with %s" % sample_name) cmd = ("cat <(echo chrom start end mC hmC C conflicts | sed 's/ /\t/g') {tx_out_1} | gzip -c > {tx_out_2} ").format(**locals()) do.run(cmd, "") tx_out_1 = "%s.woFDR.gz" % tx_out cmd = ("paste <(zcat {hmc_out}) <(zcat {tx_out_2}) | gzip -c > {tx_out}").format(**locals()) do.run(cmd, "Merge data for %s" % sample_name) merge_out = [os.path.join(out_dir, "%s_merged.txt.gz" % sample_name), os.path.join(out_dir, "%s_merged_pass.txt.gz" % sample_name)] if not file_exists(merge_out[0]): with file_transaction(merge_out) as tx_outs: tx_out, tx_out_pass = tx_outs df = pd.read_csv(mlml_out, sep="\t") import statsmodels.sandbox.stats.multicomp df["fdr"] = statsmodels.sandbox.stats.multicomp.fdrcorrection0(df["pvalue"])[1] df_p_pass = df[df.fdr<0.05] logger.debug("Pass FDR 5 pct in %s:%s " % (sample_name, float(df_p_pass.shape[1])/float(df.shape[1]))) df.to_csv(tx_out, sep="\t") df_p_pass.to_csv(tx_out_pass, sep="\t")
def normalize_sv_coverage(*items): """Normalize CNV coverage depths by GC, repeats and background. Provides normalized output based on CNVkit approaches, provides a point for providing additional methods in the future: - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit from bcbio.structural import shared as sshared orig_items = items items = [ utils.to_single_data(x) for x in cwlutils.handle_combined_input(items) ] if all(not cnvkit.use_general_sv_bins(x) for x in items): return orig_items out_files = {} for group_id, gitems in itertools.groupby( items, lambda x: tz.get_in(["regions", "bins", "group"], x)): inputs, backgrounds = sshared.find_case_control(list(gitems)) cnns = reduce(operator.add, [[ tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x) ] for x in backgrounds], []) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join( dd.get_sample_name(x) for x in items)) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(inputs[00]), "structural", dd.get_sample_name(inputs[0]), "bins")) back_file = cnvkit.cnvkit_background( cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) for data in inputs: work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = cnvkit.run_fix( tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, os.path.join( work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))), data) out_files[dd.get_sample_name(data)] = fix_file out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name( data)] out.append([data]) return out
def chipseq_count(data): """ count reads mapping to ChIP/ATAC consensus peaks with featureCounts """ method = dd.get_chip_method(data) if method == "chip": in_bam = dd.get_work_bam(data) elif method == "atac": if bam.is_paired(dd.get_work_bam(data)): in_bam = tz.get_in(("atac", "align", "NF"), data) else: in_bam = tz.get_in(("atac", "align", "full"), data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data) if not consensus_file: return [[data]] saf_file = os.path.splitext(consensus_file)[0] + ".saf" work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "consensus") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): if method == "atac": if bam.is_paired(dd.get_work_bam(data)): data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) else: data = tz.assoc_in(data, ("peak_counts", "full"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count_file) return [[data]] featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) cmd = ( "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {sorted_bam}") message = ("Count reads in {sorted_bam} overlapping {saf_file} using " "featureCounts.") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) if method == "atac": if bam.is_paired(dd.get_work_bam(data)): data = tz.assoc_in(data, ("peak_counts", "NF"), count_file) else: data = tz.assoc_in(data, ("peak_counts", "full"), count_file) elif method == "chip": data = tz.assoc_in(data, ("peak_counts"), count_file) return [[data]]
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals, data): """Step 1 of GATK recalibration process, producing table of covariates. For GATK 4 we use local multicore spark runs: https://github.com/broadinstitute/gatk/issues/2345 For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis' plots in the GATK documentation: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest This identifies large files and calculates the fraction to downsample to. """ target_counts = 1e8 # 100 million reads per read group, 20x the plotted max out_file = os.path.join( dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.grp" % utils.splitext_plus(os.path.basename(dup_align_bam))[0]) if not utils.file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with file_transaction(data, out_file) as tx_out_file: gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+ or GATK4 for BQSR" params = ["-I", dup_align_bam] cores = dd.get_num_cores(data) if gatk_type == "gatk4": params += [ "-T", "BaseRecalibratorSpark", "--sparkMaster", "local[%s]" % cores, "--output", tx_out_file, "--reference", dd.get_ref_twobit(data) ] else: params += [ "-T", "BaseRecalibrator", "-o", tx_out_file, "-R", ref_file ] downsample_pct = bam.get_downsample_pct( dup_align_bam, target_counts, data) if downsample_pct: params += [ "--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS" ] if platform.lower() == "solid": params += [ "--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N" ] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def _get_files_project(sample, upload_config): """Retrieve output files associated with an entire analysis project. """ out = [{"path": sample["provenance"]["programs"]}] for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]: if os.path.exists( os.path.join(log.get_log_dir(sample["config"]), fname)): out.append({ "path": os.path.join(log.get_log_dir(sample["config"]), fname), "type": "external_command_log", "ext": "" }) if "summary" in sample and sample["summary"].get("project"): out.append({"path": sample["summary"]["project"]}) mixup_check = tz.get_in(["summary", "mixup_check"], sample) if mixup_check: out.append({ "path": sample["summary"]["mixup_check"], "type": "directory", "ext": "mixup_check" }) report = os.path.join(dd.get_work_dir(sample), "report") if utils.file_exists(report): out.append({"path": report, "type": "directory", "ext": "report"}) if sample.get("seqcluster", None): out.append({ "path": sample["seqcluster"], "type": "directory", "ext": "seqcluster" }) for x in sample.get("variants", []): if "pop_db" in x: out.append({ "path": x["pop_db"], "type": "sqlite", "variantcaller": x["variantcaller"] }) for x in sample.get("variants", []): if "population" in x: pop_db = tz.get_in(["population", "db"], x) if pop_db: out.append({ "path": pop_db, "type": "sqlite", "variantcaller": x["variantcaller"] }) out.extend(_get_variant_file(x, ("population", "vcf"))) for x in sample.get("variants", []): if x.get("validate") and x["validate"].get("grading_summary"): out.append({"path": x["validate"]["grading_summary"]}) break if "coverage" in sample: cov_db = tz.get_in(["coverage", "summary"], sample) if cov_db: out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"}) all_coverage = tz.get_in(["coverage", "all"], sample) if all_coverage: out.append({ "path": all_coverage, "type": "bed", "ext": "coverage" }) if dd.get_mirna_counts(sample): out.append({"path": dd.get_mirna_counts(sample)}) if dd.get_isomir_counts(sample): out.append({"path": dd.get_isomir_counts(sample)}) if dd.get_novel_mirna_counts(sample): out.append({"path": dd.get_novel_mirna_counts(sample)}) if dd.get_novel_isomir_counts(sample): out.append({"path": dd.get_novel_isomir_counts(sample)}) if dd.get_combined_counts(sample): out.append({"path": dd.get_combined_counts(sample)}) if dd.get_annotated_combined_counts(sample): out.append({"path": dd.get_annotated_combined_counts(sample)}) if dd.get_combined_fpkm(sample): out.append({"path": dd.get_combined_fpkm(sample)}) if dd.get_combined_fpkm_isoform(sample): out.append({"path": dd.get_combined_fpkm_isoform(sample)}) if dd.get_transcript_assembler(sample): out.append({"path": dd.get_merged_gtf(sample)}) if dd.get_dexseq_counts(sample): out.append({"path": dd.get_dexseq_counts(sample)}) if dd.get_express_counts(sample): out.append({"path": dd.get_express_counts(sample)}) if dd.get_express_fpkm(sample): out.append({"path": dd.get_express_fpkm(sample)}) if dd.get_express_tpm(sample): out.append({"path": dd.get_express_tpm(sample)}) if dd.get_isoform_to_gene(sample): out.append({"path": dd.get_isoform_to_gene(sample)}) if dd.get_square_vcf(sample): out.append({"path": dd.get_square_vcf(sample)}) if dd.get_sailfish_tidy(sample): out.append({"path": dd.get_sailfish_tidy(sample)}) if dd.get_sailfish_transcript_tpm(sample): out.append({"path": dd.get_sailfish_transcript_tpm(sample)}) if dd.get_sailfish_gene_tpm(sample): out.append({"path": dd.get_sailfish_gene_tpm(sample)}) return _add_meta(out, config=upload_config)
def normalize_sv_coverage(*items): """Normalize CNV coverage depths by GC, repeats and background. Provides normalized output based on CNVkit approaches, provides a point for providing additional methods in the future: - reference: calculates reference backgrounds from normals and pools including GC and repeat information - fix: Uses background to normalize coverage estimations http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix """ from bcbio.structural import cnvkit from bcbio.structural import shared as sshared items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)] if all(not cnvkit.use_general_sv_bins(x) for x in items): return [[d] for d in items] out_files = {} back_files = {} for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)): # No CNVkit calling for this particular set of samples if group_id is None: continue inputs, backgrounds = sshared.find_case_control(list(gitems)) cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x), tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], []) assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items)) for d in inputs: if tz.get_in(["depth", "bins", "target"], d): target_bed = tz.get_in(["depth", "bins", "target"], d) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural", dd.get_sample_name(inputs[0]), "bins")) input_backs = set(filter(lambda x: x is not None, [dd.get_background_cnv_reference(d) for d in inputs])) if input_backs: assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs) back_file = list(input_backs)[0] else: back_file = cnvkit.cnvkit_background(cnns, os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)), backgrounds or inputs, target_bed, antitarget_bed) fix_cmd_inputs = [] for data in inputs: work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) if tz.get_in(["depth", "bins", "target"], data): fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data))) fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data), tz.get_in(["depth", "bins", "antitarget"], data), back_file, fix_file, data)) out_files[dd.get_sample_name(data)] = fix_file back_files[dd.get_sample_name(data)] = back_file parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel) out = [] for data in items: if dd.get_sample_name(data) in out_files: data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)] data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)] out.append([data]) return out
def work_dir(data): return utils.safe_makedir( os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data), "sambamba"))
def run_mosdepth(data, target_name, bed_file, per_base=False, quantize=None, thresholds=None): """Run mosdepth generating distribution, region depth and per-base depth. """ MosdepthCov = collections.namedtuple( "MosdepthCov", ("dist", "per_base", "regions", "quantize", "thresholds")) bam_file = dd.get_align_bam(data) or dd.get_work_bam(data) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data))) prefix = os.path.join(work_dir, "%s-%s" % (dd.get_sample_name(data), target_name)) old_dist_file = "%s.mosdepth.dist.txt" % (prefix) out = MosdepthCov( (old_dist_file if utils.file_uptodate( old_dist_file, bam_file) else "%s.mosdepth.%s.dist.txt" % (prefix, "region" if bed_file else "global")), ("%s.per-base.bed.gz" % prefix) if per_base else None, ("%s.regions.bed.gz" % prefix) if bed_file else None, ("%s.quantized.bed.gz" % prefix) if quantize else None, ("%s.thresholds.bed.gz" % prefix) if thresholds else None) if not utils.file_uptodate(out.dist, bam_file): with file_transaction(data, out.dist) as tx_out_file: tx_prefix = os.path.join(os.path.dirname(tx_out_file), os.path.basename(prefix)) num_cores = dd.get_cores(data) bed_arg = ("--by %s" % bed_file) if bed_file else "" perbase_arg = "" if per_base else "--no-per-base" mapq_arg = "-Q 1" if (per_base or quantize) else "" if quantize: quant_arg = "--quantize %s" % quantize[0] quant_export = " && ".join([ "export MOSDEPTH_Q%s=%s" % (i, x) for (i, x) in enumerate(quantize[1]) ]) quant_export += " && " else: quant_arg, quant_export = "", "" thresholds_cmdl = ( "-T " + ",".join([str(t) for t in thresholds])) if out.thresholds else "" cmd = ( "{quant_export}mosdepth -t {num_cores} -F 1804 {mapq_arg} {perbase_arg} {bed_arg} {quant_arg} " "{tx_prefix} {bam_file} {thresholds_cmdl}") message = "Calculating coverage: %s %s" % ( dd.get_sample_name(data), target_name) do.run(cmd.format(**locals()), message.format(**locals())) if out.per_base: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.per_base)), out.per_base) if out.regions: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.regions)), out.regions) if out.quantize: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.quantize)), out.quantize) if out.thresholds: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.thresholds)), out.thresholds) return out
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if vcinfo and vcinfo.get("vrn_file") and utils.file_exists( vcinfo["vrn_file"]): if vcinfo["vrn_file"] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo["vrn_file"]): if vcinfo["vrn_file"] and vcfutils.vcf_has_nonfiltered_variants( vcinfo["vrn_file"]): vcf_file = vcinfo["vrn_file"] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None if not peddy or not vcf_file or not vcfanno.is_human(data): if not peddy: reason = "peddy executable not found" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" cmd = ( "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ((l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) file_fapths = [] opts = "" out_dir = os.path.join(work_dir, "multiqc") out_data = os.path.join(work_dir, "multiqc", "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") samples = _report_summary(samples, os.path.join(out_dir, "report")) for data in samples: for program, pfiles in tz.get_in(["summary", "qc"], data, {}).iteritems(): if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif isinstance(pfiles, basestring): pfiles = [pfiles] file_fapths.extend(pfiles) file_fapths.append( os.path.join(out_dir, "report", "metrics", "target_info.yaml")) # XXX temporary workaround until we can handle larger inputs through MultiQC file_fapths = list(set(file_fapths)) # Back compatible -- to migrate to explicit specifications in input YAML file_fapths += ["trimmed", "htseq-count/*summary"] if not utils.file_exists(out_file): with utils.chdir(work_dir): file_fapths = [ fpath for fpath in file_fapths if _check_multiqc_input(fpath) and _is_good_file_for_multiqc(fpath) ] input_list_file = _create_list_file(file_fapths) export_tmp = "" if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) if input_list_file: cmd = "{export_tmp} {multiqc} -f -l {input_list_file} -o {tx_out} {opts}" with tx_tmpdir(data, work_dir) as tx_out: do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(samples): if i == 0: if utils.file_exists(out_file): data_files = glob.glob( os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } out.append(data) return [[fpath] for fpath in out]
def run_peddy(samples, out_dir=None): vcf_file = None for d in samples: vcinfo = variant.get_active_vcinfo(d) if vcinfo and vcinfo.get("vrn_file") and utils.file_exists( vcinfo["vrn_file"]): if vcinfo["vrn_file"] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo["vrn_file"]): vcf_file = vcinfo["vrn_file"] break data = samples[0] peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None if not peddy or not vcf_file or not is_human(data): logger.info( "peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking " "for %s." % vcf_file) return samples batch = dd.get_batch(data) or dd.get_sample_name(data) if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" peddyfiles = expected_peddy_files(peddy_report, batch) if file_exists(peddy_report): return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles) if file_exists(peddy_prefix + "-failed.log"): return samples num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}" message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) if to_show[-1].find("IndexError") >= 0 and to_show[-1].find( "is out of bounds for axis") >= 0: logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def summary(*samples): """Summarize all quality metrics together""" samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) folders = [] opts = "" out_dir = os.path.join(work_dir, "multiqc") out_data = os.path.join(work_dir, "multiqc", "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") samples = _report_summary(samples, os.path.join(out_dir, "report")) for data in samples: for program, pfiles in tz.get_in(["summary", "qc"], data, {}).iteritems(): if isinstance(pfiles, dict): pfiles = pfiles["base"] folders.append(os.path.dirname(pfiles)) # XXX temporary workaround until we can handle larger inputs through MultiQC folders = list(set(folders)) if len(folders) > 250: logger.warning( "Too many samples for MultiQC, only using first 250 entries.") folders = folders[:250] opts = "--flat" # Back compatible -- to migrate to explicit specifications in input YAML folders += ["trimmed", "htseq-count/*summary"] if not utils.file_exists(out_file): with utils.chdir(work_dir): input_dir = " ".join([_check_multiqc_input(d) for d in folders]) export_tmp = "" if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0]) if input_dir.strip(): cmd = "{export_tmp} {multiqc} -f {input_dir} -o {tx_out} {opts}" with tx_tmpdir(data, work_dir) as tx_out: do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) out = [] for i, data in enumerate(samples): if i == 0: if utils.file_exists(out_file): data_files = glob.glob( os.path.join(out_dir, "multiqc_data", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.bed")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.txt")) data_files += glob.glob( os.path.join(out_dir, "report", "*", "*.tsv")) data_files += glob.glob(os.path.join(out_dir, "report", "*.R*")) if "summary" not in data: data["summary"] = {} data["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } out.append(data) return [[d] for d in out]
def _report_summary(samples, out_dir): """ Run coverage report with bcbiocov package """ try: import bcbreport.prepare as bcbreport except ImportError: logger.info("skipping report. No bcbreport installed.") return samples # samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) parent_dir = utils.safe_makedir(out_dir) with utils.chdir(parent_dir): logger.info("copy qsignature") qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma") if qsignature_fn: # this need to be inside summary/qc dict if utils.file_exists(qsignature_fn) and not utils.file_exists("qsignature.ma"): shutil.copy(qsignature_fn, "bcbio_qsignature.ma") out_dir = utils.safe_makedir("fastqc") logger.info("summarize fastqc") with utils.chdir(out_dir): _merge_fastqc(samples) logger.info("summarize target information") if samples[0].get("analysis", "").lower() in ["variant", "variant2"]: samples = _merge_target_information(samples) out_dir = utils.safe_makedir("coverage") logger.info("summarize coverage") for data in samples: pfiles = tz.get_in(["summary", "qc", "coverage"], data, []) if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif pfiles: pfiles = [pfiles] for fn in pfiles: if os.path.basename(fn).find("coverage_fixed") > -1: utils.copy_plus(fn, os.path.join(out_dir, os.path.basename(fn))) out_dir = utils.safe_makedir("variants") logger.info("summarize variants") for data in samples: pfiles = tz.get_in(["summary", "qc", "variants"], data, []) if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif pfiles: pfiles = [pfiles] for fn in pfiles: if os.path.basename(fn).find("gc-depth-parse.tsv") > -1: utils.copy_plus(fn, os.path.join(out_dir, os.path.basename(fn))) bcbreport.report(parent_dir) out_report = os.path.join(parent_dir, "qc-coverage-report.html") if not utils.file_exists(out_report): rmd_file = os.path.join(parent_dir, "report-ready.Rmd") run_file = "%s-run.R" % (os.path.splitext(out_report)[0]) with open(run_file, "w") as out_handle: out_handle.write("""library(rmarkdown)\nrender("%s")\n""" % rmd_file) # cmd = "%s %s" % (utils.Rscript_cmd(), run_file) # Skip automated generation of coverage report to avoid error # messages. We need to generalize coverage reporting and re-include. # try: # do.run(cmd, "Prepare coverage summary", log_error=False) # except subprocess.CalledProcessError as msg: # logger.info("Skipping generation of coverage report: %s" % (str(msg))) if utils.file_exists("report-ready.html"): shutil.move("report-ready.html", out_report) return samples
def _sv_workdir(data): return utils.safe_makedir( os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "purecn"))
def merge_split_alignments(samples, run_parallel): """Manage merging split alignments back into a final working BAM file. Perform de-duplication on the final merged file. """ ready = [] file_key = "work_bam" to_merge = collections.defaultdict(list) for data in (xs[0] for xs in samples): if data.get("combine"): out_key = tz.get_in(["combine", file_key, "out"], data) if not out_key: out_key = data["rgnames"]["lane"] to_merge[out_key].append(data) else: ready.append([data]) ready_merge = [] hla_merges = [] for mgroup in to_merge.values(): cur_data = mgroup[0] del cur_data["align_split"] for x in mgroup[1:]: cur_data["combine"][file_key]["extras"].append(x[file_key]) ready_merge.append([cur_data]) cur_hla = None for d in mgroup: hla_files = tz.get_in(["hla", "fastq"], d) if hla_files: if not cur_hla: cur_hla = { "rgnames": { "sample": dd.get_sample_name(cur_data) }, "config": cur_data["config"], "dirs": cur_data["dirs"], "hla": { "fastq": [] } } cur_hla["hla"]["fastq"].append(hla_files) if cur_hla: hla_merges.append([cur_hla]) if not tz.get_in(["config", "algorithm", "kraken"], data): # kraken requires fasta filenames from data['files'] as input. # We don't want to remove those files if kraken qc is required. _save_fastq_space(samples) merged = run_parallel("delayed_bam_merge", ready_merge) hla_merge_raw = run_parallel("merge_split_alignments", hla_merges) hla_merges = {} for hla_merge in [x[0] for x in hla_merge_raw]: hla_merges[dd.get_sample_name(hla_merge)] = tz.get_in(["hla", "fastq"], hla_merge) # Add stable 'align_bam' target to use for retrieving raw alignment out = [] for data in [x[0] for x in merged + ready]: if data.get("work_bam"): data["align_bam"] = data["work_bam"] if dd.get_sample_name(data) in hla_merges: data["hla"]["fastq"] = hla_merges[dd.get_sample_name(data)] else: hla_files = glob.glob( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "hla", "*.fq")) if hla_files: data["hla"]["fastq"] = hla_files out.append([data]) return out