def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.") out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs([utils.deepish_copy(x) for x in samples], samples[0]) work_samples = _summarize_inputs(work_samples, out_dir) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir(samples[0]) else: export_tmp = "" locale_export = utils.locale_export() path_export = utils.local_path_export() other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = ("{path_export}{export_tmp}{locale_export} " "{multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}") do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")): shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) [data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))] data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files} data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final) # Prepare final file list and inputs for downstream usage file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final) if any([cwlutils.is_cwl_run(d) for d in samples]): for indir in ["inputs", "report"]: tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir)) if not utils.file_exists(tarball): with utils.chdir(out_dir): cmd = ["tar", "-czvpf", tarball, indir] do.run(cmd, "Compress multiqc inputs: %s" % indir) samples[0]["summary"]["multiqc"]["secondary"].append(tarball) if any([cwlutils.is_cwl_run(d) for d in samples]): samples = _add_versions(samples) return [[data] for data in samples]
def summary(*samples): """Summarize all quality metrics together""" samples = list(utils.flatten(samples)) work_dir = dd.get_work_dir(samples[0]) multiqc = config_utils.get_program("multiqc", samples[0]["config"]) if not multiqc: logger.debug( "multiqc not found. Update bcbio_nextgen.py tools to fix this issue." ) out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc")) out_data = os.path.join(out_dir, "multiqc_data") out_file = os.path.join(out_dir, "multiqc_report.html") file_list = os.path.join(out_dir, "list_files.txt") work_samples = cwlutils.unpack_tarballs( [utils.deepish_copy(x) for x in samples], samples[0]) work_samples = _summarize_inputs(work_samples, out_dir) if not utils.file_exists(out_file): with tx_tmpdir(samples[0], work_dir) as tx_out: in_files = _get_input_files(work_samples, out_dir, tx_out) in_files += _merge_metrics(work_samples, out_dir) if _one_exists(in_files): with utils.chdir(out_dir): config_file = _create_config_file(out_dir, work_samples) input_list_file = _create_list_file(in_files, file_list) if dd.get_tmp_dir(samples[0]): export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir( samples[0]) else: export_tmp = "" locale_export = utils.locale_export() path_export = utils.local_path_export() other_opts = config_utils.get_resources( "multiqc", samples[0]["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]) cmd = ( "{path_export}{export_tmp}{locale_export} " "{multiqc} -c {config_file} -f -l {input_list_file} {other_opts} -o {tx_out}" ) do.run(cmd.format(**locals()), "Run multiqc") if utils.file_exists( os.path.join(tx_out, "multiqc_report.html")): shutil.move( os.path.join(tx_out, "multiqc_report.html"), out_file) shutil.move(os.path.join(tx_out, "multiqc_data"), out_data) samples = _group_by_sample_and_batch(samples) if utils.file_exists(out_file) and samples: data_files = set() for i, data in enumerate(samples): data_files.add( os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt")) data_files.add( os.path.join(out_dir, "report", "metrics", "target_info.yaml")) data_files.add(os.path.join(out_dir, "multiqc_config.yaml")) [ data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*")) ] data_files = [f for f in data_files if f and utils.file_exists(f)] if "summary" not in samples[0]: samples[0]["summary"] = {} samples[0]["summary"]["multiqc"] = { "base": out_file, "secondary": data_files } data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json") data_json_final = _save_uploaded_data_json( samples, data_json, os.path.join(out_dir, "multiqc_data")) if data_json_final: samples[0]["summary"]["multiqc"]["secondary"].append( data_json_final) # Prepare final file list and inputs for downstream usage file_list_final = _save_uploaded_file_list(samples, file_list, out_dir) if file_list_final: samples[0]["summary"]["multiqc"]["secondary"].append( file_list_final) if any([cwlutils.is_cwl_run(d) for d in samples]): for indir in ["inputs", "report"]: tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir)) if not utils.file_exists(tarball): with utils.chdir(out_dir): cmd = ["tar", "-czvpf", tarball, indir] do.run(cmd, "Compress multiqc inputs: %s" % indir) samples[0]["summary"]["multiqc"]["secondary"].append( tarball) if any([cwlutils.is_cwl_run(d) for d in samples]): samples = _add_versions(samples) return [[data] for data in samples]
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples]) if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips: if not peddy: reason = "peddy executable not found" elif config_skips: reason = "peddy in tools_off configuration" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" locale = utils.locale_export() cmd = ( "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ( (l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and min") >= 0) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4 - len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info( "No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info( "%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return [[data]] else: logger.error( "No UMI transform was specified, but %s does not look " "pre-transformed." % fq1) sys.exit(1) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" if dd.get_demultiplexed(data): demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data) split_option = "" else: demuxed_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return [[data]] locale_export = utils.locale_export() cmd = ( "{locale_export}{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} {demuxed_option} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def _umis_cmd(data): """Return umis command line argument, with correct python and locale. """ return "%s %s %s" % ( utils.locale_export(), utils.get_program_python("umis"), config_utils.get_program("umis", data["config"], default="umis"))
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4-len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info("No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info("%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return [[data]] else: logger.error("No UMI transform was specified, but %s does not look " "pre-transformed." % fq1) sys.exit(1) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." %(dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" if dd.get_demultiplexed(data): demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data) split_option = "" else: demuxed_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return [[data]] locale_export = utils.locale_export() cmd = ("{locale_export}{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} {demuxed_option} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]