Example #1
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.")
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = cwlutils.unpack_tarballs([utils.deepish_copy(x) for x in samples], samples[0])
    work_samples = _summarize_inputs(work_samples, out_dir)
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir(samples[0])
                    else:
                        export_tmp = ""
                    locale_export = utils.locale_export()
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources("multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = ("{path_export}{export_tmp}{locale_export} "
                           "{multiqc} -f -l {input_list_file} {other_opts} -o {tx_out}")
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"), out_data)
    samples = _group_by_sample_and_batch(samples)
    if utils.file_exists(out_file) and samples:
        data_files = set()
        for i, data in enumerate(samples):
            data_files.add(os.path.join(out_dir, "report", "metrics", dd.get_sample_name(data) + "_bcbio.txt"))
        data_files.add(os.path.join(out_dir, "report", "metrics", "target_info.yaml"))
        data_files.add(os.path.join(out_dir, "multiqc_config.yaml"))
        [data_files.add(f) for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))]
        data_files = [f for f in data_files if f and utils.file_exists(f)]
        if "summary" not in samples[0]:
            samples[0]["summary"] = {}
        samples[0]["summary"]["multiqc"] = {"base": out_file, "secondary": data_files}

        data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json")
        data_json_final = _save_uploaded_data_json(samples, data_json, os.path.join(out_dir, "multiqc_data"))
        if data_json_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(data_json_final)

        # Prepare final file list and inputs for downstream usage
        file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
        if file_list_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(file_list_final)
            if any([cwlutils.is_cwl_run(d) for d in samples]):
                for indir in ["inputs", "report"]:
                    tarball = os.path.join(out_dir, "multiqc-%s.tar.gz" % (indir))
                    if not utils.file_exists(tarball):
                        with utils.chdir(out_dir):
                            cmd = ["tar", "-czvpf", tarball, indir]
                            do.run(cmd, "Compress multiqc inputs: %s" % indir)
                    samples[0]["summary"]["multiqc"]["secondary"].append(tarball)

    if any([cwlutils.is_cwl_run(d) for d in samples]):
        samples = _add_versions(samples)

    return [[data] for data in samples]
Example #2
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug(
            "multiqc not found. Update bcbio_nextgen.py tools to fix this issue."
        )
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = cwlutils.unpack_tarballs(
        [utils.deepish_copy(x) for x in samples], samples[0])
    work_samples = _summarize_inputs(work_samples, out_dir)
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    config_file = _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir(
                            samples[0])
                    else:
                        export_tmp = ""
                    locale_export = utils.locale_export()
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources(
                        "multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = (
                        "{path_export}{export_tmp}{locale_export} "
                        "{multiqc} -c {config_file} -f -l {input_list_file} {other_opts} -o {tx_out}"
                    )
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(
                            os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(
                            os.path.join(tx_out, "multiqc_report.html"),
                            out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"),
                                    out_data)
    samples = _group_by_sample_and_batch(samples)
    if utils.file_exists(out_file) and samples:
        data_files = set()
        for i, data in enumerate(samples):
            data_files.add(
                os.path.join(out_dir, "report", "metrics",
                             dd.get_sample_name(data) + "_bcbio.txt"))
        data_files.add(
            os.path.join(out_dir, "report", "metrics", "target_info.yaml"))
        data_files.add(os.path.join(out_dir, "multiqc_config.yaml"))
        [
            data_files.add(f)
            for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))
        ]
        data_files = [f for f in data_files if f and utils.file_exists(f)]
        if "summary" not in samples[0]:
            samples[0]["summary"] = {}
        samples[0]["summary"]["multiqc"] = {
            "base": out_file,
            "secondary": data_files
        }

        data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json")
        data_json_final = _save_uploaded_data_json(
            samples, data_json, os.path.join(out_dir, "multiqc_data"))
        if data_json_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(
                data_json_final)

        # Prepare final file list and inputs for downstream usage
        file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
        if file_list_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(
                file_list_final)
            if any([cwlutils.is_cwl_run(d) for d in samples]):
                for indir in ["inputs", "report"]:
                    tarball = os.path.join(out_dir,
                                           "multiqc-%s.tar.gz" % (indir))
                    if not utils.file_exists(tarball):
                        with utils.chdir(out_dir):
                            cmd = ["tar", "-czvpf", tarball, indir]
                            do.run(cmd, "Compress multiqc inputs: %s" % indir)
                    samples[0]["summary"]["multiqc"]["secondary"].append(
                        tarball)

    if any([cwlutils.is_cwl_run(d) for d in samples]):
        samples = _add_versions(samples)

    return [[data] for data in samples]
Example #3
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = None
        if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [
                "tumor"
        ]:
            vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if not vcinfo and dd.get_phenotype(d) in ["tumor"]:
            vcinfo = variant.extract_germline_vcinfo(d, peddy_dir)
        if vcinfo:
            for key in ["germline", "vrn_file"]:
                if vcinfo and vcinfo.get(key) and utils.file_exists(
                        vcinfo[key]):
                    if vcinfo[key] and dd.get_sample_name(
                            d) in vcfutils.get_samples(vcinfo[key]):
                        if vcinfo[
                                key] and vcfutils.vcf_has_nonfiltered_variants(
                                    vcinfo[key]):
                            vcf_file = vcinfo[key]
                            break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples])
    if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips:
        if not peddy:
            reason = "peddy executable not found"
        elif config_skips:
            reason = "peddy in tools_off configuration"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            locale = utils.locale_export()
            cmd = (
                "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return (
                        (l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0)
                        or (l.find("n_components=") >= 0
                            and l.find("must be between 1 and min") >= 0)
                        or (l.find(
                            "Input contains NaN, infinity or a value too large for dtype"
                        ) >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Example #4
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4 - len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if not transform:
        logger.info(
            "No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info(
                "%s detected as pre-transformed, passing it on unchanged." %
                fq1)
            data["files"] = [fq1]
            return [[data]]
        else:
            logger.error(
                "No UMI transform was specified, but %s does not look "
                "pre-transformed." % fq1)
            sys.exit(1)

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s." %
                (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) > 1:
        split_option = "--separate_cb"
    else:
        split_option = ""
    if dd.get_demultiplexed(data):
        demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data)
        split_option = ""
    else:
        demuxed_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = next(in_handle)
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]
    locale_export = utils.locale_export()
    cmd = (
        "{locale_export}{umis} fastqtransform {split_option} {transform_file} "
        "--cores {cores} {demuxed_option} "
        "{fq1} {fq2} {fq3} {fq4}"
        "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Example #5
0
def _umis_cmd(data):
    """Return umis command line argument, with correct python and locale.
    """
    return "%s %s %s" % (
        utils.locale_export(), utils.get_program_python("umis"),
        config_utils.get_program("umis", data["config"], default="umis"))
Example #6
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4-len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if not transform:
        logger.info("No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info("%s detected as pre-transformed, passing it on unchanged." % fq1)
            data["files"] = [fq1]
            return [[data]]
        else:
            logger.error("No UMI transform was specified, but %s does not look "
                         "pre-transformed." % fq1)
            sys.exit(1)

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s."
                %(dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) > 1:
        split_option = "--separate_cb"
    else:
        split_option = ""
    if dd.get_demultiplexed(data):
        demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data)
        split_option = ""
    else:
        demuxed_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = next(in_handle)
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]
    locale_export = utils.locale_export()
    cmd = ("{locale_export}{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} {demuxed_option} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]