예제 #1
0
def _create_config_file(out_dir, samples):
    """Provide configuration file hiding duplicate columns.

    Future entry point for providing top level configuration of output reports.
    """
    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

    # Setting the module order
    module_order = []
    module_order.extend([
        "bcbio",
        "samtools",
        "goleft_indexcov",
        "peddy"
    ])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {})  # tumor-only somatic with germline extraction
           or dd.get_phenotype(s) == "germline"        # or paired somatic with germline calling for normal
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([{
            'bcftools': {
                'name': 'Bcftools (somatic)',
                'info': 'Bcftools stats for somatic variant calls only.',
                'path_filters': ['*_bcftools_stats.txt'],
                'write_general_stats': True,
            }},
            {'bcftools': {
                'name': 'Bcftools (germline)',
                'info': 'Bcftools stats for germline variant calls only.',
                'path_filters': ['*_bcftools_stats_germline.txt'],
                'write_general_stats': False
            }},
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "picard",
        "qualimap",
        "snpeff",
        "fastqc",
        "preseq",
    ])
    out["module_order"] = module_order

    preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return out_file
예제 #2
0
def _create_config_file(out_dir, samples):
    """Provide configuration file hiding duplicate columns.

    Future entry point for providing top level configuration of output reports.
    """
    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

    # Setting the module order
    module_order = []
    module_order.extend([
        "bcbio",
        "samtools",
        "goleft_indexcov"
    ])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {})  # tumor-only somatic with germline extraction
           or dd.get_phenotype(s) == "germline"        # or paired somatic with germline calling for normal
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([{
            'bcftools': {
                'name': 'Bcftools (somatic)',
                'info': 'Bcftools stats for somatic variant calls only.',
                'path_filters': ['*_bcftools_stats.txt'],
                'write_general_stats': True,
            }},
            {'bcftools': {
                'name': 'Bcftools (germline)',
                'info': 'Bcftools stats for germline variant calls only.',
                'path_filters': ['*_bcftools_stats_germline.txt'],
                'write_general_stats': False
            }},
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "picard",
        "qualimap",
        "snpeff",
        "fastqc",
        "preseq",
    ])
    out["module_order"] = module_order

    preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return out_file
예제 #3
0
def run_peddy(samples, out_dir=None):
    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                vcf_file = vcinfo["vrn_file"]
                break
    data = samples[0]
    peddy = config_utils.get_program("peddy", data) if config_utils.program_installed("peddy", data) else None
    if not peddy or not vcf_file or not is_human(data):
        logger.info("peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking "
                    "for %s." % vcf_file)
        return samples
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"
    peddyfiles = expected_peddy_files(peddy_report, batch)
    if file_exists(peddy_report):
        return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    num_cores = dd.get_num_cores(data)

    with tx_tmpdir(data) as tx_dir:
        peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix))
        # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
        stderr_log = os.path.join(tx_dir, "run-stderr.log")
        cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}"
        message = "Running peddy on {vcf_file} against {ped_file}."
        try:
            do.run(cmd.format(**locals()), message.format(**locals()))
        except:
            to_show = collections.deque(maxlen=100)
            with open(stderr_log) as in_handle:
                for line in in_handle:
                    to_show.append(line)
            if any([l.find("IndexError") >=0 and l.find("is out of bounds for axis") >= 0
                    for l in to_show]):
                logger.info("Skipping peddy because no variants overlap with checks: %s" % batch)
                with open(peddy_prefix + "-failed.log", "w") as out_handle:
                    out_handle.write("peddy did not find overlaps with 1kg sites in VCF, skipping")
                return samples
            else:
                logger.warning("".join(to_show))
                raise
        for ext in PEDDY_OUT_EXTENSIONS:
            if os.path.exists(peddy_prefix_tx + ext):
                shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
예제 #4
0
def run(bam_file, data, out_dir):
    out = {}
    vcinfo = variant.get_active_vcinfo(data)
    if vcinfo and vcfutils.vcf_has_variants(vcinfo["vrn_file"]):
        out_file = os.path.join(utils.safe_makedir(out_dir),
                                "%s-damage.yaml" % (dd.get_sample_name(data)))
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = ["dkfzbiasfilter_summarize.py", "--sample=%s" % dd.get_sample_name(data),
                       "--outfile=%s" % tx_out_file, vcinfo["vrn_file"]]
                do.run(cmd, "Summarize damage filtering")
        if utils.file_exists(out_file):
            out["base"] = out_file
    return out
예제 #5
0
def run(bam_file, data, out_dir):
    out = {}
    vcinfo = variant.get_active_vcinfo(data, use_ensemble=False)
    dkfzbiasfilter = config_utils.get_program("dkfzbiasfilter_summarize.py", data)
    if vcinfo and vcfutils.vcf_has_variants(vcinfo["vrn_file"]):
        out_file = os.path.join(utils.safe_makedir(out_dir),
                                "%s-damage.yaml" % (dd.get_sample_name(data)))
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [dkfzbiasfilter, "--sample=%s" % dd.get_sample_name(data),
                       "--outfile=%s" % tx_out_file, vcinfo["vrn_file"]]
                do.run(cmd, "Summarize damage filtering")
        if utils.file_exists(out_file):
            out["base"] = out_file
    return out
예제 #6
0
def _create_config_file(out_dir, samples):
    """Provide configuration file hiding duplicate columns.

    Future entry point for providing top level configuration of output reports.
    """
    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        # Hiding metrics duplicated by Qualimap
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

        # Setting up thresholds for Qualimap depth cutoff calculations, based on sample avg depths
        avg_depths = [tz.get_in(["summary", "metrics", "Avg_coverage"], s) for s in samples]
        avg_depths = [x for x in avg_depths if x]
        # Picking all thresholds up to the highest sample average depth
        thresholds = [t for t in coverage.DEPTH_THRESHOLDS if not avg_depths or t <= max(avg_depths)]
        # ...plus one more
        if len(thresholds) < len(coverage.DEPTH_THRESHOLDS):
            thresholds.append(coverage.DEPTH_THRESHOLDS[len(thresholds)])

        # Showing only thresholds surrounding any of average depths
        thresholds_hidden = []
        for i, t in enumerate(thresholds):
            if t > 20:  # Not hiding anything below 20x
                if any(thresholds[i-1] <= c < thresholds[i] for c in avg_depths if c and i-1 >= 0) or \
                   any(thresholds[i] <= c < thresholds[i+1] for c in avg_depths if c and i+1 < len(thresholds)):
                    pass
                else:
                    thresholds_hidden.append(t)

        # Hide coverage unless running full qualimap, downsampled inputs are confusing
        if not any(("qualimap_full" in dd.get_tools_on(d)) for d in samples):
            thresholds_hidden = thresholds + thresholds_hidden
            thresholds_hidden.sort()
            thresholds = []
        out['qualimap_config'] = {
            'general_stats_coverage': [str(t) for t in thresholds],
            'general_stats_coverage_hidden': [str(t) for t in thresholds_hidden]}

    # Avoid confusing peddy outputs, sticking to ancestry and sex prediction
    out["table_columns_visible"]["Peddy"] = {"family_id": False, "sex_het_ratio": False,
                                             "error_sex_check": False}

    # Setting the module order
    module_order = []
    module_order.extend([
        "bcbio",
        "samtools",
        "goleft_indexcov",
        "peddy"
    ])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {}) or  # tumor-only somatic with germline extraction
           dd.get_phenotype(s) == "germline" or           # or paired somatic with germline calling for normal
           _has_bcftools_germline_stats(s)                # CWL organized statistics
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([{
            'bcftools': {
                'name': 'Bcftools (somatic)',
                'info': 'Bcftools stats for somatic variant calls only.',
                'path_filters': ['*_bcftools_stats.txt'],
                'write_general_stats': True,
            }},
            {'bcftools': {
                'name': 'Bcftools (germline)',
                'info': 'Bcftools stats for germline variant calls only.',
                'path_filters': ['*_bcftools_stats_germline.txt'],
                'write_general_stats': False
            }},
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "salmon",
        "picard",
        "qualimap",
        "snpeff",
        "fastqc",
        "preseq",
    ])
    out["module_order"] = module_order

    preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return out_file
예제 #7
0
def _create_config_file(out_dir, samples):
    """Provide configuration file for multiqc report."""

    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    extra_fn_clean_trim = []
    extra_fn_clean_trim.extend(
        ["coverage.mosdepth.region.dist", "coverage.mosdepth.global.dist"])
    out["extra_fn_clean_trim"] = extra_fn_clean_trim

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d)
            or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        # Hiding metrics duplicated by Qualimap
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

        # Setting up thresholds for Qualimap depth cutoff calculations, based on sample avg depths
        avg_depths = [
            tz.get_in(["summary", "metrics", "Avg_coverage"], s)
            for s in samples
        ]
        avg_depths = [x for x in avg_depths if x]
        # Picking all thresholds up to the highest sample average depth
        thresholds = [
            t for t in coverage.DEPTH_THRESHOLDS
            if not avg_depths or t <= max(avg_depths)
        ]
        # ...plus one more
        if len(thresholds) < len(coverage.DEPTH_THRESHOLDS):
            thresholds.append(coverage.DEPTH_THRESHOLDS[len(thresholds)])

        # Showing only thresholds surrounding any of average depths
        thresholds_hidden = []
        for i, t in enumerate(thresholds):
            if t > 20:  # Not hiding anything below 20x
                if any(thresholds[i-1] <= c < thresholds[i] for c in avg_depths if c and i-1 >= 0) or \
                   any(thresholds[i] <= c < thresholds[i+1] for c in avg_depths if c and i+1 < len(thresholds)):
                    pass
                else:
                    thresholds_hidden.append(t)

        # Hide coverage unless running full qualimap, downsampled inputs are confusing
        if not any(("qualimap_full" in dd.get_tools_on(d)) for d in samples):
            thresholds_hidden = thresholds + thresholds_hidden
            thresholds_hidden.sort()
            thresholds = []
        out['qualimap_config'] = {
            'general_stats_coverage': [str(t) for t in thresholds],
            'general_stats_coverage_hidden':
            [str(t) for t in thresholds_hidden]
        }

    # Avoid confusing peddy outputs, sticking to ancestry and sex prediction
    out["table_columns_visible"]["Peddy"] = {
        "family_id": False,
        "sex_het_ratio": False,
        "error_sex_check": False
    }

    # Setting the module order
    module_order = []
    module_order.extend(["bcbio", "samtools", "goleft_indexcov", "peddy"])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {})
           or  # tumor-only somatic with germline extraction
           dd.get_phenotype(s) == "germline"
           or  # or paired somatic with germline calling for normal
           _has_bcftools_germline_stats(s)  # CWL organized statistics
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([
            {
                'bcftools': {
                    'name': 'Bcftools (somatic)',
                    'info': 'Bcftools stats for somatic variant calls only.',
                    'path_filters': ['*_bcftools_stats.txt'],
                    'custom_config': {
                        'write_general_stats': True
                    },
                }
            },
            {
                'bcftools': {
                    'name': 'Bcftools (germline)',
                    'info': 'Bcftools stats for germline variant calls only.',
                    'path_filters': ['*_bcftools_stats_germline.txt'],
                    'custom_config': {
                        'write_general_stats': False
                    },
                }
            },
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "salmon", "star", "picard", "qualimap", "snpeff", "bismark", "fastqc",
        "preseq"
    ])
    out["module_order"] = module_order

    preseq_samples = [
        s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)
    ]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out,
                       out_handle,
                       default_flow_style=False,
                       allow_unicode=False)
    return out_file
예제 #8
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = None
        if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [
                "tumor"
        ]:
            vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if not vcinfo and dd.get_phenotype(d) in ["tumor"]:
            vcinfo = variant.extract_germline_vcinfo(d, peddy_dir)
        if vcinfo:
            for key in ["germline", "vrn_file"]:
                if vcinfo and vcinfo.get(key) and utils.file_exists(
                        vcinfo[key]):
                    if vcinfo[key] and dd.get_sample_name(
                            d) in vcfutils.get_samples(vcinfo[key]):
                        if vcinfo[
                                key] and vcfutils.vcf_has_nonfiltered_variants(
                                    vcinfo[key]):
                            vcf_file = vcinfo[key]
                            break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples])
    if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips:
        if not peddy:
            reason = "peddy executable not found"
        elif config_skips:
            reason = "peddy in tools_off configuration"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            locale = utils.locale_export()
            cmd = (
                "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return (
                        (l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0)
                        or (l.find("n_components=") >= 0
                            and l.find("must be between 1 and min") >= 0)
                        or (l.find(
                            "Input contains NaN, infinity or a value too large for dtype"
                        ) >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)