Example #1
0
def _get_paired_samples(sample, data):
    """Get input sample for each chip bam file."""
    dd.get_phenotype(sample)
    for origin in data:
        if  dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = dd.get_work_bam(origin[0])
            return [sample]
Example #2
0
def check_paired_problems(items):
    """Check for incorrectly paired tumor/normal samples in a batch.
    """
    # ensure we're in a paired batch
    if not get_paired(items):
        return
    num_tumor = len(
        [x for x in items if dd.get_phenotype(x).lower() == "tumor"])
    if num_tumor > 1:
        raise ValueError(
            "Unsupported configuration: found multiple tumor samples in batch %s: %s"
            % (tz.get_in(["metadata", "batch"], items[0]),
               [dd.get_sample_name(data) for data in items]))
    elif num_tumor == 0 and any(
            dd.get_phenotype(data).lower() == "normal" for data in items):
        raise ValueError("Found normal sample without tumor in batch %s: %s" %
                         (tz.get_in(["metadata", "batch"], items[0]),
                          [dd.get_sample_name(data) for data in items]))
    else:
        vcs = get_somatic_variantcallers(items)
        if "mutect" in vcs or "mutect2" in vcs or "strelka2" in vcs:
            paired = get_paired(items)
            if not (paired.normal_data or paired.normal_panel):
                raise ValueError(
                    "MuTect, MuTect2 and Strelka2 somatic calling requires normal sample or panel: %s"
                    % [dd.get_sample_name(data) for data in items])
def parallel_calling(data, run_parallel):
    """This is needed only if running methylated veruss hidroxy-methulated"""
    out = []
    for sample in data:
        work_bam = dd.get_work_bam(sample[0])
        with closing(pysam.Samfile(work_bam, "rb")) as pysam_work_bam:
            chroms = pysam_work_bam.references
            for chrom in chroms:
                new_sample = copy.deepcopy(sample)
                if chrom.find("_") > -1:
                    continue
                new_sample[0]['chr_to_run'] = chrom
                out.append(new_sample)
    out = run_parallel("cpg_calling", out)
    for sample in out:
        phenotype = dd.get_phenotype(sample[0])
        batch = dd.get_batch(sample[0])
        if phenotype == "mC":
            for sample2 in out:
                if batch in dd.get_batch(sample2[0]) and dd.get_phenotype(
                        sample2[0]) == "hmC":
                    if sample[0]["chr_to_run"] == sample2[0]["chr_to_run"]:
                        sample[0]["control"] = sample2[0]["cpg_file"]
                        break
    out = run_parallel("cpg_processing", out)
    for sample in data:
        sample[0]["cpg_split"] = []
        sample[0]["hmc_split"] = []
        name = dd.get_sample_name(sample[0])
        for chunck in out:
            if name == dd.get_sample_name(chunck[0]):
                sample[0]["cpg_split"].append(chunck[0]["cpg_file"])
                if "hmc_file" in chunck[0]:
                    sample[0]["hmc_split"].append(chunck[0]["hmc_file"])
Example #4
0
def _get_paired_samples(sample, data):
    """Get input sample for each chip bam file."""
    dd.get_phenotype(sample)
    for origin in data:
        if dd.get_batch(sample) in dd.get_batch(
                origin[0]) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = dd.get_work_bam(origin[0])
            return [sample]
Example #5
0
def _check(sample, data):
    """Get input sample for each chip bam file."""
    if dd.get_chip_method(sample).lower() == "atac":
        return [sample]
    if dd.get_phenotype(sample) == "input":
        return None
    for origin in data:
        if  dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = dd.get_work_bam(origin[0])
            return [sample]
    return [sample]
Example #6
0
def _check(sample, data):
    """Get input sample for each chip bam file."""
    if dd.get_chip_method(sample).lower() == "atac":
        return [sample]
    if dd.get_phenotype(sample) == "input":
        return None
    for origin in data:
        if dd.get_batch(sample) in (dd.get_batches(origin[0]) or []) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = origin[0].get("work_bam")
            return [sample]
    return [sample]
Example #7
0
def _get_replicate_samples(sample, data):
    """Get input sample for each chip bam file."""
    dd.get_phenotype(sample)
    rep_bam = ""
    for origin in data:
        if  dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(sample) in dd.get_phenotype(origin[0]) and dd.get_work_bam(sample) != dd.get_work_bam(origin[0]) and dd.get_phenotype(origin[0]) != "control":
            if rep_bam != "":
                rep_bam = rep_bam + "," + dd.get_work_bam(origin[0])
            else:
                rep_bam = dd.get_work_bam(origin[0])
    sample["work_bam_rep"] = dd.get_work_bam(origin[0])
    return [sample]
Example #8
0
def _get_paired_samples(sample, data):
    """Get input sample for each chip bam file."""
    dd.get_phenotype(sample)
    input_bam = ""
    for origin in data:
        if  dd.get_phenotype(origin[0]) == "control":
            if input_bam != "":
                input_bam = input_bam + "," + dd.get_work_bam(origin[0])
            else:
                input_bam = dd.get_work_bam(origin[0])
    sample["work_bam_input"] = input_bam
    return [sample]
Example #9
0
def check_paired_problems(items):
    """Check for incorrectly paired tumor/normal samples in a batch.
    """
    # ensure we're in a paired batch
    if not get_paired(items):
        return
    num_tumor = len([x for x in items if dd.get_phenotype(x).lower() == "tumor"])
    if num_tumor > 1:
        raise ValueError("Unsupported configuration: found multiple tumor samples in batch %s: %s" %
                         (tz.get_in(["metadata", "batch"], items[0]),
                          [dd.get_sample_name(data) for data in items]))
    elif num_tumor == 0 and any(dd.get_phenotype(data).lower() == "normal" for data in items):
        raise ValueError("Found normal sample without tumor in batch %s: %s" %
                         (tz.get_in(["metadata", "batch"], items[0]),
                          [dd.get_sample_name(data) for data in items]))
Example #10
0
def check_paired_problems(items):
    """Check for incorrectly paired tumor/normal samples in a batch.
    """
    # ensure we're in a paired batch
    if not get_paired(items):
        return
    num_tumor = len([x for x in items if dd.get_phenotype(x).lower() == "tumor"])
    if num_tumor > 1:
        raise ValueError("Unsupported configuration: found multiple tumor samples in batch %s: %s" %
                         (tz.get_in(["metadata", "batch"], items[0]),
                          [dd.get_sample_name(data) for data in items]))
    elif num_tumor == 0 and any(dd.get_phenotype(data).lower() == "normal" for data in items):
        raise ValueError("Found normal sample without tumor in batch %s: %s" %
                         (tz.get_in(["metadata", "batch"], items[0]),
                          [dd.get_sample_name(data) for data in items]))
Example #11
0
def finalize_sv(samples, config):
    """Combine results from multiple sv callers into a single ordered 'sv' key.
    """
    by_bam = collections.OrderedDict()
    for x in samples:
        batch = dd.get_batch(x) or [dd.get_sample_name(x)]
        try:
            by_bam[x["align_bam"], tuple(batch)].append(x)
        except KeyError:
            by_bam[x["align_bam"], tuple(batch)] = [x]
    by_batch = collections.OrderedDict()
    lead_batches = {}
    for grouped_calls in by_bam.values():

        def orig_svcaller_order(x):
            orig_callers = tz.get_in(["config", "algorithm", "svcaller_orig"],
                                     x)
            cur_caller = tz.get_in(["config", "algorithm", "svcaller"], x)
            return orig_callers.index(cur_caller)

        sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x],
                                key=orig_svcaller_order)
        final = grouped_calls[0]
        if len(sorted_svcalls) > 0:
            final["sv"] = reduce(operator.add,
                                 [x["sv"] for x in sorted_svcalls])
        final["config"]["algorithm"]["svcaller"] = final["config"][
            "algorithm"].pop("svcaller_orig")
        batch = dd.get_batch(final) or dd.get_sample_name(final)
        batches = batch if isinstance(batch, (list, tuple)) else [batch]
        if len(batches) > 1:
            lead_batches[(dd.get_sample_name(final),
                          dd.get_phenotype(final) == "germline")] = batches[0]
        for batch in batches:
            try:
                by_batch[batch].append(final)
            except KeyError:
                by_batch[batch] = [final]
    out = []
    for batch, items in by_batch.items():
        if any("svplots" in dd.get_tools_on(d) for d in items):
            items = plot.by_regions(items)
        for data in items:
            if lead_batches.get(
                (dd.get_sample_name(data),
                 dd.get_phenotype(data) == "germline")) in [batch, None]:
                out.append([data])
    return out
Example #12
0
def _get_vcf_samples(calls, items):
    have_full_file = False
    all_samples = set([])
    sample_matches = False
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for data in items:
                for i, test_name in enumerate([dd.get_sample_name(data)] +
                                              dd.get_batches(data)):
                    # For tumor/normal batches, want to attach germline VCFs to normals
                    # Standard somatics go to tumors
                    if dd.get_phenotype(data) == "normal":
                        test_name += "-germline"
                    if os.path.basename(f).startswith(
                        ("%s-" % test_name, "%s." % test_name)):
                        # Prefer matches to single samples (gVCF) over joint batches
                        if i == 0:
                            sample_matches = True
                        if sample_matches and i > 0:
                            continue
                        else:
                            all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Example #13
0
def _maybe_add_alignment(algorithm, sample, out):
    if _has_alignment_file(algorithm,
                           sample) and dd.get_phenotype(sample) != "germline":
        for (fname, ext, isplus) in [(sample.get("work_bam"), "ready", False),
                                     (sample.get("umi_bam"), "umi", False),
                                     (sample.get("bigwig"), "ready", False),
                                     (dd.get_disc_bam(sample), "disc", True),
                                     (dd.get_sr_bam(sample), "sr", True)]:
            if fname and os.path.exists(fname):
                if fname.endswith("bam"):
                    ftype, fext = "bam", ".bai"
                elif fname.endswith("cram"):
                    ftype, fext = "cram", ".crai"
                elif fname.endswith("bw"):
                    ftype, fext = "bw", ".bw"
                else:
                    raise ValueError("Unexpected alignment file type %s" %
                                     fname)
                out.append({
                    "path": fname,
                    "type": ftype,
                    "plus": isplus,
                    "ext": ext
                })
                if utils.file_exists(fname + fext):
                    out.append({
                        "path": fname + fext,
                        "type": ftype + fext,
                        "plus": isplus,
                        "index": True,
                        "ext": ext
                    })
    return out
Example #14
0
def _create_config_file(out_dir, samples):
    """Provide configuration file hiding duplicate columns.

    Future entry point for providing top level configuration of output reports.
    """
    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

    # Setting the module order
    module_order = []
    module_order.extend([
        "bcbio",
        "samtools",
        "goleft_indexcov",
        "peddy"
    ])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {})  # tumor-only somatic with germline extraction
           or dd.get_phenotype(s) == "germline"        # or paired somatic with germline calling for normal
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([{
            'bcftools': {
                'name': 'Bcftools (somatic)',
                'info': 'Bcftools stats for somatic variant calls only.',
                'path_filters': ['*_bcftools_stats.txt'],
                'write_general_stats': True,
            }},
            {'bcftools': {
                'name': 'Bcftools (germline)',
                'info': 'Bcftools stats for germline variant calls only.',
                'path_filters': ['*_bcftools_stats_germline.txt'],
                'write_general_stats': False
            }},
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "picard",
        "qualimap",
        "snpeff",
        "fastqc",
        "preseq",
    ])
    out["module_order"] = module_order

    preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return out_file
Example #15
0
def _maybe_add_alignment(algorithm, sample, out):
    if _has_alignment_file(algorithm, sample) and dd.get_phenotype(sample) != "germline":
        for (fname, ext, isplus) in [(sample.get("work_bam"), "ready", False),
                                     (sample.get("umi_bam"), "umi", False),
                                     (sample.get("bigwig"), "ready", False),
                                     (dd.get_disc_bam(sample), "disc", True),
                                     (dd.get_sr_bam(sample), "sr", True)]:
            if fname and os.path.exists(fname):
                if fname.endswith("bam"):
                    ftype, fext = "bam", ".bai"
                elif fname.endswith("cram"):
                    ftype, fext = "cram", ".crai"
                elif fname.endswith("bw"):
                    ftype, fext = "bw", ".bw"
                else:
                    raise ValueError("Unexpected alignment file type %s" % fname)
                out.append({"path": fname,
                            "type": ftype,
                            "plus": isplus,
                            "ext": ext})
                if utils.file_exists(fname + fext):
                    out.append({"path": fname + fext,
                                "type": ftype + fext,
                                "plus": isplus,
                                "index": True,
                                "ext": ext})
    return out
Example #16
0
def batch(samples):
    """CWL: batch together per sample, joint and germline calls for ensemble combination.

    Sets up groups of same sample/batch variant calls for ensemble calling, as
    long as we have more than one caller per group.
    """
    samples = [utils.to_single_data(x) for x in samples]
    sample_order = [dd.get_sample_name(x) for x in samples]
    batch_groups = collections.defaultdict(list)
    for data in samples:
        batch_samples = tuple(data.get("batch_samples", [dd.get_sample_name(data)]))
        batch_groups[(batch_samples, dd.get_phenotype(data))].append(data)

    out = []
    for (batch_samples, phenotype), gsamples in batch_groups.items():
        if len(gsamples) > 1:
            batches = set([])
            for d in gsamples:
                batches |= set(dd.get_batches(d))
            cur = copy.deepcopy(gsamples[0])
            cur.update({"batch_id": sorted(list(batches))[0] if batches else "_".join(batch_samples),
                        "batch_samples": batch_samples,
                        "variants": {"variantcallers": [dd.get_variantcaller(d) for d in gsamples],
                                     "calls": [d.get("vrn_file") for d in gsamples]}})
            out.append(cur)

    def by_original_order(d):
        return min([sample_order.index(s) for s in d["batch_samples"] if s in sample_order])
    return sorted(out, key=by_original_order)
Example #17
0
def _batch_split_by_sv(samples, stage):
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (utils.to_single_data(x) for x in samples):
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            background.append(data)
            for x in ready_data:
                svcaller = tz.get_in(["config", "algorithm", "svcaller"], x)
                batch = dd.get_batch(x) or dd.get_sample_name(x)
                if stage in ["precall", "ensemble"
                             ]:  # no batching for precall or ensemble methods
                    if isinstance(
                            batch,
                            basestring) and batch != dd.get_sample_name(x):
                        batch += "_%s" % dd.get_sample_name(x)
                    else:
                        batch = dd.get_sample_name(x)
                    if dd.get_phenotype(x) == "germline":
                        batch += "_germline"
                elif svcaller in _GLOBAL_BATCHING:  # All samples batched together for analyses
                    batch = "all"
                batches = batch if isinstance(batch,
                                              (list, tuple)) else [batch]
                for b in batches:
                    try:
                        to_process[(svcaller, b)].append(x)
                    except KeyError:
                        to_process[(svcaller, b)] = [x]
        else:
            extras.append([data])
    return to_process, extras, background
Example #18
0
def _batch_split_by_sv(samples, stage):
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (utils.to_single_data(x) for x in samples):
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            background.append(data)
            for x in ready_data:
                svcaller = tz.get_in(["config", "algorithm", "svcaller"], x)
                batch = dd.get_batch(x) or dd.get_sample_name(x)
                if stage in ["ensemble"]:  # no batching for ensemble methods
                    if isinstance(batch, six.string_types) and batch != dd.get_sample_name(x):
                        batch += "_%s" % dd.get_sample_name(x)
                    else:
                        batch = dd.get_sample_name(x)
                    if dd.get_phenotype(x) == "germline":
                        batch += "_germline"
                elif svcaller in _GLOBAL_BATCHING:  # All samples batched together for analyses
                    batch = "all"
                batches = batch if isinstance(batch, (list, tuple)) else [batch]
                for b in batches:
                    try:
                        to_process[(svcaller, b)].append(x)
                    except KeyError:
                        to_process[(svcaller, b)] = [x]
        else:
            extras.append([data])
    return to_process, extras, background
Example #19
0
def _get_vcf_samples(calls, items):
    have_full_file = False
    all_samples = set([])
    sample_matches = False
    for f in utils.flatten(calls):
        if have_full_file:
            cur = set(vcfutils.get_samples(f))
            if cur:
                if not all_samples:
                    all_samples = cur
                else:
                    all_samples &= set(cur)
        else:
            for data in items:
                for i, test_name in enumerate([dd.get_sample_name(data)] + dd.get_batches(data)):
                    # For tumor/normal batches, want to attach germline VCFs to normals
                    # Standard somatics go to tumors
                    if dd.get_phenotype(data) == "normal":
                        test_name += "-germline"
                    if os.path.basename(f).startswith(("%s-" % test_name,
                                                       "%s." % test_name)):
                        # Prefer matches to single samples (gVCF) over joint batches
                        if i == 0:
                            sample_matches = True
                        if sample_matches and i > 0:
                            continue
                        else:
                            all_samples.add(dd.get_sample_name(data))
    return list(all_samples)
Example #20
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (atropos, coverage, damage, fastqc, kraken, qsignature, qualimap,
                          samtools, picard, srna, umi, variant, viral, preseq)
    tools = {"fastqc": fastqc.run,
             "atropos": atropos.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "coverage": coverage.run,
             "damage": damage.run,
             "variants": variant.run,
             "peddy": peddy.run_qc,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run,
             "viral": viral.run,
             "preseq": preseq.run,
             }
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = utils.deepish_copy(dd.get_summary_qc(data))
    for program_name in dd.get_algorithm_qc(data):
        if not bam_file and program_name != "kraken":  # kraken doesn't need bam
            continue
        if dd.get_phenotype(data) == "germline" and program_name != "variants":
            continue
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            # Check for metrics output, two cases:
            # 1. output with {"metrics"} and files ("base")
            if "metrics" in out:
                metrics.update(out.pop("metrics"))
            # 2. a dictionary of metrics
            elif "base" not in out:
                metrics.update(out)
            # Check for files only output
            if "base" in out:
                qc_files = out
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Example #21
0
def _create_config_file(out_dir, samples):
    """Provide configuration file hiding duplicate columns.

    Future entry point for providing top level configuration of output reports.
    """
    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

    # Setting the module order
    module_order = []
    module_order.extend([
        "bcbio",
        "samtools",
        "goleft_indexcov"
    ])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {})  # tumor-only somatic with germline extraction
           or dd.get_phenotype(s) == "germline"        # or paired somatic with germline calling for normal
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([{
            'bcftools': {
                'name': 'Bcftools (somatic)',
                'info': 'Bcftools stats for somatic variant calls only.',
                'path_filters': ['*_bcftools_stats.txt'],
                'write_general_stats': True,
            }},
            {'bcftools': {
                'name': 'Bcftools (germline)',
                'info': 'Bcftools stats for germline variant calls only.',
                'path_filters': ['*_bcftools_stats_germline.txt'],
                'write_general_stats': False
            }},
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "picard",
        "qualimap",
        "snpeff",
        "fastqc",
        "preseq",
    ])
    out["module_order"] = module_order

    preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return out_file
Example #22
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (coverage, damage, fastqc, kraken, qsignature, qualimap,
                          samtools, picard, srna, umi, variant, viral, preseq)
    tools = {"fastqc": fastqc.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "coverage": coverage.run,
             "damage": damage.run,
             "variants": variant.run,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run,
             "viral": viral.run,
             "preseq": preseq.run,
             }
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = {}
    for program_name in dd.get_algorithm_qc(data):
        if not bam_file and program_name != "kraken":  # kraken doesn't need bam
            continue
        if dd.get_phenotype(data) == "germline" and program_name != "variants":
            continue
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            # Check for metrics output, two cases:
            # 1. output with {"metrics"} and files ("base")
            if "metrics" in out:
                metrics.update(out.pop("metrics"))
            # 2. a dictionary of metrics
            elif "base" not in out:
                metrics.update(out)
            # Check for files only output
            if "base" in out:
                qc_files = out
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Example #23
0
def _get_multiplier(samples):
    """Get multiplier to get jobs
       only for samples that have input
    """
    to_process = 1.0
    for sample in samples:
        if dd.get_phenotype(sample[0]) == "chip":
            to_process += 1.0
    return to_process / len(samples)
Example #24
0
def _get_multiplier(samples):
    """Get multiplier to get jobs
       only for samples that have input
    """
    to_process = 1.0
    for sample in samples:
        if dd.get_phenotype(sample[0]) == "chip":
            to_process += 1.0
    return to_process / len(samples)
Example #25
0
def run(_, data, out_dir):
    """Prepare variants QC analysis: bcftools stats and snpEff output.
    """
    out = []
    vcinfo = get_active_vcinfo(data)
    if vcinfo:
        if dd.get_phenotype(data) == "normal" and "germline" in vcinfo:
            out.append(_bcftools_stats(data, out_dir, "germline", germline=True))
        elif dd.get_phenotype(data) != "germline":
            out.append(_bcftools_stats(data, out_dir))
            if "germline" in vcinfo:
                out.append(_bcftools_stats(data, out_dir, "germline", germline=True))
        else:
            out.append(_bcftools_stats(data, out_dir, germline=True))
    out.append(_snpeff_stats(data, out_dir))

    out = [item for item in out if item]
    if out:
        return {"base": out[0], "secondary": out[1:]}
Example #26
0
def run(_, data, out_dir):
    """Prepare variants QC analysis: bcftools stats and snpEff output.
    """
    out = []
    vcinfo = get_active_vcinfo(data)
    if vcinfo:
        if dd.get_phenotype(data) == "normal" and "germline" in vcinfo:
            out.append(_bcftools_stats(data, out_dir, "germline", germline=True))
        elif dd.get_phenotype(data) != "germline":
            out.append(_bcftools_stats(data, out_dir))
            if "germline" in vcinfo:
                out.append(_bcftools_stats(data, out_dir, "germline", germline=True))
        else:
            out.append(_bcftools_stats(data, out_dir, germline=True))

    out.append(_snpeff_stats(data, out_dir))

    out = [item for item in out if item]
    if out:
        return {"base": out[0], "secondary": out[1:]}
Example #27
0
def finalize_sv(samples, config):
    """Combine results from multiple sv callers into a single ordered 'sv' key.
    """
    by_bam = collections.OrderedDict()
    for x in samples:
        batch = dd.get_batch(x) or [dd.get_sample_name(x)]
        try:
            by_bam[x["align_bam"], tuple(batch)].append(x)
        except KeyError:
            by_bam[x["align_bam"], tuple(batch)] = [x]
    by_batch = collections.OrderedDict()
    lead_batches = {}
    for grouped_calls in by_bam.values():
        def orig_svcaller_order(x):
            orig_callers = tz.get_in(["config", "algorithm", "svcaller_orig"], x)
            cur_caller = tz.get_in(["config", "algorithm", "svcaller"], x)
            return orig_callers.index(cur_caller)
        sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x],
                                key=orig_svcaller_order)
        final = grouped_calls[0]
        if len(sorted_svcalls) > 0:
            final["sv"] = reduce(operator.add, [x["sv"] for x in sorted_svcalls])
        final["config"]["algorithm"]["svcaller"] = final["config"]["algorithm"].pop("svcaller_orig")
        batch = dd.get_batch(final) or dd.get_sample_name(final)
        batches = batch if isinstance(batch, (list, tuple)) else [batch]
        if len(batches) > 1:
            lead_batches[(dd.get_sample_name(final), dd.get_phenotype(final) == "germline")] = batches[0]
        for batch in batches:
            try:
                by_batch[batch].append(final)
            except KeyError:
                by_batch[batch] = [final]
    out = []
    for batch, items in by_batch.items():
        if any("svplots" in dd.get_tools_on(d) for d in items):
            items = plot.by_regions(items)
        for data in items:
            if lead_batches.get((dd.get_sample_name(data), dd.get_phenotype(data) == "germline")) in [batch, None]:
                out.append([data])
    return out
Example #28
0
def _get_multiplier(samples):
    """Get multiplier to get jobs
       only for samples that have input
    """
    to_process = 1.0
    for sample in samples:
        if dd.get_phenotype(sample[0]) != "control" and dd.get_replicate(sample[0]) == 1:
            to_process += 1.0
    if to_process / len(samples) < 1.0:
        to_process = 1.0
    else:
        to_process = to_process / len(samples)
    return to_process
Example #29
0
def check_paired_problems(items):
    """Check for incorrectly paired tumor/normal samples in a batch.
    """
    # ensure we're in a paired batch
    if not get_paired(items):
        return
    num_tumor = len([x for x in items if dd.get_phenotype(x).lower() == "tumor"])
    if num_tumor > 1:
        raise ValueError("Unsupported configuration: found multiple tumor samples in batch %s: %s" %
                         (tz.get_in(["metadata", "batch"], items[0]),
                          [dd.get_sample_name(data) for data in items]))
    elif num_tumor == 0 and any(dd.get_phenotype(data).lower() == "normal" for data in items):
        raise ValueError("Found normal sample without tumor in batch %s: %s" %
                         (tz.get_in(["metadata", "batch"], items[0]),
                          [dd.get_sample_name(data) for data in items]))
    else:
        vcs = get_somatic_variantcallers(items)
        if "mutect" in vcs or "mutect2" in vcs or "strelka2" in vcs:
            paired = get_paired(items)
            if not (paired.normal_data or paired.normal_panel):
                raise ValueError("MuTect, MuTect2 and Strelka2 somatic calling requires normal sample or panel: %s" %
                                 [dd.get_sample_name(data) for data in items])
Example #30
0
def _select_sample(data, variant_file, work_dir):
    """Select current sample from original call file.
    """
    sample_name = dd.get_sample_name(data)
    if dd.get_phenotype(data) == "germline":
        variant_file = germline.fix_germline_samplename(variant_file, sample_name, data)

    out_file = os.path.join(work_dir, "%s-%s.vcf.gz" % (utils.splitext_plus(os.path.basename(variant_file))[0],
                                                        sample_name))
    if not utils.file_uptodate(out_file, variant_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = "bcftools view -s {sample_name} -O z -o {tx_out_file} {variant_file}"
            do.run(cmd.format(**locals()), "Run manta SV analysis")
    return vcfutils.bgzip_and_index(out_file, data["config"])
Example #31
0
def _get_multiplier(samples):
    """Get multiplier to get jobs
       only for samples that have input
    """
    to_process = 1.0
    to_skip = 0
    for sample in samples:
        if dd.get_phenotype(sample[0]) == "chip":
            to_process += 1.0
        elif dd.get_chip_method(sample[0]).lower() == "atac":
            to_process += 1.0
        else:
            to_skip += 1.0
    return (to_process - to_skip) / len(samples)
Example #32
0
def extract_germline_vcinfo(data, out_dir):
    """Extract germline VCFs from existing tumor inputs.
    """
    supported_germline = set(["vardict", "octopus", "freebayes"])
    if dd.get_phenotype(data) in ["tumor"]:
        for v in _get_variants(data):
            if v.get("variantcaller") in supported_germline:
                if v.get("germline"):
                    return v
                else:
                    d = utils.deepish_copy(data)
                    d["vrn_file"] = v["vrn_file"]
                    gd = germline.extract(d, [d], out_dir)
                    v["germline"] = gd["vrn_file_plus"]["germline"]
                    return v
Example #33
0
def extract_germline_vcinfo(data, out_dir):
    """Extract germline VCFs from existing tumor inputs.
    """
    supported_germline = set(["vardict", "octopus", "freebayes"])
    if dd.get_phenotype(data) in ["tumor"]:
        for v in _get_variants(data):
            if v.get("variantcaller") in supported_germline:
                if v.get("germline"):
                    return v
                else:
                    d = utils.deepish_copy(data)
                    d["vrn_file"] = v["vrn_file"]
                    gd = germline.extract(d, [d], out_dir)
                    v["germline"] = gd["vrn_file_plus"]["germline"]
                    return v
Example #34
0
def extract(data, items):
    """Extract germline calls for the given sample, if tumor only.

    For germline calling done separately, fix VCF sample naming to match.
    """
    if vcfutils.get_paired_phenotype(data):
        if dd.get_batches(data) and len(items) == 1:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
            germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
            data["vrn_file_plus"] = {"germline": germline_vcf}
    elif dd.get_phenotype(data) == "germline":
        sample_name = dd.get_sample_name(data)
        vcf_samples = vcfutils.get_samples(data["vrn_file"])
        if (sample_name.endswith("-germline") and len(vcf_samples) == 1
              and sample_name.replace("-germline", "") == vcf_samples[0]):
            data["vrn_file"] = fix_germline_samplename(data["vrn_file"], sample_name, data)
    return data
Example #35
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
Example #36
0
def _get_multiplier(samples):
    """Get multiplier to get jobs
       only for samples that have input
    """
    to_process = 1.0
    to_skip = 0
    for sample in samples:
        if dd.get_phenotype(sample[0]) == "chip":
            to_process += 1.0
        elif dd.get_chip_method(sample[0]).lower() == "atac":
            to_process += 1.0
        else:
            to_skip += 1.0
    mult = (to_process - to_skip) / len(samples)
    if mult <= 0:
        mult = 1 / len(samples)
    return max(mult, 1)
Example #37
0
def extract(data, items):
    """Extract germline calls for the given sample, if tumor only.

    For germline calling done separately, fix VCF sample naming to match.
    """
    if vcfutils.get_paired_phenotype(data):
        if dd.get_batches(data) and len(items) == 1:
            germline_vcf = _remove_prioritization(data["vrn_file"], data)
            germline_vcf = vcfutils.bgzip_and_index(germline_vcf, data["config"])
            data["vrn_file_plus"] = {"germline": germline_vcf}
    elif dd.get_phenotype(data) == "germline":
        sample_name = dd.get_sample_name(data)
        vcf_samples = vcfutils.get_samples(data["vrn_file"])
        if (sample_name.endswith("-germline") and len(vcf_samples) == 1
              and sample_name.replace("-germline", "") == vcf_samples[0]):
            data["vrn_file"] = _fix_germline_samplename(data["vrn_file"], sample_name, data)
    return data
Example #38
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["archive"] = any([dd.get_archive(d) for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
Example #39
0
def peakcall_prepare(data, run_parallel):
    """Entry point for doing peak calling"""
    caller_fns = get_callers()
    to_process = []
    for sample in data:
        mimic = copy.copy(sample[0])
        for caller in dd.get_peakcaller(sample[0]):
            if caller in caller_fns and dd.get_phenotype(mimic) == "chip":
                mimic["peak_fn"] = caller
                name = dd.get_sample_name(mimic)
                mimic = _get_paired_samples(mimic, data)
                if mimic:
                    to_process.append(mimic)
                else:
                    logger.info("Skipping peak calling. No input sample for %s" % name)
    if to_process:
        after_process = run_parallel("peakcalling", to_process)
        data = _sync(data, after_process)
    return data
Example #40
0
def _batch_split_by_sv(samples, stage):
    """Return
    - to_process = svcaller-batch => [svcaller-sample1, svcaller-sample2...] odict
    - extras = samples without sv calling (should there be any?)
    - background - all samples
    """
    to_process = collections.OrderedDict()
    extras = []
    background = []
    for data in (utils.to_single_data(x) for x in samples):
        # data = sample
        ready_data = _handle_multiple_svcallers(data, stage)
        if len(ready_data) > 0:
            # why appending every sample to background?
            background.append(data)
            # x is sample - sv caller pair
            for x in ready_data:
                svcaller = tz.get_in(["config", "algorithm", "svcaller"], x)
                batch = dd.get_batch(x) or dd.get_sample_name(x)
                if stage in ["ensemble"]:  # no batching for ensemble methods
                    if isinstance(batch, six.string_types
                                  ) and batch != dd.get_sample_name(x):
                        batch += "_%s" % dd.get_sample_name(x)
                    else:
                        batch = dd.get_sample_name(x)
                    if dd.get_phenotype(x) == "germline":
                        batch += "_germline"
                elif svcaller in _GLOBAL_BATCHING:  # All samples batched together for analyses
                    batch = "all"
                # just creating PON - no calling
                if stage in ["standard"] and batch in ["pon_build"]:
                    extras.append(x)
                else:
                    batches = batch if isinstance(batch,
                                                  (list, tuple)) else [batch]
                    for b in batches:
                        try:
                            to_process[(svcaller, b)].append(x)
                        except KeyError:
                            to_process[(svcaller, b)] = [x]
        else:
            extras.append([data])
    return to_process, extras, background
Example #41
0
def peakcall_prepare(data, run_parallel):
    """Entry point for doing peak calling"""
    caller_fns = get_callers()
    to_process = []
    for sample in data:
        mimic = copy.copy(sample[0])
        for caller in dd.get_peakcaller(sample[0]):
            if caller in caller_fns and dd.get_phenotype(mimic) == "chip":
                mimic["peak_fn"] = caller
                name = dd.get_sample_name(mimic)
                mimic = _get_paired_samples(mimic, data)
                if mimic:
                    to_process.append(mimic)
                else:
                    logger.info(
                        "Skipping peak calling. No input sample for %s" % name)
    if to_process:
        after_process = run_parallel("peakcalling", to_process)
        data = _sync(data, after_process)
    return data
Example #42
0
def splicecall_prepare(data, run_parallel):
    """Entry point for doing alternative splice callers"""
    gtf_file = dd.get_gtf_file(data)
    caller_fns = get_callers()
    to_process = []
    caller = "rmats"
    for sample in data:
        if dd.get_replicate(sample[0]) == 1:
            mimic = copy.copy(sample[0])
            if caller in dd.get_splicecaller(sample[0]):
                if caller in caller_fns and dd.get_phenotype(mimic) != "control":
                    mimic["rmats_fn"] = caller
                    name = dd.get_sample_name(mimic)
                    rep_mimic = _get_replicate_samples(mimic, data)
                    mimic = _get_paired_samples(mimic, data)
                    if mimic:
                        to_process.append(mimic)
                    else:
                        logger.info("Skipping alternative splice calling. No input sample for %s" % name)
    if to_process:
        after_process = run_parallel("splicecalling", to_process)
        data = _sync(data, after_process)
    return data
Example #43
0
def _create_config_file(out_dir, samples):
    """Provide configuration file hiding duplicate columns.

    Future entry point for providing top level configuration of output reports.
    """
    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        # Hiding metrics duplicated by Qualimap
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

        # Setting up thresholds for Qualimap depth cutoff calculations, based on sample avg depths
        avg_depths = [tz.get_in(["summary", "metrics", "Avg_coverage"], s) for s in samples]
        avg_depths = [x for x in avg_depths if x]
        # Picking all thresholds up to the highest sample average depth
        thresholds = [t for t in coverage.DEPTH_THRESHOLDS if not avg_depths or t <= max(avg_depths)]
        # ...plus one more
        if len(thresholds) < len(coverage.DEPTH_THRESHOLDS):
            thresholds.append(coverage.DEPTH_THRESHOLDS[len(thresholds)])

        # Showing only thresholds surrounding any of average depths
        thresholds_hidden = []
        for i, t in enumerate(thresholds):
            if t > 20:  # Not hiding anything below 20x
                if any(thresholds[i-1] <= c < thresholds[i] for c in avg_depths if c and i-1 >= 0) or \
                   any(thresholds[i] <= c < thresholds[i+1] for c in avg_depths if c and i+1 < len(thresholds)):
                    pass
                else:
                    thresholds_hidden.append(t)

        # Hide coverage unless running full qualimap, downsampled inputs are confusing
        if not any(("qualimap_full" in dd.get_tools_on(d)) for d in samples):
            thresholds_hidden = thresholds + thresholds_hidden
            thresholds_hidden.sort()
            thresholds = []
        out['qualimap_config'] = {
            'general_stats_coverage': [str(t) for t in thresholds],
            'general_stats_coverage_hidden': [str(t) for t in thresholds_hidden]}

    # Avoid confusing peddy outputs, sticking to ancestry and sex prediction
    out["table_columns_visible"]["Peddy"] = {"family_id": False, "sex_het_ratio": False,
                                             "error_sex_check": False}

    # Setting the module order
    module_order = []
    module_order.extend([
        "bcbio",
        "samtools",
        "goleft_indexcov",
        "peddy"
    ])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {}) or  # tumor-only somatic with germline extraction
           dd.get_phenotype(s) == "germline" or           # or paired somatic with germline calling for normal
           _has_bcftools_germline_stats(s)                # CWL organized statistics
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([{
            'bcftools': {
                'name': 'Bcftools (somatic)',
                'info': 'Bcftools stats for somatic variant calls only.',
                'path_filters': ['*_bcftools_stats.txt'],
                'write_general_stats': True,
            }},
            {'bcftools': {
                'name': 'Bcftools (germline)',
                'info': 'Bcftools stats for germline variant calls only.',
                'path_filters': ['*_bcftools_stats_germline.txt'],
                'write_general_stats': False
            }},
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "salmon",
        "picard",
        "qualimap",
        "snpeff",
        "fastqc",
        "preseq",
    ])
    out["module_order"] = module_order

    preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False)
    return out_file
Example #44
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = None
        if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [
                "tumor"
        ]:
            vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if not vcinfo and dd.get_phenotype(d) in ["tumor"]:
            vcinfo = variant.extract_germline_vcinfo(d, peddy_dir)
        if vcinfo:
            for key in ["germline", "vrn_file"]:
                if vcinfo and vcinfo.get(key) and utils.file_exists(
                        vcinfo[key]):
                    if vcinfo[key] and dd.get_sample_name(
                            d) in vcfutils.get_samples(vcinfo[key]):
                        if vcinfo[
                                key] and vcfutils.vcf_has_nonfiltered_variants(
                                    vcinfo[key]):
                            vcf_file = vcinfo[key]
                            break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples])
    if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips:
        if not peddy:
            reason = "peddy executable not found"
        elif config_skips:
            reason = "peddy in tools_off configuration"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            locale = utils.locale_export()
            cmd = (
                "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return (
                        (l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0)
                        or (l.find("n_components=") >= 0
                            and l.find("must be between 1 and min") >= 0)
                        or (l.find(
                            "Input contains NaN, infinity or a value too large for dtype"
                        ) >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Example #45
0
def _create_config_file(out_dir, samples):
    """Provide configuration file for multiqc report."""

    out_file = os.path.join(out_dir, "multiqc_config.yaml")
    out = {"table_columns_visible": dict()}

    extra_fn_clean_trim = []
    extra_fn_clean_trim.extend(
        ["coverage.mosdepth.region.dist", "coverage.mosdepth.global.dist"])
    out["extra_fn_clean_trim"] = extra_fn_clean_trim

    # Avoid duplicated bcbio columns with qualimap
    if any(("qualimap" in dd.get_tools_on(d)
            or "qualimap_full" in dd.get_tools_on(d)) for d in samples):
        # Hiding metrics duplicated by Qualimap
        out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False}
        out["table_columns_visible"]["FastQC"] = {"percent_gc": False}

        # Setting up thresholds for Qualimap depth cutoff calculations, based on sample avg depths
        avg_depths = [
            tz.get_in(["summary", "metrics", "Avg_coverage"], s)
            for s in samples
        ]
        avg_depths = [x for x in avg_depths if x]
        # Picking all thresholds up to the highest sample average depth
        thresholds = [
            t for t in coverage.DEPTH_THRESHOLDS
            if not avg_depths or t <= max(avg_depths)
        ]
        # ...plus one more
        if len(thresholds) < len(coverage.DEPTH_THRESHOLDS):
            thresholds.append(coverage.DEPTH_THRESHOLDS[len(thresholds)])

        # Showing only thresholds surrounding any of average depths
        thresholds_hidden = []
        for i, t in enumerate(thresholds):
            if t > 20:  # Not hiding anything below 20x
                if any(thresholds[i-1] <= c < thresholds[i] for c in avg_depths if c and i-1 >= 0) or \
                   any(thresholds[i] <= c < thresholds[i+1] for c in avg_depths if c and i+1 < len(thresholds)):
                    pass
                else:
                    thresholds_hidden.append(t)

        # Hide coverage unless running full qualimap, downsampled inputs are confusing
        if not any(("qualimap_full" in dd.get_tools_on(d)) for d in samples):
            thresholds_hidden = thresholds + thresholds_hidden
            thresholds_hidden.sort()
            thresholds = []
        out['qualimap_config'] = {
            'general_stats_coverage': [str(t) for t in thresholds],
            'general_stats_coverage_hidden':
            [str(t) for t in thresholds_hidden]
        }

    # Avoid confusing peddy outputs, sticking to ancestry and sex prediction
    out["table_columns_visible"]["Peddy"] = {
        "family_id": False,
        "sex_het_ratio": False,
        "error_sex_check": False
    }

    # Setting the module order
    module_order = []
    module_order.extend(["bcbio", "samtools", "goleft_indexcov", "peddy"])
    out['bcftools'] = {'write_separate_table': True}
    # if germline calling was performed:
    if any("germline" in (get_active_vcinfo(s) or {})
           or  # tumor-only somatic with germline extraction
           dd.get_phenotype(s) == "germline"
           or  # or paired somatic with germline calling for normal
           _has_bcftools_germline_stats(s)  # CWL organized statistics
           for s in samples):
        # Split somatic and germline variant stats into separate multiqc submodules,
        # with somatic going into General Stats, and germline going into a separate table:
        module_order.extend([
            {
                'bcftools': {
                    'name': 'Bcftools (somatic)',
                    'info': 'Bcftools stats for somatic variant calls only.',
                    'path_filters': ['*_bcftools_stats.txt'],
                    'custom_config': {
                        'write_general_stats': True
                    },
                }
            },
            {
                'bcftools': {
                    'name': 'Bcftools (germline)',
                    'info': 'Bcftools stats for germline variant calls only.',
                    'path_filters': ['*_bcftools_stats_germline.txt'],
                    'custom_config': {
                        'write_general_stats': False
                    },
                }
            },
        ])
    else:
        module_order.append("bcftools")
    module_order.extend([
        "salmon", "star", "picard", "qualimap", "snpeff", "bismark", "fastqc",
        "preseq"
    ])
    out["module_order"] = module_order

    preseq_samples = [
        s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)
    ]
    if preseq_samples:
        out["preseq"] = _make_preseq_multiqc_config(preseq_samples)

    with open(out_file, "w") as out_handle:
        yaml.safe_dump(out,
                       out_handle,
                       default_flow_style=False,
                       allow_unicode=False)
    return out_file