Ejemplo n.º 1
0
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which("configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file) or _out_of_date(out_file):
        config_script = os.path.realpath(utils.which("configManta.py"))
        cmd = [utils.get_program_python("configManta.py"), config_script]
        if paired:
            if paired.normal_bam:
                cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        for region in _maybe_limit_chromosomes(data):
            cmd += ["--region", region]
        resources = config_utils.get_resources("manta", data["config"])
        if resources.get("options"):
            cmd += [str(x) for x in resources["options"]]
        # If we are removing polyX, avoid calling on small indels which require
        # excessively long runtimes on noisy WGS runs
        if "polyx" in dd.get_exclude_regions(data):
            cmd += ["--config", _prep_streamlined_config(config_script, work_dir)]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
Ejemplo n.º 2
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    from bcbio.heterogeneity import chromhacks
    CovInfo = collections.namedtuple("CovInfo",
                                     "callable, raw_callable, depth_files")
    noalt_calling = "noalt_calling" in dd.get_tools_on(
        data) or "altcontigs" in dd.get_exclude_regions(data)

    def callable_chrom_filter(r):
        """Filter to callable region, potentially limiting by chromosomes.
        """
        return r.name == "CALLABLE" and (not noalt_calling
                                         or chromhacks.is_nonalt(r.chrom))

    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir(data):
        sv_bed = regions.get_sv_bed(data)
        callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed)
        input_regions_bed = dd.get_variant_regions(data)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(data, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(callable_chrom_filter)
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(
                            input_regions,
                            nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, callable_bed, depth_files)
Ejemplo n.º 3
0
def _prep_config(items, paired, work_dir):
    """Run initial configuration, generating a run directory for Manta.
    """
    assert utils.which("configManta.py"), "Could not find installed configManta.py"
    out_file = os.path.join(work_dir, "runWorkflow.py")
    if not utils.file_exists(out_file) or _out_of_date(out_file):
        config_script = os.path.realpath(utils.which("configManta.py"))
        cmd = [sys.executable, config_script]
        if paired:
            if paired.normal_bam:
                cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam]
            else:
                cmd += ["--tumorBam=%s" % paired.tumor_bam]
        else:
            cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items]
        data = paired.tumor_data if paired else items[0]
        cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir]
        if dd.get_coverage_interval(data) not in ["genome"]:
            cmd += ["--exome"]
        for region in _maybe_limit_chromosomes(data):
            cmd += ["--region", region]
        # If we are removing polyX, avoid calling on small indels which require
        # excessively long runtimes on noisy WGS runs
        if "polyx" in dd.get_exclude_regions(data):
            cmd += ["--config", _prep_streamlined_config(config_script, work_dir)]
        do.run(cmd, "Configure manta SV analysis")
    return out_file
Ejemplo n.º 4
0
def _add_config_regions(nblock_regions, ref_regions, data):
    """Add additional nblock regions based on configured regions to call.
    Identifies user defined regions which we should not be analyzing.
    """
    input_regions_bed = dd.get_variant_regions(data)
    if input_regions_bed:
        input_regions = pybedtools.BedTool(input_regions_bed)
        # work around problem with single region not subtracted correctly.
        if len(input_regions) == 1:
            str_regions = str(input_regions[0]).strip()
            input_regions = pybedtools.BedTool("%s\n%s" %
                                               (str_regions, str_regions),
                                               from_string=True)
        input_nblock = ref_regions.subtract(input_regions, nonamecheck=True)
        if input_nblock == ref_regions:
            raise ValueError(
                "Input variant_region file (%s) "
                "excludes all genomic regions. Do the chromosome names "
                "in the BED file match your genome (chr1 vs 1)?" %
                input_regions_bed)
        all_intervals = _combine_regions([input_nblock, nblock_regions],
                                         ref_regions)
    else:
        all_intervals = nblock_regions
    if "noalt_calling" in dd.get_tools_on(
            data) or "altcontigs" in dd.get_exclude_regions(data):
        from bcbio.heterogeneity import chromhacks
        remove_intervals = ref_regions.filter(
            lambda r: not chromhacks.is_nonalt(r.chrom))
        all_intervals = _combine_regions([all_intervals, remove_intervals],
                                         ref_regions)
    return all_intervals.merge()
Ejemplo n.º 5
0
def sample_callable_bed(bam_file, ref_file, data):
    """Retrieve callable regions for a sample subset by defined analysis regions.
    """
    from bcbio.heterogeneity import chromhacks
    CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files")
    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    def callable_chrom_filter(r):
        """Filter to callable region, potentially limiting by chromosomes.
        """
        return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom))
    out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0]
    with shared.bedtools_tmpdir(data):
        sv_bed = regions.get_sv_bed(data)
        callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed)
        input_regions_bed = dd.get_variant_regions(data)
        if not utils.file_uptodate(out_file, callable_bed):
            with file_transaction(data, out_file) as tx_out_file:
                callable_regions = pybedtools.BedTool(callable_bed)
                filter_regions = callable_regions.filter(callable_chrom_filter)
                if input_regions_bed:
                    if not utils.file_uptodate(out_file, input_regions_bed):
                        input_regions = pybedtools.BedTool(input_regions_bed)
                        filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file)
                else:
                    filter_regions.saveas(tx_out_file)
    return CovInfo(out_file, callable_bed, depth_files)
Ejemplo n.º 6
0
def _add_config_regions(nblock_regions, ref_regions, data):
    """Add additional nblock regions based on configured regions to call.
    Identifies user defined regions which we should not be analyzing.
    """
    input_regions_bed = dd.get_variant_regions(data)
    if input_regions_bed:
        input_regions = pybedtools.BedTool(input_regions_bed)
        # work around problem with single region not subtracted correctly.
        if len(input_regions) == 1:
            str_regions = str(input_regions[0]).strip()
            input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions),
                                               from_string=True)
        input_nblock = ref_regions.subtract(input_regions, nonamecheck=True)
        if input_nblock == ref_regions:
            raise ValueError("Input variant_region file (%s) "
                             "excludes all genomic regions. Do the chromosome names "
                             "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed)
        all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions)
    else:
        all_intervals = nblock_regions
    if "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data):
        from bcbio.heterogeneity import chromhacks
        remove_intervals = ref_regions.filter(lambda r: not chromhacks.is_nonalt(r.chrom))
        all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions)
    return all_intervals.merge()
Ejemplo n.º 7
0
def add_highdepth_genome_exclusion(items):
    """Add exclusions to input items to avoid slow runtimes on whole genomes.
    """
    out = []
    for d in items:
        d = utils.deepish_copy(d)
        if dd.get_coverage_interval(d) == "genome":
            e = dd.get_exclude_regions(d)
            if "highdepth" not in e:
                e.append("highdepth")
                d = dd.set_exclude_regions(d, e)
        out.append(d)
    return out
Ejemplo n.º 8
0
def _maybe_limit_chromosomes(data):
    """Potentially limit chromosomes to avoid problematically named HLA contigs.

    HLAs have ':' characters in them which confuse downstream processing. If
    we have no problematic chromosomes we don't limit anything.
    """
    std_chroms = []
    prob_chroms = []
    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    for contig in ref.file_contigs(dd.get_ref_file(data)):
        if contig.name.find(":") > 0 or (noalt_calling and not chromhacks.is_nonalt(contig.name)):
            prob_chroms.append(contig.name)
        else:
            std_chroms.append(contig.name)
    if len(prob_chroms) > 0:
        return std_chroms
    else:
        return []
Ejemplo n.º 9
0
def _maybe_limit_chromosomes(data):
    """Potentially limit chromosomes to avoid problematically named HLA contigs.

    HLAs have ':' characters in them which confuse downstream processing. If
    we have no problematic chromosomes we don't limit anything.
    """
    std_chroms = []
    prob_chroms = []
    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    for contig in ref.file_contigs(dd.get_ref_file(data)):
        if contig.name.find(":") > 0 or (noalt_calling and not chromhacks.is_nonalt(contig.name)):
            prob_chroms.append(contig.name)
        else:
            std_chroms.append(contig.name)
    if len(prob_chroms) > 0:
        return std_chroms
    else:
        return []
Ejemplo n.º 10
0
 def _get_sample_excludes(d):
     excludes = dd.get_exclude_regions(d)
     # back compatible
     if tz.get_in(("config", "algorithm", "remove_lcr"), d, False):
         excludes.append("lcr")
     return excludes