Esempio n. 1
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    depth_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions")
    if not utils.file_uptodate(callable_file, bam_file):
        cmd = ["goleft", "depth", "--q", "1", "--mincov", str(params["min"]),
               "--processes", str(dd.get_num_cores(data)), "--ordered"]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, depth_file) as tx_depth_file:
            with utils.chdir(os.path.dirname(tx_depth_file)):
                tx_callable_file = tx_depth_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_depth_file.replace(".depth.bed", "")
                bam_ref_file = "%s-bamref.fa" % utils.splitext_plus(bam_file)[0]
                bam.fai_from_bam(dd.get_ref_file(data), bam_file, bam_ref_file + ".fai", data)
                cmd += ["--reference", bam_ref_file]
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return depth_file, final_callable, _extract_highdepth(final_callable, data), variant_regions_avg_cov
Esempio n. 2
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using mosdepth.

    Removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"min": dd.get_coverage_depth_min(data)}
    variant_regions = dd.get_variant_regions_merged(data)
    if not variant_regions:
        variant_regions = _create_genome_regions(data)
    # Back compatible with previous pre-mosdepth callable files
    callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                                 dd.get_sample_name(data))),
                                 "%s-coverage.callable.bed" % (dd.get_sample_name(data)))
    if not utils.file_uptodate(callable_file, bam_file):
        vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"])
        to_calculate = [("variant_regions", variant_regions, vr_quantize, None),
                        ("sv_regions", regions.get_sv_bed(data), None, None),
                        ("coverage", dd.get_coverage(data), None, DEPTH_THRESHOLDS)]
        depth_files = {}
        for target_name, region_bed, quantize, thresholds in to_calculate:
            if region_bed:
                cur_depth = {}
                depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds)
                for attr in ("dist", "regions", "thresholds"):
                    val = getattr(depth_info, attr, None)
                    if val:
                        cur_depth[attr] = val
                depth_files[target_name] = cur_depth
                if target_name == "variant_regions":
                    callable_file = depth_info.quantize
    else:
        depth_files = {}
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return final_callable, depth_files
Esempio n. 3
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using mosdepth.

    Removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"min": dd.get_coverage_depth_min(data)}
    variant_regions = dd.get_variant_regions_merged(data)
    if not variant_regions:
        variant_regions = _create_genome_regions(data)
    # Back compatible with previous pre-mosdepth callable files
    callable_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                                 dd.get_sample_name(data))),
                                 "%s-coverage.callable.bed" % (dd.get_sample_name(data)))
    if not utils.file_uptodate(callable_file, bam_file):
        vr_quantize = ("0:1:%s:" % (params["min"]), ["NO_COVERAGE", "LOW_COVERAGE", "CALLABLE"])
        to_calculate = [("variant_regions", variant_regions, vr_quantize, None),
                        ("sv_regions", bedutils.clean_file(regions.get_sv_bed(data), data), None, None),
                        ("coverage", bedutils.clean_file(dd.get_coverage(data), data), None, DEPTH_THRESHOLDS)]
        depth_files = {}
        for target_name, region_bed, quantize, thresholds in to_calculate:
            if region_bed:
                cur_depth = {}
                depth_info = run_mosdepth(data, target_name, region_bed, quantize=quantize, thresholds=thresholds)
                for attr in ("dist", "regions", "thresholds"):
                    val = getattr(depth_info, attr, None)
                    if val:
                        cur_depth[attr] = val
                depth_files[target_name] = cur_depth
                if target_name == "variant_regions":
                    callable_file = depth_info.quantize
    else:
        depth_files = {}
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return final_callable, depth_files
Esempio n. 4
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {
        "window_size": 5000,
        "parallel_window_size": 1e5,
        "min": dd.get_coverage_depth_min(data),
        "high_multiplier": 20
    }
    prefix = os.path.join(
        utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    out_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data,
                                                   bam_file,
                                                   variant_regions,
                                                   "variant_regions",
                                                   file_prefix=prefix)
    if not utils.file_uptodate(out_file, bam_file):
        ref_file = dd.get_ref_file(data)
        cmd = [
            "goleft", "depth", "--windowsize",
            str(params["window_size"]), "--q", "1", "--mincov",
            str(params["min"]), "--reference", ref_file, "--processes",
            str(dd.get_num_cores(data)), "--stats", "--ordered"
        ]
        if variant_regions:
            window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(
                out_file)[0]
            if not utils.file_uptodate(window_file, bam_file):
                with file_transaction(data, window_file) as tx_out_file:
                    pybedtools.BedTool().window_maker(
                        w=params["parallel_window_size"],
                        b=pybedtools.BedTool(variant_regions)).saveas(
                            tx_out_file)
            cmd += ["--bed", window_file]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, out_file) as tx_out_file:
            with utils.chdir(os.path.dirname(tx_out_file)):
                tx_callable_file = tx_out_file.replace(".depth.bed",
                                                       ".callable.bed")
                prefix = tx_out_file.replace(".depth.bed", "")
                cmd += ["--prefix", prefix, bam_file]
                do.run(cmd,
                       "Calculate coverage: %s" % dd.get_sample_name(data))
                shutil.move(tx_callable_file, callable_file)
    return out_file, callable_file, _extract_highdepth(
        callable_file, data), variant_regions_avg_cov
Esempio n. 5
0
def summary(items):
    data = items[0]
    cutoff = dd.get_coverage_depth_min(data)
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    batch = _get_group_batch(items)
    assert batch, "Did not find batch for samples: %s" % ",".join([dd.get_sample_name(x) for x in items])
    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file):
        if coverage_bed:
            mini_coverage = bed.minimize(coverage_bed).fn
        if priority_bed:
            mini_priority = bed.minimize(priority_bed).fn
        if coverage_bed and priority_bed:
            combined_bed = bed.concat([mini_coverage, mini_priority]).fn
        elif coverage_bed:
            combined_bed = mini_coverage
        elif priority_bed:
            combined_bed = mini_priority
        else:  # no coverage or priority file has been set
            return items
        clean_bed = bedutils.clean_file(combined_bed, data) if len(combined_bed) > 0 else combined_bed.fn
        bed_file = _uniquify_bed_names(clean_bed, out_dir, data)

        if bed_file and utils.file_exists(bed_file):
            with file_transaction(data, out_file) as tx_out_file:
                chanjo = os.path.join(os.path.dirname(sys.executable), "chanjo")
                cmd = "{chanjo} --db {tx_out_file} build {bed_file}"
                do.run(cmd.format(**locals()), "Prep chanjo database")
                for data in items:
                    sample = dd.get_sample_name(data)
                    bam_file = data["work_bam"]
                    cmd = (
                        "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                        "{bam_file} {bed_file} | "
                        "{chanjo} --db {tx_out_file} import"
                    )
                    do.run(cmd.format(**locals()), "Chanjo coverage", data)
        if bed_file:
            os.remove(bed_file)
    coverage = regions_coverage(out_file, batch, out_dir)
    problem_regions = dd.get_problem_region_dir(data)
    if problem_regions:
        coverage = decorate_problem_regions(coverage, problem_regions)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file, "all": coverage}
        out.append([data])
    return out
Esempio n. 6
0
def calc_callable_loci(data, region=None, out_file=None):
    """Determine callable bases for an input BAM in the given region.
    """
    if out_file is None:
        out_file = "%s-callable.bed" % os.path.splitext(data["work_bam"])[0]
    depth = {"min": dd.get_coverage_depth_min(data)}
    if not utils.file_exists(out_file):
        ref_file = tz.get_in(["reference", "fasta", "base"], data)
        region_file, calc_callable = _regions_for_coverage(data, region, ref_file, out_file)
        if calc_callable:
            _group_by_ctype(_get_coverage_file(data["work_bam"], ref_file, region, region_file, depth,
                                               out_file, data),
                            depth, region, region_file, out_file, data)
        # special case, do not calculate if we are in a chromosome not covered by BED file
        else:
            with file_transaction(data, out_file) as tx_out_file:
                shutil.move(region_file, tx_out_file)
    return [{"callable_bed": out_file, "config": data["config"], "work_bam": data["work_bam"]}]
Esempio n. 7
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    out_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions")
    if not utils.file_uptodate(out_file, bam_file):
        ref_file = dd.get_ref_file(data)
        cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1",
               "--mincov", str(params["min"]), "--reference", ref_file,
               "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"]
        window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0]
        if not utils.file_uptodate(window_file, bam_file):
            with file_transaction(data, window_file) as tx_out_file:
                if not variant_regions:
                    variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0]
                    with open(variant_regions, "w") as out_handle:
                        for c in shared.get_noalt_contigs(data):
                            out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
                pybedtools.BedTool().window_maker(w=params["parallel_window_size"],
                                                  b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file)
        cmd += ["--bed", window_file]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, out_file) as tx_out_file:
            with utils.chdir(os.path.dirname(tx_out_file)):
                tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_out_file.replace(".depth.bed", "")
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
Esempio n. 8
0
def summary(items):
    data = items[0]
    cutoff = dd.get_coverage_depth_min(data)
    work_dir = dd.get_work_dir(data)
    out_dir = utils.safe_makedir(os.path.join(work_dir, "coverage"))
    coverage_bed = dd.get_coverage_regions(data)
    priority_bed = dd.get_priority_regions(data)
    batch = _get_group_batch(items)
    assert batch, ("Did not find batch for samples: %s" %
                   ",".join([dd.get_sample_name(x) for x in items]))
    out_file = os.path.join(out_dir, "%s-coverage.db" % batch)
    if not utils.file_exists(out_file):
        combined_bed = bed.concat([coverage_bed, priority_bed])
        clean_bed = bedutils.clean_file(
            combined_bed.fn,
            data) if len(combined_bed) > 0 else combined_bed.fn
        bed_file = _uniquify_bed_names(clean_bed, out_dir, data)
        if utils.file_exists(bed_file):
            with file_transaction(data, out_file) as tx_out_file:
                chanjo = os.path.join(os.path.dirname(sys.executable),
                                      "chanjo")
                cmd = ("{chanjo} --db {tx_out_file} build {bed_file}")
                do.run(cmd.format(**locals()), "Prep chanjo database")
                for data in items:
                    sample = dd.get_sample_name(data)
                    bam_file = data["work_bam"]
                    cmd = (
                        "{chanjo} annotate -s {sample} -g {batch} -c {cutoff} "
                        "{bam_file} {bed_file} | "
                        "{chanjo} --db {tx_out_file} import")
                    do.run(cmd.format(**locals()), "Chanjo coverage", data)
        os.remove(bed_file)
    coverage = regions_coverage(out_file, batch, out_dir)
    problem_regions = dd.get_problem_region_dir(data)
    if problem_regions:
        coverage = decorate_problem_regions(coverage, problem_regions)
    out = []
    for data in items:
        if utils.file_exists(out_file):
            data["coverage"] = {"summary": out_file, "all": coverage}
        out.append([data])
    return out