Example #1
0
def get_contigs(data):
    contigs = [x.name for x in shared.get_noalt_contigs(data)]
    keep = [
        x for x in contigs
        if chromhacks.is_autosomal(x) or chromhacks.is_sex(x)
    ]
    return keep
Example #2
0
def _create_genome_regions(callable_file, data):
    """Create whole genome contigs we want to process, only non-alts.

    Skips problem contigs like HLAs for downstream analysis.
    """
    variant_regions = "%s-genome.bed" % utils.splitext_plus(callable_file)[0]
    with file_transaction(data, variant_regions) as tx_variant_regions:
        with open(tx_variant_regions, "w") as out_handle:
            for c in shared.get_noalt_contigs(data):
                out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
    return variant_regions
Example #3
0
def _create_genome_regions(callable_file, data):
    """Create whole genome contigs we want to process, only non-alts.

    Skips problem contigs like HLAs for downstream analysis.
    """
    variant_regions = "%s-genome.bed" % utils.splitext_plus(callable_file)[0]
    with file_transaction(data, variant_regions) as tx_variant_regions:
        with open(tx_variant_regions, "w") as out_handle:
            for c in shared.get_noalt_contigs(data):
                out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
    return variant_regions
Example #4
0
def _create_genome_regions(data):
    """Create whole genome contigs we want to process, only non-alts.

    Skips problem contigs like HLAs for downstream analysis.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data)))
    variant_regions = os.path.join(work_dir, "target-genome.bed")
    with file_transaction(data, variant_regions) as tx_variant_regions:
        with open(tx_variant_regions, "w") as out_handle:
            for c in shared.get_noalt_contigs(data):
                out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
    return variant_regions
Example #5
0
def _create_genome_regions(data):
    """Create whole genome contigs we want to process, only non-alts.

    Skips problem contigs like HLAs for downstream analysis.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data)))
    variant_regions = os.path.join(work_dir, "target-genome.bed")
    with file_transaction(data, variant_regions) as tx_variant_regions:
        with open(tx_variant_regions, "w") as out_handle:
            for c in shared.get_noalt_contigs(data):
                out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
    return variant_regions
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    out_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions,
                                                   "variant_regions", file_prefix=prefix)
    if not utils.file_uptodate(out_file, bam_file):
        ref_file = dd.get_ref_file(data)
        cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1",
               "--mincov", str(params["min"]), "--reference", ref_file,
               "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"]
        window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0]
        if not utils.file_uptodate(window_file, bam_file):
            with file_transaction(data, window_file) as tx_out_file:
                if not variant_regions:
                    variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0]
                    with open(variant_regions, "w") as out_handle:
                        for c in shared.get_noalt_contigs(data):
                            out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
                pybedtools.BedTool().window_maker(w=params["parallel_window_size"],
                                                  b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file)
        cmd += ["--bed", window_file]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, out_file) as tx_out_file:
            with utils.chdir(os.path.dirname(tx_out_file)):
                tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_out_file.replace(".depth.bed", "")
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
Example #7
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    out_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions")
    if not utils.file_uptodate(out_file, bam_file):
        ref_file = dd.get_ref_file(data)
        cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1",
               "--mincov", str(params["min"]), "--reference", ref_file,
               "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"]
        window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0]
        if not utils.file_uptodate(window_file, bam_file):
            with file_transaction(data, window_file) as tx_out_file:
                if not variant_regions:
                    variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0]
                    with open(variant_regions, "w") as out_handle:
                        for c in shared.get_noalt_contigs(data):
                            out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
                pybedtools.BedTool().window_maker(w=params["parallel_window_size"],
                                                  b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file)
        cmd += ["--bed", window_file]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, out_file) as tx_out_file:
            with utils.chdir(os.path.dirname(tx_out_file)):
                tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_out_file.replace(".depth.bed", "")
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov