Beispiel #1
0
def _get_larger_chroms(ref_file):
    """Retrieve larger chromosomes, avoiding the smaller ones for plotting.
    """
    from scipy.cluster.vq import kmeans, vq
    all_sizes = []
    for c in ref.file_contigs(ref_file):
        all_sizes.append(float(c.size))
    all_sizes.sort()
    # separate out smaller chromosomes and haplotypes with kmeans
    centroids, _ = kmeans(np.array(all_sizes), 2)
    idx, _ = vq(np.array(all_sizes), centroids)
    little_sizes = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx, all_sizes)))
    little_sizes = [x[1] for x in little_sizes]
    # create one more cluster with the smaller, removing the haplotypes
    centroids2, _ = kmeans(np.array(little_sizes), 2)
    idx2, _ = vq(np.array(little_sizes), centroids2)
    little_sizes2 = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx2, little_sizes)))
    little_sizes2 = [x[1] for x in little_sizes2]
    # get any chromosomes not in haplotype/random bin
    thresh = max(little_sizes2)
    larger_chroms = []
    for c in ref.file_contigs(ref_file):
        if c.size > thresh:
            larger_chroms.append(c.name)
    return larger_chroms
Beispiel #2
0
def get_noalt_contigs(data):
    """Retrieve contigs without alternatives as defined in bwa *.alts files.

    If no alt files present (when we're not aligning with bwa), work around
    with standard set of alts based on hg38 -- anything with HLA, _alt or
    _decoy in the name.
    """
    alts = set([])
    alt_files = [
        f for f in tz.get_in(["reference", "bwa", "indexes"], data, [])
        if f.endswith("alt")
    ]
    if alt_files:
        for alt_file in alt_files:
            with open(alt_file) as in_handle:
                for line in in_handle:
                    if not line.startswith("@"):
                        alts.add(line.split()[0].strip())
    else:
        for contig in ref.file_contigs(dd.get_ref_file(data)):
            if ("_alt" in contig.name or "_decoy" in contig.name
                    or contig.name.startswith("HLA-") or ":" in contig.name):
                alts.add(contig.name)
    return [
        c for c in ref.file_contigs(dd.get_ref_file(data))
        if c.name not in alts
    ]
Beispiel #3
0
def _average_genome_coverage(data, bam_file):
    total = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
    read_counts = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False)
    with pysam.Samfile(bam_file, "rb") as pysam_bam:
        read_size = np.median(list(itertools.islice((a.query_length for a in pysam_bam.fetch()), 1e5)))
    avg_cov = float(read_counts * read_size) / total
    return avg_cov
Beispiel #4
0
def _run_svtyper(in_file, full_bam, exclude_file, data):
    """Genotype structural variant calls with SVtyper.

    Removes calls in high depth regions to avoid slow runtimes:
    https://github.com/hall-lab/svtyper/issues/16
    """
    out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not vcfutils.vcf_has_variants(in_file):
                shutil.copy(in_file, out_file)
            else:
                python = sys.executable
                svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper")
                if exclude_file and utils.file_exists(exclude_file):
                    regions_to_rm = "-T ^%s" % (exclude_file)
                else:
                    regions_to_rm = ""
                # add FILTER headers, which are lost during svtyping
                header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0]
                with open(header_file, "w") as out_handle:
                    with utils.open_gzipsafe(in_file) as in_handle:
                        for line in in_handle:
                            if not line.startswith("#"):
                                break
                            if line.startswith("##FILTER"):
                                out_handle.write(line)
                    for region in ref.file_contigs(dd.get_ref_file(data), data["config"]):
                        out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size))
                cmd = ("bcftools view {in_file} {regions_to_rm} | "
                       "{python} {svtyper} --max_reads 1000 -B {full_bam} | "
                       "bcftools annotate -h {header_file} | "
                       "bgzip -c > {tx_out_file}")
                do.run(cmd.format(**locals()), "SV genotyping with svtyper")
    return vcfutils.sort_by_ref(out_file, data)
Beispiel #5
0
def _check_bam_contigs(in_bam, ref_file, config):
    """Ensure a pre-aligned BAM file matches the expected reference genome.
    """
    # GATK allows chromosome M to be in multiple locations, skip checking it
    allowed_outoforder = ["chrM", "MT"]
    ref_contigs = [c.name for c in ref.file_contigs(ref_file, config)]
    with pysam.Samfile(in_bam, "rb") as bamfile:
        bam_contigs = [c["SN"] for c in bamfile.header["SQ"]]
    extra_bcs = [x for x in bam_contigs if x not in ref_contigs]
    extra_rcs = [x for x in ref_contigs if x not in bam_contigs]
    problems = []
    warnings = []
    for bc, rc in itertools.izip_longest([x for x in bam_contigs if (x not in extra_bcs and
                                                                     x not in allowed_outoforder)],
                                         [x for x in ref_contigs if (x not in extra_rcs and
                                                                     x not in allowed_outoforder)]):
        if bc != rc:
            if bc and rc:
                problems.append("Reference mismatch. BAM: %s Reference: %s" % (bc, rc))
            elif bc:
                warnings.append("Extra BAM chromosomes: %s" % bc)
            elif rc:
                warnings.append("Extra reference chromosomes: %s" % rc)
    for bc in extra_bcs:
        warnings.append("Extra BAM chromosomes: %s" % bc)
    for rc in extra_rcs:
        warnings.append("Extra reference chromosomes: %s" % rc)
    if problems:
        raise ValueError("Unexpected order, name or contig mismatches between input BAM and reference file:\n%s\n"
                         "Setting `bam_clean: picard` in the configuration can often fix this issue."
                         % "\n".join(problems))
    if warnings:
        print("*** Potential problems in input BAM compared to reference:\n%s\n" %
              "\n".join(warnings))
Beispiel #6
0
def _get_maxcov_downsample(data):
    """Calculate maximum coverage downsampling for whole genome samples.

    Returns None if we're not doing downsampling.
    """
    from bcbio.bam import ref
    from bcbio.ngsalign import alignprep, bwa
    from bcbio.variation import coverage
    params = {"min_coverage_for_downsampling": 10,
              "maxcov_downsample_multiplier": dd.get_maxcov_downsample(data)}
    fastq_file = data["files"][0]
    num_reads = alignprep.total_reads_from_grabix(fastq_file)
    if num_reads and params["maxcov_downsample_multiplier"] and params["maxcov_downsample_multiplier"] > 0:
        vrs = dd.get_variant_regions_merged(data)
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
            genome_cov_pct = callable_size / float(total_size)
        else:
            callable_size = total_size
            genome_cov_pct = 1.0
        if (genome_cov_pct > coverage.GENOME_COV_THRESH
              and dd.get_coverage_interval(data) in ["genome", None, False]):
            total_counts, total_sizes = 0, 0
            for count, size in bwa.fastq_size_output(fastq_file, 5000):
                total_counts += int(count)
                total_sizes += (int(size) * int(count))
            read_size = float(total_sizes) / float(total_counts)
            avg_cov = float(num_reads * read_size) / callable_size
            if avg_cov >= params["min_coverage_for_downsampling"]:
                return int(avg_cov * params["maxcov_downsample_multiplier"])
    return None
Beispiel #7
0
def _get_region_size(ref_file, data, region=None):
    """Retrieve size of a region, potentially returning None if not set.
    """
    if region:
        for contig in ref.file_contigs(ref_file, data["config"]):
            if contig.name == region:
                return contig.size
Beispiel #8
0
def _collapse_transcripts(in_file, window, data, out_dir):
    """Collapse transcripts into min/max coordinates and optionally add windows.
    """
    if out_dir is None:
        out_dir = os.path.dirname(in_file)
    out_file = os.path.join(out_dir,
                            "%s-transcripts_w%s.bed" % (os.path.splitext(os.path.basename(in_file))[0],
                                                        window))
    chrom_sizes = {}
    for contig in ref.file_contigs(dd.get_ref_file(data), data["config"]):
        chrom_sizes[contig.name] = contig.size
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            prep_file = "%s-sortprep%s" % os.path.splitext(tx_out_file)
            sort_cmd = bedutils.get_sort_cmd()
            cmd = "{sort_cmd} -k4,4 -k1,1 {in_file} > {prep_file}"
            do.run(cmd.format(**locals()), "Sort BED file by transcript name")
            with open(tx_out_file, "w") as out_handle:
                # Work around for segmentation fault issue with groupby
                # https://github.com/daler/pybedtools/issues/131#issuecomment-89832476
                x = pybedtools.BedTool(prep_file)
                def gen():
                    for r in x:
                        yield r
                for name, rs in itertools.groupby(gen(), lambda r: (r.name, r.chrom)):
                    rs = list(rs)
                    r = rs[0]
                    for gcoords in _group_coords(rs):
                        min_pos = max(min(gcoords) - window, 0)
                        max_pos = min(max(gcoords) + window, chrom_sizes[r.chrom])
                        out_handle.write("%s\t%s\t%s\t%s\n" % (r.chrom, min_pos, max_pos, r.name))
    return bedutils.sort_merge(out_file, data)
Beispiel #9
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions_merged(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            callable_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        genome_cov_pct = callable_size / float(total_size)
        if genome_cov_pct > GENOME_COV_THRESH:
            cov_interval = "genome"
            offtarget_pct = 0.0
        elif not vrs:
            cov_interval = "regional"
            offtarget_pct = 0.0
        else:
            offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data),
                                             vrs or callable_file, "variant_regions")
            if offtarget_pct > OFFTARGET_THRESH:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
                    % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Beispiel #10
0
def _target_chroms_and_header(bam_file, data):
    """Get a list of chromosomes to target and new updated ref_file header.

    Could potentially handle remapping from chr1 -> 1 but currently disabled due
    to speed issues.
    """
    special_remaps = {"chrM": "MT", "MT": "chrM"}
    target_chroms = dict([(x.name, i) for i, x in enumerate(ref.file_contigs(dd.get_ref_file(data)))
                          if chromhacks.is_autosomal_or_sex(x.name)])
    out_chroms = []
    with pysam.Samfile(bam_file, "rb") as bamfile:
        for bami, bam_contig in enumerate([c["SN"] for c in bamfile.header["SQ"]]):
            if bam_contig in target_chroms:
                target_chrom = bam_contig
            elif bam_contig in special_remaps and special_remaps[bam_contig] in target_chroms:
                target_chrom = special_remaps[bam_contig]
            elif bam_contig.startswith("chr") and bam_contig.replace("chr", "") in target_chroms:
                target_chrom = bam_contig.replace("chr", "")
            elif "chr%s" % bam_contig in target_chroms:
                target_chrom = "chr%s" % bam_contig
            else:
                target_chrom = None
            # target_chrom == bam_contig ensures we don't try chr1 -> 1 style remapping
            if target_chrom and target_chrom == bam_contig:
                # Order not required if dealing with SAM file header fixing
                #assert bami == target_chroms[target_chrom], \
                #    ("remove_extracontigs: Non-matching order of standard contig: %s %s (%s vs %s)" %
                #     (bam_file, target_chrom, bami, target_chroms[target_chrom]))
                out_chroms.append(target_chrom)
    assert out_chroms, ("remove_extracontigs: Did not find any chromosomes in reference file: %s %s" %
                        (bam_file, target_chroms))
    return out_chroms
Beispiel #11
0
def _gids_to_genes(gids, ssm_locs, cnv_ssms, data):
    """Convert support ids for SNPs and SSMs into associated genes.
    """
    locs = collections.defaultdict(set)
    for gid in gids:
        cur_locs = []
        try:
            cur_locs.append(ssm_locs[gid])
        except KeyError:
            for ssm_loc in cnv_ssms.get(gid, []):
                cur_locs.append(ssm_locs[ssm_loc])
        for chrom, pos in cur_locs:
            locs[chrom].add(pos)
    genes = set([])
    with tx_tmpdir(data) as tmpdir:
        chrom_prefix = "chr" if next(ref.file_contigs(dd.get_ref_file(data))).name.startswith("chr") else ""
        loc_file = os.path.join(tmpdir, "battenberg_find_genes.bed")
        with open(loc_file, "w") as out_handle:
            for chrom in sorted(locs.keys()):
                for loc in sorted(list(locs[chrom])):
                    out_handle.write("%s%s\t%s\t%s\n" % (chrom_prefix, chrom, loc - 1, loc))
        ann_file = annotate.add_genes(loc_file, data, max_distance=10000)
        for r in pybedtools.BedTool(ann_file):
            for gene in r.name.split(","):
                if gene != ".":
                    genes.add(gene)
    return sorted(list(genes))
Beispiel #12
0
def _goleft_indexcov(bam_file, data, out_dir):
    """Use goleft indexcov to estimate coverage distributions using BAM index.

    Only used for whole genome runs as captures typically don't have enough data
    to be useful for index-only summaries.
    """
    if not dd.get_coverage_interval(data) == "genome":
        return []
    out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov"))
    out_files = [os.path.join(out_dir, "%s-indexcov.%s" % (dd.get_sample_name(data), ext))
                 for ext in ["roc", "ped", "bed.gz"]]
    if not utils.file_uptodate(out_files[-1], bam_file):
        with transaction.tx_tmpdir(data) as tmp_dir:
            tmp_dir = utils.safe_makedir(os.path.join(tmp_dir, dd.get_sample_name(data)))
            gender_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_sex(x.name)]
            gender_args = "--sex %s" % (",".join(gender_chroms)) if gender_chroms else ""
            cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}"
            try:
                do.run(cmd.format(**locals()), "QC: goleft indexcov")
            except subprocess.CalledProcessError as msg:
                if not ("indexcov: no usable" in str(msg) or
                        ("indexcov: expected" in str(msg) and "sex chromosomes, found:" in str(msg))):
                    raise
            for out_file in out_files:
                orig_file = os.path.join(tmp_dir, os.path.basename(out_file))
                if utils.file_exists(orig_file):
                    utils.copy_plus(orig_file, out_file)
    # MultiQC needs non-gzipped/BED inputs so unpack the file
    out_bed = out_files[-1].replace(".bed.gz", ".tsv")
    if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed):
        with transaction.file_transaction(data, out_bed) as tx_out_bed:
            cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed)
            do.run(cmd, "Unpack indexcov BED file")
    out_files[-1] = out_bed
    return [x for x in out_files if utils.file_exists(x)]
Beispiel #13
0
def add_contig_to_header_cl(ref_file, out_file):
    """Add update ##contig lines to VCF header, required for bcftools/GATK compatibility.
    """
    header_file = "%s-contig_header.txt" % utils.splitext_plus(out_file)[0]
    with open(header_file, "w") as out_handle:
        for region in ref.file_contigs(ref_file, {}):
            out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size))
    return ("grep -v ^##contig | bcftools annotate -h %s" % header_file)
Beispiel #14
0
 def split_by_region(data):
     base, ext = utils.splitext_plus(os.path.basename(out_file))
     args = []
     for region in [x.name for x in ref.file_contigs(ref_file, config)]:
         region_out = os.path.join(os.path.dirname(out_file), "%s-regions" % base, "%s-%s%s" % (base, region, ext))
         utils.safe_makedir(os.path.dirname(region_out))
         args.append((region_out, ref_file, config, region))
     return out_file, args
Beispiel #15
0
def add_contig_to_header(line, ref_file):
    """Streaming target to add contigs to a VCF file header.
    """
    if line.startswith("##fileformat=VCF"):
        out = [line]
        for region in ref.file_contigs(ref_file):
            out.append("##contig=<ID=%s,length=%s>" % (region.name, region.size))
        return "\n".join(out)
    else:
        return line
Beispiel #16
0
def _prep_priority_filter(gemini_db, data):
    """Prepare tabix indexed file with priority based filters and supporting information
    """
    from gemini import GeminiQuery

    out_file = "%s-priority.tsv" % utils.splitext_plus(gemini_db)[0]
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        ref_chroms = set([x.name for x in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        with file_transaction(data, out_file) as tx_out_file:
            gq = GeminiQuery(gemini_db)
            pops = [
                "aaf_esp_ea",
                "aaf_esp_aa",
                "aaf_esp_all",
                "aaf_1kg_amr",
                "aaf_1kg_eas",
                "aaf_1kg_sas",
                "aaf_1kg_afr",
                "aaf_1kg_eur",
                "aaf_1kg_all",
                "aaf_adj_exac_all",
                "aaf_adj_exac_afr",
                "aaf_adj_exac_amr",
                "aaf_adj_exac_eas",
                "aaf_adj_exac_fin",
                "aaf_adj_exac_nfe",
                "aaf_adj_exac_oth",
                "aaf_adj_exac_sas",
            ]
            attrs = (
                "chrom, start, end, ref, alt, impact_so, impact_severity, in_dbsnp, "
                "cosmic_ids, clinvar_sig, clinvar_origin, fitcons, gt_ref_depths, gt_alt_depths"
            ).split(", ")
            gq.run("SELECT %s FROM variants" % ", ".join(attrs + pops))
            sidx = gq.sample_to_idx[dd.get_sample_name(data)]
            header = attrs[:5] + ["filter"] + attrs[5:-2] + [x for x in pops if x.endswith("_all")] + ["freq"]
            with open(tx_out_file, "w") as out_handle:
                writer = csv.writer(out_handle, dialect="excel-tab")
                cheader = header[:]
                cheader[0] = "#" + cheader[0]
                writer.writerow(cheader)
                for row in gq:
                    ref_depth = tz.get_in(["gt_ref_depths", sidx], row, 0)
                    alt_depth = tz.get_in(["gt_alt_depths", sidx], row, 0)
                    out_vals = dict(row.row)
                    try:
                        out_vals["freq"] = "%.2f" % (float(alt_depth) / float(ref_depth + alt_depth))
                    except ZeroDivisionError:
                        out_vals["freq"] = "0.00"
                    out_vals["filter"] = _calc_priority_filter(row, pops)
                    if out_vals["chrom"] not in ref_chroms and _hg19_to_GRCh37(out_vals["chrom"]) in ref_chroms:
                        out_vals["chrom"] = _hg19_to_GRCh37(out_vals["chrom"])
                    out = [out_vals[x] for x in header]
                    writer.writerow(out)
    return vcfutils.bgzip_and_index(out_file, data["config"], tabix_args="-0 -c '#' -s 1 -b 2 -e 3")
Beispiel #17
0
def _average_genome_coverage(data, bam_file):
    """Quickly calculate average coverage for whole genome files using indices.

    Includes all reads, with duplicates. Uses sampling of 10M reads.
    """
    total = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
    read_counts = sum(x.aligned for x in bam.idxstats(bam_file, data))
    with pysam.Samfile(bam_file, "rb") as pysam_bam:
        read_size = np.median(list(itertools.islice((a.query_length for a in pysam_bam.fetch()), int(1e7))))
    avg_cov = float(read_counts * read_size) / total
    return avg_cov
Beispiel #18
0
def fix_header(ref_file):
    added_ref = False
    for line in sys.stdin:
        # skip current read groups, since adding new
        # skip current contigs since adding new sequence dictionary
        if line.startswith(("@RG", "@SQ")):
            pass
        elif not added_ref and not line.startswith("@"):
            for x in ref.file_contigs(ref_file):
                sys.stdout.write("@SQ\tSN:%s\tLN:%s\n" % (x.name, x.size))
            added_ref = True
        else:
            sys.stdout.write(line)
Beispiel #19
0
def check_bed_contigs(in_file, data):
    """Ensure BED file contigs match the reference genome.
    """
    contigs = set([])
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if not line.startswith(("#", "track", "browser")) and line.strip():
                contigs.add(line.split()[0])
    ref_contigs = set([x.name for x in ref.file_contigs(dd.get_ref_file(data))])
    if len(contigs - ref_contigs) / float(len(contigs)) > 0.25:
        raise ValueError("Contigs in BED file %s not in reference genome:\n %s\n"
                         % (in_file, list(contigs - ref_contigs)) +
                         "This is typically due to chr1 versus 1 differences in BED file and reference.")
Beispiel #20
0
def _make_ignore_file(work_dir, ref_file, impute_file, ignore_file):
    chroms = set([])
    with open(impute_file) as in_handle:
        for line in in_handle:
            chrom = line.split()[0]
            chroms.add(chrom)
            if not chrom.startswith("chr"):
                chroms.add("chr%s" % chrom)
    with open(ignore_file, "w") as out_handle:
        for contig in ref.file_contigs(ref_file):
            if contig.name not in chroms:
                out_handle.write("%s\n" % contig.name)
    return ignore_file
Beispiel #21
0
def _check_ref_files(ref_info, data):
    problems = []
    for contig in ref.file_contigs(ref_info["fasta"]["base"], data["config"]):
        cur_problems = set([])
        for char in list(contig.name):
            if char not in ALLOWED_CONTIG_NAME_CHARS:
                cur_problems.add(char)
        if len(cur_problems) > 0:
            problems.append("Found non-allowed characters in chromosome name %s: %s" %
                            (contig.name, " ".join(list(cur_problems))))
    if len(problems) > 0:
        msg = ("\nProblems with input reference file %s\n" % ref_info["fasta"]["base"])
        raise ValueError(msg + "\n".join(problems) + "\n")
Beispiel #22
0
def subset_to_genome(in_file, out_file, data):
    """Subset a BED file to only contain contigs present in the reference genome.
    """
    if not utils.file_uptodate(out_file, in_file):
        contigs = set([x.name for x in ref.file_contigs(dd.get_ref_file(data))])
        with utils.open_gzipsafe(in_file) as in_handle:
            with file_transaction(data, out_file) as tx_out_file:
                with open(tx_out_file, "w") as out_handle:
                    for line in in_handle:
                        parts = line.split()
                        if parts and parts[0] in contigs:
                            out_handle.write(line)
    return out_file
Beispiel #23
0
def _make_ignore_file(work_dir, ref_file, ignore_file, impute_file):
    """Create input files with chromosomes to ignore and gender loci.
    """
    gl_file = os.path.join(work_dir, "gender_loci.txt")
    chroms = set([])
    with open(impute_file) as in_handle:
        for line in in_handle:
            chrom = line.split()[0]
            chroms.add(chrom)
            if not chrom.startswith("chr"):
                chroms.add("chr%s" % chrom)
    with open(ignore_file, "w") as out_handle:
        for contig in ref.file_contigs(ref_file):
            if contig.name not in chroms:
                out_handle.write("%s\n" % contig.name)
    with open(gl_file, "w") as out_handle:
        for contig in ref.file_contigs(ref_file):
            if contig.name in ["Y", "chrY"]:
                # From https://github.com/cancerit/cgpBattenberg/blob/dev/perl/share/gender/GRCh37d5_Y.loci
                positions = [2934912, 4546684, 4549638, 4550107]
                for pos in positions:
                    out_handle.write("%s\t%s\n" % (contig.name, pos))
    return ignore_file, gl_file
Beispiel #24
0
def _run_wham(inputs, background_bams):
    """Run WHAM on a defined set of inputs and targets.
    """
    out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0]))
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            cores = dd.get_cores(inputs[0])
            ref_file = dd.get_ref_file(inputs[0])
            include_chroms = ",".join([c.name for c in ref.file_contigs(ref_file)
                                       if chromhacks.is_autosomal_or_x(c.name)])
            all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams)
            cmd = ("whamg -x {cores} -a {ref_file} -f {all_bams} -c {include_chroms} "
                   "| bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()), "WHAM SV caller: %s" % ", ".join(dd.get_sample_name(d) for d in inputs))
    return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
Beispiel #25
0
def _check_ref_files(ref_info, data):
    problems = []
    if not data["genome_build"]:
        problems.append("Did not find 'genome_build' for sample: %s" % dd.get_sample_name(data))
    else:
        for contig in ref.file_contigs(ref_info["fasta"]["base"], data["config"]):
            cur_problems = set([])
            for char in list(contig.name):
                if char not in ALLOWED_CONTIG_NAME_CHARS:
                    cur_problems.add(char)
            if len(cur_problems) > 0:
                problems.append("Found non-allowed characters in chromosome name %s: %s" %
                                (contig.name, " ".join(list(cur_problems))))
    if len(problems) > 0:
        msg = ("\nProblems with input reference file %s\n" % tz.get_in(["fasta", "base"], ref_info))
        raise ValueError(msg + "\n".join(problems) + "\n")
Beispiel #26
0
def sort_by_ref(vcf_file, data):
    """Sort a VCF file by genome reference and position, adding contig information.
    """
    out_file = "%s-prep.vcf.gz" % utils.splitext_plus(vcf_file)[0]
    if not utils.file_uptodate(out_file, vcf_file):
        with file_transaction(data, out_file) as tx_out_file:
            header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0]
            with open(header_file, "w") as out_handle:
                for region in ref.file_contigs(dd.get_ref_file(data), data["config"]):
                    out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size))
            cat_cmd = "zcat" if vcf_file.endswith("vcf.gz") else "cat"
            cmd = ("{cat_cmd} {vcf_file} | grep -v ^##contig | bcftools annotate -h {header_file} | "
                   "vt sort -m full -o {tx_out_file} -")
            with utils.chdir(os.path.dirname(tx_out_file)):
                do.run(cmd.format(**locals()), "Sort VCF by reference")
    return bgzip_and_index(out_file, data["config"])
Beispiel #27
0
def _sort_by_region(fnames, regions, ref_file, config):
    """Sort a set of regionally split files by region for ordered output.
    """
    contig_order = {}
    for i, sq in enumerate(ref.file_contigs(ref_file, config)):
        contig_order[sq.name] = i
    sitems = []
    for region, fname in zip(regions, fnames):
        if isinstance(region, (list, tuple)):
            c, s, e = region
        else:
            c = region
            s, e = 0, 0
        sitems.append(((contig_order[c], s, e), fname))
    sitems.sort()
    return [x[1] for x in sitems]
Beispiel #28
0
def _maybe_limit_chromosomes(data):
    """Potentially limit chromosomes to avoid problematically named HLA contigs.

    HLAs have ':' characters in them which confuse downstream processing. If
    we have no problematic chromosomes we don't limit anything.
    """
    std_chroms = []
    prob_chroms = []
    for contig in ref.file_contigs(dd.get_ref_file(data)):
        if contig.name.find(":") > 0:
            prob_chroms.append(contig.name)
        else:
            std_chroms.append(contig.name)
    if len(prob_chroms) > 0:
        return std_chroms
    else:
        return []
Beispiel #29
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    genome_cov_thresh = 0.40  # percent of genome covered for whole genome analysis
    offtarget_thresh = 0.10  # percent of offtarget reads required to be capture (not amplification) based
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            seq_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            seq_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([
            c.size
            for c in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        genome_cov_pct = seq_size / float(total_size)
        if genome_cov_pct > genome_cov_thresh:
            cov_interval = "genome"
            offtarget_pct = 0.0
        else:
            offtarget_stat_file = dd.get_offtarget_stats(data)
            if not offtarget_stat_file:
                offtarget_pct = 0.0
            else:
                with open(offtarget_stat_file) as in_handle:
                    stats = yaml.safe_load(in_handle)
                if float(stats["mapped"]) > 0:
                    offtarget_pct = stats["offtarget"] / float(stats["mapped"])
                else:
                    offtarget_pct = 0.0
            if offtarget_pct > offtarget_thresh:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info(
            "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
            % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0,
               offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Beispiel #30
0
def _goleft_indexcov(bam_file, data, out_dir):
    """Use goleft indexcov to estimate coverage distributions using BAM index.

    Only used for whole genome runs as captures typically don't have enough data
    to be useful for index-only summaries.
    """
    if not dd.get_coverage_interval(data) == "genome":
        return []
    out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov"))
    out_files = [
        os.path.join(out_dir,
                     "%s-indexcov.%s" % (dd.get_sample_name(data), ext))
        for ext in ["roc", "ped", "bed.gz"]
    ]
    if not utils.file_uptodate(out_files[-1], bam_file):
        with transaction.tx_tmpdir(data) as tmp_dir:
            tmp_dir = utils.safe_makedir(
                os.path.join(tmp_dir, dd.get_sample_name(data)))
            gender_chroms = [
                x.name for x in ref.file_contigs(dd.get_ref_file(data))
                if chromhacks.is_sex(x.name)
            ]
            gender_args = "--sex %s" % (
                ",".join(gender_chroms)) if gender_chroms else ""
            # XXX Skip gender args until we can correctly specify #1793
            gender_args = ""
            cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}"
            try:
                do.run(cmd.format(**locals()), "QC: goleft indexcov")
            except subprocess.CalledProcessError as msg:
                if not ("indexcov: no usable" in str(msg) or
                        ("indexcov: expected" in str(msg)
                         and "sex chromosomes, found:" in str(msg))):
                    raise
            for out_file in out_files:
                orig_file = os.path.join(tmp_dir, os.path.basename(out_file))
                if utils.file_exists(orig_file):
                    utils.copy_plus(orig_file, out_file)
    # MultiQC needs non-gzipped/BED inputs so unpack the file
    out_bed = out_files[-1].replace(".bed.gz", ".tsv")
    if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed):
        with transaction.file_transaction(data, out_bed) as tx_out_bed:
            cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed)
            do.run(cmd, "Unpack indexcov BED file")
    out_files[-1] = out_bed
    return [x for x in out_files if utils.file_exists(x)]
Beispiel #31
0
def _collapse_transcripts(in_file,
                          window,
                          data,
                          out_dir,
                          include_gene_names=True):
    """Collapse transcripts into min/max coordinates and optionally add windows.
    """
    if out_dir is None:
        out_dir = os.path.dirname(in_file)
    out_file = os.path.join(
        out_dir, "%s-transcripts_w%s.bed" %
        (os.path.splitext(os.path.basename(in_file))[0], window))
    chrom_sizes = {}
    for contig in ref.file_contigs(dd.get_ref_file(data), data["config"]):
        chrom_sizes[contig.name] = contig.size
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            prep_file = "%s-sortprep%s" % os.path.splitext(tx_out_file)
            sort_cmd = bedutils.get_sort_cmd()
            cmd = "{sort_cmd} -k4,4 -k1,1 {in_file} > {prep_file}"
            do.run(cmd.format(**locals()), "Sort BED file by transcript name")
            with open(tx_out_file, "w") as out_handle:
                # Work around for segmentation fault issue with groupby
                # https://github.com/daler/pybedtools/issues/131#issuecomment-89832476
                x = pybedtools.BedTool(prep_file)

                def gen():
                    for r in x:
                        yield r

                for name, rs in itertools.groupby(gen(), lambda r:
                                                  (r.name, r.chrom)):
                    rs = list(rs)
                    r = rs[0]
                    for gcoords in _group_coords(rs):
                        min_pos = max(min(gcoords) - window, 0)
                        max_pos = min(
                            max(gcoords) + window, chrom_sizes[r.chrom])
                        if include_gene_names:
                            out_handle.write(
                                "%s\t%s\t%s\t%s\n" %
                                (r.chrom, min_pos, max_pos, r.name))
                        else:
                            out_handle.write("%s\t%s\t%s\n" %
                                             (r.chrom, min_pos, max_pos))
    return bedutils.sort_merge(out_file, data)
Beispiel #32
0
def to_standardonly(in_file, ref_file, data):
    """Subset a VCF input file to standard chromosomes (1-22,X,Y,MT).
    """
    from bcbio.heterogeneity import chromhacks
    out_file = "%s-stdchrs.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_exists(out_file):
        stds = []
        for c in ref.file_contigs(ref_file):
            if chromhacks.is_nonalt(c.name):
                stds.append(c.name)
        if stds:
            with file_transaction(data, out_file) as tx_out_file:
                stds = ",".join(stds)
                in_file = bgzip_and_index(in_file, data["config"])
                cmd = "bcftools view -o {tx_out_file} -O z {in_file} {stds}"
                do.run(cmd.format(**locals()), "Subset to standard chromosomes")
    return bgzip_and_index(out_file, data["config"]) if utils.file_exists(out_file) else in_file
Beispiel #33
0
def to_standardonly(in_file, ref_file, data):
    """Subset a VCF input file to standard chromosomes (1-22,X,Y,MT).
    """
    from bcbio.heterogeneity import chromhacks
    out_file = "%s-stdchrs.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_exists(out_file):
        stds = []
        for c in ref.file_contigs(ref_file):
            if chromhacks.is_nonalt(c.name):
                stds.append(c.name)
        if stds:
            with file_transaction(data, out_file) as tx_out_file:
                stds = ",".join(stds)
                in_file = bgzip_and_index(in_file, data["config"])
                cmd = "bcftools view -o {tx_out_file} -O z {in_file} {stds}"
                do.run(cmd.format(**locals()), "Subset to standard chromosomes")
    return bgzip_and_index(out_file, data["config"]) if utils.file_exists(out_file) else in_file
Beispiel #34
0
def _get_callable_regions(data):
    """Retrieve regions to parallelize by from callable regions, variant regions or chromosomes
    """
    callable_files = data.get("callable_regions") or data.get("variant_regions")
    if callable_files:
        assert len(callable_files) == 1
        regions = [(r.chrom, int(r.start), int(r.stop)) for r in pybedtools.BedTool(callable_files[0])]
    else:
        work_bam = list(tz.take(1, filter(lambda x: x.endswith(".bam"), data["work_bams"])))
        if work_bam:
            with contextlib.closing(pysam.Samfile(work_bam[0], "rb")) as pysam_bam:
                regions = [(chrom, 0, length) for (chrom, length) in zip(pysam_bam.references,
                                                                         pysam_bam.lengths)]
        else:
            regions = [(r.name, 0, r.size) for r in
                       ref.file_contigs(dd.get_ref_file(data), data["config"])]
    return regions
Beispiel #35
0
def _sort_by_region(fnames, regions, ref_file, config):
    """Sort a set of regionally split files by region for ordered output.
    """
    contig_order = {}
    for i, sq in enumerate(ref.file_contigs(ref_file, config)):
        contig_order[sq.name] = i
    sitems = []
    assert len(regions) == len(fnames), (regions, fnames)
    for region, fname in zip(regions, fnames):
        if isinstance(region, (list, tuple)):
            c, s, e = region
        else:
            c = region
            s, e = 0, 0
        sitems.append(((contig_order[c], s, e), fname))
    sitems.sort()
    return [x[1] for x in sitems]
Beispiel #36
0
def _maybe_limit_chromosomes(data):
    """Potentially limit chromosomes to avoid problematically named HLA contigs.

    HLAs have ':' characters in them which confuse downstream processing. If
    we have no problematic chromosomes we don't limit anything.
    """
    std_chroms = []
    prob_chroms = []
    for contig in ref.file_contigs(dd.get_ref_file(data)):
        if contig.name.find(":") > 0:
            prob_chroms.append(contig.name)
        else:
            std_chroms.append(contig.name)
    if len(prob_chroms) > 0:
        return std_chroms
    else:
        return []
Beispiel #37
0
def check_bed_contigs(in_file, data):
    """Ensure BED file contigs match the reference genome.
    """
    if not dd.get_ref_file(data):
        return
    contigs = set([])
    with utils.open_gzipsafe(in_file) as in_handle:
        for line in in_handle:
            if not line.startswith(("#", "track", "browser")) and line.strip():
                contigs.add(line.split()[0])
    ref_contigs = set(
        [x.name for x in ref.file_contigs(dd.get_ref_file(data))])
    if contigs and len(contigs - ref_contigs) / float(len(contigs)) > 0.25:
        raise ValueError(
            "Contigs in BED file %s not in reference genome:\n %s\n" %
            (in_file, list(contigs - ref_contigs)) +
            "This is typically due to chr1 versus 1 differences in BED file and reference."
        )
Beispiel #38
0
def _get_callable_regions(data):
    """Retrieve regions to parallelize by from callable regions or chromosomes.
    """
    import pybedtools
    callable_files = data.get("callable_regions")
    if callable_files:
        assert len(callable_files) == 1
        regions = [(r.chrom, int(r.start), int(r.stop)) for r in pybedtools.BedTool(callable_files[0])]
    else:
        work_bam = list(tz.take(1, filter(lambda x: x and x.endswith(".bam"), data["work_bams"])))
        if work_bam:
            with pysam.Samfile(work_bam[0], "rb") as pysam_bam:
                regions = [(chrom, 0, length) for (chrom, length) in zip(pysam_bam.references,
                                                                         pysam_bam.lengths)]
        else:
            regions = [(r.name, 0, r.size) for r in
                       ref.file_contigs(dd.get_ref_file(data), data["config"])]
    return regions
Beispiel #39
0
def _maybe_limit_chromosomes(data):
    """Potentially limit chromosomes to avoid problematically named HLA contigs.

    HLAs have ':' characters in them which confuse downstream processing. If
    we have no problematic chromosomes we don't limit anything.
    """
    std_chroms = []
    prob_chroms = []
    noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data)
    for contig in ref.file_contigs(dd.get_ref_file(data)):
        if contig.name.find(":") > 0 or (noalt_calling and not chromhacks.is_nonalt(contig.name)):
            prob_chroms.append(contig.name)
        else:
            std_chroms.append(contig.name)
    if len(prob_chroms) > 0:
        return std_chroms
    else:
        return []
Beispiel #40
0
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv using smoove.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    name = "%s%s" % (dd.get_sample_name(items[0]), ext)
    out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name)
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz"
                                % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    if utils.file_exists(old_out_file):
        return old_out_file, sv_exclude_bed
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cores = dd.get_num_cores(items[0])
            out_dir = os.path.dirname(tx_out_file)
            ref_file = dd.get_ref_file(items[0])
            full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items,
                                                      os.path.dirname(tx_out_file)))
            std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"]
            def _is_std_exclude(n):
                clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes]
                return any([n.startswith(x) or n.endswith(x) for x in clean_excludes])
            exclude_chrs = [c.name for c in ref.file_contigs(ref_file)
                            if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)]
            exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs)
            exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else ""
            tempdir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tempdir} && "
                   "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} "
                   "--name {name} --outdir {out_dir} "
                   "{exclude_bed} {exclude_chrs} {full_bams}")
            with utils.chdir(tempdir):
                try:
                    do.run(cmd.format(**locals()), "smoove lumpy calling", items[0])
                except subprocess.CalledProcessError as msg:
                    if _allowed_errors(str(msg)):
                        vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"],
                                                samples=[dd.get_sample_name(d) for d in items])
                    else:
                        logger.exception()
                        raise
    vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file, sv_exclude_bed
Beispiel #41
0
def _run_svtyper(in_file, full_bam, sr_bam, exclude_file, data):
    """Genotype structural variant calls with SVtyper.

    Removes calls in high depth regions to avoid slow runtimes:
    https://github.com/hall-lab/svtyper/issues/16
    """
    out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not vcfutils.vcf_has_variants(in_file):
                shutil.copy(in_file, out_file)
            else:
                python = sys.executable
                svtyper = os.path.join(os.path.dirname(sys.executable),
                                       "svtyper")
                if exclude_file and utils.file_exists(exclude_file):
                    regions_to_rm = "-T ^%s" % (exclude_file)
                else:
                    regions_to_rm = ""
                # add FILTER headers, which are lost during svtyping
                header_file = "%s-header.txt" % utils.splitext_plus(
                    tx_out_file)[0]
                with open(header_file, "w") as out_handle:
                    with utils.open_gzipsafe(in_file) as in_handle:
                        for line in in_handle:
                            if not line.startswith("#"):
                                break
                            if line.startswith("##FILTER"):
                                out_handle.write(line)
                    for region in ref.file_contigs(dd.get_ref_file(data),
                                                   data["config"]):
                        out_handle.write("##contig=<ID=%s,length=%s>\n" %
                                         (region.name, region.size))
                if _older_svtyper_version(svtyper):
                    svtyper_extra_opts = "-M -S {sr_bam}"
                else:
                    svtyper_extra_opts = ""
                cmd = ("bcftools view {in_file} {regions_to_rm} | "
                       "{python} {svtyper} -B {full_bam} " +
                       svtyper_extra_opts + " | "
                       "bcftools annotate -h {header_file} | "
                       "bgzip -c > {tx_out_file}")
                do.run(cmd.format(**locals()), "SV genotyping with svtyper")
    return vcfutils.sort_by_ref(out_file, data)
Beispiel #42
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    genome_cov_thresh = 0.40  # percent of genome covered for whole genome analysis
    offtarget_thresh = 0.01  # percent of offtarget reads required to be capture (not amplification) based
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions_merged(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            callable_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([
            c.size
            for c in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        genome_cov_pct = callable_size / float(total_size)
        if genome_cov_pct > genome_cov_thresh:
            cov_interval = "genome"
            offtarget_pct = 0.0
        elif not vrs:
            cov_interval = "regional"
            offtarget_pct = 0.0
        else:
            offtarget_pct = _count_offtarget(
                data,
                dd.get_align_bam(data) or dd.get_work_bam(data), vrs
                or callable_file, "variant_regions")
            if offtarget_pct > offtarget_thresh:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info(
            "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
            % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0,
               offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Beispiel #43
0
def sort_by_ref(vcf_file, data):
    """Sort a VCF file by genome reference and position, adding contig information.
    """
    out_file = "%s-prep.vcf.gz" % utils.splitext_plus(vcf_file)[0]
    if not utils.file_uptodate(out_file, vcf_file):
        with file_transaction(data, out_file) as tx_out_file:
            header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0]
            with open(header_file, "w") as out_handle:
                for region in ref.file_contigs(dd.get_ref_file(data),
                                               data["config"]):
                    out_handle.write("##contig=<ID=%s,length=%s>\n" %
                                     (region.name, region.size))
            cat_cmd = "zcat" if vcf_file.endswith("vcf.gz") else "cat"
            cmd = (
                "{cat_cmd} {vcf_file} | grep -v ^##contig | bcftools annotate -h {header_file} | "
                "vt sort -m full -o {tx_out_file} -")
            with utils.chdir(os.path.dirname(tx_out_file)):
                do.run(cmd.format(**locals()), "Sort VCF by reference")
    return bgzip_and_index(out_file, data["config"])
Beispiel #44
0
def _check_ref_files(ref_info, data):
    problems = []
    if not data["genome_build"]:
        problems.append("Did not find 'genome_build' for sample: %s" % dd.get_sample_name(data))
    elif not tz.get_in(["fasta", "base"], ref_info):
        problems.append("Did not find fasta reference file for genome %s.\n" % (data["genome_build"]) +
                        "Check tool-data/*.loc files to ensure paths to reference data are correct.")
    else:
        for contig in ref.file_contigs(ref_info["fasta"]["base"], data["config"]):
            cur_problems = set([])
            for char in list(contig.name):
                if char not in ALLOWED_CONTIG_NAME_CHARS:
                    cur_problems.add(char)
            if len(cur_problems) > 0:
                problems.append("Found non-allowed characters in chromosome name %s: %s" %
                                (contig.name, " ".join(list(cur_problems))))
    if len(problems) > 0:
        msg = ("\nProblems with input reference file %s\n" % tz.get_in(["fasta", "base"], ref_info))
        raise ValueError(msg + "\n".join(problems) + "\n")
Beispiel #45
0
def _get_alt_chroms(data):
    """Retrieve alternative contigs as defined in bwa *.alts files.

    If no alt files present (when we're not aligning with bwa), work around
    with standard set of alts based on hg38 -- anything with HLA, _alt or
    _decoy in the name.
    """
    alts = []
    alt_files = [f for f in tz.get_in(["reference", "bwa", "indexes"], data, []) if f.endswith("alt")]
    if alt_files:
        for alt_file in alt_files:
            with open(alt_file) as in_handle:
                for line in in_handle:
                    if not line.startswith("@"):
                        alts.append(line.split()[0].strip())
    else:
        for contig in ref.file_contigs(dd.get_ref_file(data)):
            if ("_alt" in contig.name or "_decoy" in contig.name or
                  contig.name.startswith("HLA-") or ":" in contig.name):
                alts.append(contig.name)
    return alts
Beispiel #46
0
def _check_bam_contigs(in_bam, ref_file, config):
    """Ensure a pre-aligned BAM file matches the expected reference genome.
    """
    # GATK allows chromosome M to be in multiple locations, skip checking it
    allowed_outoforder = ["chrM", "MT"]
    ref_contigs = [c.name for c in ref.file_contigs(ref_file, config)]
    with pysam.Samfile(in_bam, "rb") as bamfile:
        bam_contigs = [c["SN"] for c in bamfile.header["SQ"]]
    extra_bcs = [x for x in bam_contigs if x not in ref_contigs]
    extra_rcs = [x for x in ref_contigs if x not in bam_contigs]
    problems = []
    warnings = []
    for bc, rc in zip_longest([
            x for x in bam_contigs
            if (x not in extra_bcs and x not in allowed_outoforder)
    ], [
            x for x in ref_contigs
            if (x not in extra_rcs and x not in allowed_outoforder)
    ]):
        if bc != rc:
            if bc and rc:
                problems.append("Reference mismatch. BAM: %s Reference: %s" %
                                (bc, rc))
            elif bc:
                warnings.append("Extra BAM chromosomes: %s" % bc)
            elif rc:
                warnings.append("Extra reference chromosomes: %s" % rc)
    for bc in extra_bcs:
        warnings.append("Extra BAM chromosomes: %s" % bc)
    for rc in extra_rcs:
        warnings.append("Extra reference chromosomes: %s" % rc)
    if problems:
        raise ValueError(
            "Unexpected order, name or contig mismatches between input BAM and reference file:\n%s\n"
            "Setting `bam_clean: remove_extracontigs` in the configuration can often fix this issue."
            % "\n".join(problems))
    if warnings:
        print(
            "*** Potential problems in input BAM compared to reference:\n%s\n"
            % "\n".join(warnings))
def _target_chroms_and_header(bam_file, data):
    """Get a list of chromosomes to target and new updated ref_file header.

    Could potentially handle remapping from chr1 -> 1 but currently disabled due
    to speed issues.
    """
    special_remaps = {"chrM": "MT", "MT": "chrM"}
    target_chroms = dict([
        (x.name, i)
        for i, x in enumerate(ref.file_contigs(dd.get_ref_file(data)))
        if chromhacks.is_autosomal_or_sex(x.name)
    ])
    out_chroms = []
    with pysam.Samfile(bam_file, "rb") as bamfile:
        for bami, bam_contig in enumerate(
            [c["SN"] for c in bamfile.header["SQ"]]):
            if bam_contig in target_chroms:
                target_chrom = bam_contig
            elif bam_contig in special_remaps and special_remaps[
                    bam_contig] in target_chroms:
                target_chrom = special_remaps[bam_contig]
            elif bam_contig.startswith("chr") and bam_contig.replace(
                    "chr", "") in target_chroms:
                target_chrom = bam_contig.replace("chr", "")
            elif "chr%s" % bam_contig in target_chroms:
                target_chrom = "chr%s" % bam_contig
            else:
                target_chrom = None
            # target_chrom == bam_contig ensures we don't try chr1 -> 1 style remapping
            if target_chrom and target_chrom == bam_contig:
                # Order not required if dealing with SAM file header fixing
                #assert bami == target_chroms[target_chrom], \
                #    ("remove_extracontigs: Non-matching order of standard contig: %s %s (%s vs %s)" %
                #     (bam_file, target_chrom, bami, target_chroms[target_chrom]))
                out_chroms.append(target_chrom)
    assert out_chroms, (
        "remove_extracontigs: Did not find any chromosomes in reference file: %s %s"
        % (bam_file, target_chroms))
    return out_chroms
Beispiel #48
0
def _setup_variant_regions(data, out_dir):
    """Ensure we have variant regions for calling, using transcript if not present.

    Respects noalt_calling by removing additional contigs to improve
    speeds.
    """
    vr_file = dd.get_variant_regions(data)
    if not vr_file:
        vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir)
    contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))])
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")),
                            "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0])
    if not utils.file_uptodate(out_file, vr_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with shared.bedtools_tmpdir(data):
                    for r in pybedtools.BedTool(vr_file):
                        if r.chrom in contigs:
                            if chromhacks.is_nonalt(r.chrom):
                                out_handle.write(str(r))
    data = dd.set_variant_regions(data, out_file)
    return data
Beispiel #49
0
def _run_wham(inputs, background_bams):
    """Run WHAM on a defined set of inputs and targets.
    """
    out_file = os.path.join(_sv_workdir(inputs[0]),
                            "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0]))
    if not utils.file_exists(out_file):
        with file_transaction(inputs[0], out_file) as tx_out_file:
            cores = dd.get_cores(inputs[0])
            ref_file = dd.get_ref_file(inputs[0])
            include_chroms = ",".join([
                c.name for c in ref.file_contigs(ref_file)
                if chromhacks.is_autosomal_or_x(c.name)
            ])
            all_bams = ",".join([x["align_bam"]
                                 for x in inputs] + background_bams)
            cmd = (
                "whamg -x {cores} -a {ref_file} -f {all_bams} -c {include_chroms} "
                "| bgzip -c > {tx_out_file}")
            do.run(
                cmd.format(**locals()), "WHAM SV caller: %s" %
                ", ".join(dd.get_sample_name(d) for d in inputs))
    return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
Beispiel #50
0
def _check_prealigned_bam(in_bam, ref_file, config):
    """Ensure a pre-aligned BAM file matches the expected reference genome.
    """
    ref_contigs = [c.name for c in ref.file_contigs(ref_file, config)]
    with contextlib.closing(pysam.Samfile(in_bam, "rb")) as bamfile:
        bam_contigs = [c["SN"] for c in bamfile.header["SQ"]]
    problems = []
    warnings = []
    for bc, rc in itertools.izip_longest(bam_contigs, ref_contigs):
        if bc != rc:
            if bc and rc:
                problems.append("Reference mismatch. BAM: %s Reference: %s" % (bc, rc))
            elif bc:
                problems.append("Extra BAM chromosomes: %s" % bc)
            elif rc:
                warnings.append("Extra reference chromosomes: %s" % rc)
    if problems:
        raise ValueError("Unexpected order, name or contig mismatches between input BAM and reference file:\n%s\n"
                         % "\n".join(problems))
    if warnings:
        print("*** Potential problems in input BAM compared to reference:\n%s\n" %
              "\n".join(warnings))
Beispiel #51
0
def _get_maxcov_downsample(data):
    """Calculate maximum coverage downsampling for whole genome samples.

    Returns None if we're not doing downsampling.
    """
    from bcbio.bam import ref
    from bcbio.ngsalign import alignprep, bwa
    from bcbio.variation import coverage
    params = {
        "min_coverage_for_downsampling": 10,
        "maxcov_downsample_multiplier": dd.get_maxcov_downsample(data)
    }
    fastq_file = data["files"][0]
    num_reads = alignprep.total_reads_from_grabix(fastq_file)
    if num_reads and params["maxcov_downsample_multiplier"] and params[
            "maxcov_downsample_multiplier"] > 0:
        vrs = dd.get_variant_regions_merged(data)
        total_size = sum([
            c.size
            for c in ref.file_contigs(dd.get_ref_file(data), data["config"])
        ])
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
            genome_cov_pct = callable_size / float(total_size)
        else:
            callable_size = total_size
            genome_cov_pct = 1.0
        if (genome_cov_pct > coverage.GENOME_COV_THRESH
                and dd.get_coverage_interval(data) in ["genome", None, False]):
            total_counts, total_sizes = 0, 0
            for count, size in bwa.fastq_size_output(fastq_file, 5000):
                total_counts += int(count)
                total_sizes += (int(size) * int(count))
            read_size = float(total_sizes) / float(total_counts)
            avg_cov = float(num_reads * read_size) / callable_size
            if avg_cov >= params["min_coverage_for_downsampling"]:
                return int(avg_cov * params["maxcov_downsample_multiplier"])
    return None
Beispiel #52
0
def _sort_by_region(fnames, regions, ref_file, config):
    """Sort a set of regionally split files by region for ordered output.
    """
    contig_order = {}
    for i, sq in enumerate(ref.file_contigs(ref_file, config)):
        contig_order[sq.name] = i
    sitems = []
    assert len(regions) == len(fnames), (regions, fnames)
    added_fnames = set([])
    for region, fname in zip(regions, fnames):
        if fname not in added_fnames:
            if isinstance(region, (list, tuple)):
                c, s, e = region
            elif isinstance(region, six.string_types) and region.find(":") >= 0:
                c, coords = region.split(":")
                s, e = [int(x) for x in coords.split("-")]
            else:
                c = region
                s, e = 0, 0
            sitems.append(((contig_order[c], s, e), c, fname))
            added_fnames.add(fname)
    sitems.sort()
    return [(x[1], x[2]) for x in sitems]
Beispiel #53
0
def check_bed_coords(in_file, data):
    """Ensure BED file coordinates match reference genome.

    Catches errors like using a hg38 BED file for an hg19 genome run.
    """
    if dd.get_ref_file(data):
        contig_sizes = {}
        for contig in ref.file_contigs(dd.get_ref_file(data)):
            contig_sizes[contig.name] = contig.size
        with utils.open_gzipsafe(in_file) as in_handle:
            for line in in_handle:
                if not line.startswith(("#", "track", "browser")) and line.strip():
                    parts = line.split()
                    if len(parts) > 3:
                        try:
                            end = int(parts[2])
                        except ValueError:
                            continue
                        contig = parts[0]
                        check_size = contig_sizes.get(contig)
                        if check_size and end > check_size:
                            raise ValueError("Found BED coordinate off the end of the chromosome:\n%s%s\n"
                                             "Is the input BED from the right genome build?" %
                                             (line, in_file))
Beispiel #54
0
def _too_many_contigs(ref_file):
    """Check for more contigs than the maximum samblaster deduplication supports.
    """
    max_contigs = 32768
    return len(list(ref.file_contigs(ref_file))) >= max_contigs
Beispiel #55
0
def _merge_target_information(samples, metrics_dir):
    out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml"))
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    original_variant_regions = set(
        dd.get_variant_regions_orig(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the same across all samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name":
            dd.get_genome_build(data),
            "size":
            sum([
                c.size for c in ref.file_contigs(dd.get_ref_file(data),
                                                 data["config"])
            ]),
        }

    # Reporting in MultiQC only if the target is the same across all samples
    vcr_orig = None
    if len(original_variant_regions) == 1 and list(
            original_variant_regions)[0] is not None:
        vcr_orig = list(original_variant_regions)[0]
        vcr_clean = bedutils.clean_file(vcr_orig, data)
        info["variants_regions_info"] = {
            "bed":
            vcr_orig,
            "size":
            sum(
                len(x) for x in pybedtools.BedTool(
                    dd.get_variant_regions_merged(data))),
            "regions":
            pybedtools.BedTool(vcr_clean).count(),
        }
        gene_num = annotate.count_genes(vcr_clean, data)
        if gene_num is not None:
            info["variants_regions_info"]["genes"] = gene_num
    else:
        info["variants_regions_info"] = {
            "bed": "callable regions",
        }
    # Reporting in MultiQC only if the target is the same across samples
    if len(coverage_beds) == 1:
        cov_bed = list(coverage_beds)[0]
        if cov_bed not in [None, "None"]:
            if vcr_orig and vcr_orig == cov_bed:
                info["coverage_bed_info"] = info["variants_regions_info"]
            else:
                clean_bed = bedutils.clean_file(cov_bed,
                                                data,
                                                prefix="cov-",
                                                simple=True)
                info["coverage_bed_info"] = {
                    "bed": cov_bed,
                    "size": pybedtools.BedTool(cov_bed).total_coverage(),
                    "regions": pybedtools.BedTool(clean_bed).count(),
                }
                gene_num = annotate.count_genes(clean_bed, data)
                if gene_num is not None:
                    info["coverage_bed_info"]["genes"] = gene_num
        else:
            info["coverage_bed_info"] = info["variants_regions_info"]

    coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"]
                             for data in samples)
    if len(coverage_intervals) == 1:
        info["coverage_interval"] = list(coverage_intervals)[0]

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Beispiel #56
0
def get_mitochondrial_chroms(data):
    ref_file = dd.get_ref_file(data)
    mito = [
        c.name for c in ref.file_contigs(ref_file) if is_mitochondrial(c.name)
    ]
    return mito
Beispiel #57
0
def get_hla_chroms(ref_file):
    hla = [c.name for c in ref.file_contigs(ref_file) if is_hla(c.name)]
    return hla
Beispiel #58
0
def _merge_target_information(samples):
    out_file = os.path.join("metrics", "target_info.yaml")
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    variant_regions = set(dd.get_variant_regions(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the sample across samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name":
            dd.get_genome_build(data),
            "size":
            sum([
                c.size for c in ref.file_contigs(dd.get_ref_file(data),
                                                 data["config"])
            ]),
        }

    # Reporting in MultiQC only if the target is the sample across samples
    vcr = None
    if len(variant_regions) == 1:
        vcr = dd.get_variant_regions_orig(data)
        vcr_merged = dd.get_variant_regions_merged(data)
        vcr_ann = annotate.add_genes(vcr, data)
        info["variants_regions_info"] = {
            "bed":
            variant_regions,
            "size":
            sum(len(x) for x in pybedtools.BedTool(vcr_merged)),
            "regions":
            pybedtools.BedTool(vcr).count(),
            "genes":
            len(
                list(
                    set(r.name for r in pybedtools.BedTool(vcr_ann)
                        if r.name and r.name != "."))),
        }
    elif len(variant_regions) == 0:
        info["variants_regions_info"] = {"bed": None}

    # Reporting in MultiQC only if the target is the sample across samples
    if len(coverage_beds) == 1:
        bed = dd.get_coverage(data)
        if vcr and vcr == bed:
            info["coverage_bed_info"] = info["variants_regions_info"]
        elif bed:
            ann_bed = annotate.add_genes(bed, data)
            info["coverage_bed_info"] = {
                "bed":
                bed,
                "size":
                pybedtools.BedTool(bed).total_coverage(),
                "regions":
                pybedtools.BedTool(bed).count(),
                "genes":
                len(
                    list(
                        set(r.name for r in pybedtools.BedTool(ann_bed)
                            if r.name and r.name != "."))),
            }

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
Beispiel #59
0
def autosomal_or_x_coords(ref_file):
    out = []
    for contig in ref.file_contigs(ref_file):
        if is_autosomal_or_x(contig.name):
            out.append((contig.name, 0, contig.size))
    return out
Beispiel #60
0
 def has_build37_contigs(data):
     for contig in ref.file_contigs(dd.get_ref_file(data)):
         if contig.name.startswith("GL") or contig.name.find("_gl") >= 0:
             if contig.name in naming.GMAP["hg19"] or contig.name in naming.GMAP["GRCh37"]:
                 return True
     return False