def _get_larger_chroms(ref_file): """Retrieve larger chromosomes, avoiding the smaller ones for plotting. """ from scipy.cluster.vq import kmeans, vq all_sizes = [] for c in ref.file_contigs(ref_file): all_sizes.append(float(c.size)) all_sizes.sort() # separate out smaller chromosomes and haplotypes with kmeans centroids, _ = kmeans(np.array(all_sizes), 2) idx, _ = vq(np.array(all_sizes), centroids) little_sizes = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx, all_sizes))) little_sizes = [x[1] for x in little_sizes] # create one more cluster with the smaller, removing the haplotypes centroids2, _ = kmeans(np.array(little_sizes), 2) idx2, _ = vq(np.array(little_sizes), centroids2) little_sizes2 = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx2, little_sizes))) little_sizes2 = [x[1] for x in little_sizes2] # get any chromosomes not in haplotype/random bin thresh = max(little_sizes2) larger_chroms = [] for c in ref.file_contigs(ref_file): if c.size > thresh: larger_chroms.append(c.name) return larger_chroms
def get_noalt_contigs(data): """Retrieve contigs without alternatives as defined in bwa *.alts files. If no alt files present (when we're not aligning with bwa), work around with standard set of alts based on hg38 -- anything with HLA, _alt or _decoy in the name. """ alts = set([]) alt_files = [ f for f in tz.get_in(["reference", "bwa", "indexes"], data, []) if f.endswith("alt") ] if alt_files: for alt_file in alt_files: with open(alt_file) as in_handle: for line in in_handle: if not line.startswith("@"): alts.add(line.split()[0].strip()) else: for contig in ref.file_contigs(dd.get_ref_file(data)): if ("_alt" in contig.name or "_decoy" in contig.name or contig.name.startswith("HLA-") or ":" in contig.name): alts.add(contig.name) return [ c for c in ref.file_contigs(dd.get_ref_file(data)) if c.name not in alts ]
def _average_genome_coverage(data, bam_file): total = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) read_counts = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) with pysam.Samfile(bam_file, "rb") as pysam_bam: read_size = np.median(list(itertools.islice((a.query_length for a in pysam_bam.fetch()), 1e5))) avg_cov = float(read_counts * read_size) / total return avg_cov
def _run_svtyper(in_file, full_bam, exclude_file, data): """Genotype structural variant calls with SVtyper. Removes calls in high depth regions to avoid slow runtimes: https://github.com/hall-lab/svtyper/issues/16 """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") if exclude_file and utils.file_exists(exclude_file): regions_to_rm = "-T ^%s" % (exclude_file) else: regions_to_rm = "" # add FILTER headers, which are lost during svtyping header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break if line.startswith("##FILTER"): out_handle.write(line) for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cmd = ("bcftools view {in_file} {regions_to_rm} | " "{python} {svtyper} --max_reads 1000 -B {full_bam} | " "bcftools annotate -h {header_file} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def _check_bam_contigs(in_bam, ref_file, config): """Ensure a pre-aligned BAM file matches the expected reference genome. """ # GATK allows chromosome M to be in multiple locations, skip checking it allowed_outoforder = ["chrM", "MT"] ref_contigs = [c.name for c in ref.file_contigs(ref_file, config)] with pysam.Samfile(in_bam, "rb") as bamfile: bam_contigs = [c["SN"] for c in bamfile.header["SQ"]] extra_bcs = [x for x in bam_contigs if x not in ref_contigs] extra_rcs = [x for x in ref_contigs if x not in bam_contigs] problems = [] warnings = [] for bc, rc in itertools.izip_longest([x for x in bam_contigs if (x not in extra_bcs and x not in allowed_outoforder)], [x for x in ref_contigs if (x not in extra_rcs and x not in allowed_outoforder)]): if bc != rc: if bc and rc: problems.append("Reference mismatch. BAM: %s Reference: %s" % (bc, rc)) elif bc: warnings.append("Extra BAM chromosomes: %s" % bc) elif rc: warnings.append("Extra reference chromosomes: %s" % rc) for bc in extra_bcs: warnings.append("Extra BAM chromosomes: %s" % bc) for rc in extra_rcs: warnings.append("Extra reference chromosomes: %s" % rc) if problems: raise ValueError("Unexpected order, name or contig mismatches between input BAM and reference file:\n%s\n" "Setting `bam_clean: picard` in the configuration can often fix this issue." % "\n".join(problems)) if warnings: print("*** Potential problems in input BAM compared to reference:\n%s\n" % "\n".join(warnings))
def _get_maxcov_downsample(data): """Calculate maximum coverage downsampling for whole genome samples. Returns None if we're not doing downsampling. """ from bcbio.bam import ref from bcbio.ngsalign import alignprep, bwa from bcbio.variation import coverage params = {"min_coverage_for_downsampling": 10, "maxcov_downsample_multiplier": dd.get_maxcov_downsample(data)} fastq_file = data["files"][0] num_reads = alignprep.total_reads_from_grabix(fastq_file) if num_reads and params["maxcov_downsample_multiplier"] and params["maxcov_downsample_multiplier"] > 0: vrs = dd.get_variant_regions_merged(data) total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() genome_cov_pct = callable_size / float(total_size) else: callable_size = total_size genome_cov_pct = 1.0 if (genome_cov_pct > coverage.GENOME_COV_THRESH and dd.get_coverage_interval(data) in ["genome", None, False]): total_counts, total_sizes = 0, 0 for count, size in bwa.fastq_size_output(fastq_file, 5000): total_counts += int(count) total_sizes += (int(size) * int(count)) read_size = float(total_sizes) / float(total_counts) avg_cov = float(num_reads * read_size) / callable_size if avg_cov >= params["min_coverage_for_downsampling"]: return int(avg_cov * params["maxcov_downsample_multiplier"]) return None
def _get_region_size(ref_file, data, region=None): """Retrieve size of a region, potentially returning None if not set. """ if region: for contig in ref.file_contigs(ref_file, data["config"]): if contig.name == region: return contig.size
def _collapse_transcripts(in_file, window, data, out_dir): """Collapse transcripts into min/max coordinates and optionally add windows. """ if out_dir is None: out_dir = os.path.dirname(in_file) out_file = os.path.join(out_dir, "%s-transcripts_w%s.bed" % (os.path.splitext(os.path.basename(in_file))[0], window)) chrom_sizes = {} for contig in ref.file_contigs(dd.get_ref_file(data), data["config"]): chrom_sizes[contig.name] = contig.size if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: prep_file = "%s-sortprep%s" % os.path.splitext(tx_out_file) sort_cmd = bedutils.get_sort_cmd() cmd = "{sort_cmd} -k4,4 -k1,1 {in_file} > {prep_file}" do.run(cmd.format(**locals()), "Sort BED file by transcript name") with open(tx_out_file, "w") as out_handle: # Work around for segmentation fault issue with groupby # https://github.com/daler/pybedtools/issues/131#issuecomment-89832476 x = pybedtools.BedTool(prep_file) def gen(): for r in x: yield r for name, rs in itertools.groupby(gen(), lambda r: (r.name, r.chrom)): rs = list(rs) r = rs[0] for gcoords in _group_coords(rs): min_pos = max(min(gcoords) - window, 0) max_pos = min(max(gcoords) + window, chrom_sizes[r.chrom]) out_handle.write("%s\t%s\t%s\t%s\n" % (r.chrom, min_pos, max_pos, r.name)) return bedutils.sort_merge(out_file, data)
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions_merged(data) callable_file = dd.get_sample_callable(data) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() else: callable_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) genome_cov_pct = callable_size / float(total_size) if genome_cov_pct > GENOME_COV_THRESH: cov_interval = "genome" offtarget_pct = 0.0 elif not vrs: cov_interval = "regional" offtarget_pct = 0.0 else: offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data), vrs or callable_file, "variant_regions") if offtarget_pct > OFFTARGET_THRESH: cov_interval = "regional" else: cov_interval = "amplicon" logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def _target_chroms_and_header(bam_file, data): """Get a list of chromosomes to target and new updated ref_file header. Could potentially handle remapping from chr1 -> 1 but currently disabled due to speed issues. """ special_remaps = {"chrM": "MT", "MT": "chrM"} target_chroms = dict([(x.name, i) for i, x in enumerate(ref.file_contigs(dd.get_ref_file(data))) if chromhacks.is_autosomal_or_sex(x.name)]) out_chroms = [] with pysam.Samfile(bam_file, "rb") as bamfile: for bami, bam_contig in enumerate([c["SN"] for c in bamfile.header["SQ"]]): if bam_contig in target_chroms: target_chrom = bam_contig elif bam_contig in special_remaps and special_remaps[bam_contig] in target_chroms: target_chrom = special_remaps[bam_contig] elif bam_contig.startswith("chr") and bam_contig.replace("chr", "") in target_chroms: target_chrom = bam_contig.replace("chr", "") elif "chr%s" % bam_contig in target_chroms: target_chrom = "chr%s" % bam_contig else: target_chrom = None # target_chrom == bam_contig ensures we don't try chr1 -> 1 style remapping if target_chrom and target_chrom == bam_contig: # Order not required if dealing with SAM file header fixing #assert bami == target_chroms[target_chrom], \ # ("remove_extracontigs: Non-matching order of standard contig: %s %s (%s vs %s)" % # (bam_file, target_chrom, bami, target_chroms[target_chrom])) out_chroms.append(target_chrom) assert out_chroms, ("remove_extracontigs: Did not find any chromosomes in reference file: %s %s" % (bam_file, target_chroms)) return out_chroms
def _gids_to_genes(gids, ssm_locs, cnv_ssms, data): """Convert support ids for SNPs and SSMs into associated genes. """ locs = collections.defaultdict(set) for gid in gids: cur_locs = [] try: cur_locs.append(ssm_locs[gid]) except KeyError: for ssm_loc in cnv_ssms.get(gid, []): cur_locs.append(ssm_locs[ssm_loc]) for chrom, pos in cur_locs: locs[chrom].add(pos) genes = set([]) with tx_tmpdir(data) as tmpdir: chrom_prefix = "chr" if next(ref.file_contigs(dd.get_ref_file(data))).name.startswith("chr") else "" loc_file = os.path.join(tmpdir, "battenberg_find_genes.bed") with open(loc_file, "w") as out_handle: for chrom in sorted(locs.keys()): for loc in sorted(list(locs[chrom])): out_handle.write("%s%s\t%s\t%s\n" % (chrom_prefix, chrom, loc - 1, loc)) ann_file = annotate.add_genes(loc_file, data, max_distance=10000) for r in pybedtools.BedTool(ann_file): for gene in r.name.split(","): if gene != ".": genes.add(gene) return sorted(list(genes))
def _goleft_indexcov(bam_file, data, out_dir): """Use goleft indexcov to estimate coverage distributions using BAM index. Only used for whole genome runs as captures typically don't have enough data to be useful for index-only summaries. """ if not dd.get_coverage_interval(data) == "genome": return [] out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov")) out_files = [os.path.join(out_dir, "%s-indexcov.%s" % (dd.get_sample_name(data), ext)) for ext in ["roc", "ped", "bed.gz"]] if not utils.file_uptodate(out_files[-1], bam_file): with transaction.tx_tmpdir(data) as tmp_dir: tmp_dir = utils.safe_makedir(os.path.join(tmp_dir, dd.get_sample_name(data))) gender_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_sex(x.name)] gender_args = "--sex %s" % (",".join(gender_chroms)) if gender_chroms else "" cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}" try: do.run(cmd.format(**locals()), "QC: goleft indexcov") except subprocess.CalledProcessError as msg: if not ("indexcov: no usable" in str(msg) or ("indexcov: expected" in str(msg) and "sex chromosomes, found:" in str(msg))): raise for out_file in out_files: orig_file = os.path.join(tmp_dir, os.path.basename(out_file)) if utils.file_exists(orig_file): utils.copy_plus(orig_file, out_file) # MultiQC needs non-gzipped/BED inputs so unpack the file out_bed = out_files[-1].replace(".bed.gz", ".tsv") if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed): with transaction.file_transaction(data, out_bed) as tx_out_bed: cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed) do.run(cmd, "Unpack indexcov BED file") out_files[-1] = out_bed return [x for x in out_files if utils.file_exists(x)]
def add_contig_to_header_cl(ref_file, out_file): """Add update ##contig lines to VCF header, required for bcftools/GATK compatibility. """ header_file = "%s-contig_header.txt" % utils.splitext_plus(out_file)[0] with open(header_file, "w") as out_handle: for region in ref.file_contigs(ref_file, {}): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) return ("grep -v ^##contig | bcftools annotate -h %s" % header_file)
def split_by_region(data): base, ext = utils.splitext_plus(os.path.basename(out_file)) args = [] for region in [x.name for x in ref.file_contigs(ref_file, config)]: region_out = os.path.join(os.path.dirname(out_file), "%s-regions" % base, "%s-%s%s" % (base, region, ext)) utils.safe_makedir(os.path.dirname(region_out)) args.append((region_out, ref_file, config, region)) return out_file, args
def add_contig_to_header(line, ref_file): """Streaming target to add contigs to a VCF file header. """ if line.startswith("##fileformat=VCF"): out = [line] for region in ref.file_contigs(ref_file): out.append("##contig=<ID=%s,length=%s>" % (region.name, region.size)) return "\n".join(out) else: return line
def _prep_priority_filter(gemini_db, data): """Prepare tabix indexed file with priority based filters and supporting information """ from gemini import GeminiQuery out_file = "%s-priority.tsv" % utils.splitext_plus(gemini_db)[0] if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): ref_chroms = set([x.name for x in ref.file_contigs(dd.get_ref_file(data), data["config"])]) with file_transaction(data, out_file) as tx_out_file: gq = GeminiQuery(gemini_db) pops = [ "aaf_esp_ea", "aaf_esp_aa", "aaf_esp_all", "aaf_1kg_amr", "aaf_1kg_eas", "aaf_1kg_sas", "aaf_1kg_afr", "aaf_1kg_eur", "aaf_1kg_all", "aaf_adj_exac_all", "aaf_adj_exac_afr", "aaf_adj_exac_amr", "aaf_adj_exac_eas", "aaf_adj_exac_fin", "aaf_adj_exac_nfe", "aaf_adj_exac_oth", "aaf_adj_exac_sas", ] attrs = ( "chrom, start, end, ref, alt, impact_so, impact_severity, in_dbsnp, " "cosmic_ids, clinvar_sig, clinvar_origin, fitcons, gt_ref_depths, gt_alt_depths" ).split(", ") gq.run("SELECT %s FROM variants" % ", ".join(attrs + pops)) sidx = gq.sample_to_idx[dd.get_sample_name(data)] header = attrs[:5] + ["filter"] + attrs[5:-2] + [x for x in pops if x.endswith("_all")] + ["freq"] with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle, dialect="excel-tab") cheader = header[:] cheader[0] = "#" + cheader[0] writer.writerow(cheader) for row in gq: ref_depth = tz.get_in(["gt_ref_depths", sidx], row, 0) alt_depth = tz.get_in(["gt_alt_depths", sidx], row, 0) out_vals = dict(row.row) try: out_vals["freq"] = "%.2f" % (float(alt_depth) / float(ref_depth + alt_depth)) except ZeroDivisionError: out_vals["freq"] = "0.00" out_vals["filter"] = _calc_priority_filter(row, pops) if out_vals["chrom"] not in ref_chroms and _hg19_to_GRCh37(out_vals["chrom"]) in ref_chroms: out_vals["chrom"] = _hg19_to_GRCh37(out_vals["chrom"]) out = [out_vals[x] for x in header] writer.writerow(out) return vcfutils.bgzip_and_index(out_file, data["config"], tabix_args="-0 -c '#' -s 1 -b 2 -e 3")
def _average_genome_coverage(data, bam_file): """Quickly calculate average coverage for whole genome files using indices. Includes all reads, with duplicates. Uses sampling of 10M reads. """ total = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) read_counts = sum(x.aligned for x in bam.idxstats(bam_file, data)) with pysam.Samfile(bam_file, "rb") as pysam_bam: read_size = np.median(list(itertools.islice((a.query_length for a in pysam_bam.fetch()), int(1e7)))) avg_cov = float(read_counts * read_size) / total return avg_cov
def fix_header(ref_file): added_ref = False for line in sys.stdin: # skip current read groups, since adding new # skip current contigs since adding new sequence dictionary if line.startswith(("@RG", "@SQ")): pass elif not added_ref and not line.startswith("@"): for x in ref.file_contigs(ref_file): sys.stdout.write("@SQ\tSN:%s\tLN:%s\n" % (x.name, x.size)) added_ref = True else: sys.stdout.write(line)
def check_bed_contigs(in_file, data): """Ensure BED file contigs match the reference genome. """ contigs = set([]) with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith(("#", "track", "browser")) and line.strip(): contigs.add(line.split()[0]) ref_contigs = set([x.name for x in ref.file_contigs(dd.get_ref_file(data))]) if len(contigs - ref_contigs) / float(len(contigs)) > 0.25: raise ValueError("Contigs in BED file %s not in reference genome:\n %s\n" % (in_file, list(contigs - ref_contigs)) + "This is typically due to chr1 versus 1 differences in BED file and reference.")
def _make_ignore_file(work_dir, ref_file, impute_file, ignore_file): chroms = set([]) with open(impute_file) as in_handle: for line in in_handle: chrom = line.split()[0] chroms.add(chrom) if not chrom.startswith("chr"): chroms.add("chr%s" % chrom) with open(ignore_file, "w") as out_handle: for contig in ref.file_contigs(ref_file): if contig.name not in chroms: out_handle.write("%s\n" % contig.name) return ignore_file
def _check_ref_files(ref_info, data): problems = [] for contig in ref.file_contigs(ref_info["fasta"]["base"], data["config"]): cur_problems = set([]) for char in list(contig.name): if char not in ALLOWED_CONTIG_NAME_CHARS: cur_problems.add(char) if len(cur_problems) > 0: problems.append("Found non-allowed characters in chromosome name %s: %s" % (contig.name, " ".join(list(cur_problems)))) if len(problems) > 0: msg = ("\nProblems with input reference file %s\n" % ref_info["fasta"]["base"]) raise ValueError(msg + "\n".join(problems) + "\n")
def subset_to_genome(in_file, out_file, data): """Subset a BED file to only contain contigs present in the reference genome. """ if not utils.file_uptodate(out_file, in_file): contigs = set([x.name for x in ref.file_contigs(dd.get_ref_file(data))]) with utils.open_gzipsafe(in_file) as in_handle: with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for line in in_handle: parts = line.split() if parts and parts[0] in contigs: out_handle.write(line) return out_file
def _make_ignore_file(work_dir, ref_file, ignore_file, impute_file): """Create input files with chromosomes to ignore and gender loci. """ gl_file = os.path.join(work_dir, "gender_loci.txt") chroms = set([]) with open(impute_file) as in_handle: for line in in_handle: chrom = line.split()[0] chroms.add(chrom) if not chrom.startswith("chr"): chroms.add("chr%s" % chrom) with open(ignore_file, "w") as out_handle: for contig in ref.file_contigs(ref_file): if contig.name not in chroms: out_handle.write("%s\n" % contig.name) with open(gl_file, "w") as out_handle: for contig in ref.file_contigs(ref_file): if contig.name in ["Y", "chrY"]: # From https://github.com/cancerit/cgpBattenberg/blob/dev/perl/share/gender/GRCh37d5_Y.loci positions = [2934912, 4546684, 4549638, 4550107] for pos in positions: out_handle.write("%s\t%s\n" % (contig.name, pos)) return ignore_file, gl_file
def _run_wham(inputs, background_bams): """Run WHAM on a defined set of inputs and targets. """ out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0])) if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: cores = dd.get_cores(inputs[0]) ref_file = dd.get_ref_file(inputs[0]) include_chroms = ",".join([c.name for c in ref.file_contigs(ref_file) if chromhacks.is_autosomal_or_x(c.name)]) all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams) cmd = ("whamg -x {cores} -a {ref_file} -f {all_bams} -c {include_chroms} " "| bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "WHAM SV caller: %s" % ", ".join(dd.get_sample_name(d) for d in inputs)) return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def _check_ref_files(ref_info, data): problems = [] if not data["genome_build"]: problems.append("Did not find 'genome_build' for sample: %s" % dd.get_sample_name(data)) else: for contig in ref.file_contigs(ref_info["fasta"]["base"], data["config"]): cur_problems = set([]) for char in list(contig.name): if char not in ALLOWED_CONTIG_NAME_CHARS: cur_problems.add(char) if len(cur_problems) > 0: problems.append("Found non-allowed characters in chromosome name %s: %s" % (contig.name, " ".join(list(cur_problems)))) if len(problems) > 0: msg = ("\nProblems with input reference file %s\n" % tz.get_in(["fasta", "base"], ref_info)) raise ValueError(msg + "\n".join(problems) + "\n")
def sort_by_ref(vcf_file, data): """Sort a VCF file by genome reference and position, adding contig information. """ out_file = "%s-prep.vcf.gz" % utils.splitext_plus(vcf_file)[0] if not utils.file_uptodate(out_file, vcf_file): with file_transaction(data, out_file) as tx_out_file: header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cat_cmd = "zcat" if vcf_file.endswith("vcf.gz") else "cat" cmd = ("{cat_cmd} {vcf_file} | grep -v ^##contig | bcftools annotate -h {header_file} | " "vt sort -m full -o {tx_out_file} -") with utils.chdir(os.path.dirname(tx_out_file)): do.run(cmd.format(**locals()), "Sort VCF by reference") return bgzip_and_index(out_file, data["config"])
def _sort_by_region(fnames, regions, ref_file, config): """Sort a set of regionally split files by region for ordered output. """ contig_order = {} for i, sq in enumerate(ref.file_contigs(ref_file, config)): contig_order[sq.name] = i sitems = [] for region, fname in zip(regions, fnames): if isinstance(region, (list, tuple)): c, s, e = region else: c = region s, e = 0, 0 sitems.append(((contig_order[c], s, e), fname)) sitems.sort() return [x[1] for x in sitems]
def _maybe_limit_chromosomes(data): """Potentially limit chromosomes to avoid problematically named HLA contigs. HLAs have ':' characters in them which confuse downstream processing. If we have no problematic chromosomes we don't limit anything. """ std_chroms = [] prob_chroms = [] for contig in ref.file_contigs(dd.get_ref_file(data)): if contig.name.find(":") > 0: prob_chroms.append(contig.name) else: std_chroms.append(contig.name) if len(prob_chroms) > 0: return std_chroms else: return []
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ genome_cov_thresh = 0.40 # percent of genome covered for whole genome analysis offtarget_thresh = 0.10 # percent of offtarget reads required to be capture (not amplification) based if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions(data) callable_file = dd.get_sample_callable(data) if vrs: seq_size = pybedtools.BedTool(vrs).total_coverage() else: seq_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) genome_cov_pct = seq_size / float(total_size) if genome_cov_pct > genome_cov_thresh: cov_interval = "genome" offtarget_pct = 0.0 else: offtarget_stat_file = dd.get_offtarget_stats(data) if not offtarget_stat_file: offtarget_pct = 0.0 else: with open(offtarget_stat_file) as in_handle: stats = yaml.safe_load(in_handle) if float(stats["mapped"]) > 0: offtarget_pct = stats["offtarget"] / float(stats["mapped"]) else: offtarget_pct = 0.0 if offtarget_pct > offtarget_thresh: cov_interval = "regional" else: cov_interval = "amplicon" logger.info( "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def _goleft_indexcov(bam_file, data, out_dir): """Use goleft indexcov to estimate coverage distributions using BAM index. Only used for whole genome runs as captures typically don't have enough data to be useful for index-only summaries. """ if not dd.get_coverage_interval(data) == "genome": return [] out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov")) out_files = [ os.path.join(out_dir, "%s-indexcov.%s" % (dd.get_sample_name(data), ext)) for ext in ["roc", "ped", "bed.gz"] ] if not utils.file_uptodate(out_files[-1], bam_file): with transaction.tx_tmpdir(data) as tmp_dir: tmp_dir = utils.safe_makedir( os.path.join(tmp_dir, dd.get_sample_name(data))) gender_chroms = [ x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_sex(x.name) ] gender_args = "--sex %s" % ( ",".join(gender_chroms)) if gender_chroms else "" # XXX Skip gender args until we can correctly specify #1793 gender_args = "" cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}" try: do.run(cmd.format(**locals()), "QC: goleft indexcov") except subprocess.CalledProcessError as msg: if not ("indexcov: no usable" in str(msg) or ("indexcov: expected" in str(msg) and "sex chromosomes, found:" in str(msg))): raise for out_file in out_files: orig_file = os.path.join(tmp_dir, os.path.basename(out_file)) if utils.file_exists(orig_file): utils.copy_plus(orig_file, out_file) # MultiQC needs non-gzipped/BED inputs so unpack the file out_bed = out_files[-1].replace(".bed.gz", ".tsv") if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed): with transaction.file_transaction(data, out_bed) as tx_out_bed: cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed) do.run(cmd, "Unpack indexcov BED file") out_files[-1] = out_bed return [x for x in out_files if utils.file_exists(x)]
def _collapse_transcripts(in_file, window, data, out_dir, include_gene_names=True): """Collapse transcripts into min/max coordinates and optionally add windows. """ if out_dir is None: out_dir = os.path.dirname(in_file) out_file = os.path.join( out_dir, "%s-transcripts_w%s.bed" % (os.path.splitext(os.path.basename(in_file))[0], window)) chrom_sizes = {} for contig in ref.file_contigs(dd.get_ref_file(data), data["config"]): chrom_sizes[contig.name] = contig.size if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: prep_file = "%s-sortprep%s" % os.path.splitext(tx_out_file) sort_cmd = bedutils.get_sort_cmd() cmd = "{sort_cmd} -k4,4 -k1,1 {in_file} > {prep_file}" do.run(cmd.format(**locals()), "Sort BED file by transcript name") with open(tx_out_file, "w") as out_handle: # Work around for segmentation fault issue with groupby # https://github.com/daler/pybedtools/issues/131#issuecomment-89832476 x = pybedtools.BedTool(prep_file) def gen(): for r in x: yield r for name, rs in itertools.groupby(gen(), lambda r: (r.name, r.chrom)): rs = list(rs) r = rs[0] for gcoords in _group_coords(rs): min_pos = max(min(gcoords) - window, 0) max_pos = min( max(gcoords) + window, chrom_sizes[r.chrom]) if include_gene_names: out_handle.write( "%s\t%s\t%s\t%s\n" % (r.chrom, min_pos, max_pos, r.name)) else: out_handle.write("%s\t%s\t%s\n" % (r.chrom, min_pos, max_pos)) return bedutils.sort_merge(out_file, data)
def to_standardonly(in_file, ref_file, data): """Subset a VCF input file to standard chromosomes (1-22,X,Y,MT). """ from bcbio.heterogeneity import chromhacks out_file = "%s-stdchrs.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_exists(out_file): stds = [] for c in ref.file_contigs(ref_file): if chromhacks.is_nonalt(c.name): stds.append(c.name) if stds: with file_transaction(data, out_file) as tx_out_file: stds = ",".join(stds) in_file = bgzip_and_index(in_file, data["config"]) cmd = "bcftools view -o {tx_out_file} -O z {in_file} {stds}" do.run(cmd.format(**locals()), "Subset to standard chromosomes") return bgzip_and_index(out_file, data["config"]) if utils.file_exists(out_file) else in_file
def to_standardonly(in_file, ref_file, data): """Subset a VCF input file to standard chromosomes (1-22,X,Y,MT). """ from bcbio.heterogeneity import chromhacks out_file = "%s-stdchrs.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_exists(out_file): stds = [] for c in ref.file_contigs(ref_file): if chromhacks.is_nonalt(c.name): stds.append(c.name) if stds: with file_transaction(data, out_file) as tx_out_file: stds = ",".join(stds) in_file = bgzip_and_index(in_file, data["config"]) cmd = "bcftools view -o {tx_out_file} -O z {in_file} {stds}" do.run(cmd.format(**locals()), "Subset to standard chromosomes") return bgzip_and_index(out_file, data["config"]) if utils.file_exists(out_file) else in_file
def _get_callable_regions(data): """Retrieve regions to parallelize by from callable regions, variant regions or chromosomes """ callable_files = data.get("callable_regions") or data.get("variant_regions") if callable_files: assert len(callable_files) == 1 regions = [(r.chrom, int(r.start), int(r.stop)) for r in pybedtools.BedTool(callable_files[0])] else: work_bam = list(tz.take(1, filter(lambda x: x.endswith(".bam"), data["work_bams"]))) if work_bam: with contextlib.closing(pysam.Samfile(work_bam[0], "rb")) as pysam_bam: regions = [(chrom, 0, length) for (chrom, length) in zip(pysam_bam.references, pysam_bam.lengths)] else: regions = [(r.name, 0, r.size) for r in ref.file_contigs(dd.get_ref_file(data), data["config"])] return regions
def _sort_by_region(fnames, regions, ref_file, config): """Sort a set of regionally split files by region for ordered output. """ contig_order = {} for i, sq in enumerate(ref.file_contigs(ref_file, config)): contig_order[sq.name] = i sitems = [] assert len(regions) == len(fnames), (regions, fnames) for region, fname in zip(regions, fnames): if isinstance(region, (list, tuple)): c, s, e = region else: c = region s, e = 0, 0 sitems.append(((contig_order[c], s, e), fname)) sitems.sort() return [x[1] for x in sitems]
def _maybe_limit_chromosomes(data): """Potentially limit chromosomes to avoid problematically named HLA contigs. HLAs have ':' characters in them which confuse downstream processing. If we have no problematic chromosomes we don't limit anything. """ std_chroms = [] prob_chroms = [] for contig in ref.file_contigs(dd.get_ref_file(data)): if contig.name.find(":") > 0: prob_chroms.append(contig.name) else: std_chroms.append(contig.name) if len(prob_chroms) > 0: return std_chroms else: return []
def check_bed_contigs(in_file, data): """Ensure BED file contigs match the reference genome. """ if not dd.get_ref_file(data): return contigs = set([]) with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith(("#", "track", "browser")) and line.strip(): contigs.add(line.split()[0]) ref_contigs = set( [x.name for x in ref.file_contigs(dd.get_ref_file(data))]) if contigs and len(contigs - ref_contigs) / float(len(contigs)) > 0.25: raise ValueError( "Contigs in BED file %s not in reference genome:\n %s\n" % (in_file, list(contigs - ref_contigs)) + "This is typically due to chr1 versus 1 differences in BED file and reference." )
def _get_callable_regions(data): """Retrieve regions to parallelize by from callable regions or chromosomes. """ import pybedtools callable_files = data.get("callable_regions") if callable_files: assert len(callable_files) == 1 regions = [(r.chrom, int(r.start), int(r.stop)) for r in pybedtools.BedTool(callable_files[0])] else: work_bam = list(tz.take(1, filter(lambda x: x and x.endswith(".bam"), data["work_bams"]))) if work_bam: with pysam.Samfile(work_bam[0], "rb") as pysam_bam: regions = [(chrom, 0, length) for (chrom, length) in zip(pysam_bam.references, pysam_bam.lengths)] else: regions = [(r.name, 0, r.size) for r in ref.file_contigs(dd.get_ref_file(data), data["config"])] return regions
def _maybe_limit_chromosomes(data): """Potentially limit chromosomes to avoid problematically named HLA contigs. HLAs have ':' characters in them which confuse downstream processing. If we have no problematic chromosomes we don't limit anything. """ std_chroms = [] prob_chroms = [] noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data) for contig in ref.file_contigs(dd.get_ref_file(data)): if contig.name.find(":") > 0 or (noalt_calling and not chromhacks.is_nonalt(contig.name)): prob_chroms.append(contig.name) else: std_chroms.append(contig.name) if len(prob_chroms) > 0: return std_chroms else: return []
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv using smoove. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" name = "%s%s" % (dd.get_sample_name(items[0]), ext) out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) if utils.file_exists(old_out_file): return old_out_file, sv_exclude_bed if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cores = dd.get_num_cores(items[0]) out_dir = os.path.dirname(tx_out_file) ref_file = dd.get_ref_file(items[0]) full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, os.path.dirname(tx_out_file))) std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"] def _is_std_exclude(n): clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes] return any([n.startswith(x) or n.endswith(x) for x in clean_excludes]) exclude_chrs = [c.name for c in ref.file_contigs(ref_file) if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)] exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs) exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else "" tempdir = os.path.dirname(tx_out_file) cmd = ("export TMPDIR={tempdir} && " "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} " "--name {name} --outdir {out_dir} " "{exclude_bed} {exclude_chrs} {full_bams}") with utils.chdir(tempdir): try: do.run(cmd.format(**locals()), "smoove lumpy calling", items[0]) except subprocess.CalledProcessError as msg: if _allowed_errors(str(msg)): vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"], samples=[dd.get_sample_name(d) for d in items]) else: logger.exception() raise vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file, sv_exclude_bed
def _run_svtyper(in_file, full_bam, sr_bam, exclude_file, data): """Genotype structural variant calls with SVtyper. Removes calls in high depth regions to avoid slow runtimes: https://github.com/hall-lab/svtyper/issues/16 """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") if exclude_file and utils.file_exists(exclude_file): regions_to_rm = "-T ^%s" % (exclude_file) else: regions_to_rm = "" # add FILTER headers, which are lost during svtyping header_file = "%s-header.txt" % utils.splitext_plus( tx_out_file)[0] with open(header_file, "w") as out_handle: with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break if line.startswith("##FILTER"): out_handle.write(line) for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) if _older_svtyper_version(svtyper): svtyper_extra_opts = "-M -S {sr_bam}" else: svtyper_extra_opts = "" cmd = ("bcftools view {in_file} {regions_to_rm} | " "{python} {svtyper} -B {full_bam} " + svtyper_extra_opts + " | " "bcftools annotate -h {header_file} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ genome_cov_thresh = 0.40 # percent of genome covered for whole genome analysis offtarget_thresh = 0.01 # percent of offtarget reads required to be capture (not amplification) based if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions_merged(data) callable_file = dd.get_sample_callable(data) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() else: callable_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) genome_cov_pct = callable_size / float(total_size) if genome_cov_pct > genome_cov_thresh: cov_interval = "genome" offtarget_pct = 0.0 elif not vrs: cov_interval = "regional" offtarget_pct = 0.0 else: offtarget_pct = _count_offtarget( data, dd.get_align_bam(data) or dd.get_work_bam(data), vrs or callable_file, "variant_regions") if offtarget_pct > offtarget_thresh: cov_interval = "regional" else: cov_interval = "amplicon" logger.info( "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def sort_by_ref(vcf_file, data): """Sort a VCF file by genome reference and position, adding contig information. """ out_file = "%s-prep.vcf.gz" % utils.splitext_plus(vcf_file)[0] if not utils.file_uptodate(out_file, vcf_file): with file_transaction(data, out_file) as tx_out_file: header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cat_cmd = "zcat" if vcf_file.endswith("vcf.gz") else "cat" cmd = ( "{cat_cmd} {vcf_file} | grep -v ^##contig | bcftools annotate -h {header_file} | " "vt sort -m full -o {tx_out_file} -") with utils.chdir(os.path.dirname(tx_out_file)): do.run(cmd.format(**locals()), "Sort VCF by reference") return bgzip_and_index(out_file, data["config"])
def _check_ref_files(ref_info, data): problems = [] if not data["genome_build"]: problems.append("Did not find 'genome_build' for sample: %s" % dd.get_sample_name(data)) elif not tz.get_in(["fasta", "base"], ref_info): problems.append("Did not find fasta reference file for genome %s.\n" % (data["genome_build"]) + "Check tool-data/*.loc files to ensure paths to reference data are correct.") else: for contig in ref.file_contigs(ref_info["fasta"]["base"], data["config"]): cur_problems = set([]) for char in list(contig.name): if char not in ALLOWED_CONTIG_NAME_CHARS: cur_problems.add(char) if len(cur_problems) > 0: problems.append("Found non-allowed characters in chromosome name %s: %s" % (contig.name, " ".join(list(cur_problems)))) if len(problems) > 0: msg = ("\nProblems with input reference file %s\n" % tz.get_in(["fasta", "base"], ref_info)) raise ValueError(msg + "\n".join(problems) + "\n")
def _get_alt_chroms(data): """Retrieve alternative contigs as defined in bwa *.alts files. If no alt files present (when we're not aligning with bwa), work around with standard set of alts based on hg38 -- anything with HLA, _alt or _decoy in the name. """ alts = [] alt_files = [f for f in tz.get_in(["reference", "bwa", "indexes"], data, []) if f.endswith("alt")] if alt_files: for alt_file in alt_files: with open(alt_file) as in_handle: for line in in_handle: if not line.startswith("@"): alts.append(line.split()[0].strip()) else: for contig in ref.file_contigs(dd.get_ref_file(data)): if ("_alt" in contig.name or "_decoy" in contig.name or contig.name.startswith("HLA-") or ":" in contig.name): alts.append(contig.name) return alts
def _check_bam_contigs(in_bam, ref_file, config): """Ensure a pre-aligned BAM file matches the expected reference genome. """ # GATK allows chromosome M to be in multiple locations, skip checking it allowed_outoforder = ["chrM", "MT"] ref_contigs = [c.name for c in ref.file_contigs(ref_file, config)] with pysam.Samfile(in_bam, "rb") as bamfile: bam_contigs = [c["SN"] for c in bamfile.header["SQ"]] extra_bcs = [x for x in bam_contigs if x not in ref_contigs] extra_rcs = [x for x in ref_contigs if x not in bam_contigs] problems = [] warnings = [] for bc, rc in zip_longest([ x for x in bam_contigs if (x not in extra_bcs and x not in allowed_outoforder) ], [ x for x in ref_contigs if (x not in extra_rcs and x not in allowed_outoforder) ]): if bc != rc: if bc and rc: problems.append("Reference mismatch. BAM: %s Reference: %s" % (bc, rc)) elif bc: warnings.append("Extra BAM chromosomes: %s" % bc) elif rc: warnings.append("Extra reference chromosomes: %s" % rc) for bc in extra_bcs: warnings.append("Extra BAM chromosomes: %s" % bc) for rc in extra_rcs: warnings.append("Extra reference chromosomes: %s" % rc) if problems: raise ValueError( "Unexpected order, name or contig mismatches between input BAM and reference file:\n%s\n" "Setting `bam_clean: remove_extracontigs` in the configuration can often fix this issue." % "\n".join(problems)) if warnings: print( "*** Potential problems in input BAM compared to reference:\n%s\n" % "\n".join(warnings))
def _target_chroms_and_header(bam_file, data): """Get a list of chromosomes to target and new updated ref_file header. Could potentially handle remapping from chr1 -> 1 but currently disabled due to speed issues. """ special_remaps = {"chrM": "MT", "MT": "chrM"} target_chroms = dict([ (x.name, i) for i, x in enumerate(ref.file_contigs(dd.get_ref_file(data))) if chromhacks.is_autosomal_or_sex(x.name) ]) out_chroms = [] with pysam.Samfile(bam_file, "rb") as bamfile: for bami, bam_contig in enumerate( [c["SN"] for c in bamfile.header["SQ"]]): if bam_contig in target_chroms: target_chrom = bam_contig elif bam_contig in special_remaps and special_remaps[ bam_contig] in target_chroms: target_chrom = special_remaps[bam_contig] elif bam_contig.startswith("chr") and bam_contig.replace( "chr", "") in target_chroms: target_chrom = bam_contig.replace("chr", "") elif "chr%s" % bam_contig in target_chroms: target_chrom = "chr%s" % bam_contig else: target_chrom = None # target_chrom == bam_contig ensures we don't try chr1 -> 1 style remapping if target_chrom and target_chrom == bam_contig: # Order not required if dealing with SAM file header fixing #assert bami == target_chroms[target_chrom], \ # ("remove_extracontigs: Non-matching order of standard contig: %s %s (%s vs %s)" % # (bam_file, target_chrom, bami, target_chroms[target_chrom])) out_chroms.append(target_chrom) assert out_chroms, ( "remove_extracontigs: Did not find any chromosomes in reference file: %s %s" % (bam_file, target_chroms)) return out_chroms
def _setup_variant_regions(data, out_dir): """Ensure we have variant regions for calling, using transcript if not present. Respects noalt_calling by removing additional contigs to improve speeds. """ vr_file = dd.get_variant_regions(data) if not vr_file: vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir) contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))]) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")), "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0]) if not utils.file_uptodate(out_file, vr_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with shared.bedtools_tmpdir(data): for r in pybedtools.BedTool(vr_file): if r.chrom in contigs: if chromhacks.is_nonalt(r.chrom): out_handle.write(str(r)) data = dd.set_variant_regions(data, out_file) return data
def _run_wham(inputs, background_bams): """Run WHAM on a defined set of inputs and targets. """ out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0])) if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: cores = dd.get_cores(inputs[0]) ref_file = dd.get_ref_file(inputs[0]) include_chroms = ",".join([ c.name for c in ref.file_contigs(ref_file) if chromhacks.is_autosomal_or_x(c.name) ]) all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams) cmd = ( "whamg -x {cores} -a {ref_file} -f {all_bams} -c {include_chroms} " "| bgzip -c > {tx_out_file}") do.run( cmd.format(**locals()), "WHAM SV caller: %s" % ", ".join(dd.get_sample_name(d) for d in inputs)) return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def _check_prealigned_bam(in_bam, ref_file, config): """Ensure a pre-aligned BAM file matches the expected reference genome. """ ref_contigs = [c.name for c in ref.file_contigs(ref_file, config)] with contextlib.closing(pysam.Samfile(in_bam, "rb")) as bamfile: bam_contigs = [c["SN"] for c in bamfile.header["SQ"]] problems = [] warnings = [] for bc, rc in itertools.izip_longest(bam_contigs, ref_contigs): if bc != rc: if bc and rc: problems.append("Reference mismatch. BAM: %s Reference: %s" % (bc, rc)) elif bc: problems.append("Extra BAM chromosomes: %s" % bc) elif rc: warnings.append("Extra reference chromosomes: %s" % rc) if problems: raise ValueError("Unexpected order, name or contig mismatches between input BAM and reference file:\n%s\n" % "\n".join(problems)) if warnings: print("*** Potential problems in input BAM compared to reference:\n%s\n" % "\n".join(warnings))
def _get_maxcov_downsample(data): """Calculate maximum coverage downsampling for whole genome samples. Returns None if we're not doing downsampling. """ from bcbio.bam import ref from bcbio.ngsalign import alignprep, bwa from bcbio.variation import coverage params = { "min_coverage_for_downsampling": 10, "maxcov_downsample_multiplier": dd.get_maxcov_downsample(data) } fastq_file = data["files"][0] num_reads = alignprep.total_reads_from_grabix(fastq_file) if num_reads and params["maxcov_downsample_multiplier"] and params[ "maxcov_downsample_multiplier"] > 0: vrs = dd.get_variant_regions_merged(data) total_size = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() genome_cov_pct = callable_size / float(total_size) else: callable_size = total_size genome_cov_pct = 1.0 if (genome_cov_pct > coverage.GENOME_COV_THRESH and dd.get_coverage_interval(data) in ["genome", None, False]): total_counts, total_sizes = 0, 0 for count, size in bwa.fastq_size_output(fastq_file, 5000): total_counts += int(count) total_sizes += (int(size) * int(count)) read_size = float(total_sizes) / float(total_counts) avg_cov = float(num_reads * read_size) / callable_size if avg_cov >= params["min_coverage_for_downsampling"]: return int(avg_cov * params["maxcov_downsample_multiplier"]) return None
def _sort_by_region(fnames, regions, ref_file, config): """Sort a set of regionally split files by region for ordered output. """ contig_order = {} for i, sq in enumerate(ref.file_contigs(ref_file, config)): contig_order[sq.name] = i sitems = [] assert len(regions) == len(fnames), (regions, fnames) added_fnames = set([]) for region, fname in zip(regions, fnames): if fname not in added_fnames: if isinstance(region, (list, tuple)): c, s, e = region elif isinstance(region, six.string_types) and region.find(":") >= 0: c, coords = region.split(":") s, e = [int(x) for x in coords.split("-")] else: c = region s, e = 0, 0 sitems.append(((contig_order[c], s, e), c, fname)) added_fnames.add(fname) sitems.sort() return [(x[1], x[2]) for x in sitems]
def check_bed_coords(in_file, data): """Ensure BED file coordinates match reference genome. Catches errors like using a hg38 BED file for an hg19 genome run. """ if dd.get_ref_file(data): contig_sizes = {} for contig in ref.file_contigs(dd.get_ref_file(data)): contig_sizes[contig.name] = contig.size with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith(("#", "track", "browser")) and line.strip(): parts = line.split() if len(parts) > 3: try: end = int(parts[2]) except ValueError: continue contig = parts[0] check_size = contig_sizes.get(contig) if check_size and end > check_size: raise ValueError("Found BED coordinate off the end of the chromosome:\n%s%s\n" "Is the input BED from the right genome build?" % (line, in_file))
def _too_many_contigs(ref_file): """Check for more contigs than the maximum samblaster deduplication supports. """ max_contigs = 32768 return len(list(ref.file_contigs(ref_file))) >= max_contigs
def _merge_target_information(samples, metrics_dir): out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml")) if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) original_variant_regions = set( dd.get_variant_regions_orig(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the same across all samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]), } # Reporting in MultiQC only if the target is the same across all samples vcr_orig = None if len(original_variant_regions) == 1 and list( original_variant_regions)[0] is not None: vcr_orig = list(original_variant_regions)[0] vcr_clean = bedutils.clean_file(vcr_orig, data) info["variants_regions_info"] = { "bed": vcr_orig, "size": sum( len(x) for x in pybedtools.BedTool( dd.get_variant_regions_merged(data))), "regions": pybedtools.BedTool(vcr_clean).count(), } gene_num = annotate.count_genes(vcr_clean, data) if gene_num is not None: info["variants_regions_info"]["genes"] = gene_num else: info["variants_regions_info"] = { "bed": "callable regions", } # Reporting in MultiQC only if the target is the same across samples if len(coverage_beds) == 1: cov_bed = list(coverage_beds)[0] if cov_bed not in [None, "None"]: if vcr_orig and vcr_orig == cov_bed: info["coverage_bed_info"] = info["variants_regions_info"] else: clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True) info["coverage_bed_info"] = { "bed": cov_bed, "size": pybedtools.BedTool(cov_bed).total_coverage(), "regions": pybedtools.BedTool(clean_bed).count(), } gene_num = annotate.count_genes(clean_bed, data) if gene_num is not None: info["coverage_bed_info"]["genes"] = gene_num else: info["coverage_bed_info"] = info["variants_regions_info"] coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples) if len(coverage_intervals) == 1: info["coverage_interval"] = list(coverage_intervals)[0] if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def get_mitochondrial_chroms(data): ref_file = dd.get_ref_file(data) mito = [ c.name for c in ref.file_contigs(ref_file) if is_mitochondrial(c.name) ] return mito
def get_hla_chroms(ref_file): hla = [c.name for c in ref.file_contigs(ref_file) if is_hla(c.name)] return hla
def _merge_target_information(samples): out_file = os.path.join("metrics", "target_info.yaml") if utils.file_exists(out_file): return samples genomes = set(dd.get_genome_build(data) for data in samples) coverage_beds = set(dd.get_coverage(data) for data in samples) variant_regions = set(dd.get_variant_regions(data) for data in samples) data = samples[0] info = {} # Reporting in MultiQC only if the genome is the sample across samples if len(genomes) == 1: info["genome_info"] = { "name": dd.get_genome_build(data), "size": sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]), } # Reporting in MultiQC only if the target is the sample across samples vcr = None if len(variant_regions) == 1: vcr = dd.get_variant_regions_orig(data) vcr_merged = dd.get_variant_regions_merged(data) vcr_ann = annotate.add_genes(vcr, data) info["variants_regions_info"] = { "bed": variant_regions, "size": sum(len(x) for x in pybedtools.BedTool(vcr_merged)), "regions": pybedtools.BedTool(vcr).count(), "genes": len( list( set(r.name for r in pybedtools.BedTool(vcr_ann) if r.name and r.name != "."))), } elif len(variant_regions) == 0: info["variants_regions_info"] = {"bed": None} # Reporting in MultiQC only if the target is the sample across samples if len(coverage_beds) == 1: bed = dd.get_coverage(data) if vcr and vcr == bed: info["coverage_bed_info"] = info["variants_regions_info"] elif bed: ann_bed = annotate.add_genes(bed, data) info["coverage_bed_info"] = { "bed": bed, "size": pybedtools.BedTool(bed).total_coverage(), "regions": pybedtools.BedTool(bed).count(), "genes": len( list( set(r.name for r in pybedtools.BedTool(ann_bed) if r.name and r.name != "."))), } if info: with open(out_file, "w") as out_handle: yaml.safe_dump(info, out_handle) return samples
def autosomal_or_x_coords(ref_file): out = [] for contig in ref.file_contigs(ref_file): if is_autosomal_or_x(contig.name): out.append((contig.name, 0, contig.size)) return out
def has_build37_contigs(data): for contig in ref.file_contigs(dd.get_ref_file(data)): if contig.name.startswith("GL") or contig.name.find("_gl") >= 0: if contig.name in naming.GMAP["hg19"] or contig.name in naming.GMAP["GRCh37"]: return True return False