def gatk_realigner(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None, deep_coverage=False): """Realign a BAM file around indels using GATK, returning sorted BAM. """ runner = broad.runner_from_config(config) bam.index(align_bam, config) runner.run_fn("picard_index_ref", ref_file) ref.fasta_idx(ref_file) if region: align_bam = subset_bam_by_region(align_bam, region, out_file) bam.index(align_bam, config) if has_aligned_reads(align_bam, region): variant_regions = config["algorithm"].get("variant_regions", None) realign_target_file = gatk_realigner_targets(runner, align_bam, ref_file, dbsnp, region, out_file, deep_coverage, variant_regions) realign_bam = gatk_indel_realignment(runner, align_bam, ref_file, realign_target_file, region, out_file, deep_coverage) # No longer required in recent GATK (> Feb 2011) -- now done on the fly # realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam) return realign_bam elif out_file: shutil.copy(align_bam, out_file) return out_file else: return align_bam
def _get_coverage_file(in_bam, ref_file, region, region_file, depth, base_file, data): """Retrieve summary of coverage in a region. Requires positive non-zero mapping quality at a position, matching GATK's CallableLoci defaults. """ out_file = "%s-genomecov.bed" % utils.splitext_plus(base_file)[0] if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: fai_file = ref.fasta_idx(ref_file, data["config"]) sambamba = config_utils.get_program("sambamba", data["config"]) bedtools = config_utils.get_program("bedtools", data["config"]) cmd = ( "{sambamba} view -F 'mapping_quality > 0' -L {region_file} -f bam -l 1 {in_bam} | " "{bedtools} genomecov -split -ibam stdin -bga -g {fai_file} " "> {tx_out_file}") do.run(cmd.format(**locals()), "bedtools genomecov: %s" % (str(region)), data) # Empty output file, no coverage for the whole contig if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feat in get_ref_bedtool(ref_file, data["config"], region): out_handle.write("%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, 0)) return out_file
def add_genes(in_file, data, max_distance=10000): """Add gene annotations to a BED file from pre-prepared RNA-seq data. max_distance -- only keep annotations within this distance of event """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) if gene_file and utils.file_exists(in_file): out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): input_rec = iter(pybedtools.BedTool(in_file)).next() # keep everything after standard chrom/start/end, 1-based extra_fields = range(4, len(input_rec.fields) + 1) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" % (max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd() cmd = ("{sort_cmd} -k1,1 -k2,2n {in_file} | " "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) " "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info") return out_file else: return in_file
def ref_file_from_bam(bam_file, data): """Subset a fasta input file to only a fraction of input contigs. """ new_ref = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "inputs", "ref")), "%s-subset.fa" % dd.get_genome_build(data)) if not utils.file_exists(new_ref): with file_transaction(data, new_ref) as tx_out_file: contig_file = "%s-contigs.txt" % utils.splitext_plus(new_ref)[0] with open(contig_file, "w") as out_handle: for contig in [x.contig for x in idxstats(bam_file, data) if x.contig != "*"]: out_handle.write("%s\n" % contig) cmd = "seqtk subseq -l 100 %s %s > %s" % (dd.get_ref_file(data), contig_file, tx_out_file) do.run(cmd, "Subset %s to BAM file contigs" % dd.get_genome_build(data)) ref.fasta_idx(new_ref, data["config"]) runner = broad.runner_from_path("picard", data["config"]) runner.run_fn("picard_index_ref", new_ref) return {"base": new_ref}
def get_padded_bed_file(out_dir, bed_file, padding, data): out_file = os.path.join(out_dir, "%s-padded.bed" % (utils.splitext_plus(os.path.basename(bed_file))[0])) if utils.file_uptodate(out_file, bed_file): return out_file fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: cmd = "bedtools slop -i {bed_file} -g {fai_file} -b {padding} > {tx_out_file}" do.run(cmd.format(**locals()), "Pad BED file", data) return out_file
def get_padded_bed_file(out_dir, bed_file, padding, data): bedtools = config_utils.get_program("bedtools", data, default="bedtools") out_file = os.path.join(out_dir, "%s-padded.bed" % (utils.splitext_plus(os.path.basename(bed_file))[0])) if utils.file_uptodate(out_file, bed_file): return out_file fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: cmd = "{bedtools} slop -i {bed_file} -g {fai_file} -b {padding} | bedtools merge -i - > {tx_out_file}" do.run(cmd.format(**locals()), "Pad BED file", data) return out_file
def _subset_bed_by_region(in_file, out_file, regions, ref_file, do_merge=True): orig_bed = pybedtools.BedTool(in_file) region_bed = pybedtools.BedTool("\n".join(["%s\t%s\t%s" % (c, s, e) for c, s, e in regions]) + "\n", from_string=True) sort_kwargs = {"faidx": ref.fasta_idx(ref_file)} if ref_file else {} if do_merge: orig_bed.intersect(region_bed, nonamecheck=True).saveas().sort(**sort_kwargs).saveas().\ filter(lambda x: len(x) > 1).saveas().merge().saveas(out_file) else: orig_bed.intersect(region_bed, nonamecheck=True).saveas().sort(**sort_kwargs).saveas().\ filter(lambda x: len(x) > 1).saveas(out_file)
def get_padded_bed_file(bed_file, padding, data, bedprep_dir=None): if not bedprep_dir: bedprep_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "bedprep")) out_file = os.path.join(bedprep_dir, "%s-padded.bed" % (utils.splitext_plus(os.path.basename(bed_file))[0])) if utils.file_uptodate(out_file, bed_file): return out_file fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: cmd = "bedtools slop -i {bed_file} -g {fai_file} -b {padding} > {tx_out_file}" do.run(cmd.format(**locals()), "Pad BED file", data) return out_file
def fai_from_bam(ref_file, bam_file, out_file, data): """Create a fai index with only contigs in the input BAM file. """ contigs = set([x.contig for x in idxstats(bam_file, data)]) if not utils.file_uptodate(out_file, bam_file): with open(ref.fasta_idx(ref_file, data["config"])) as in_handle: with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for line in (l for l in in_handle if l.strip()): if line.split()[0] in contigs: out_handle.write(line) return out_file
def _prep_callable_bed(in_file, work_dir, stats, data): """Sort and merge callable BED regions to prevent SV double counting """ out_file = os.path.join(work_dir, "%s-merge.bed.gz" % utils.splitext_plus(os.path.basename(in_file))[0]) gsort = config_utils.get_program("gsort", data) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: fai_file = ref.fasta_idx(dd.get_ref_file(data)) cmd = ("{gsort} {in_file} {fai_file} | bedtools merge -i - -d {stats[merge_size]} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Prepare SV callable BED regions") return vcfutils.bgzip_and_index(out_file, data["config"])
def split_vcf(in_file, ref_file, config, out_dir=None): """Split a VCF file into separate files by chromosome. """ if out_dir is None: out_dir = os.path.join(os.path.dirname(in_file), "split") out_files = [] with open(ref.fasta_idx(ref_file, config)) as in_handle: for line in in_handle: chrom, size = line.split()[:2] out_file = os.path.join(out_dir, os.path.basename(replace_suffix(append_stem(in_file, "-%s" % chrom), ".vcf"))) subset_vcf(in_file, (chrom, 0, size), out_file, config) out_files.append(out_file) return out_files
def subset_by_genes(in_file, data, out_dir, pad): """Subset BED file of regions to only those within pad of the final output. """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) fai_file = ref.fasta_idx(dd.get_ref_file(data)) if not gene_file or not utils.file_exists(in_file): return in_file else: out_file = os.path.join(out_dir, "%s-geneonly.bed" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: want_region_file = "%s-targetregions%s" % utils.splitext_plus(out_file) pybedtools.BedTool(gene_file).slop(g=fai_file, b=pad).merge().saveas(want_region_file) pybedtools.BedTool(in_file).intersect(b=want_region_file).sort().saveas(tx_out_file) return out_file
def add_genes(in_file, data, max_distance=10000, work_dir=None): """Add gene annotations to a BED file from pre-prepared RNA-seq data. max_distance -- only keep annotations within this distance of event """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) if gene_file and utils.file_exists(in_file): out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0] if work_dir: out_file = os.path.join(work_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: _add_genes_to_bed(in_file, gene_file, fai_file, tx_out_file, data, max_distance) return out_file else: return in_file
def split_vcf(in_file, ref_file, config, out_dir=None): """Split a VCF file into separate files by chromosome. """ if out_dir is None: out_dir = os.path.join(os.path.dirname(in_file), "split") out_files = [] with open(ref.fasta_idx(ref_file, config)) as in_handle: for line in in_handle: chrom, size = line.split()[:2] out_file = os.path.join( out_dir, os.path.basename( replace_suffix(append_stem(in_file, "-%s" % chrom), ".vcf"))) subset_vcf(in_file, (chrom, 0, size), out_file, config) out_files.append(out_file) return out_files
def add_genes(in_file, data, max_distance=10000): """Add gene annotations to a BED file from pre-prepared RNA-seq data. max_distance -- only keep annotations within this distance of event """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) if gene_file and utils.file_exists(in_file): out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): input_rec = iter(pybedtools.BedTool(in_file)).next() # keep everything after standard chrom/start/end, 1-based extra_fields = range(4, len(input_rec.fields) + 1) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = ( r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" % (max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd() cmd = ( "{sort_cmd} -k1,1 -k2,2n {in_file} | " "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) " "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}" ) do.run(cmd.format(**locals()), "Annotate BED file with gene info") return out_file else: return in_file
def _get_coverage_file(in_bam, ref_file, region, region_file, depth, base_file, data): """Retrieve summary of coverage in a region. Requires positive non-zero mapping quality at a position, matching GATK's CallableLoci defaults. """ out_file = "%s-genomecov.bed" % utils.splitext_plus(base_file)[0] if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: fai_file = ref.fasta_idx(ref_file, data["config"]) sambamba = config_utils.get_program("sambamba", data["config"]) bedtools = config_utils.get_program("bedtools", data["config"]) cmd = ("{sambamba} view -F 'mapping_quality > 0' -L {region_file} -f bam -l 1 {in_bam} | " "{bedtools} genomecov -split -ibam stdin -bga -g {fai_file} " "> {tx_out_file}") do.run(cmd.format(**locals()), "bedtools genomecov: %s" % (str(region)), data) # Empty output file, no coverage for the whole contig if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feat in get_ref_bedtool(ref_file, data["config"], region): out_handle.write("%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, 0)) return out_file