def _combine_files(tsv_files, work_dir, data): """Combine multiple priority tsv files into a final sorted output. """ header = "\t".join( [ "caller", "sample", "chrom", "start", "end", "svtype", "known", "lof", "annotation", "split_read_support", "paired_end_support", ] ) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-prioritize.tsv" % (sample)) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: input_files = " ".join(tsv_files) sort_cmd = bedutils.get_sort_cmd() cmd = "{{ echo '{header}'; cat {input_files} | {sort_cmd} -k3,3 -k4,4n; }} > {tx_out_file}" do.run(cmd.format(**locals()), "Combine prioritized from multiple callers") return out_file
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000): """Re-usable subcomponent that annotates BED file genes from another BED """ try: input_rec = iter(pybedtools.BedTool(in_file)).next() except StopIteration: # empty file utils.copy_plus(in_file, out_file) return # keep everything after standard chrom/start/end, 1-based extra_fields = range(4, len(input_rec.fields) + 1) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" % (max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd() cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" # Ensure gene transcripts match reference genome ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" % (utils.splitext_plus(os.path.basename(gene_file))[0])) ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data) cmd = ("{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | " "{sort_cmd} -k1,1 -k2,2n | " "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) " "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {ready_gene_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info")
def _collapse_transcripts(in_file, window, data, out_dir): """Collapse transcripts into min/max coordinates and optionally add windows. """ if out_dir is None: out_dir = os.path.dirname(in_file) out_file = os.path.join(out_dir, "%s-transcripts_w%s.bed" % (os.path.splitext(os.path.basename(in_file))[0], window)) chrom_sizes = {} for contig in ref.file_contigs(dd.get_ref_file(data), data["config"]): chrom_sizes[contig.name] = contig.size if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: prep_file = "%s-sortprep%s" % os.path.splitext(tx_out_file) sort_cmd = bedutils.get_sort_cmd() cmd = "{sort_cmd} -k4,4 -k1,1 {in_file} > {prep_file}" do.run(cmd.format(**locals()), "Sort BED file by transcript name") with open(tx_out_file, "w") as out_handle: # Work around for segmentation fault issue with groupby # https://github.com/daler/pybedtools/issues/131#issuecomment-89832476 x = pybedtools.BedTool(prep_file) def gen(): for r in x: yield r for name, rs in itertools.groupby(gen(), lambda r: (r.name, r.chrom)): rs = list(rs) r = rs[0] for gcoords in _group_coords(rs): min_pos = max(min(gcoords) - window, 0) max_pos = min(max(gcoords) + window, chrom_sizes[r.chrom]) out_handle.write("%s\t%s\t%s\t%s\n" % (r.chrom, min_pos, max_pos, r.name)) return bedutils.sort_merge(out_file, data)
def add_genes(in_file, data, max_distance=10000): """Add gene annotations to a BED file from pre-prepared RNA-seq data. max_distance -- only keep annotations within this distance of event """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) if gene_file and utils.file_exists(in_file): out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): input_rec = iter(pybedtools.BedTool(in_file)).next() # keep everything after standard chrom/start/end, 1-based extra_fields = range(4, len(input_rec.fields) + 1) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" % (max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd() cmd = ("{sort_cmd} -k1,1 -k2,2n {in_file} | " "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) " "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info") return out_file else: return in_file
def add_genes_to_bed(in_file, gene_file, fai_file, out_file, max_distance=10000): """Re-usable subcomponent that annotates BED file genes from another BED """ input_rec = iter(pybedtools.BedTool(in_file)).next() # keep everything after standard chrom/start/end, 1-based extra_fields = range(4, len(input_rec.fields) + 1) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = ( r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" % (max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd() cmd = ( "{sort_cmd} -k1,1 -k2,2n {in_file} | " "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) " "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info")
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, max_distance=10000): """Re-usable subcomponent that annotates BED file genes from another BED """ input_rec = iter(pybedtools.BedTool(in_file)).next() # keep everything after standard chrom/start/end, 1-based extra_fields = range(4, len(input_rec.fields) + 1) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" % (max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd() cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" cmd = ("{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | " "{sort_cmd} -k1,1 -k2,2n | " "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) " "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info")
def _combine_files(tsv_files, work_dir, data): """Combine multiple priority tsv files into a final sorted output. """ header = "\t".join(["caller", "sample", "chrom", "start", "end", "svtype", "lof", "annotation", "split_read_support", "paired_support_PE", "paired_support_PR"]) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-prioritize.tsv" % (sample)) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: input_files = " ".join(tsv_files) sort_cmd = bedutils.get_sort_cmd() cmd = "{{ echo '{header}'; cat {input_files} | {sort_cmd} -k3,3 -k4,4n; }} > {tx_out_file}" do.run(cmd.format(**locals()), "Combine prioritized from multiple callers") return out_file
def _group_by_ctype(bed_file, depth, region, region_file, out_file, data): """Group adjacent callable/uncallble regions into defined intervals. Uses tips from bedtools discussion: https://groups.google.com/d/msg/bedtools-discuss/qYDE6XF-GRA/2icQtUeOX_UJ https://gist.github.com/arq5x/b67196a46db5b63bee06 """ with file_transaction(data, out_file) as tx_out_file: min_cov = depth["min"] sort_cmd = bedutils.get_sort_cmd() cmd = (r"""cat {bed_file} | awk '{{if ($4 == 0) {{print $0"\tNO_COVERAGE"}} """ r"""else if ($4 < {min_cov}) {{print $0"\tLOW_COVERAGE"}} """ r"""else {{print $0"\tCALLABLE"}} }}' | """ "bedtools groupby -prec 21 -g 1,5 -c 1,2,3,5 -o first,first,max,first | " "cut -f 3-6 | " "bedtools intersect -nonamecheck -a - -b {region_file} | " "{sort_cmd} -k1,1 -k2,2n > {tx_out_file}") do.run(cmd.format(**locals()), "bedtools groupby coverage: %s" % (str(region)), data)
def _group_by_ctype(bed_file, depth, region, region_file, out_file, data): """Group adjacent callable/uncallble regions into defined intervals. Uses tips from bedtools discussion: https://groups.google.com/d/msg/bedtools-discuss/qYDE6XF-GRA/2icQtUeOX_UJ https://gist.github.com/arq5x/b67196a46db5b63bee06 """ with file_transaction(data, out_file) as tx_out_file: min_cov = depth["min"] sort_cmd = bedutils.get_sort_cmd() cmd = ( r"""cat {bed_file} | awk '{{if ($4 == 0) {{print $0"\tNO_COVERAGE"}} """ r"""else if ($4 < {min_cov}) {{print $0"\tLOW_COVERAGE"}} """ r"""else {{print $0"\tCALLABLE"}} }}' | """ "bedtools groupby -prec 21 -g 1,5 -c 1,2,3,5 -o first,first,max,first | " "cut -f 3-6 | " "bedtools intersect -nonamecheck -a - -b {region_file} | " "{sort_cmd} -k1,1 -k2,2n > {tx_out_file}") do.run(cmd.format(**locals()), "bedtools groupby coverage: %s" % (str(region)), data)
def add_genes(in_file, data, max_distance=10000): """Add gene annotations to a BED file from pre-prepared RNA-seq data. max_distance -- only keep annotations within this distance of event """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) if gene_file and utils.file_exists(in_file): out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): input_rec = iter(pybedtools.BedTool(in_file)).next() # keep everything after standard chrom/start/end, 1-based extra_fields = range(4, len(input_rec.fields) + 1) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = ( r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" % (max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd() cmd = ( "{sort_cmd} -k1,1 -k2,2n {in_file} | " "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) " "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}" ) do.run(cmd.format(**locals()), "Annotate BED file with gene info") return out_file else: return in_file
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000): """Re-usable subcomponent that annotates BED file genes from another BED """ try: input_rec = next(iter(pybedtools.BedTool(in_file))) except StopIteration: # empty file utils.copy_plus(in_file, out_file) return # keep everything after standard chrom/start/end, 1-based extra_fields = list(range(4, len(input_rec.fields) + 1)) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s || $NF < -%s) $%s = "."} {print}'""" % (max_distance, max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd(os.path.dirname(out_file)) cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" # Ensure gene transcripts match reference genome ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" % (utils.splitext_plus(os.path.basename(gene_file))[0])) ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data) exports = "export TMPDIR=%s && %s" % (os.path.dirname(out_file), utils.local_path_export()) bcbio_py = sys.executable gsort = config_utils.get_program("gsort", data) cmd = ("{exports}{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | " "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | " "{gsort} - {fai_file} | " "bedtools closest -g {fai_file} " "-D ref -t first -a - -b <({gsort} {ready_gene_file} {fai_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info")