def bed(args): """ %prog bed xmlfile Print summary of optical map alignment in BED format. """ from jcvi.formats.bed import sort p = OptionParser(bed.__doc__) p.add_option( "--blockonly", default=False, action="store_true", help="Only print out large blocks, not fragments [default: %default]") p.add_option("--nosort", default=False, action="store_true", help="Do not sort bed [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) xmlfile, = args bedfile = xmlfile.rsplit(".", 1)[0] + ".bed" fw = open(bedfile, "w") om = OpticalMap(xmlfile) om.write_bed(fw, blockonly=opts.blockonly) fw.close() if not opts.nosort: sort([bedfile, "--inplace"])
def bed(args): """ %prog bed xmlfile Print summary of optical map alignment in BED format. """ from jcvi.formats.bed import sort p = OptionParser(bed.__doc__) p.add_option("--blockonly", default=False, action="store_true", help="Only print out large blocks, not fragments [default: %default]") p.add_option("--point", default=False, action="store_true", help="Print accesssion as single point instead of interval") p.add_option("--scale", type="float", help="Scale the OM distance by factor") p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements [default: %default]") p.add_option("--nosort", default=False, action="store_true", help="Do not sort bed [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) xmlfile, = args bedfile = xmlfile.rsplit(".", 1)[0] + ".bed" om = OpticalMap(xmlfile) om.write_bed(bedfile, point=opts.point, scale=opts.scale, blockonly=opts.blockonly, switch=opts.switch) if not opts.nosort: sort([bedfile, "--inplace"])
def bed(args): """ %prog bed xmlfile Print summary of optical map alignment in BED format. """ from jcvi.formats.bed import sort p = OptionParser(bed.__doc__) p.add_option("--blockonly", default=False, action="store_true", help="Only print out large blocks, not fragments [default: %default]") p.add_option("--nosort", default=False, action="store_true", help="Do not sort bed [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) xmlfile, = args bedfile = xmlfile.rsplit(".", 1)[0] + ".bed" fw = open(bedfile, "w") om = OpticalMap(xmlfile) om.write_bed(fw, blockonly=opts.blockonly) fw.close() if not opts.nosort: sort([bedfile, "--inplace"])
def tRNAscan(args): """ %prog tRNAscan all.trna > all.trna.gff3 Convert tRNAscan-SE output into gff3 format. Sequence tRNA Bounds tRNA Anti Intron Bounds Cove Name tRNA # Begin End Type Codon Begin End Score -------- ------ ---- ------ ---- ----- ----- ---- ------ 23231 1 335355 335440 Tyr GTA 335392 335404 69.21 23231 2 1076190 1076270 Leu AAG 0 0 66.33 Conversion based on PERL one-liner in: <https://github.com/sujaikumar/assemblage/blob/master/README-annotation.md> """ from jcvi.formats.gff import sort p = OptionParser(tRNAscan.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) trnaout, = args gffout = trnaout + ".gff3" fp = open(trnaout) fw = open(gffout, "w") fp.next() fp.next() row = fp.next() assert row.startswith("--------") for row in fp: atoms = [x.strip() for x in row.split("\t")] contig, trnanum, start, end, aa, codon, \ intron_start, intron_end, score = atoms start, end = int(start), int(end) orientation = '+' if start > end: start, end = end, start orientation = '-' source = "tRNAscan" type = "tRNA" if codon == "???": codon = "XXX" comment = "ID={0}.tRNA.{1};Name=tRNA-{2} (anticodon: {3})".\ format(contig, trnanum, aa, codon) print >> fw, "\t".join(str(x) for x in (contig, source, type, start,\ end, score, orientation, ".", comment)) fw.close() sort([gffout, "-i"])
def build(args): """ %prog build input.bed scaffolds.fasta Build associated genome FASTA file and CHAIN file that can be used to lift old coordinates to new coordinates. The CHAIN file will be used to lift the original marker positions to new positions in the reconstructed genome. The new positions of the markers will be reported in *.lifted.bed. """ p = OptionParser(build.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, scaffolds = args pf = inputbed.rsplit(".", 1)[0] mapbed = pf + ".bed" chr_agp = pf + ".chr.agp" chr_fasta = pf + ".chr.fasta" if need_update((chr_agp, scaffolds), chr_fasta): agp_build([chr_agp, scaffolds, chr_fasta]) unplaced_agp = pf + ".unplaced.agp" if need_update((chr_agp, scaffolds), unplaced_agp): write_unplaced_agp(chr_agp, scaffolds, unplaced_agp) unplaced_fasta = pf + ".unplaced.fasta" if need_update((unplaced_agp, scaffolds), unplaced_fasta): agp_build([unplaced_agp, scaffolds, unplaced_fasta]) combined_agp = pf + ".agp" if need_update((chr_agp, unplaced_agp), combined_agp): FileMerger((chr_agp, unplaced_agp), combined_agp).merge() combined_fasta = pf + ".fasta" if need_update((chr_fasta, unplaced_fasta), combined_fasta): FileMerger((chr_fasta, unplaced_fasta), combined_fasta).merge() chainfile = pf + ".chain" if need_update((combined_agp, scaffolds, combined_fasta), chainfile): fromagp([combined_agp, scaffolds, combined_fasta]) liftedbed = mapbed.rsplit(".", 1)[0] + ".lifted.bed" if need_update((mapbed, chainfile), liftedbed): cmd = "liftOver -minMatch=1 {0} {1} {2} unmapped".\ format(mapbed, chainfile, liftedbed) sh(cmd) sort([liftedbed, "-i"]) # Sort bed in place
def __init__(self, bedfile, sizesfile): from jcvi.formats.bed import sort sortedbedfile = bedfile.rsplit(".", 1)[0] + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile]) bedfile = sortedbedfile coveragefile = bedfile + ".coverage" if need_update(bedfile, coveragefile): cmd = "genomeCoverageBed" cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile) sh(cmd, outfile=coveragefile) self.sizes = Sizes(sizesfile).mapping filename = coveragefile assert filename.endswith(".coverage") super(Coverage, self).__init__(filename)
def consolidate(nbedfile, obedfile, cbedfile): from pybedtools import BedTool nbedtool = BedTool(nbedfile) obedtool = BedTool(obedfile) ab = nbedtool.intersect(obedtool, s=True, u=True) ba = obedtool.intersect(nbedtool, s=True, u=True) cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn) fp = popen(cmd) ovl = BedTool(fp.readlines()) abmerge = ovl.merge(s=True, nms=True, scores="mean").sort() cmd = "cat {0}".format(abmerge.fn) fp = popen(cmd, debug=False) ovl = BedTool(fp.readlines()) notovl = nbedtool.intersect(ovl.sort(), s=True, v=True) infile = "{0} {1}".format(notovl.fn, ovl.fn) tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid()) cmd = "sort -k1,1 -k2,2n" sh(cmd, infile=infile, outfile=tmpfile) fp = open(cbedfile, "w") bed = Bed(tmpfile) for b in bed: if ";" in b.accn: accns = set() for accn in b.accn.split(";"): accns.add(accn) b.accn = ";".join(accns) print(b, file=fp) fp.close() os.remove(tmpfile) sort([cbedfile, "-i"])
def bed(args): """ %prog bed xmlfile Print summary of optical map alignment in BED format. """ from jcvi.formats.bed import sort p = OptionParser(bed.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) xmlfile, = args bedfile = xmlfile.rsplit(".", 1)[0] + ".bed" fw = open(bedfile, "w") om = OpticalMap(xmlfile) om.write_bed(fw) fw.close() sort([bedfile, "--inplace"])
def __init__(self, bedfile, sizesfile): bedfile = sort([bedfile]) coveragefile = bedfile + ".coverage" if need_update(bedfile, coveragefile): cmd = "genomeCoverageBed" cmd += " -bg -i {0} -g {1}".format(bedfile, sizesfile) sh(cmd, outfile=coveragefile) self.sizes = Sizes(sizesfile).mapping filename = coveragefile assert filename.endswith(".coverage") super(Coverage, self).__init__(filename)
def consolidate(nbedfile, obedfile, cbedfile): from pybedtools import BedTool nbedtool = BedTool(nbedfile) obedtool = BedTool(obedfile) ab = nbedtool.intersect(obedtool, s=True, u=True) ba = obedtool.intersect(nbedtool, s=True, u=True) cmd = "cat {0} {1} | sort -k1,1 -k2,2n".format(ab.fn, ba.fn) fp = popen(cmd) ovl = BedTool(fp.readlines()) abmerge = ovl.merge(s=True, nms=True, scores="mean").sort() cmd = "cat {0}".format(abmerge.fn) fp = popen(cmd, debug=False) ovl = BedTool(fp.readlines()) notovl = nbedtool.intersect(ovl.sort(), s=True, v=True) infile = "{0} {1}".format(notovl.fn, ovl.fn) tmpfile = "/tmp/reformat.{0}.bed".format(os.getpid()) cmd = "sort -k1,1 -k2,2n" sh(cmd, infile=infile, outfile=tmpfile) fp = open(cbedfile, "w") bed = Bed(tmpfile) for b in bed: if ";" in b.accn: accns = set() for accn in b.accn.split(";"): accns.add(accn) b.accn = ";".join(accns) print >> fp, b fp.close() os.remove(tmpfile) sort([cbedfile, "-i"])
def annotate(args): """ %prog annotate new.bed old.bed 2> log Annotate the `new.bed` with features from `old.bed` for the purpose of gene numbering. Ambiguity in ID assignment can be resolved by either of the following 2 methods: - `alignment`: make use of global sequence alignment score (calculated by `needle`) - `overlap`: make use of overlap length (calculated by `intersectBed`) Transfer over as many identifiers as possible while following guidelines: http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing Note: Following RegExp pattern describes the structure of the identifier assigned to features in the `new.bed` file. new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+") Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144 Adjust the value of `new_id_pat` manually as per your ID naming conventions. """ from jcvi.utils.grouper import Grouper valid_resolve_choices = ["alignment", "overlap"] p = OptionParser(annotate.__doc__) p.add_option("--resolve", default="alignment", choices=valid_resolve_choices, help="Resolve ID assignment based on a certain metric" \ + " [default: %default]") p.add_option("--atg_name", default=False, action="store_true", help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \ + " [default: %default]") g1 = OptionGroup(p, "Optional parameters (alignment):\n" \ + "Use if resolving ambiguities based on sequence `alignment`") g1.add_option("--pid", dest="pid", default=35., type="float", help="Percent identity cutoff [default: %default]") g1.add_option("--score", dest="score", default=250., type="float", help="Alignment score cutoff [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters (overlap):\n" \ + "Use if resolving ambiguities based on `overlap` length\n" \ + "Parameters equivalent to `intersectBed`") g2.add_option( "-f", dest="f", default=0.5, type="float", help="Minimum overlap fraction (0.0 - 1.0) [default: %default]") g2.add_option( "-r", dest="r", default=False, action="store_true", help="Require fraction overlap to be reciprocal [default: %default]") g2.add_option("-s", dest="s", default=True, action="store_true", help="Require same strandedness [default: %default]") p.add_option_group(g2) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) nbedfile, obedfile = args npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0] # Make consolidated.bed cbedfile = "consolidated.bed" if not os.path.isfile(cbedfile): consolidate(nbedfile, obedfile, cbedfile) else: logging.warning("`{0}` already exists. Skipping step".format(cbedfile)) logging.warning("Resolving ID assignment ambiguity based on `{0}`".\ format(opts.resolve)) if opts.resolve == "alignment": # Get pairs and prompt to run needle pairsfile = "nw.pairs" scoresfile = "nw.scores" if not os.path.isfile(pairsfile): get_pairs(cbedfile, pairsfile) else: logging.warning("`{0}` already exists. Checking for needle output".\ format(pairsfile)) # If needle scores do not exist, prompt user to run needle if not os.path.isfile(scoresfile): logging.error("`{0}` does not exist. Please process {1} using `needle`".\ format(scoresfile, pairsfile)) sys.exit() else: scoresfile = "ovl.scores" # Calculate overlap length using intersectBed calculate_ovl(nbedfile, obedfile, opts, scoresfile) logging.warning("`{0}' exists. Storing scores in memory".\ format(scoresfile)) scores = read_scores(scoresfile, opts) # Iterate through consolidated bed and # filter piles based on score abedline = {} cbed = Bed(cbedfile) g = Grouper() for c in cbed: accn = c.accn g.join(*accn.split(";")) nbedline = {} nbed = Bed(nbedfile) for line in nbed: nbedline[line.accn] = line splits = set() for chr, chrbed in nbed.sub_beds(): abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits) if splits is not None: abedline = process_splits(splits, scores, nbedline, abedline) abedfile = npf + ".annotated.bed" afh = open(abedfile, "w") for accn in abedline: print >> afh, abedline[accn] afh.close() sort([abedfile, "-i"])
def annotate(args): """ %prog annotate new.bed old.bed 2> log Annotate the `new.bed` with features from `old.bed` for the purpose of gene numbering. Ambiguity in ID assignment can be resolved by either of the following 2 methods: - `alignment`: make use of global sequence alignment score (calculated by `needle`) - `overlap`: make use of overlap length (calculated by `intersectBed`) Transfer over as many identifiers as possible while following guidelines: http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing Note: Following RegExp pattern describes the structure of the identifier assigned to features in the `new.bed` file. new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+") Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144 Adjust the value of `new_id_pat` manually as per your ID naming conventions. """ from jcvi.utils.grouper import Grouper valid_resolve_choices = ["alignment", "overlap"] p = OptionParser(annotate.__doc__) p.add_option("--resolve", default="alignment", choices=valid_resolve_choices, help="Resolve ID assignment based on a certain metric" \ + " [default: %default]") p.add_option("--atg_name", default=False, action="store_true", help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \ + " [default: %default]") g1 = OptionGroup(p, "Optional parameters (alignment):\n" \ + "Use if resolving ambiguities based on sequence `alignment`") g1.add_option("--pid", dest="pid", default=35., type="float", help="Percent identity cutoff [default: %default]") g1.add_option("--score", dest="score", default=250., type="float", help="Alignment score cutoff [default: %default]") p.add_option_group(g1) g2 = OptionGroup(p, "Optional parameters (overlap):\n" \ + "Use if resolving ambiguities based on `overlap` length\n" \ + "Parameters equivalent to `intersectBed`") g2.add_option("-f", dest="f", default=0.5, type="float", help="Minimum overlap fraction (0.0 - 1.0) [default: %default]") g2.add_option("-r", dest="r", default=False, action="store_true", help="Require fraction overlap to be reciprocal [default: %default]") g2.add_option("-s", dest="s", default=True, action="store_true", help="Require same strandedness [default: %default]") p.add_option_group(g2) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) nbedfile, obedfile = args npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0] # Make consolidated.bed cbedfile = "consolidated.bed" if not os.path.isfile(cbedfile): consolidate(nbedfile, obedfile, cbedfile) else: logging.warning("`{0}` already exists. Skipping step".format(cbedfile)) logging.warning("Resolving ID assignment ambiguity based on `{0}`".\ format(opts.resolve)) if opts.resolve == "alignment": # Get pairs and prompt to run needle pairsfile = "nw.pairs" scoresfile = "nw.scores" if not os.path.isfile(pairsfile): get_pairs(cbedfile, pairsfile) else: logging.warning("`{0}` already exists. Checking for needle output".\ format(pairsfile)) # If needle scores do not exist, prompt user to run needle if not os.path.isfile(scoresfile): logging.error("`{0}` does not exist. Please process {1} using `needle`".\ format(scoresfile, pairsfile)) sys.exit() else: scoresfile = "ovl.scores" # Calculate overlap length using intersectBed calculate_ovl(nbedfile, obedfile, opts, scoresfile) logging.warning("`{0}' exists. Storing scores in memory".\ format(scoresfile)) scores = read_scores(scoresfile, opts) # Iterate through consolidated bed and # filter piles based on score abedline = {} cbed = Bed(cbedfile) g = Grouper() for c in cbed: accn = c.accn g.join(*accn.split(";")) nbedline = {} nbed = Bed(nbedfile) for line in nbed: nbedline[line.accn] = line splits = set() for chr, chrbed in nbed.sub_beds(): abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits) if splits is not None: abedline = process_splits(splits, scores, nbedline, abedline) abedfile = npf + ".annotated.bed" afh = open(abedfile, "w") for accn in abedline: print >> afh, abedline[accn] afh.close() sort([abedfile, "-i"])
def deletion(args): """ %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed Find IES based on mapping MAC reads to MIC genome. """ p = OptionParser(deletion.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth to call a deletion") p.add_option("--minspan", default=30, type="int", help="Minimum span to call a deletion") p.add_option("--split", default=False, action="store_true", help="Break at cigar N into separate parts") p.set_tmpdir() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, gapsbedfile = args if bedfile.endswith(".bam"): bamfile = bedfile bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed") if need_update(bamfile, bedfile): cmd = "bamToBed -i {0}".format(bamfile) if opts.split: cmd += " -split" cmd += " | cut -f1-4" sh(cmd, outfile=bedfile) sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir) if bedfile.endswith(".sorted.bed"): pf = bedfile.rsplit(".", 2)[0] sortedbedfile = bedfile else: pf = bedfile.rsplit(".", 1)[0] sortedbedfile = pf + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile, "-u", "--accn", sort_tmpdir]) # Find reads that contain multiple matches ibedfile = pf + ".d.bed" if need_update(sortedbedfile, ibedfile): bed = Bed(sortedbedfile, sorted=False) fw = open(ibedfile, "w") logging.debug("Write deletions to `{0}`.".format(ibedfile)) for accn, bb in groupby(bed, key=lambda x: x.accn): bb = list(bb) branges = [(x.seqid, x.start, x.end) for x in bb] iranges = range_interleave(branges) for seqid, start, end in iranges: if end - start + 1 < opts.minspan: continue print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, accn + '-d')) fw.close() # Uniqify the insertions and count occurrences countbedfile = pf + ".uniq.bed" if need_update(ibedfile, countbedfile): bed = Bed(ibedfile) fw = open(countbedfile, "w") logging.debug("Write counts to `{0}`.".format(countbedfile)) registry = Counter((x.seqid, x.start, x.end) for x in bed) ies_id = 1 for (seqid, start, end), count in registry.items(): ies_name = "{0:05d}-r{1}".format(ies_id, count) if count < opts.mindepth: continue print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name)) ies_id += 1 fw.close() sort([countbedfile, "-i", sort_tmpdir]) # Remove deletions that contain some read depth depthbedfile = pf + ".depth.bed" if need_update((sortedbedfile, countbedfile), depthbedfile): depth([sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile)]) validbedfile = pf + ".valid.bed" if need_update(depthbedfile, validbedfile): fw = open(validbedfile, "w") logging.debug("Filter valid deletions to `{0}`.".format(validbedfile)) bed = Bed(depthbedfile) all_scores = [float(b.score) for b in bed] lb, ub = outlier_cutoff(all_scores) logging.debug("Bounds for depths: LB={0:.2f} (ignored) UB={1:.2f}".format(lb, ub)) for b in bed: if float(b.score) > ub: continue print >> fw, b fw.close() # Remove deletions that contain sequencing gaps on its flanks selectedbedfile = pf + ".selected.bed" if need_update(validbedfile, selectedbedfile): flanksbedfile = pf + ".flanks.bed" fw = open(flanksbedfile, "w") bed = Bed(validbedfile) flank = 100 logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile)) for b in bed: start, end = b.start, b.end b.start, b.end = start, min(start + flank - 1, end) print >> fw, b b.start, b.end = max(start, end - flank + 1), end print >> fw, b fw.close() intersectidsfile = pf + ".intersect.ids" cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile) cmd += " | cut -f4 | sort -u" sh(cmd, outfile=intersectidsfile) some([validbedfile, intersectidsfile, "-v", "--outfile={0}".format(selectedbedfile)]) # Find best-scoring non-overlapping set iesbedfile = pf + ".ies.bed" if need_update(selectedbedfile, iesbedfile): bed = Bed(selectedbedfile) fw = open(iesbedfile, "w") logging.debug("Write IES to `{0}`.".format(iesbedfile)) branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \ for i, x in enumerate(bed)] iranges, iscore = range_chain(branges) logging.debug("Best chain score: {0} ({1} IES)".\ format(iscore, len(iranges))) ies_id = 1 for seqid, start, end, score, id in iranges: ies_name = "IES-{0:05d}-r{1}".format(ies_id, score) span = end - start + 1 print >> fw, "\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name, span)) ies_id += 1 fw.close()
def deletion(args): """ %prog deletion [mac.mic.bam|mac.mic.bed] mic.gaps.bed Find IES based on mapping MAC reads to MIC genome. """ p = OptionParser(deletion.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth to call a deletion") p.add_option("--minspan", default=30, type="int", help="Minimum span to call a deletion") p.add_option("--split", default=False, action="store_true", help="Break at cigar N into separate parts") p.set_tmpdir() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, gapsbedfile = args if bedfile.endswith(".bam"): bamfile = bedfile bedfile = bamfile.replace(".sorted.", ".").replace(".bam", ".bed") if need_update(bamfile, bedfile): cmd = "bamToBed -i {0}".format(bamfile) if opts.split: cmd += " -split" cmd += " | cut -f1-4" sh(cmd, outfile=bedfile) sort_tmpdir = "--tmpdir={0}".format(opts.tmpdir) if bedfile.endswith(".sorted.bed"): pf = bedfile.rsplit(".", 2)[0] sortedbedfile = bedfile else: pf = bedfile.rsplit(".", 1)[0] sortedbedfile = pf + ".sorted.bed" if need_update(bedfile, sortedbedfile): sort([bedfile, "-u", "--accn", sort_tmpdir]) # Find reads that contain multiple matches ibedfile = pf + ".d.bed" if need_update(sortedbedfile, ibedfile): bed = Bed(sortedbedfile, sorted=False) fw = open(ibedfile, "w") logging.debug("Write deletions to `{0}`.".format(ibedfile)) for accn, bb in groupby(bed, key=lambda x: x.accn): bb = list(bb) branges = [(x.seqid, x.start, x.end) for x in bb] iranges = range_interleave(branges) for seqid, start, end in iranges: if end - start + 1 < opts.minspan: continue print("\t".join(str(x) for x in \ (seqid, start - 1, end, accn + '-d')), file=fw) fw.close() # Uniqify the insertions and count occurrences countbedfile = pf + ".uniq.bed" if need_update(ibedfile, countbedfile): bed = Bed(ibedfile) fw = open(countbedfile, "w") logging.debug("Write counts to `{0}`.".format(countbedfile)) registry = Counter((x.seqid, x.start, x.end) for x in bed) ies_id = 1 for (seqid, start, end), count in registry.items(): ies_name = "{0:05d}-r{1}".format(ies_id, count) if count < opts.mindepth: continue print("\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name)), file=fw) ies_id += 1 fw.close() sort([countbedfile, "-i", sort_tmpdir]) # Remove deletions that contain some read depth depthbedfile = pf + ".depth.bed" if need_update((sortedbedfile, countbedfile), depthbedfile): depth([ sortedbedfile, countbedfile, "--outfile={0}".format(depthbedfile) ]) validbedfile = pf + ".valid.bed" if need_update(depthbedfile, validbedfile): fw = open(validbedfile, "w") logging.debug("Filter valid deletions to `{0}`.".format(validbedfile)) bed = Bed(depthbedfile) all_scores = [float(b.score) for b in bed] lb, ub = outlier_cutoff(all_scores) logging.debug( "Bounds for depths: LB={0:.2f} (ignored) UB={1:.2f}".format( lb, ub)) for b in bed: if float(b.score) > ub: continue print(b, file=fw) fw.close() # Remove deletions that contain sequencing gaps on its flanks selectedbedfile = pf + ".selected.bed" if need_update(validbedfile, selectedbedfile): flanksbedfile = pf + ".flanks.bed" fw = open(flanksbedfile, "w") bed = Bed(validbedfile) flank = 100 logging.debug("Write deletion flanks to `{0}`.".format(flanksbedfile)) for b in bed: start, end = b.start, b.end b.start, b.end = start, min(start + flank - 1, end) print(b, file=fw) b.start, b.end = max(start, end - flank + 1), end print(b, file=fw) fw.close() intersectidsfile = pf + ".intersect.ids" cmd = "intersectBed -a {0} -b {1}".format(flanksbedfile, gapsbedfile) cmd += " | cut -f4 | sort -u" sh(cmd, outfile=intersectidsfile) some([ validbedfile, intersectidsfile, "-v", "--outfile={0}".format(selectedbedfile) ]) # Find best-scoring non-overlapping set iesbedfile = pf + ".ies.bed" if need_update(selectedbedfile, iesbedfile): bed = Bed(selectedbedfile) fw = open(iesbedfile, "w") logging.debug("Write IES to `{0}`.".format(iesbedfile)) branges = [Range(x.seqid, x.start, x.end, int(x.accn.rsplit("r")[-1]), i) \ for i, x in enumerate(bed)] iranges, iscore = range_chain(branges) logging.debug("Best chain score: {0} ({1} IES)".\ format(iscore, len(iranges))) ies_id = 1 for seqid, start, end, score, id in iranges: ies_name = "IES-{0:05d}-r{1}".format(ies_id, score) span = end - start + 1 print("\t".join(str(x) for x in \ (seqid, start - 1, end, ies_name, span)), file=fw) ies_id += 1 fw.close()