def testSetItem(self): ivf = IntervalFile(self.file) iv = ivf.next() iv.chrom = 'chrfake' print iv.fields self.assertEqual(iv['chrom'], 'chrfake') self.assertEqual(iv.chrom, 'chrfake')
def testAppend(self): ivf = IntervalFile(self.file) iv = ivf.next() print iv.fields iv.append('asdf') print iv self.assertEqual(iv[-1], 'asdf')
def testGetItemNegative(self): "test negative indexes to feature." ivf = IntervalFile(self.file) iv = ivf.next() self.assert_(iv[-self.fieldcount+self.chrpos].startswith("chr"), iv[-self.fieldcount+self.chrpos]) self.assert_(iv[-self.fieldcount+self.startpos].isdigit(), iv[-self.fieldcount+self.startpos]) self.assert_(iv[-self.fieldcount+self.stoppos].isdigit())
def testStart(self): ivf = IntervalFile(self.file) iv = ivf.next() orig_string = str(iv) # 0-based. orig_start = iv.start # Setting .start always sets 0-based coord. iv.start = orig_start # But for GFF setting .start should also make the .fields[3] the GFF # 1-based coord assert iv.start == int(iv.fields[3])-1 second_string = str(iv) second_start = iv.start iv.start = second_start # Check .start and .fields[3] internal consistency again assert iv.start == int(iv.fields[3])-1 print ' orig:', '(start=%s)'%orig_start, orig_string print ' second:', '(start=%s)'%second_start, second_string print 'current:', '(start=%s)'%iv.start, str(iv) self.assert_(orig_start == second_start == iv.start) self.assert_(orig_string == second_string == str(iv))
def testGetItem(self): "getitem now supports direct access to the line." ivf = IntervalFile(self.file) iv = ivf.next() self.assert_(iv[self.chrpos].startswith("chr")) self.assert_(iv[self.startpos].isdigit()) self.assert_(iv[self.startpos].isdigit())
def testName(self): ivf = IntervalFile(self.file) iv = ivf.next() iv.name = "bart simpson" self.assertEqual(iv.name, "bart simpson") if iv.file_type == "gff": self.assert_("bart" in iv.fields[8])
def testStart(self): ivf = IntervalFile(self.file) iv = ivf.next() orig_string = str(iv) # 0-based. orig_start = iv.start # Setting .start always sets 0-based coord. iv.start = orig_start # But for GFF setting .start should also make the .fields[3] the GFF # 1-based coord assert iv.start == int(iv.fields[3]) - 1 second_string = str(iv) second_start = iv.start iv.start = second_start # Check .start and .fields[3] internal consistency again assert iv.start == int(iv.fields[3]) - 1 print ' orig:', '(start=%s)' % orig_start, orig_string print ' second:', '(start=%s)' % second_start, second_string print 'current:', '(start=%s)' % iv.start, str(iv) self.assert_(orig_start == second_start == iv.start) self.assert_(orig_string == second_string == str(iv))
def testGetItemNegative(self): "test negative indexes to feature." ivf = IntervalFile(self.file) iv = ivf.next() self.assert_(iv[-self.fieldcount + self.chrpos].startswith("chr"), iv[-self.fieldcount + self.chrpos]) self.assert_(iv[-self.fieldcount + self.startpos].isdigit(), iv[-self.fieldcount + self.startpos]) self.assert_(iv[-self.fieldcount + self.stoppos].isdigit())
def testGetItemSliceNone(self): " test support for funky slices." ivf = IntervalFile(self.file) iv = ivf.next() self.assertEqual(len(iv[:3]), 3) self.assertEqual(len(iv[3:3]), 0) self.assertEqual(len(iv[2:]), self.fieldcount-2, iv[2:]) print len(iv.fields), iv.fields self.assertRaises(IndexError, lambda x: iv[x], self.fieldcount+1)
def testGetItemSlice(self): "getitem now supports direct access to the line." ivf = IntervalFile(self.file) iv = ivf.next() seqid, = iv[self.chrpos:self.chrpos+1] start, end = iv[self.startpos:self.stoppos+1] self.assert_(start.isdigit()) self.assertEqual(int(end), iv.end) self.assertEqual(seqid, iv.chrom)
def testGetItemSlice(self): "getitem now supports direct access to the line." ivf = IntervalFile(self.file) iv = ivf.next() seqid, = iv[self.chrpos:self.chrpos + 1] start, end = iv[self.startpos:self.stoppos + 1] self.assert_(start.isdigit()) self.assertEqual(int(end), iv.end) self.assertEqual(seqid, iv.chrom)
def testGetItemSliceNone(self): " test support for funky slices." ivf = IntervalFile(self.file) iv = ivf.next() self.assertEqual(len(iv[:3]), 3) self.assertEqual(len(iv[3:3]), 0) self.assertEqual(len(iv[2:]), self.fieldcount - 2, iv[2:]) print len(iv.fields), iv.fields self.assertRaises(IndexError, lambda x: iv[x], self.fieldcount + 1)
def testSetAttrs(self): ivf = IntervalFile(self.file) iv = ivf.next() if iv.file_type != 'gff': self.assertRaises(ValueError, iv.attrs.__setitem__, 'a','b') return iv.attrs['ID'] = 'fake' iv.attrs['field0'] = 'asdf' self.assertEqual(str(iv.attrs), iv[8]) self.assert_('field0=asdf' in iv[8]) self.assert_('ID=fake' in iv[8])
def testSetAttrs(self): ivf = IntervalFile(self.file) iv = ivf.next() if iv.file_type != 'gff': self.assertRaises(ValueError, iv.attrs.__setitem__, 'a', 'b') return iv.attrs['ID'] = 'fake' iv.attrs['field0'] = 'asdf' self.assertEqual(str(iv.attrs), iv[8]) self.assert_('field0=asdf' in iv[8]) self.assert_('ID=fake' in iv[8])
def testStart(self): ivf = IntervalFile(self.file) iv = ivf.next() orig_string = str(iv) orig_start = iv.start iv.start = orig_start second_string = str(iv) second_start = iv.start iv.start = second_start print ' orig:', '(start=%s)'%orig_start, orig_string print ' second:', '(start=%s)'%second_start, second_string print 'current:', '(start=%s)'%iv.start, str(iv) self.assert_(orig_start == second_start == iv.start) self.assert_(orig_string == second_string == str(iv))
def testStart(self): ivf = IntervalFile(self.file) iv = ivf.next() orig_string = str(iv) orig_start = iv.start iv.start = orig_start second_string = str(iv) second_start = iv.start iv.start = second_start print ' orig:', '(start=%s)' % orig_start, orig_string print ' second:', '(start=%s)' % second_start, second_string print 'current:', '(start=%s)' % iv.start, str(iv) self.assert_(orig_start == second_start == iv.start) self.assert_(orig_string == second_string == str(iv))
def testStart(self): ivf = IntervalFile(self.file) iv = next(ivf) orig_string = str(iv) # 0-based. orig_start = iv.start # Setting .start always sets 0-based coord. iv.start = orig_start # But for GFF setting .start should also make the .fields[3] the GFF # 1-based coord assert iv.start == int(iv.fields[3]) - 1 second_string = str(iv) second_start = iv.start iv.start = second_start # Check .start and .fields[3] internal consistency again assert iv.start == int(iv.fields[3]) - 1 print(" orig:", "(start=%s)" % orig_start, orig_string) print(" second:", "(start=%s)" % second_start, second_string) print("current:", "(start=%s)" % iv.start, str(iv)) self.assertTrue(orig_start == second_start == iv.start) self.assertTrue(orig_string == second_string == str(iv))
def testAppend(self): ivf = IntervalFile(self.file) iv = next(ivf) print(iv.fields) iv.append('asdf') print(iv) self.assertEqual(iv[-1], 'asdf')
def testSetItem(self): ivf = IntervalFile(self.file) iv = next(ivf) iv.chrom = "chrfake" print(iv.fields) self.assertEqual(iv["chrom"], "chrfake") self.assertEqual(iv.chrom, "chrfake")
def testFileType(self): self.assert_(self.bed.file_type == "bed", (self.bed.file_type, self.file)) gff = os.path.join(PATH, "data/c.gff") i = IntervalFile(gff) self.assert_(i.file_type == "gff", (i.file_type, gff))
def testSetAttrs(self): ivf = IntervalFile(self.file) iv = next(ivf) if iv.file_type != 'gff': iv.attrs['a'] = 'b' self.assertRaises(ValueError, str, iv) return iv.attrs['ID'] = 'fake' iv.attrs['field0'] = 'asdf' self.assertEqual(str(iv.attrs), iv[8]) self.assertTrue('field0=asdf' in iv[8]) self.assertTrue('ID=fake' in iv[8])
def testSetAttrs(self): ivf = IntervalFile(self.file) iv = next(ivf) if iv.file_type != "gff": iv.attrs["a"] = "b" self.assertRaises(ValueError, str, iv) return iv.attrs["ID"] = "fake" iv.attrs["field0"] = "asdf" self.assertEqual(str(iv.attrs), iv[8]) self.assertTrue("field0=asdf" in iv[8]) self.assertTrue("ID=fake" in iv[8])
class IntervalFileTest(unittest.TestCase): file = "data/rmsk.hg18.chr21.small.bed" def setUp(self): self.file = os.path.join(PATH, self.file) self.bed = IntervalFile(self.file) def testFileType(self): self.assert_(self.bed.file_type == "bed", (self.bed.file_type, self.file)) gff = os.path.join(PATH, "data/c.gff") i = IntervalFile(gff) self.assert_(i.file_type == "gff", (i.file_type, gff)) def testOverlaps(self): i = Interval("chr21", 9719768, 9739768) hits = self.bed.all_hits(i) self.assertEqual(len(hits), 8) for hit in hits: self.assert_(hit.start <= 9739768 and hit.end >= 9719768) def testStrands(self): i = Interval("chr21", 9719768, 9739768, "+") hits = self.bed.all_hits(i, same_strand=True) for hit in hits: self.assert_(hit.strand == '+') i = Interval("chr21", 9719768, 9739768, "-") hits = self.bed.all_hits(i, same_strand=True) for hit in hits: self.assert_(hit.strand == '-') def testRichCmp(self): a = Interval("chr21", 9719768, 9739768) b = Interval("chr21", 9719767, 9739768) self.assert_(a < b) self.assert_(b < a) c = Interval("chr21", 9719767, 9739768) self.assert_(c == b) d = Interval("chr22", 9719767, 9739768) self.assert_(c != d)
def testStart(self): ivf = IntervalFile(self.file) iv = next(ivf) orig_string = str(iv) orig_start = iv.start iv.start = orig_start second_string = str(iv) second_start = iv.start iv.start = second_start print(" orig:", "(start=%s)" % orig_start, orig_string) print(" second:", "(start=%s)" % second_start, second_string) print("current:", "(start=%s)" % iv.start, str(iv)) self.assertTrue(orig_start == second_start == iv.start) self.assertTrue(orig_string == second_string == str(iv))
class IntervalFileTest(unittest.TestCase): file = "data/rmsk.hg18.chr21.small.bed" def setUp(self): self.file = os.path.join(PATH, self.file) self.bed = IntervalFile(self.file) def testFileType(self): self.assert_(self.bed.file_type == "bed", (self.bed.file_type, self.file)) gff = os.path.join(PATH, "data/c.gff") i = IntervalFile(gff) self.assert_(i.file_type == "gff", (i.file_type, gff)) def testOverlaps(self): i = Interval("chr21", 9719768, 9739768) hits = self.bed.all_hits(i) self.assertEqual(len(hits), 8) for hit in hits: self.assert_(hit.start <= 9739768 and hit.end >= 9719768) def testStrands(self): i = Interval("chr21", 9719768, 9739768, "+") hits = self.bed.all_hits(i, same_strand=True) for hit in hits: self.assert_(hit.strand == '+') i = Interval("chr21", 9719768, 9739768, "-") hits = self.bed.all_hits(i, same_strand=True) for hit in hits: self.assert_(hit.strand == '-') def testRichCmp(self): # be obsessive . . . # # == a = Interval("chr21", 100, 200) b = Interval("chr21", 100, 200) self.assert_(a == b) self.assertFalse(a != b) self.assert_(a <= b) self.assert_(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) a = Interval("chr21", 100, 100) b = Interval("chr21", 100, 100) self.assert_(a == b) self.assertFalse(a != b) self.assert_(a <= b) self.assert_(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # != because of strand a = Interval("chr21", 100, 200, strand='+') b = Interval("chr21", 100, 200, strand='-') self.assertFalse(a == b) self.assert_(a != b) self.assertFalse(a <= b) self.assertFalse(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # a >= b a = Interval("chr21", 100, 300) b = Interval("chr21", 100, 200) self.assertFalse(a == b) self.assert_(a != b) self.assertFalse(a <= b) self.assert_(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # a <= b a = Interval("chr21", 100, 300) b = Interval("chr21", 300, 300) self.assertFalse(a == b) self.assert_(a != b) self.assert_(a <= b) self.assertFalse(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # a <= b a = Interval("chr21", 100, 300) b = Interval("chr21", 250, 300) self.assertFalse(a == b) self.assert_(a != b) self.assert_(a <= b) self.assertFalse(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # a < b a = Interval("chr21", 100, 200) b = Interval("chr21", 201, 300) self.assertFalse(a == b) self.assert_(a != b) self.assert_(a <= b) self.assertFalse(a >= b) self.assert_(a < b) self.assertFalse(a > b) # a > b a = Interval("chr21", 201, 300) b = Interval("chr21", 100, 200) self.assertFalse(a == b) self.assert_(a != b) self.assertFalse(a <= b) self.assert_(a >= b) self.assertFalse(a < b) self.assert_(a > b) # a != b a = Interval("none", 1, 100) b = Interval("chr21", 1, 100) self.assertFalse(a == b) self.assert_(a != b) self.assertFalse(a <= b) self.assertFalse(a >= b) self.assertFalse(a < b) self.assertFalse(a > b) # nested should raise NotImplementedError a = Interval("chr21", 100, 200) b = Interval("chr21", 50, 300) self.assertRaises(NotImplementedError, a.__eq__, b) self.assertRaises(NotImplementedError, a.__ne__, b) self.assertRaises(NotImplementedError, a.__le__, b) self.assertRaises(NotImplementedError, a.__ge__, b) self.assertRaises(NotImplementedError, a.__lt__, b) self.assertRaises(NotImplementedError, a.__gt__, b)
# 3. Reached an interval that is AFTER the query (start > query's end) # We add each feature to the cache, and track those that overlap while (curr_db is not None and curr_qy.chrom == curr_db.chrom and not after(curr_db, curr_qy)): if (overlaps(curr_qy, curr_db) > 0): hits.append(curr_db) db_cache.append(curr_db) curr_db = get_next(database) # Report the query's overlaps and move on to the next query report_hits(curr_qy, hits) hits = [] curr_qy = get_next(query) if __name__ == "__main__": if len(sys.argv) < 3: print("Usage:") print("chrom_sweep.py [query] [database]") sys.exit() query_file = sys.argv[1] database_file = sys.argv[2] # open up the BED files. query = IntervalFile(query_file) # The Query File database = IntervalFile(database_file) # The Database File sweep(query, database)
def main(): ''' main function takes arguments from the cammandline ''' parser = argparse.ArgumentParser() parser.add_argument( '--upstream', type=int, help='distance upstream of seed' ' to look for flanking regions to compare with `seed`, default = 20000', default=20000) parser.add_argument('--downstream', type=int, help='distance downstream to ' 'look for flanking regions, default = 20000', default=20000) parser.add_argument('--seed', help='regions of interest, e.g. promoters', required=True, metavar="BED") parser.add_argument('--exclude', help='regions to be excluded when looking for flanks') parser.add_argument('--include', help='regions to be included when looking for flanks') parser.add_argument('--test', default='fisher', choices=['fisher', 'permutation', 'both']) parser.add_argument( '--shuffles', type=int, help='number of shuffles to do for permutation analysis', default=1000) parser.add_argument('--genome', help='the name of the genome file for BEDTools') parser.add_argument('--full', help='output full, dataset with per-sample p-values', default=False, action='store_true') parser.add_argument('--score', metavar="BIGWIG/INT", help='score functionality in progress') parser.add_argument('variants', help='regions to assign significance e.g.' 'a list of variants', metavar="BED/VCF", nargs='+') args = parser.parse_args() seed_region = BedTool(args.seed) if args.variants[0][-4:] == '.vcf': region_file = vcf_to_long_bed(*args.variants) else: assert len(args.variants) == 1 region_file = IntervalFile(args.variants[0]) out_file = sys.stdout genome = args.genome if args.exclude: include_file = BedTool(args.exclude).complement(g=genome) elif args.include: include_file = BedTool(args.include) else: include_file = BedTool(create_genome_bed(genome)) bw = None if args.score: bw = BigWigFile(open(args.score)) analyze_intervals(include_file, seed_region, region_file, args.upstream, args.downstream, args.test, args.shuffles, out_file, args.full, score=bw)
1 = start_byte, 2 = end_byte, 3 = num_records, 4 = max_interval size (future, for binary search) """ # what are the BED files A_file = sys.argv[1] B_file = sys.argv[2] # expected index file name A_idx_file = A_file + ".idx" B_idx_file = B_file + ".idx" # open up the BED files. A = IntervalFile(A_file) # The Query File B = IntervalFile(B_file) # The Database File # create index files if they don't yet exist if not os.path.exists(A_idx_file): index_bed.index(A_file) if not os.path.exists(B_idx_file): index_bed.index(B_file) # load the indices for A and B A_map = [] # list of chrom/offset tuples for line in open(A_idx_file): fields = line.strip().split("\t") A_map.append( (fields[0], int(fields[1]), int(fields[2]), int(fields[3]), int(fields[4])))
def testSetItemString(self): ivf = IntervalFile(self.file) iv = ivf.next() iv['chrom'] = 'fake' self.assertEqual(iv['chrom'], 'fake') self.assertEqual(iv.chrom, 'fake')
def testGetItemString(self): ivf = IntervalFile(self.file) iv = ivf.next() self.assertEqual(iv['chrom'], iv.chrom) self.assertEqual(iv['start'], iv.start) self.assertEqual(iv['end'], iv.end)
1 = start_byte, 2 = end_byte, 3 = num_records, 4 = max_interval size (future, for binary search) """ # what are the BED files A_file = sys.argv[1] B_file = sys.argv[2] # expected index file name A_idx_file = A_file + ".idx" B_idx_file = B_file + ".idx" # open up the BED files. A = IntervalFile(A_file) # The Query File B = IntervalFile(B_file) # The Database File # create index files if they don't yet exist if not os.path.exists(A_idx_file): index_bed.index(A_file) if not os.path.exists(B_idx_file): index_bed.index(B_file) # load the indices for A and B A_map = [] # list of chrom/offset tuples for line in open(A_idx_file): fields = line.strip().split("\t") A_map.append((fields[0], int(fields[1]), int(fields[2]), int(fields[3]), int(fields[4]))) B_map = [] # list of chrom/offset tuples for line in open(B_idx_file):
def setUp(self): self.file = os.path.join(PATH, self.file) self.bed = IntervalFile(self.file)
def testSetItemString(self): ivf = IntervalFile(self.file) iv = next(ivf) iv["chrom"] = "fake" self.assertEqual(iv["chrom"], "fake") self.assertEqual(iv.chrom, "fake")
def testGetItemString(self): ivf = IntervalFile(self.file) iv = next(ivf) self.assertEqual(iv["chrom"], iv.chrom) self.assertEqual(iv["start"], iv.start) self.assertEqual(iv["end"], iv.end)