def test_stream(): """ Stream and file-based equality, both whole-file and Interval by Interval """ a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') c = a.intersect(b) # make an unwriteable dir... orig_tempdir = pybedtools.get_tempdir() if os.path.exists('unwriteable'): os.system('rm -rf unwriteable') os.system('mkdir unwriteable') os.system('chmod -w unwriteable') # ...set that to the new tempdir pybedtools.set_tempdir('unwriteable') # this should really not be written anywhere d = a.intersect(b, stream=True) assert_raises(NotImplementedError, c.__eq__, d) d_contents = d.fn.read() c_contents = open(c.fn).read() assert d_contents == c_contents # reconstruct d and check Interval-by-Interval equality pybedtools.set_tempdir('unwriteable') d = a.intersect(b, stream=True) for i,j in zip(c, d): assert str(i) == str(j) # Now do something similar with GFF files. a = pybedtools.example_bedtool('a.bed') f = pybedtools.example_bedtool('d.gff') # file-based pybedtools.set_tempdir(orig_tempdir) g1 = f.intersect(a) # streaming pybedtools.set_tempdir('unwriteable') g2 = f.intersect(a, stream=True) for i,j in zip(g1, g2): assert str(i) == str(j) # this was segfaulting at one point, just run to make sure g3 = f.intersect(a, stream=True) for i in iter(g3): print i for row in f.cut(range(3), stream=True): row[0], row[1], row[2] assert_raises(IndexError, row.__getitem__, 3) pybedtools.set_tempdir(orig_tempdir) os.system('rm -fr unwriteable')
def test_tail(): a = pybedtools.example_bedtool('rmsk.hg18.chr21.small.bed') observed = a.tail(as_string=True) expected = fix( """ chr21 13355834 13356047 MER58A 892 - chr21 13356250 13356290 AT_rich 26 + chr21 13356358 13356381 AT_rich 23 + chr21 13356571 13356910 L2 333 - chr21 13357179 13357987 L1MEc 1264 - chr21 13358003 13358300 L1MEc 379 - chr21 13358304 13358952 L1MEc 1271 - chr21 13358960 13359288 L2 336 + chr21 13359444 13359751 AluY 2337 + chr21 13360044 13360225 L1M5 284 -""") assert observed == expected # only ask for 3 lines observed = a.tail(3, as_string=True) expected = fix( """ chr21 13358960 13359288 L2 336 + chr21 13359444 13359751 AluY 2337 + chr21 13360044 13360225 L1M5 284 -""") assert observed == expected # For short files, whole thing should be returned a = pybedtools.example_bedtool('a.bed') expected = str(a) obs = a.tail(as_string=True) assert obs == expected
def test_cat(): a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') b_fn = pybedtools.example_filename('b.bed') assert a.cat(b) == a.cat(b_fn) expected = fix(""" chr1 1 500 chr1 800 950 """) assert a.cat(b) == expected a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') c = a.cat(b, postmerge=False) assert len(a) + len(b) == len(c), (len(a), len(b), len(c)) print c assert c == fix(""" chr1 1 100 feature1 0 + chr1 100 200 feature2 0 + chr1 150 500 feature3 0 - chr1 900 950 feature4 0 + chr1 155 200 feature5 0 - chr1 800 901 feature6 0 + """)
def test_history_step(): a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') c = a.intersect(b) d = c.subtract(a) tag = c.history[0].result_tag assert pybedtools.find_tagged(tag) == c assert_raises(ValueError, pybedtools.find_tagged, 'nonexistent') print d.history d.delete_temporary_history(ask=True, raw_input_func=lambda x: 'n') assert os.path.exists(a.fn) assert os.path.exists(b.fn) assert os.path.exists(c.fn) assert os.path.exists(d.fn) d.delete_temporary_history(ask=True, raw_input_func=lambda x: 'Yes') assert os.path.exists(a.fn) assert os.path.exists(b.fn) assert not os.path.exists(c.fn) # this is the only thing that should change assert os.path.exists(d.fn) a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') c = a.intersect(b) d = c.subtract(a) d.delete_temporary_history(ask=False) assert os.path.exists(a.fn) assert os.path.exists(b.fn) assert not os.path.exists(c.fn) # this is the only thing that should change assert os.path.exists(d.fn)
def test_tuple_creation(): # everything as a string t = [ ("chr1", "1", "100", "feature1", "0", "+"), ("chr1", "100", "200", "feature2", "0", "+"), ("chr1", "150", "500", "feature3", "0", "-"), ("chr1", "900", "950", "feature4", "0", "+") ] x = pybedtools.BedTool(t).saveas() assert pybedtools.example_bedtool('a.bed') == x t = [ ("chr1", 1, 100, "feature1", 0, "+"), ("chr1", 100, 200, "feature2", 0, "+"), ("chr1", 150, 500, "feature3", 0, "-"), ("chr1", 900, 950, "feature4", 0, "+") ] x = pybedtools.BedTool(t).saveas() assert pybedtools.example_bedtool('a.bed') == x t = [ ("chr1", "fake", "gene", "50", "300", ".", "+", ".", "ID=gene1"), ("chr1", "fake", "mRNA", "50", "300", ".", "+", ".", "ID=mRNA1;Parent=gene1;"), ("chr1", "fake", "CDS", "75", "150", ".", "+", ".", "ID=CDS1;Parent=mRNA1;"), ("chr1", "fake", "CDS", "200", "275", ".", "+", ".", "ID=CDS2;Parent=mRNA1;"), ("chr1", "fake", "rRNA", "1200", "1275", ".", "+", ".", "ID=rRNA1;"),] x = pybedtools.BedTool(t).saveas() # Make sure that x has actual Intervals and not plain tuples or something assert isinstance(x[0], pybedtools.Interval) assert repr(x[0]) == "Interval(chr1:49-300)" assert x[0]['ID'] == 'gene1'
def test_bam_filetype(): # regression test -- this was segfaulting before because IntervalFile # couldn't parse SAM a = pybedtools.example_bedtool('gdc.bam') b = pybedtools.example_bedtool('gdc.gff') c = a.intersect(b) assert c.file_type == 'bam'
def test_issue_141(): a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') # make an empty file empty = pybedtools.BedTool("", from_string=True) # invalid file format malformed = pybedtools.BedTool('a a a', from_string=True) # positive control; works a + b # "adding" an empty file always gets zero features assert len(a + empty) == 0 assert len(empty + a) == 0 assert len(empty + empty) == 0 # "adding" a malformed file raises MalformedBedLineError # (an uncaught exception raised when trying to intersect) with pytest.raises(pybedtools.MalformedBedLineError): a + malformed x = pybedtools.example_bedtool('x.bam') x + a
def test_output_kwarg(): a = pybedtools.example_bedtool("a.bed") b = pybedtools.example_bedtool("b.bed") c = a.intersect(b) d = a.intersect(b, output="deleteme.bed") assert c == d os.unlink("deleteme.bed")
def test_output_kwarg(): a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') c = a.intersect(b) d = a.intersect(b, output='deleteme.bed') assert c == d os.unlink('deleteme.bed')
def test_annotate_xstream(): a = pybedtools.example_bedtool('m1.bed') b = pybedtools.example_bedtool('mm9.bed12') c = annotate.add_xstream(a, b, dist=1000, updown="up") assert a.field_count() == c.field_count() - 1 assert len(a) == len(c) d = annotate.add_xstream(c, b, dist=1000, updown="down") assert a.field_count() == d.field_count() - 2
def test_jaccard(): x = pybedtools.example_bedtool('a.bed') results = x.jaccard(pybedtools.example_bedtool('b.bed')) assert results == {'intersection': 46, 'union': 649, 'jaccard': 0.0708783, 'n_intersections': 2}, results results2 = x.jaccard(pybedtools.example_bedtool('b.bed'), stream=True) assert results == results2, results2
def test_random_intersection(): # TODO: return N = 4 a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') li = list(a.randomintersection(b, N)) assert len(li) == N, li
def test_annotate_closest(): a = pybedtools.example_bedtool('m1.bed') b = pybedtools.example_bedtool('mm9.bed12') c = annotate.add_closest(a, b) assert len(a) == len(c), (len(a), len(c), str(c)) assert a.field_count() == c.field_count() - 2 # in this test-case, the final column should be exon;intron # since m1 completely contains both an exon and an intron. f = next(iter(c))
def run(d): method = d['method'] bedtool = d['bedtool'] convert = d['convert'] kwargs = d['kw'].copy() expected = d['test_case']['expected'] bedtool_converter = convert.pop('bedtool') bedtool = ( converters[bedtool_converter](pybedtools.example_bedtool(bedtool)) ) for k, converter_name in convert.items(): kwargs[k] = ( converters[converter_name](pybedtools.example_bedtool(kwargs[k])) ) result = getattr(bedtool, method)(**kwargs) res = str(result) expected = fix(expected) try: assert res == expected except AssertionError: print(result.fn) print('Method call:') args = [] for key, val in list(kwargs.items()): args.append(('%s=%s' % (key, val)).strip()) args = ', '.join(args) print('BedTool.%(method)s(%(args)s)' % locals()) print('Got:') print(res) print('Expected:') print(expected) print('Diff:') for i in ( difflib.unified_diff(res.splitlines(1), expected.splitlines(1)) ): print(i, end=' ') # Make tabs and newlines visible spec_res = res.replace('\t', '\\t').replace('\n', '\\n\n') spec_expected = expected.replace('\t', '\\t').replace('\n', '\\n\n') print('Showing special characters:') print('Got:') print(spec_res) print('Expected:') print(spec_expected) print('Diff:') for i in ( difflib.unified_diff(spec_res.splitlines(1), spec_expected.splitlines(1)) ): print(i, end=' ') raise
def test_many_files(): """regression test to make sure many files can be created """ a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') # Previously, IntervalFile would leak open files and would cause OSError # (too many open files) at iteration 1010 or so. for i in xrange(1100): c = a.intersect(b)
def test_issue_147(): # previously this would raise BEDToolsError because of unexpected stderr. with open(pybedtools.BedTool._tmp(), 'w') as tmp: orig_stderr = sys.stderr sys.stderr = tmp v = pybedtools.example_bedtool('vcf-stderr-test.vcf') b = pybedtools.example_bedtool('vcf-stderr-test.bed') v.intersect(b) sys.stderr = orig_stderr
def test_chromsizes_in_5prime_3prime(): # standard 5' a = pybedtools.example_bedtool('a.bed')\ .each(featurefuncs.five_prime, 1, 10, add_to_name="_TSS", genome=pybedtools.chromsizes("hg19"))\ .saveas() assert a == fix( """ chr1 0 11 feature1_TSS 0 + chr1 99 110 feature2_TSS 0 + chr1 490 501 feature3_TSS 0 - chr1 899 910 feature4_TSS 0 + """), str(a) # add genomes sizes; last feature should be truncated a = pybedtools.example_bedtool('a.bed')\ .each(featurefuncs.five_prime, 1, 10, add_to_name="_TSS", genome=dict(chr1=(0, 900)))\ .saveas() assert a == fix( """ chr1 0 11 feature1_TSS 0 + chr1 99 110 feature2_TSS 0 + chr1 490 501 feature3_TSS 0 - chr1 899 900 feature4_TSS 0 + """), str(a) # same thing but for 3'. # Note that the last feature chr1:949-960 is completely truncated because # it would entirely fall outside of the chromosome a = pybedtools.example_bedtool('a.bed')\ .each(featurefuncs.three_prime, 1, 10, add_to_name="_TSS", genome=dict(chr1=(0, 900)))\ .saveas() assert a == fix( """ chr1 99 110 feature1_TSS 0 + chr1 199 210 feature2_TSS 0 + chr1 140 151 feature3_TSS 0 - chr1 900 900 feature4_TSS 0 + """), str(a) # be a lot harsher with the chromsizes to ensure features on both strands # get truncated correctly a = pybedtools.example_bedtool('a.bed')\ .each(featurefuncs.three_prime, 1, 10, add_to_name="_TSS", genome=dict(chr1=(0, 120)))\ .saveas() assert a == fix( """ chr1 99 110 feature1_TSS 0 + chr1 120 120 feature2_TSS 0 + chr1 120 120 feature3_TSS 0 - chr1 120 120 feature4_TSS 0 + """), str(a)
def test_gzipped_files_can_be_intersected(): agz = _make_temporary_gzip(pybedtools.example_filename('a.bed')) bgz = _make_temporary_gzip(pybedtools.example_filename('b.bed')) agz = pybedtools.BedTool(agz) bgz = pybedtools.BedTool(bgz) a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') assert a.intersect(b) == agz.intersect(bgz) == a.intersect(bgz) == agz.intersect(b)
def test_repr_and_printing(): a = pybedtools.example_bedtool("a.bed") b = pybedtools.example_bedtool("b.bed") c = a + b os.unlink(c.fn) assert "a.bed" in repr(a) assert "b.bed" in repr(b) assert "MISSING FILE" in repr(c) print a.head(1)
def test_repr_and_printing(): a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') c = a+b os.unlink(c.fn) assert 'a.bed' in repr(a) assert 'b.bed' in repr(b) assert 'MISSING FILE' in repr(c) print a.head(1)
def test_issue_118(): p = psutil.Process(os.getpid()) start_fds = p.num_fds() a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') for i in range(100): c = a.intersect(b) c.field_count() stop_fds = p.num_fds() assert start_fds == stop_fds
def test_stream_of_generator(): a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') b1 = a.intersect(a, stream=True) b2 = pybedtools.BedTool((i for i in a)).intersect(a, stream=True) sb1 = str(b1) sb2 = str(b2) print sb1 print sb2 assert sb1 == sb2
def test_reldist(): x = pybedtools.example_bedtool('a.bed') results = x.reldist(pybedtools.example_bedtool('b.bed')) assert results == {'reldist': [0.15, 0.21, 0.28], 'count': [1, 1, 1], 'total': [3, 3, 3], 'fraction': [0.333, 0.333, 0.333]}, results results2 = x.reldist(pybedtools.example_bedtool('b.bed'), detail=True) print results2 assert results2 == fix(""" chr1 1 100 feature1 0 + 0.282 chr1 100 200 feature2 0 + 0.153 chr1 150 500 feature3 0 - 0.220""")
def test_gzip(): # make new gzipped files on the fly agz = pybedtools.BedTool._tmp() bgz = pybedtools.BedTool._tmp() os.system("gzip -c %s > %s" % (pybedtools.example_filename("a.bed"), agz)) os.system("gzip -c %s > %s" % (pybedtools.example_filename("b.bed"), bgz)) agz = pybedtools.BedTool(agz) bgz = pybedtools.BedTool(bgz) assert agz.file_type == bgz.file_type == "bed" a = pybedtools.example_bedtool("a.bed") b = pybedtools.example_bedtool("b.bed") assert a.intersect(b) == agz.intersect(bgz) == a.intersect(bgz) == agz.intersect(b)
def test_gzip(): # make new gzipped files on the fly agz = pybedtools.BedTool._tmp() bgz = pybedtools.BedTool._tmp() os.system('gzip -c %s > %s' % (pybedtools.example_filename('a.bed'), agz)) os.system('gzip -c %s > %s' % (pybedtools.example_filename('b.bed'), bgz)) agz = pybedtools.BedTool(agz) bgz = pybedtools.BedTool(bgz) assert agz.file_type == bgz.file_type == 'bed' a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') assert a.intersect(b) == agz.intersect(bgz) == a.intersect(bgz) == agz.intersect(b)
def test_bam_stream_bam(): x = pybedtools.example_bedtool('gdc.bam') b = pybedtools.example_bedtool('gdc.gff') c = x.intersect(b, u=True, stream=True) expected = fix(""" None 16 chr2L 71 255 5M * 0 0 TTCTC IIIII NM:i:0 NH:i:1 None 16 chr2L 141 255 5M * 0 0 CACCA IIIII NM:i:0 NH:i:1 None 16 chr2L 151 255 5M * 0 0 GTTCA IIIII NM:i:0 NH:i:1 None 0 chr2L 211 255 5M * 0 0 AAATA IIIII NM:i:0 NH:i:1 None 0 chr2L 71 255 5M * 0 0 GAGAA IIIII NM:i:0 NH:i:1 None 0 chr2L 141 255 5M * 0 0 TGGTG IIIII NM:i:0 NH:i:1 None 0 chr2L 161 255 5M * 0 0 GATAA IIIII NM:i:0 NH:i:1""") assert str(c) == expected
def test_repr_and_printing(): """ Missing files and streams should say so in repr() """ a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('b.bed') c = a+b d = a.intersect(b, stream=True) os.unlink(c.fn) assert 'a.bed' in repr(a) assert 'b.bed' in repr(b) assert 'MISSING FILE' in repr(c) assert 'stream' in repr(d)
def test_create_from_list_long_features(): """ Iterator handles extra fields from long features (BED+GFF -wao intersection) """ a = pybedtools.example_bedtool('a.bed') b = pybedtools.example_bedtool('c.gff') c = a.intersect(b, wao=True, stream=False) d = a.intersect(b, wao=True, stream=True) print b.closest(a) for i in d: print i
def test_randomstats(): chromsizes = {"chr1": (1, 1000)} a = pybedtools.example_bedtool("a.bed").set_chromsizes(chromsizes) b = pybedtools.example_bedtool("b.bed") try: results = a.randomstats(b, 100, debug=True) assert results["actual"] == 3 assert results["median randomized"] == 2.0 assert results["percentile"] == 90.0 except ImportError: # allow doctests to pass if SciPy not installed sys.stderr.write("SciPy not installed, so not testing " "BedTool.randomstats().")
def test_repr_and_printing(): """ Missing files and streams should say so in repr() """ a = pybedtools.example_bedtool("a.bed") b = pybedtools.example_bedtool("b.bed") c = a + b d = a.intersect(b, stream=True) os.unlink(c.fn) assert "a.bed" in repr(a) assert "b.bed" in repr(b) assert "MISSING FILE" in repr(c) assert "stream" in repr(d)
def _classifier(): c = Classifier(bed=pybedtools.example_filename('gdc.bed'), annotations=pybedtools.example_filename('gdc.gff')) c.classify() bed = pybedtools.example_bedtool('gdc.bed') assert c.class_counts == { frozenset(['UTR', 'exon', 'mRNA', 'CDS', 'tRNA', 'gene']): 1, frozenset(['intron', 'gene', 'mRNA']): 3, frozenset([]): 1, frozenset(['gene', 'exon', 'mRNA', 'CDS']): 2, frozenset(['exon', 'mRNA', 'CDS', 'tRNA', 'intron', 'gene']): 1 } assert c.feature_classes == { bed[0]: set(['.']), bed[1]: set(['gene', 'exon', 'mRNA', 'CDS']), bed[2]: set(['intron', 'gene', 'mRNA']), bed[3]: set(['intron', 'gene', 'mRNA']), bed[4]: set(['tRNA', 'UTR', 'exon', 'mRNA', 'CDS', 'gene']), bed[5]: set(['gene', 'exon', 'mRNA', 'CDS']), bed[6]: set(['intron', 'gene', 'mRNA']), bed[7]: set(['tRNA', 'intron', 'exon', 'mRNA', 'CDS', 'gene']), } print('use these indexes for debugging') for i, f in enumerate(bed): print(i, f) for k, v in list(c.class_features.items()): print(k) for i in v: print('\t' + str(i)) assert c.class_features == { frozenset([]): [bed[0]], frozenset(['intron', 'gene', 'mRNA']): [bed[6], bed[2], bed[3]], frozenset(['gene', 'exon', 'mRNA', 'CDS']): [bed[5], bed[1]], frozenset(['UTR', 'exon', 'mRNA', 'CDS', 'tRNA', 'gene']): [bed[4]], frozenset(['exon', 'mRNA', 'CDS', 'tRNA', 'intron', 'gene']): [bed[7]], }
def test_issue_143(): def func(x): x.start += 10 return x a = pybedtools.example_bedtool('a.bed') b = a.merge(s=True, stream=True).each(func).saveas() c = a.merge(s=True).each(func).saveas() assert b == c b = a.merge(s=True, stream=True) for i in b: assert isinstance(i, pybedtools.Interval) b = a.merge(s=True, stream=True) for i in iter(iter(iter(b))): assert isinstance(i, pybedtools.Interval) for i in a.merge(s=True, stream=True).each(lambda x: x): assert isinstance(i, pybedtools.Interval)
def test_window_maker(): x = pybedtools.BedTool() a = pybedtools.example_bedtool('a.bed') result = x.window_maker(b=a.fn, w=50) print result assert result == fix(""" chr1 1 51 chr1 51 100 chr1 100 150 chr1 150 200 chr1 150 200 chr1 200 250 chr1 250 300 chr1 300 350 chr1 350 400 chr1 400 450 chr1 450 500 chr1 900 950 """)
def test_copy(): a = pybedtools.example_bedtool('a.bed') x = a[0] # Before adding the __copy__ method to Interval class, making a copy would # hang and then segfault import copy y = copy.copy(x) assert y.start == x.start assert y.stop == x.stop assert y.chrom == x.chrom assert y.name == x.name assert y.fields == x.fields assert y.file_type == x.file_type == 'bed' # Make sure it's a real copy (changing something in y doesn't change # something in x) y.start += 1 assert y.start == x.start + 1
def test_venn_mpl(): """ compares output image to expected """ try: import matplotlib except ImportError: import sys sys.stderr.write('Need matplotlib installed to test venn_mpl') return here = os.path.dirname(__file__) expected_fn = os.path.join(here, 'mpl-expected.png') original = pybedtools.example_bedtool( 'rmsk.hg18.chr21.small.bed').sort().merge() a = pybedtools.BedTool(original[:300]).saveas() b = pybedtools.BedTool(original[:20]).saveas().cat( pybedtools.BedTool(original[400:500]).saveas()) c = pybedtools.BedTool(original[15:30]).saveas().cat( pybedtools.BedTool(original[450:650]).saveas()) outfn = 'mplout.png' venn_mpl.venn_mpl(a=a.fn, b=b.fn, c=c.fn, colors=['r', 'b', 'g'], outfn=outfn, labels=['a', 'b', 'c']) # On a different machine, the created image is not visibly different but is # numerically different. Not sure what a reasonable tolerance is, but this # seems to work for now.... o = matplotlib.image.imread(outfn) e = matplotlib.image.imread(expected_fn) TOLERANCE = 200 SUM = abs((o - e).sum()) assert SUM < TOLERANCE, SUM os.unlink(outfn)
def test_issue_157(): # the problem here was that converting to file from dataframe didn't pass # through enough options to pandas. try: import pandas except ImportError: pytest.xfail("pandas not installed; skipping test") vcf = pybedtools.example_bedtool('1000genomes-example.vcf') bed = pybedtools.BedTool('20\t14300\t17000', from_string=True) non_dataframe = str(vcf.intersect(bed)) df = vcf.to_dataframe(comment='#', names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'NA00001', 'NA00002', 'NA00003']) header = ''.join([line for line in open(vcf.fn) if line.startswith('#')]) outfile = pybedtools.BedTool._tmp() with open(outfile, 'w') as fout: fout.write(header) vcf_from_df = pybedtools.BedTool.from_dataframe(df, outfile=fout) from_dataframe = str(vcf_from_df.intersect(bed)) assert non_dataframe == from_dataframe
def test_i_methods(): """ Generator that yields tests, inserting different versions of `i` as needed """ for method, send_kwargs, expected in parse_yaml(config_fn): i_isbam = False if 'ibam' in send_kwargs: i_isbam = True send_kwargs['ibam'] = pybedtools.example_filename( send_kwargs['ibam']) send_kwargs['i'] = send_kwargs['ibam'] if ('a' in send_kwargs) and ('b' in send_kwargs): continue if ('i' not in send_kwargs) and ('ibam' not in send_kwargs): continue if 'files' in send_kwargs: send_kwargs['files'] = [ pybedtools.example_filename(i) for i in send_kwargs['files'] ] orig_i = pybedtools.example_bedtool(send_kwargs['i']) if orig_i._isbam: i_isbam = True del send_kwargs['i'] done = [] for kind_i in ('filename', 'generator', 'stream', 'gzip'): if i_isbam: if (kind_i not in supported_bam): continue i = converter[kind_i](orig_i) kind = 'i=%(kind_i)s ibam=%(i_isbam)s' % locals() f = partial(run, method, i, expected, **send_kwargs) f.description = '%(method)s, %(kind)s, %(send_kwargs)s' % locals() yield (f, )
def test_stream_of_stream(): """ Second-level streaming using self-intersections """ a = pybedtools.example_bedtool('a.bed') # Ensure non-stream and stream equality of self-intersection nonstream1 = a.intersect(a, u=True) stream1 = a.intersect(a, u=True, stream=True) nonstream1_str = str(nonstream1) stream1_str = str(stream1) a_str = str(a) assert nonstream1_str == stream1_str == a_str # Have to reconstruct stream1 cause it was consumed in the str() call nonstream1 = a.intersect(a, u=True) stream1 = a.intersect(a, u=True, stream=True) nonstream2 = a.intersect(nonstream1, u=True) stream2 = a.intersect(stream1, u=True, stream=True) nonstream2_str = str(nonstream2) stream2_str = str(stream2) assert nonstream2_str == stream2_str == nonstream1_str == stream1_str == a_str
def test_indexing(): """ Indexing into BedTools """ a = pybedtools.example_bedtool('a.bed') # This is the first line interval = pybedtools.Interval('chr1', 1, 100, 'feature1', '0', '+') # just to make sure assert interval == iter(a).next() # test slice behavior results = list(a[0:2]) assert len(results) == 2 assert results[0] == interval # test single-integer indexing assert a[0] == interval # only slices and integers allowed.... assert_raises(ValueError, a.__getitem__, 'key')
def test_bedtool_creation(): # make sure we can make a bedtool from a bedtool and that it points to the # same file a = pybedtools.example_bedtool('a.bed') b = pybedtools.BedTool(a) assert b.fn == a.fn assert_raises(ValueError, pybedtools.BedTool, 'nonexistent.bed') # note that *s* has both tabs and spaces.... s = """ chr1 1 100 feature1 0 + chr1 100 200 feature2 0 + chr1 150 500 feature3 0 - chr1 900 950 feature4 0 + """ from_string = pybedtools.BedTool(s, from_string=True) # difflib used here to show a bug where a newline was included when using # from_string print ''.join(difflib.ndiff(str(from_string), str(a))) assert str(from_string) == str(a)
def test_bed_methods(): """ Generator that yields tests, inserting different versions of `bed` as needed """ for method, send_kwargs, expected in parse_yaml(config_fn): ignore = ['a', 'b', 'abam', 'i'] skip_test = False for i in ignore: if i in send_kwargs: skip_test = True if skip_test: continue if 'bed' not in send_kwargs: continue if 'files' in send_kwargs: send_kwargs['files'] = [ pybedtools.example_filename(i) for i in send_kwargs['files'] ] if 'bams' in send_kwargs: send_kwargs['bams'] = [ pybedtools.example_filename(i) for i in send_kwargs['bams'] ] if 'fi' in send_kwargs: send_kwargs['fi'] = pybedtools.example_filename(send_kwargs['fi']) orig_bed = pybedtools.example_bedtool(send_kwargs['bed']) del send_kwargs['bed'] done = [] for kind_bed in ('filename', 'generator', 'stream', 'gzip'): bed = converter[kind_bed](orig_bed) kind = 'i=%(kind_bed)s' % locals() f = partial(run, method, bed, expected, **send_kwargs) f.description = '%(method)s, %(kind)s, %(send_kwargs)s' % locals() yield (f, )
def test_venn_gchart_data_is_correct(): original = pybedtools.example_bedtool("rmsk.hg18.chr21.small.bed").sort().merge() a = pybedtools.BedTool(original[:300]).saveas() b = ( pybedtools.BedTool(original[:20]) .saveas() .cat(pybedtools.BedTool(original[400:500]).saveas()) ) c = ( pybedtools.BedTool(original[15:30]) .saveas() .cat(pybedtools.BedTool(original[450:650]).saveas()) ) colors = "00FF00,FF0000,0000FF" labels = "a,b,c" expected_data = { "chco": "00FF00,FF0000,0000FF", "chd": "t:1.0,0.4,0.7167,0.0667,0.05,0.1833,0.0167", "chs": "300x300", "cht": "v", "chdl": "a|b|c", } data = venn_gchart.venn_gchart( a=a.fn, b=b.fn, c=c.fn, colors=colors.split(","), labels=labels.split(","), size="300x300", ) for key in expected_data.keys(): e = expected_data[key] o = data[key] assert e == o
def read_data(bed_file,fasta_file): #apply bedtools to read fasta files '/home/h5li/methylation_DMR/data/DMR_coordinates_extended_b500.bed' a = pybedtools.example_bedtool( bed_file ) # '/home/h5li/methylation_DMR/data/mm10.fasta' fasta = pybedtools.example_filename( fasta_file ) a = a.sequence(fi=fasta) seq = open(a.seqfn).read() #read and extract DNA sequences DNA_seq_list = seq.split('\n') DNA_seq_list.pop() DNA_seq = [] m = 10000 n = 0 for index in range(len(DNA_seq_list)//2): DNA_seq.append(DNA_seq_list[index*2 + 1].upper()) if len(DNA_seq_list[index*2 + 1]) < m: m = len(DNA_seq_list[index*2 + 1]) if len(DNA_seq_list[index*2 + 1]) > n: n = len(DNA_seq_list[index*2 + 1]) print('The shortest length of DNA sequence is {0}bp'.format(m)) print('The longest length of DNA sequence is {0}bp'.format(n)) print('Total Number of input sequence is {0}'.format(len(DNA_seq))) return DNA_seq,n,m
def test_add_color(): try: from matplotlib import cm except ImportError: print "matplotlib not installed; skipping test_add_color" return def modify_scores(f): fields = f.fields fields[4] = str(f[2]) return pybedtools.create_interval_from_list(fields) a = pybedtools.example_bedtool('a.bed') a = a.each(modify_scores).saveas() cmap = cm.jet norm = a.colormap_normalize() results = str(a.each(featurefuncs.add_color, cmap=cmap, norm=norm)) print results assert results == fix(""" chr1 1 100 feature1 100 + 1 100 0,0,127 chr1 100 200 feature2 200 + 100 200 0,0,255 chr1 150 500 feature3 500 - 150 500 99,255,147 chr1 900 950 feature4 950 + 900 950 127,0,0""")
def test_tabix(): a = pybedtools.example_bedtool('a.bed') t = a.tabix() assert t._tabixed() results = str(t.tabix_intervals('chr1:99-200')) print results assert results == fix(""" chr1 1 100 feature1 0 + chr1 100 200 feature2 0 + chr1 150 500 feature3 0 -""") assert str(t.tabix_intervals(a[2])) == fix(""" chr1 100 200 feature2 0 + chr1 150 500 feature3 0 -""") # clean up fns = [ pybedtools.example_filename('a.bed.gz'), pybedtools.example_filename('a.bed.gz.tbi'), ] for fn in fns: if os.path.exists(fn): os.unlink(fn)
def test_gff2bed(): a = pybedtools.example_bedtool('d.gff') results = str(a.each(featurefuncs.gff2bed, name_field='Parent')) assert results == fix(""" chr1 49 300 . . + chr1 49 300 gene1 . + chr1 74 150 mRNA1 . + chr1 199 275 mRNA1 . + chr1 1199 1275 . . +""") results = str(a.each(featurefuncs.gff2bed)) assert results == fix(""" chr1 49 300 gene1 . + chr1 49 300 mRNA1 . + chr1 74 150 CDS1 . + chr1 199 275 CDS2 . + chr1 1199 1275 rRNA1 . + """) results = str(a.each(featurefuncs.gff2bed, name_field="nonexistent")) assert results == fix(""" chr1 49 300 . . + chr1 49 300 . . + chr1 74 150 . . + chr1 199 275 . . + chr1 1199 1275 . . + """) results = str(a.each(featurefuncs.gff2bed, name_field=1)) print results assert results == fix(""" chr1 49 300 fake . + chr1 49 300 fake . + chr1 74 150 fake . + chr1 199 275 fake . + chr1 1199 1275 fake . +""")
def circle_coverage_profile(bamfile, bedfile, exon_ind, split_character, platform): ''' ''' x = pybedtools.example_bedtool(bamfile) y = x.coverage(bedfile, d=True, split=True) transcriptwise_coverage = {} for position in y: if platform == 'refseq': transcript = split_character.join( position[3].split(split_character)[0:2]) elif platform == 'ensembl': transcript = position[3].split(split_character)[0] else: transcript = 'NA' print( 'you are using an unknown annotation platform, please use refseq or ensembl like formats' ) exon = int(position[3].split(split_character)[exon_ind]) if not transcript in transcriptwise_coverage: transcriptwise_coverage[transcript] = {} if not exon in transcriptwise_coverage[transcript]: transcriptwise_coverage[transcript][exon] = { 'relative_positions': [], 'position_coverage': [], 'chromosome': position[0], 'start': position[1], 'end': position[2] } transcriptwise_coverage[transcript][exon]['position_coverage'] += [ position[7] ] transcriptwise_coverage[transcript][exon]['relative_positions'] += [ position[6] ] return (transcriptwise_coverage)
def test_issue_181(): a = pybedtools.example_bedtool("a.bed") a = a.tabix(force=True) a.tabix_intervals("none:1-5") with pytest.raises(ValueError): a.tabix_intervals("none:1-5", check_coordinates=True)
def test_cut(): a = pybedtools.example_bedtool('a.bed') c = a.cut([0, 1, 2, 4]) assert c.field_count() == 4, c
def test_filter(): a = pybedtools.example_bedtool('a.bed') b = a.filter(lambda f: f.length < 100 and f.length > 0) assert len(b) == 2
def test_name(): c = iter(pybedtools.example_bedtool('c.gff')).next() assert c.name == "thaliana_1_465_805", c.name
def test_kwargs(): a = pybedtools.example_bedtool('a.bed') b = a.intersect(a, s=False) c = a.intersect(a) assert str(b) == str(c)
def test_bam_bedtool_creation(): x = pybedtools.example_bedtool('x.bam') a = pybedtools.example_bedtool('a.bed') assert x._isbam assert not a._isbam
def test_bam_iter(): x = pybedtools.example_bedtool('gdc.bam') s = 'None 0 chr2L 11 255 5M * 0 0 CGACA IIIII NM:i:0 NH:i:1\n' assert str(x[0]) == str(iter(x).next()) == s
def test_sam_filetype(): # file_type was segfaulting cause IntervalFile couldn't parse SAM a = pybedtools.example_bedtool('gdc.bam') b = pybedtools.BedTool(i for i in a).saveas() assert b.file_type == 'sam'
def test_PR_158(): # See #121 for original, #122 for follow-up, and #158 for fix. # # This used to crash with "OverflowError: can't convert negative value to CHRPOS" b = pybedtools.example_bedtool("issue_121.bam") print(b)
def test_issue_169(): x = pybedtools.example_bedtool("1000genomes-example.vcf") fn = x.bgzip(is_sorted=False, force=True) line = gzip.open(fn, "rt").readline() assert str(line).startswith("#"), line
def test_issue_180(): a = pybedtools.example_bedtool("a.bed") a = a.tabix(force=True) assert a.tabix_contigs() == ["chr1"]
def test_issue_203(): x = pybedtools.example_bedtool("x.bed") x.truncate_to_chrom(genome="hg19")
def pybedtoolsmerge(self, filteringclonescountoutput): self.input = pybedtools.example_bedtool(filteringclonescountoutput) self.output = self.input.merge().moveto(self.out+'_blastnclonesmerge.bed') return self.out+'_blastnclonesmerge.bed'