Example #1
0
def test_stream():
    """
    Stream and file-based equality, both whole-file and Interval by
    Interval
    """
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.intersect(b)

    # make an unwriteable dir...
    orig_tempdir = pybedtools.get_tempdir()
    if os.path.exists('unwriteable'):
        os.system('rm -rf unwriteable')
    os.system('mkdir unwriteable')
    os.system('chmod -w unwriteable')

    # ...set that to the new tempdir
    pybedtools.set_tempdir('unwriteable')

    # this should really not be written anywhere
    d = a.intersect(b, stream=True)

    assert_raises(NotImplementedError, c.__eq__, d)
    d_contents = d.fn.read()
    c_contents = open(c.fn).read()
    assert d_contents == c_contents

    # reconstruct d and check Interval-by-Interval equality
    pybedtools.set_tempdir('unwriteable')
    d = a.intersect(b, stream=True)

    for i,j in zip(c, d):
        assert str(i) == str(j)

    # Now do something similar with GFF files.
    a = pybedtools.example_bedtool('a.bed')
    f = pybedtools.example_bedtool('d.gff')

    # file-based
    pybedtools.set_tempdir(orig_tempdir)
    g1 = f.intersect(a)

    # streaming
    pybedtools.set_tempdir('unwriteable')
    g2 = f.intersect(a, stream=True)

    for i,j in zip(g1, g2):
        assert str(i) == str(j)

    # this was segfaulting at one point, just run to make sure
    g3 = f.intersect(a, stream=True)
    for i in iter(g3):
        print i

    for row in f.cut(range(3), stream=True):
        row[0], row[1], row[2]
        assert_raises(IndexError, row.__getitem__, 3)

    pybedtools.set_tempdir(orig_tempdir)
    os.system('rm -fr unwriteable')
Example #2
0
def test_tail():
    a = pybedtools.example_bedtool('rmsk.hg18.chr21.small.bed')
    observed = a.tail(as_string=True)
    expected = fix(
        """
        chr21	13355834	13356047	MER58A	892	-
        chr21	13356250	13356290	AT_rich	26	+
        chr21	13356358	13356381	AT_rich	23	+
        chr21	13356571	13356910	L2	333	-
        chr21	13357179	13357987	L1MEc	1264	-
        chr21	13358003	13358300	L1MEc	379	-
        chr21	13358304	13358952	L1MEc	1271	-
        chr21	13358960	13359288	L2	336	+
        chr21	13359444	13359751	AluY	2337	+
        chr21	13360044	13360225	L1M5	284	-""")
    assert observed == expected


    # only ask for 3 lines
    observed = a.tail(3, as_string=True)
    expected = fix(
        """
        chr21	13358960	13359288	L2	336	+
        chr21	13359444	13359751	AluY	2337	+
        chr21	13360044	13360225	L1M5	284	-""")
    assert observed == expected


    # For short files, whole thing should be returned
    a = pybedtools.example_bedtool('a.bed')
    expected = str(a)
    obs = a.tail(as_string=True)
    assert obs == expected
Example #3
0
def test_cat():
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    b_fn = pybedtools.example_filename('b.bed')
    assert a.cat(b) == a.cat(b_fn)
    expected =  fix("""
    chr1 1   500
    chr1 800 950
    """)
    assert a.cat(b) == expected

    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.cat(b, postmerge=False)
    assert len(a) + len(b) == len(c), (len(a), len(b), len(c))

    print c
    assert c == fix("""
    chr1	1	100	feature1	0	+
    chr1	100	200	feature2	0	+
    chr1	150	500	feature3	0	-
    chr1	900	950	feature4	0	+
    chr1	155	200	feature5	0	-
    chr1	800	901	feature6	0	+
    """)
Example #4
0
def test_history_step():
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.intersect(b)
    d = c.subtract(a)

    tag = c.history[0].result_tag
    assert pybedtools.find_tagged(tag) == c

    assert_raises(ValueError, pybedtools.find_tagged, 'nonexistent')


    print d.history
    d.delete_temporary_history(ask=True, raw_input_func=lambda x: 'n')
    assert os.path.exists(a.fn)
    assert os.path.exists(b.fn)
    assert os.path.exists(c.fn)
    assert os.path.exists(d.fn)

    d.delete_temporary_history(ask=True, raw_input_func=lambda x: 'Yes')
    assert os.path.exists(a.fn)
    assert os.path.exists(b.fn)
    assert not os.path.exists(c.fn) # this is the only thing that should change
    assert os.path.exists(d.fn)

    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.intersect(b)
    d = c.subtract(a)
    d.delete_temporary_history(ask=False)
    assert os.path.exists(a.fn)
    assert os.path.exists(b.fn)
    assert not os.path.exists(c.fn) # this is the only thing that should change
    assert os.path.exists(d.fn)
Example #5
0
def test_tuple_creation():
    # everything as a string
    t = [
            ("chr1", "1", "100", "feature1", "0", "+"),
            ("chr1", "100", "200", "feature2", "0", "+"),
            ("chr1", "150", "500", "feature3", "0", "-"),
            ("chr1", "900", "950", "feature4", "0", "+")
        ]
    x = pybedtools.BedTool(t).saveas()
    assert pybedtools.example_bedtool('a.bed') == x

    t = [
            ("chr1", 1, 100, "feature1", 0, "+"),
            ("chr1", 100, 200, "feature2", 0, "+"),
            ("chr1", 150, 500, "feature3", 0, "-"),
            ("chr1", 900, 950, "feature4", 0, "+")
        ]
    x = pybedtools.BedTool(t).saveas()
    assert pybedtools.example_bedtool('a.bed') == x

    t = [
            ("chr1", "fake", "gene", "50", "300", ".", "+", ".", "ID=gene1"),
            ("chr1", "fake", "mRNA", "50", "300", ".", "+", ".", "ID=mRNA1;Parent=gene1;"),
            ("chr1", "fake", "CDS", "75", "150", ".", "+", ".", "ID=CDS1;Parent=mRNA1;"),
            ("chr1", "fake", "CDS", "200", "275", ".", "+", ".", "ID=CDS2;Parent=mRNA1;"),
            ("chr1", "fake", "rRNA", "1200", "1275", ".", "+", ".", "ID=rRNA1;"),]
    x = pybedtools.BedTool(t).saveas()

    # Make sure that x has actual Intervals and not plain tuples or something
    assert isinstance(x[0], pybedtools.Interval)
    assert repr(x[0]) == "Interval(chr1:49-300)"
    assert x[0]['ID'] == 'gene1'
Example #6
0
def test_bam_filetype():
    # regression test -- this was segfaulting before because IntervalFile
    # couldn't parse SAM
    a = pybedtools.example_bedtool('gdc.bam')
    b = pybedtools.example_bedtool('gdc.gff')
    c = a.intersect(b)
    assert c.file_type == 'bam'
Example #7
0
def test_issue_141():
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')

    # make an empty file
    empty = pybedtools.BedTool("", from_string=True)

    # invalid file format
    malformed = pybedtools.BedTool('a	a	a', from_string=True)

    # positive control; works
    a + b

    # "adding" an empty file always gets zero features
    assert len(a + empty) == 0
    assert len(empty + a) == 0
    assert len(empty + empty) == 0

    # "adding" a malformed file raises MalformedBedLineError
    # (an uncaught exception raised when trying to intersect)
    with pytest.raises(pybedtools.MalformedBedLineError):
        a + malformed

    x = pybedtools.example_bedtool('x.bam')
    x + a
Example #8
0
def test_output_kwarg():
    a = pybedtools.example_bedtool("a.bed")
    b = pybedtools.example_bedtool("b.bed")
    c = a.intersect(b)
    d = a.intersect(b, output="deleteme.bed")
    assert c == d
    os.unlink("deleteme.bed")
Example #9
0
def test_output_kwarg():
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a.intersect(b)
    d = a.intersect(b, output='deleteme.bed')
    assert c == d
    os.unlink('deleteme.bed')
Example #10
0
def test_annotate_xstream():
    a = pybedtools.example_bedtool('m1.bed')
    b = pybedtools.example_bedtool('mm9.bed12')
    c = annotate.add_xstream(a, b, dist=1000, updown="up")
    assert a.field_count() == c.field_count() - 1
    assert len(a) == len(c)
    d = annotate.add_xstream(c, b, dist=1000, updown="down")
    assert a.field_count() == d.field_count() - 2
Example #11
0
def test_jaccard():
    x = pybedtools.example_bedtool('a.bed')

    results = x.jaccard(pybedtools.example_bedtool('b.bed'))
    assert results == {'intersection': 46, 'union': 649, 'jaccard': 0.0708783, 'n_intersections': 2}, results

    results2 = x.jaccard(pybedtools.example_bedtool('b.bed'), stream=True)
    assert results == results2, results2
Example #12
0
def test_random_intersection():
    # TODO:
    return
    N = 4
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    li = list(a.randomintersection(b, N))
    assert len(li) == N, li
Example #13
0
def test_annotate_closest():
    a = pybedtools.example_bedtool('m1.bed')
    b = pybedtools.example_bedtool('mm9.bed12')
    c = annotate.add_closest(a, b)
    assert len(a) == len(c), (len(a), len(c), str(c))
    assert a.field_count() == c.field_count() - 2
    # in this test-case, the final column should be exon;intron
    # since m1 completely contains both an exon and an intron.
    f = next(iter(c))
Example #14
0
def run(d):
    method = d['method']
    bedtool = d['bedtool']
    convert = d['convert']
    kwargs = d['kw'].copy()
    expected = d['test_case']['expected']

    bedtool_converter = convert.pop('bedtool')
    bedtool = (
        converters[bedtool_converter](pybedtools.example_bedtool(bedtool))
    )

    for k, converter_name in convert.items():
        kwargs[k] = (
            converters[converter_name](pybedtools.example_bedtool(kwargs[k]))
        )
    result = getattr(bedtool, method)(**kwargs)
    res = str(result)
    expected = fix(expected)
    try:
        assert res == expected

    except AssertionError:
        print(result.fn)
        print('Method call:')
        args = []
        for key, val in list(kwargs.items()):
            args.append(('%s=%s' % (key, val)).strip())

        args = ', '.join(args)
        print('BedTool.%(method)s(%(args)s)' % locals())
        print('Got:')
        print(res)
        print('Expected:')
        print(expected)
        print('Diff:')
        for i in (
            difflib.unified_diff(res.splitlines(1), expected.splitlines(1))
        ):
            print(i, end=' ')

        # Make tabs and newlines visible
        spec_res = res.replace('\t', '\\t').replace('\n', '\\n\n')
        spec_expected = expected.replace('\t', '\\t').replace('\n', '\\n\n')

        print('Showing special characters:')
        print('Got:')
        print(spec_res)
        print('Expected:')
        print(spec_expected)
        print('Diff:')
        for i in (
            difflib.unified_diff(spec_res.splitlines(1),
                                 spec_expected.splitlines(1))
        ):
            print(i, end=' ')
        raise
Example #15
0
def test_many_files():
    """regression test to make sure many files can be created
    """
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    # Previously, IntervalFile would leak open files and would cause OSError
    # (too many open files) at iteration 1010 or so.
    for i in xrange(1100):
        c = a.intersect(b)
Example #16
0
def test_issue_147():
    # previously this would raise BEDToolsError because of unexpected stderr.
    with open(pybedtools.BedTool._tmp(), 'w') as tmp:
        orig_stderr = sys.stderr
        sys.stderr = tmp
        v = pybedtools.example_bedtool('vcf-stderr-test.vcf')
        b = pybedtools.example_bedtool('vcf-stderr-test.bed')
        v.intersect(b)
    sys.stderr = orig_stderr
Example #17
0
def test_chromsizes_in_5prime_3prime():

    # standard 5'
    a = pybedtools.example_bedtool('a.bed')\
        .each(featurefuncs.five_prime, 1, 10, add_to_name="_TSS",
              genome=pybedtools.chromsizes("hg19"))\
        .saveas()
    assert a == fix(
        """
        chr1	0	11	feature1_TSS	0	+
        chr1	99	110	feature2_TSS	0	+
        chr1	490	501	feature3_TSS	0	-
        chr1	899	910	feature4_TSS	0	+
        """), str(a)

    # add genomes sizes; last feature should be truncated
    a = pybedtools.example_bedtool('a.bed')\
        .each(featurefuncs.five_prime, 1, 10, add_to_name="_TSS",
              genome=dict(chr1=(0, 900)))\
        .saveas()
    assert a == fix(
        """
        chr1	0	11	feature1_TSS	0	+
        chr1	99	110	feature2_TSS	0	+
        chr1	490	501	feature3_TSS	0	-
        chr1	899	900	feature4_TSS	0	+
        """), str(a)

    # same thing but for 3'.
    # Note that the last feature chr1:949-960 is completely truncated because
    # it would entirely fall outside of the chromosome
    a = pybedtools.example_bedtool('a.bed')\
            .each(featurefuncs.three_prime, 1, 10, add_to_name="_TSS",
                 genome=dict(chr1=(0, 900)))\
            .saveas()
    assert a == fix(
        """
        chr1	99	110	feature1_TSS	0	+
        chr1	199	210	feature2_TSS	0	+
        chr1	140	151	feature3_TSS	0	-
        chr1	900	900	feature4_TSS	0	+
        """), str(a)

    # be a lot harsher with the chromsizes to ensure features on both strands
    # get truncated correctly
    a = pybedtools.example_bedtool('a.bed')\
            .each(featurefuncs.three_prime, 1, 10, add_to_name="_TSS",
                 genome=dict(chr1=(0, 120)))\
            .saveas()
    assert a == fix(
        """
        chr1	99	110	feature1_TSS	0	+
        chr1	120	120	feature2_TSS	0	+
        chr1	120	120	feature3_TSS	0	-
        chr1	120	120	feature4_TSS	0	+
        """), str(a)
def test_gzipped_files_can_be_intersected():
    agz = _make_temporary_gzip(pybedtools.example_filename('a.bed'))
    bgz = _make_temporary_gzip(pybedtools.example_filename('b.bed'))

    agz = pybedtools.BedTool(agz)
    bgz = pybedtools.BedTool(bgz)

    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    assert a.intersect(b) == agz.intersect(bgz) == a.intersect(bgz) == agz.intersect(b)
Example #19
0
def test_repr_and_printing():
    a = pybedtools.example_bedtool("a.bed")
    b = pybedtools.example_bedtool("b.bed")
    c = a + b
    os.unlink(c.fn)
    assert "a.bed" in repr(a)
    assert "b.bed" in repr(b)
    assert "MISSING FILE" in repr(c)

    print a.head(1)
Example #20
0
def test_repr_and_printing():
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a+b
    os.unlink(c.fn)
    assert 'a.bed' in repr(a)
    assert 'b.bed' in repr(b)
    assert 'MISSING FILE' in repr(c)

    print a.head(1)
Example #21
0
def test_issue_118():
    p = psutil.Process(os.getpid())
    start_fds = p.num_fds()
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    for i in range(100):
        c = a.intersect(b)
        c.field_count()
    stop_fds = p.num_fds()
    assert start_fds == stop_fds
Example #22
0
def test_stream_of_generator():
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    b1 = a.intersect(a, stream=True)
    b2 = pybedtools.BedTool((i for i in a)).intersect(a, stream=True)
    sb1 = str(b1)
    sb2 = str(b2)
    print sb1
    print sb2
    assert sb1 == sb2
Example #23
0
def test_reldist():
    x = pybedtools.example_bedtool('a.bed')
    results = x.reldist(pybedtools.example_bedtool('b.bed'))
    assert results == {'reldist': [0.15, 0.21, 0.28], 'count': [1, 1, 1], 'total': [3, 3, 3], 'fraction': [0.333, 0.333, 0.333]}, results

    results2 = x.reldist(pybedtools.example_bedtool('b.bed'), detail=True)
    print results2
    assert results2 == fix("""
    chr1	1	100	feature1	0	+	0.282
    chr1	100	200	feature2	0	+	0.153
    chr1	150	500	feature3	0	-	0.220""")
Example #24
0
def test_gzip():
    # make new gzipped files on the fly
    agz = pybedtools.BedTool._tmp()
    bgz = pybedtools.BedTool._tmp()
    os.system("gzip -c %s > %s" % (pybedtools.example_filename("a.bed"), agz))
    os.system("gzip -c %s > %s" % (pybedtools.example_filename("b.bed"), bgz))
    agz = pybedtools.BedTool(agz)
    bgz = pybedtools.BedTool(bgz)
    assert agz.file_type == bgz.file_type == "bed"
    a = pybedtools.example_bedtool("a.bed")
    b = pybedtools.example_bedtool("b.bed")
    assert a.intersect(b) == agz.intersect(bgz) == a.intersect(bgz) == agz.intersect(b)
Example #25
0
def test_gzip():
    # make new gzipped files on the fly
    agz = pybedtools.BedTool._tmp()
    bgz = pybedtools.BedTool._tmp()
    os.system('gzip -c %s > %s' % (pybedtools.example_filename('a.bed'), agz))
    os.system('gzip -c %s > %s' % (pybedtools.example_filename('b.bed'), bgz))
    agz = pybedtools.BedTool(agz)
    bgz = pybedtools.BedTool(bgz)
    assert agz.file_type == bgz.file_type == 'bed'
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    assert a.intersect(b) == agz.intersect(bgz) == a.intersect(bgz) == agz.intersect(b)
Example #26
0
def test_bam_stream_bam():
    x = pybedtools.example_bedtool('gdc.bam')
    b = pybedtools.example_bedtool('gdc.gff')
    c = x.intersect(b, u=True, stream=True)
    expected = fix("""
    None	16	chr2L	71	255	5M	*	0	0	TTCTC	IIIII	NM:i:0	NH:i:1
    None	16	chr2L	141	255	5M	*	0	0	CACCA	IIIII	NM:i:0	NH:i:1
    None	16	chr2L	151	255	5M	*	0	0	GTTCA	IIIII	NM:i:0	NH:i:1
    None	0	chr2L	211	255	5M	*	0	0	AAATA	IIIII	NM:i:0	NH:i:1
    None	0	chr2L	71	255	5M	*	0	0	GAGAA	IIIII	NM:i:0	NH:i:1
    None	0	chr2L	141	255	5M	*	0	0	TGGTG	IIIII	NM:i:0	NH:i:1
    None	0	chr2L	161	255	5M	*	0	0	GATAA	IIIII	NM:i:0	NH:i:1""")
    assert str(c) == expected
Example #27
0
def test_repr_and_printing():
    """
    Missing files and streams should say so in repr()
    """
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('b.bed')
    c = a+b
    d = a.intersect(b, stream=True)
    os.unlink(c.fn)
    assert 'a.bed' in repr(a)
    assert 'b.bed' in repr(b)
    assert 'MISSING FILE' in repr(c)
    assert 'stream' in repr(d)
Example #28
0
def test_create_from_list_long_features():
    """
    Iterator handles extra fields from long features (BED+GFF -wao intersection)
    """
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.example_bedtool('c.gff')
    c = a.intersect(b, wao=True, stream=False)
    d = a.intersect(b, wao=True, stream=True)

    print b.closest(a)

    for i in d:
        print i
Example #29
0
def test_randomstats():
    chromsizes = {"chr1": (1, 1000)}
    a = pybedtools.example_bedtool("a.bed").set_chromsizes(chromsizes)
    b = pybedtools.example_bedtool("b.bed")
    try:
        results = a.randomstats(b, 100, debug=True)
        assert results["actual"] == 3
        assert results["median randomized"] == 2.0
        assert results["percentile"] == 90.0

    except ImportError:
        # allow doctests to pass if SciPy not installed
        sys.stderr.write("SciPy not installed, so not testing " "BedTool.randomstats().")
Example #30
0
def test_repr_and_printing():
    """
    Missing files and streams should say so in repr()
    """
    a = pybedtools.example_bedtool("a.bed")
    b = pybedtools.example_bedtool("b.bed")
    c = a + b
    d = a.intersect(b, stream=True)
    os.unlink(c.fn)
    assert "a.bed" in repr(a)
    assert "b.bed" in repr(b)
    assert "MISSING FILE" in repr(c)
    assert "stream" in repr(d)
Example #31
0
def _classifier():

    c = Classifier(bed=pybedtools.example_filename('gdc.bed'),
                   annotations=pybedtools.example_filename('gdc.gff'))
    c.classify()

    bed = pybedtools.example_bedtool('gdc.bed')

    assert c.class_counts == {
        frozenset(['UTR', 'exon', 'mRNA', 'CDS', 'tRNA', 'gene']): 1,
        frozenset(['intron', 'gene', 'mRNA']): 3,
        frozenset([]): 1,
        frozenset(['gene', 'exon', 'mRNA', 'CDS']): 2,
        frozenset(['exon', 'mRNA', 'CDS', 'tRNA', 'intron', 'gene']): 1
    }

    assert c.feature_classes == {
        bed[0]: set(['.']),
        bed[1]: set(['gene', 'exon', 'mRNA', 'CDS']),
        bed[2]: set(['intron', 'gene', 'mRNA']),
        bed[3]: set(['intron', 'gene', 'mRNA']),
        bed[4]: set(['tRNA', 'UTR', 'exon', 'mRNA', 'CDS', 'gene']),
        bed[5]: set(['gene', 'exon', 'mRNA', 'CDS']),
        bed[6]: set(['intron', 'gene', 'mRNA']),
        bed[7]: set(['tRNA', 'intron', 'exon', 'mRNA', 'CDS', 'gene']),
    }

    print('use these indexes for debugging')
    for i, f in enumerate(bed):
        print(i, f)

    for k, v in list(c.class_features.items()):
        print(k)
        for i in v:
            print('\t' + str(i))

    assert c.class_features == {
        frozenset([]): [bed[0]],
        frozenset(['intron', 'gene', 'mRNA']): [bed[6], bed[2], bed[3]],
        frozenset(['gene', 'exon', 'mRNA', 'CDS']): [bed[5], bed[1]],
        frozenset(['UTR', 'exon', 'mRNA', 'CDS', 'tRNA', 'gene']): [bed[4]],
        frozenset(['exon', 'mRNA', 'CDS', 'tRNA', 'intron', 'gene']): [bed[7]],
    }
Example #32
0
def test_issue_143():
    def func(x):
        x.start += 10
        return x
    a = pybedtools.example_bedtool('a.bed')
    b = a.merge(s=True, stream=True).each(func).saveas()
    c = a.merge(s=True).each(func).saveas()
    assert b == c

    b = a.merge(s=True, stream=True)
    for i in b:
        assert isinstance(i, pybedtools.Interval)

    b = a.merge(s=True, stream=True)
    for i in iter(iter(iter(b))):
        assert isinstance(i, pybedtools.Interval)

    for i in a.merge(s=True, stream=True).each(lambda x: x):
        assert isinstance(i, pybedtools.Interval)
Example #33
0
def test_window_maker():
    x = pybedtools.BedTool()
    a = pybedtools.example_bedtool('a.bed')
    result = x.window_maker(b=a.fn, w=50)
    print result
    assert result == fix("""
    chr1	1	51
    chr1	51	100
    chr1	100	150
    chr1	150	200
    chr1	150	200
    chr1	200	250
    chr1	250	300
    chr1	300	350
    chr1	350	400
    chr1	400	450
    chr1	450	500
    chr1	900	950
    """)
Example #34
0
def test_copy():
    a = pybedtools.example_bedtool('a.bed')
    x = a[0]

    # Before adding the __copy__ method to Interval class, making a copy would
    # hang and then segfault
    import copy
    y = copy.copy(x)

    assert y.start == x.start
    assert y.stop == x.stop
    assert y.chrom == x.chrom
    assert y.name == x.name
    assert y.fields == x.fields
    assert y.file_type == x.file_type == 'bed'

    # Make sure it's a real copy (changing something in y doesn't change
    # something in x)
    y.start += 1
    assert y.start == x.start + 1
Example #35
0
def test_venn_mpl():
    """
    compares output image to expected
    """
    try:
        import matplotlib
    except ImportError:
        import sys
        sys.stderr.write('Need matplotlib installed to test venn_mpl')
        return

    here = os.path.dirname(__file__)
    expected_fn = os.path.join(here, 'mpl-expected.png')

    original = pybedtools.example_bedtool(
        'rmsk.hg18.chr21.small.bed').sort().merge()
    a = pybedtools.BedTool(original[:300]).saveas()
    b = pybedtools.BedTool(original[:20]).saveas().cat(
        pybedtools.BedTool(original[400:500]).saveas())
    c = pybedtools.BedTool(original[15:30]).saveas().cat(
        pybedtools.BedTool(original[450:650]).saveas())

    outfn = 'mplout.png'
    venn_mpl.venn_mpl(a=a.fn,
                      b=b.fn,
                      c=c.fn,
                      colors=['r', 'b', 'g'],
                      outfn=outfn,
                      labels=['a', 'b', 'c'])

    # On a different machine, the created image is not visibly different but is
    # numerically different.  Not sure what a reasonable tolerance is, but this
    # seems to work for now....
    o = matplotlib.image.imread(outfn)
    e = matplotlib.image.imread(expected_fn)

    TOLERANCE = 200
    SUM = abs((o - e).sum())
    assert SUM < TOLERANCE, SUM

    os.unlink(outfn)
Example #36
0
def test_issue_157():
    # the problem here was that converting to file from dataframe didn't pass
    # through enough options to pandas.
    try:
        import pandas
    except ImportError:
        pytest.xfail("pandas not installed; skipping test")
    vcf = pybedtools.example_bedtool('1000genomes-example.vcf')
    bed = pybedtools.BedTool('20\t14300\t17000', from_string=True)
    non_dataframe = str(vcf.intersect(bed))
    df = vcf.to_dataframe(comment='#', names=['CHROM', 'POS', 'ID', 'REF', 'ALT',
                                         'QUAL', 'FILTER', 'INFO', 'FORMAT',
                                         'NA00001', 'NA00002', 'NA00003'])

    header = ''.join([line for line in open(vcf.fn) if line.startswith('#')])
    outfile = pybedtools.BedTool._tmp()
    with open(outfile, 'w') as fout:
        fout.write(header)
        vcf_from_df = pybedtools.BedTool.from_dataframe(df, outfile=fout)
    from_dataframe = str(vcf_from_df.intersect(bed))
    assert non_dataframe == from_dataframe
Example #37
0
def test_i_methods():
    """
    Generator that yields tests, inserting different versions of `i` as needed
    """
    for method, send_kwargs, expected in parse_yaml(config_fn):
        i_isbam = False
        if 'ibam' in send_kwargs:
            i_isbam = True
            send_kwargs['ibam'] = pybedtools.example_filename(
                send_kwargs['ibam'])
            send_kwargs['i'] = send_kwargs['ibam']

        if ('a' in send_kwargs) and ('b' in send_kwargs):
            continue

        if ('i' not in send_kwargs) and ('ibam' not in send_kwargs):
            continue

        if 'files' in send_kwargs:
            send_kwargs['files'] = [
                pybedtools.example_filename(i) for i in send_kwargs['files']
            ]

        orig_i = pybedtools.example_bedtool(send_kwargs['i'])
        if orig_i._isbam:
            i_isbam = True

        del send_kwargs['i']

        done = []
        for kind_i in ('filename', 'generator', 'stream', 'gzip'):
            if i_isbam:
                if (kind_i not in supported_bam):
                    continue
            i = converter[kind_i](orig_i)
            kind = 'i=%(kind_i)s ibam=%(i_isbam)s' % locals()
            f = partial(run, method, i, expected, **send_kwargs)
            f.description = '%(method)s, %(kind)s, %(send_kwargs)s' % locals()
            yield (f, )
Example #38
0
def test_stream_of_stream():
    """
    Second-level streaming using self-intersections
    """
    a = pybedtools.example_bedtool('a.bed')

    # Ensure non-stream and stream equality of self-intersection
    nonstream1 = a.intersect(a, u=True)
    stream1 = a.intersect(a, u=True, stream=True)
    nonstream1_str = str(nonstream1)
    stream1_str = str(stream1)
    a_str = str(a)
    assert nonstream1_str == stream1_str == a_str

    # Have to reconstruct stream1 cause it was consumed in the str() call
    nonstream1 = a.intersect(a, u=True)
    stream1 = a.intersect(a, u=True, stream=True)
    nonstream2 = a.intersect(nonstream1, u=True)
    stream2 = a.intersect(stream1, u=True, stream=True)
    nonstream2_str = str(nonstream2)
    stream2_str = str(stream2)
    assert nonstream2_str == stream2_str == nonstream1_str == stream1_str == a_str
Example #39
0
def test_indexing():
    """
    Indexing into BedTools
    """
    a = pybedtools.example_bedtool('a.bed')

    # This is the first line
    interval = pybedtools.Interval('chr1', 1, 100, 'feature1', '0', '+')

    # just to make sure
    assert interval == iter(a).next()

    # test slice behavior
    results = list(a[0:2])
    assert len(results) == 2
    assert results[0] == interval

    # test single-integer indexing
    assert a[0] == interval

    # only slices and integers allowed....
    assert_raises(ValueError, a.__getitem__, 'key')
Example #40
0
def test_bedtool_creation():
    # make sure we can make a bedtool from a bedtool and that it points to the
    # same file
    a = pybedtools.example_bedtool('a.bed')
    b = pybedtools.BedTool(a)
    assert b.fn == a.fn
    assert_raises(ValueError, pybedtools.BedTool, 'nonexistent.bed')

    # note that *s* has both tabs and spaces....
    s = """
    chr1	1	100	feature1  0	+
    chr1	100	200	feature2  0	+
    chr1	150	500	feature3  0	-
    chr1	900	950	feature4  0	+
    """
    from_string = pybedtools.BedTool(s, from_string=True)

    # difflib used here to show a bug where a newline was included when using
    # from_string
    print ''.join(difflib.ndiff(str(from_string), str(a)))

    assert str(from_string) == str(a)
Example #41
0
def test_bed_methods():
    """
    Generator that yields tests, inserting different versions of `bed` as needed
    """
    for method, send_kwargs, expected in parse_yaml(config_fn):
        ignore = ['a', 'b', 'abam', 'i']
        skip_test = False
        for i in ignore:
            if i in send_kwargs:
                skip_test = True
        if skip_test:
            continue
        if 'bed' not in send_kwargs:
            continue

        if 'files' in send_kwargs:
            send_kwargs['files'] = [
                pybedtools.example_filename(i) for i in send_kwargs['files']
            ]

        if 'bams' in send_kwargs:
            send_kwargs['bams'] = [
                pybedtools.example_filename(i) for i in send_kwargs['bams']
            ]

        if 'fi' in send_kwargs:
            send_kwargs['fi'] = pybedtools.example_filename(send_kwargs['fi'])

        orig_bed = pybedtools.example_bedtool(send_kwargs['bed'])

        del send_kwargs['bed']

        done = []
        for kind_bed in ('filename', 'generator', 'stream', 'gzip'):
            bed = converter[kind_bed](orig_bed)
            kind = 'i=%(kind_bed)s' % locals()
            f = partial(run, method, bed, expected, **send_kwargs)
            f.description = '%(method)s, %(kind)s, %(send_kwargs)s' % locals()
            yield (f, )
Example #42
0
def test_venn_gchart_data_is_correct():
    original = pybedtools.example_bedtool("rmsk.hg18.chr21.small.bed").sort().merge()
    a = pybedtools.BedTool(original[:300]).saveas()
    b = (
        pybedtools.BedTool(original[:20])
        .saveas()
        .cat(pybedtools.BedTool(original[400:500]).saveas())
    )
    c = (
        pybedtools.BedTool(original[15:30])
        .saveas()
        .cat(pybedtools.BedTool(original[450:650]).saveas())
    )

    colors = "00FF00,FF0000,0000FF"
    labels = "a,b,c"

    expected_data = {
        "chco": "00FF00,FF0000,0000FF",
        "chd": "t:1.0,0.4,0.7167,0.0667,0.05,0.1833,0.0167",
        "chs": "300x300",
        "cht": "v",
        "chdl": "a|b|c",
    }

    data = venn_gchart.venn_gchart(
        a=a.fn,
        b=b.fn,
        c=c.fn,
        colors=colors.split(","),
        labels=labels.split(","),
        size="300x300",
    )

    for key in expected_data.keys():
        e = expected_data[key]
        o = data[key]
        assert e == o
def read_data(bed_file,fasta_file):
    #apply bedtools to read fasta files '/home/h5li/methylation_DMR/data/DMR_coordinates_extended_b500.bed'
    a = pybedtools.example_bedtool( bed_file )
    # '/home/h5li/methylation_DMR/data/mm10.fasta'
    fasta = pybedtools.example_filename( fasta_file )
    a = a.sequence(fi=fasta)
    seq = open(a.seqfn).read()
    #read and extract DNA sequences 
    DNA_seq_list = seq.split('\n')
    DNA_seq_list.pop()
    DNA_seq = []
    m = 10000
    n = 0
    for index in range(len(DNA_seq_list)//2):
        DNA_seq.append(DNA_seq_list[index*2 + 1].upper())
        if len(DNA_seq_list[index*2 + 1]) < m:
            m = len(DNA_seq_list[index*2 + 1])
        if len(DNA_seq_list[index*2 + 1]) > n:
            n = len(DNA_seq_list[index*2 + 1])
    print('The shortest length of DNA sequence is {0}bp'.format(m))
    print('The longest length of DNA sequence is {0}bp'.format(n))
    print('Total Number of input sequence is {0}'.format(len(DNA_seq)))
    return DNA_seq,n,m
Example #44
0
def test_add_color():
    try:
        from matplotlib import cm
    except ImportError:
        print "matplotlib not installed; skipping test_add_color"
        return

    def modify_scores(f):
        fields = f.fields
        fields[4] = str(f[2])
        return pybedtools.create_interval_from_list(fields)

    a = pybedtools.example_bedtool('a.bed')
    a = a.each(modify_scores).saveas()
    cmap = cm.jet
    norm = a.colormap_normalize()
    results = str(a.each(featurefuncs.add_color, cmap=cmap, norm=norm))
    print results
    assert results == fix("""
    chr1	1	100	feature1	100	+	1	100	0,0,127
    chr1	100	200	feature2	200	+	100	200	0,0,255
    chr1	150	500	feature3	500	-	150	500	99,255,147
    chr1	900	950	feature4	950	+	900	950	127,0,0""")
Example #45
0
def test_tabix():
    a = pybedtools.example_bedtool('a.bed')
    t = a.tabix()
    assert t._tabixed()
    results = str(t.tabix_intervals('chr1:99-200'))
    print results
    assert results == fix("""
    chr1	1	100	feature1	0	+
    chr1	100	200	feature2	0	+
    chr1	150	500	feature3	0	-""")

    assert str(t.tabix_intervals(a[2])) == fix("""
    chr1	100	200	feature2	0	+
    chr1	150	500	feature3	0	-""")

    # clean up
    fns = [
        pybedtools.example_filename('a.bed.gz'),
        pybedtools.example_filename('a.bed.gz.tbi'),
    ]
    for fn in fns:
        if os.path.exists(fn):
            os.unlink(fn)
Example #46
0
def test_gff2bed():
    a = pybedtools.example_bedtool('d.gff')
    results = str(a.each(featurefuncs.gff2bed, name_field='Parent'))
    assert results == fix("""
    chr1	49	300	.	.	+
    chr1	49	300	gene1	.	+
    chr1	74	150	mRNA1	.	+
    chr1	199	275	mRNA1	.	+
    chr1	1199	1275	.	.	+""")

    results = str(a.each(featurefuncs.gff2bed))
    assert results == fix("""
    chr1	49	300	gene1	.	+
    chr1	49	300	mRNA1	.	+
    chr1	74	150	CDS1	.	+
    chr1	199	275	CDS2	.	+
    chr1	1199	1275	rRNA1	.	+
    """)

    results = str(a.each(featurefuncs.gff2bed, name_field="nonexistent"))
    assert results == fix("""
    chr1	49	300	.	.	+
    chr1	49	300	.	.	+
    chr1	74	150	.	.	+
    chr1	199	275	.	.	+
    chr1	1199	1275	.	.	+
    """)

    results = str(a.each(featurefuncs.gff2bed, name_field=1))
    print results
    assert results == fix("""
    chr1	49	300	fake	.	+
    chr1	49	300	fake	.	+
    chr1	74	150	fake	.	+
    chr1	199	275	fake	.	+
    chr1	1199	1275	fake	.	+""")
Example #47
0
def circle_coverage_profile(bamfile, bedfile, exon_ind, split_character,
                            platform):
    '''
    '''
    x = pybedtools.example_bedtool(bamfile)
    y = x.coverage(bedfile, d=True, split=True)
    transcriptwise_coverage = {}
    for position in y:
        if platform == 'refseq':
            transcript = split_character.join(
                position[3].split(split_character)[0:2])
        elif platform == 'ensembl':
            transcript = position[3].split(split_character)[0]
        else:
            transcript = 'NA'
            print(
                'you are using an unknown annotation platform, please use refseq or ensembl like formats'
            )
        exon = int(position[3].split(split_character)[exon_ind])
        if not transcript in transcriptwise_coverage:
            transcriptwise_coverage[transcript] = {}
        if not exon in transcriptwise_coverage[transcript]:
            transcriptwise_coverage[transcript][exon] = {
                'relative_positions': [],
                'position_coverage': [],
                'chromosome': position[0],
                'start': position[1],
                'end': position[2]
            }
        transcriptwise_coverage[transcript][exon]['position_coverage'] += [
            position[7]
        ]
        transcriptwise_coverage[transcript][exon]['relative_positions'] += [
            position[6]
        ]
    return (transcriptwise_coverage)
Example #48
0
def test_issue_181():
    a = pybedtools.example_bedtool("a.bed")
    a = a.tabix(force=True)
    a.tabix_intervals("none:1-5")
    with pytest.raises(ValueError):
        a.tabix_intervals("none:1-5", check_coordinates=True)
Example #49
0
def test_cut():
    a = pybedtools.example_bedtool('a.bed')
    c = a.cut([0, 1, 2, 4])
    assert c.field_count() == 4, c
Example #50
0
def test_filter():
    a = pybedtools.example_bedtool('a.bed')

    b = a.filter(lambda f: f.length < 100 and f.length > 0)
    assert len(b) == 2
Example #51
0
def test_name():
    c = iter(pybedtools.example_bedtool('c.gff')).next()
    assert c.name == "thaliana_1_465_805", c.name
Example #52
0
def test_kwargs():
    a = pybedtools.example_bedtool('a.bed')
    b = a.intersect(a, s=False)
    c = a.intersect(a)
    assert str(b) == str(c)
Example #53
0
def test_bam_bedtool_creation():
    x = pybedtools.example_bedtool('x.bam')
    a = pybedtools.example_bedtool('a.bed')
    assert x._isbam
    assert not a._isbam
Example #54
0
def test_bam_iter():
    x = pybedtools.example_bedtool('gdc.bam')
    s = 'None	0	chr2L	11	255	5M	*	0	0	CGACA	IIIII	NM:i:0	NH:i:1\n'
    assert str(x[0]) == str(iter(x).next()) == s
Example #55
0
def test_sam_filetype():
    # file_type was segfaulting cause IntervalFile couldn't parse SAM
    a = pybedtools.example_bedtool('gdc.bam')
    b = pybedtools.BedTool(i for i in a).saveas()
    assert b.file_type == 'sam'
Example #56
0
def test_PR_158():
    # See #121 for original, #122 for follow-up, and #158 for fix.
    #
    # This used to crash with "OverflowError: can't convert negative value to CHRPOS"
    b = pybedtools.example_bedtool("issue_121.bam")
    print(b)
Example #57
0
def test_issue_169():
    x = pybedtools.example_bedtool("1000genomes-example.vcf")
    fn = x.bgzip(is_sorted=False, force=True)
    line = gzip.open(fn, "rt").readline()
    assert str(line).startswith("#"), line
Example #58
0
def test_issue_180():
    a = pybedtools.example_bedtool("a.bed")
    a = a.tabix(force=True)
    assert a.tabix_contigs() == ["chr1"]
Example #59
0
def test_issue_203():
    x = pybedtools.example_bedtool("x.bed")
    x.truncate_to_chrom(genome="hg19")
 def pybedtoolsmerge(self, filteringclonescountoutput):
     self.input = pybedtools.example_bedtool(filteringclonescountoutput)
     self.output = self.input.merge().moveto(self.out+'_blastnclonesmerge.bed')
     return self.out+'_blastnclonesmerge.bed'