Example #1
0
def test_teetsv():

    t1 = (("foo", "bar"), ("a", 2), ("b", 1), ("c", 3))

    f1 = NamedTemporaryFile(delete=False)
    f2 = NamedTemporaryFile(delete=False)

    (etl.wrap(t1).teetsv(f1.name, encoding="ascii").selectgt("bar", 1).totsv(f2.name, encoding="ascii"))

    ieq(t1, etl.fromtsv(f1.name, encoding="ascii").convertnumbers())
    ieq(etl.wrap(t1).selectgt("bar", 1), etl.fromtsv(f2.name, encoding="ascii").convertnumbers())
Example #2
0
def test_teetsv_unicode():

    t1 = ((u"name", u"id"), (u"Արամ Խաչատրյան", 1), (u"Johann Strauß", 2), (u"Вагиф Сәмәдоғлу", 3), (u"章子怡", 4))

    f1 = NamedTemporaryFile(delete=False)
    f2 = NamedTemporaryFile(delete=False)

    (etl.wrap(t1).teetsv(f1.name, encoding="utf-8").selectgt("id", 1).totsv(f2.name, encoding="utf-8"))

    ieq(t1, etl.fromtsv(f1.name, encoding="utf-8").convertnumbers())
    ieq(etl.wrap(t1).selectgt("id", 1), etl.fromtsv(f2.name, encoding="utf-8").convertnumbers())
Example #3
0
def test_ZipSource():

    # setup
    table = [('foo', 'bar'), ('a', '1'), ('b', '2')]
    totsv(table, 'tmp/issue_241.tsv')
    z = zipfile.ZipFile('tmp/issue_241.zip', mode='w')
    z.write('tmp/issue_241.tsv', 'data.tsv')
    z.close()

    # test
    actual = fromtsv(ZipSource('tmp/issue_241.zip', 'data.tsv'))
    ieq(table, actual)
Example #4
0
def test_issue_231():

    table = [['foo', 'bar'], ['a', '1'], ['b', '2']]
    t = cut(table, 'foo')
    totsv(t, 'tmp/issue_231.tsv')
    u = fromtsv('tmp/issue_231.tsv')
    ieq(t, u)
    tocsv(t, 'tmp/issue_231.csv')
    u = fromcsv('tmp/issue_231.csv')
    ieq(t, u)
    topickle(t, 'tmp/issue_231.pickle')
    u = frompickle('tmp/issue_231.pickle')
    ieq(t, u)
Example #5
0
def test_fromtsv():

    f = NamedTemporaryFile(delete=False)
    writer = csv.writer(f, delimiter="\t")
    table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2))
    for row in table:
        writer.writerow(row)
    f.close()

    actual = fromtsv(f.name)
    expect = (("foo", "bar"), ("a", "1"), ("b", "2"), ("c", "2"))
    ieq(expect, actual)
    ieq(expect, actual)  # verify can iterate twice
Example #6
0
def test_zipsource():

    # setup
    tbl = [('foo', 'bar'), ('a', '1'), ('b', '2')]
    fn_tsv = NamedTemporaryFile().name
    etl.totsv(tbl, fn_tsv)
    fn_zip = NamedTemporaryFile().name
    z = zipfile.ZipFile(fn_zip, mode='w')
    z.write(fn_tsv, 'data.tsv')
    z.close()

    # test
    actual = etl.fromtsv(ZipSource(fn_zip, 'data.tsv'))
    ieq(tbl, actual)
Example #7
0
def test_fromtsv():
    
    f = NamedTemporaryFile(delete=False)
    writer = csv.writer(f, delimiter='\t')
    table = (('foo', 'bar'),
             ('a', 1),
             ('b', 2),
             ('c', 2))
    for row in table:
        writer.writerow(row)
    f.close()
    
    actual = fromtsv(f.name)
    expect = (('foo', 'bar'),
              ('a', '1'),
              ('b', '2'),
              ('c', '2'))
    ieq(expect, actual)
    ieq(expect, actual) # verify can iterate twice
Example #8
0
def fromgff3(filename, region=None):
    """
    Extract feature rows from a GFF3 file, e.g.::

        >>> import petl as etl
        >>> # activate bio extensions
        ... import petlx.bio
        >>> table1 = etl.fromgff3('fixture/sample.gff')
        >>> table1.look(truncate=30)
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | seqid        | source  | type          | start | end     | score | strand | phase | attributes                     |
        +==============+=========+===============+=======+=========+=======+========+=======+================================+
        | 'apidb|MAL1' | 'ApiDB' | 'supercontig' |     1 |  643292 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL2' | 'ApiDB' | 'supercontig' |     1 |  947102 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL3' | 'ApiDB' | 'supercontig' |     1 | 1060087 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL4' | 'ApiDB' | 'supercontig' |     1 | 1204112 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'supercontig' |     1 | 1343552 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        ...

    A region query string of the form '[seqid]' or '[seqid]:[start]-[end]'
    may be given for the `region` argument. If given, requires the GFF3
    file to be position sorted, bgzipped and tabix indexed. Requires pysam to be
    installed. E.g.::

        >>> # extract from a specific genome region via tabix
        ... table2 = etl.fromgff3('fixture/sample.sorted.gff.gz',
        ...                       region='apidb|MAL5:1289593-1289595')
        >>> table2.look(truncate=30)
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | seqid        | source  | type          | start   | end     | score | strand | phase | attributes                     |
        +==============+=========+===============+=========+=========+=======+========+=======+================================+
        | 'apidb|MAL5' | 'ApiDB' | 'supercontig' |       1 | 1343552 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'exon'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'size': '2092', 'Parent': 'ap |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'gene'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'ID': 'apidb|MAL5_18S', 'web_ |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'rRNA'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'ID': 'apidb|rna_MAL5_18S-1', |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+

    """

    if region is None:

        # parse file as tab-delimited
        table = etl.fromtsv(filename)

    else:

        # extract via tabix
        table = etl.fromtabix(filename, region=region)

    return (
        table
        .pushheader(GFF3_HEADER)
        .skipcomments('#')
        # ignore any row not 9 values long (e.g., trailing fasta)
        .rowlenselect(9)
        # parse attributes into a dict
        .convert('attributes', gff3_parse_attributes)
        # parse coordinates
        .convert(('start', 'end'), int)
    )