def test_teetsv(): t1 = (("foo", "bar"), ("a", 2), ("b", 1), ("c", 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) (etl.wrap(t1).teetsv(f1.name, encoding="ascii").selectgt("bar", 1).totsv(f2.name, encoding="ascii")) ieq(t1, etl.fromtsv(f1.name, encoding="ascii").convertnumbers()) ieq(etl.wrap(t1).selectgt("bar", 1), etl.fromtsv(f2.name, encoding="ascii").convertnumbers())
def test_teetsv_unicode(): t1 = ((u"name", u"id"), (u"Արամ Խաչատրյան", 1), (u"Johann Strauß", 2), (u"Вагиф Сәмәдоғлу", 3), (u"章子怡", 4)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) (etl.wrap(t1).teetsv(f1.name, encoding="utf-8").selectgt("id", 1).totsv(f2.name, encoding="utf-8")) ieq(t1, etl.fromtsv(f1.name, encoding="utf-8").convertnumbers()) ieq(etl.wrap(t1).selectgt("id", 1), etl.fromtsv(f2.name, encoding="utf-8").convertnumbers())
def test_ZipSource(): # setup table = [('foo', 'bar'), ('a', '1'), ('b', '2')] totsv(table, 'tmp/issue_241.tsv') z = zipfile.ZipFile('tmp/issue_241.zip', mode='w') z.write('tmp/issue_241.tsv', 'data.tsv') z.close() # test actual = fromtsv(ZipSource('tmp/issue_241.zip', 'data.tsv')) ieq(table, actual)
def test_issue_231(): table = [['foo', 'bar'], ['a', '1'], ['b', '2']] t = cut(table, 'foo') totsv(t, 'tmp/issue_231.tsv') u = fromtsv('tmp/issue_231.tsv') ieq(t, u) tocsv(t, 'tmp/issue_231.csv') u = fromcsv('tmp/issue_231.csv') ieq(t, u) topickle(t, 'tmp/issue_231.pickle') u = frompickle('tmp/issue_231.pickle') ieq(t, u)
def test_fromtsv(): f = NamedTemporaryFile(delete=False) writer = csv.writer(f, delimiter="\t") table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2)) for row in table: writer.writerow(row) f.close() actual = fromtsv(f.name) expect = (("foo", "bar"), ("a", "1"), ("b", "2"), ("c", "2")) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def test_zipsource(): # setup tbl = [('foo', 'bar'), ('a', '1'), ('b', '2')] fn_tsv = NamedTemporaryFile().name etl.totsv(tbl, fn_tsv) fn_zip = NamedTemporaryFile().name z = zipfile.ZipFile(fn_zip, mode='w') z.write(fn_tsv, 'data.tsv') z.close() # test actual = etl.fromtsv(ZipSource(fn_zip, 'data.tsv')) ieq(tbl, actual)
def test_fromtsv(): f = NamedTemporaryFile(delete=False) writer = csv.writer(f, delimiter='\t') table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) for row in table: writer.writerow(row) f.close() actual = fromtsv(f.name) expect = (('foo', 'bar'), ('a', '1'), ('b', '2'), ('c', '2')) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def fromgff3(filename, region=None): """ Extract feature rows from a GFF3 file, e.g.:: >>> import petl as etl >>> # activate bio extensions ... import petlx.bio >>> table1 = etl.fromgff3('fixture/sample.gff') >>> table1.look(truncate=30) +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | seqid | source | type | start | end | score | strand | phase | attributes | +==============+=========+===============+=======+=========+=======+========+=======+================================+ | 'apidb|MAL1' | 'ApiDB' | 'supercontig' | 1 | 643292 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL2' | 'ApiDB' | 'supercontig' | 1 | 947102 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL3' | 'ApiDB' | 'supercontig' | 1 | 1060087 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL4' | 'ApiDB' | 'supercontig' | 1 | 1204112 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'supercontig' | 1 | 1343552 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ ... A region query string of the form '[seqid]' or '[seqid]:[start]-[end]' may be given for the `region` argument. If given, requires the GFF3 file to be position sorted, bgzipped and tabix indexed. Requires pysam to be installed. E.g.:: >>> # extract from a specific genome region via tabix ... table2 = etl.fromgff3('fixture/sample.sorted.gff.gz', ... region='apidb|MAL5:1289593-1289595') >>> table2.look(truncate=30) +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | seqid | source | type | start | end | score | strand | phase | attributes | +==============+=========+===============+=========+=========+=======+========+=======+================================+ | 'apidb|MAL5' | 'ApiDB' | 'supercontig' | 1 | 1343552 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'exon' | 1289594 | 1291685 | '.' | '+' | '.' | {'size': '2092', 'Parent': 'ap | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'gene' | 1289594 | 1291685 | '.' | '+' | '.' | {'ID': 'apidb|MAL5_18S', 'web_ | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'rRNA' | 1289594 | 1291685 | '.' | '+' | '.' | {'ID': 'apidb|rna_MAL5_18S-1', | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ """ if region is None: # parse file as tab-delimited table = etl.fromtsv(filename) else: # extract via tabix table = etl.fromtabix(filename, region=region) return ( table .pushheader(GFF3_HEADER) .skipcomments('#') # ignore any row not 9 values long (e.g., trailing fasta) .rowlenselect(9) # parse attributes into a dict .convert('attributes', gff3_parse_attributes) # parse coordinates .convert(('start', 'end'), int) )