Beispiel #1
0
def test_fromtabix_noheader():
    actual = etl.fromtabix('fixture/test_noheader.bed.gz',
                           region='Pf3D7_02_v3:110000-120000')
    expect = (('Pf3D7_02_v3', '105800', '447300', 'Core'),)
    ieq(expect, actual)
Beispiel #2
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


# fromtabix()
#############

import petl as etl
# activate bio extensions
import petlx.bio
table1 = etl.fromtabix('fixture/test.bed.gz',
                       region='Pf3D7_02_v3')
table1
table2 = etl.fromtabix('fixture/test.bed.gz',
                       region='Pf3D7_02_v3:110000-120000')
table2


# fromgff3()
############

import petl as etl
# activate bio extensions
import petlx.bio
table1 = etl.fromgff3('fixture/sample.gff')
table1.look(truncate=30)
# extract from a specific genome region via tabix
table2 = etl.fromgff3('fixture/sample.sorted.gff.gz',
                      region='apidb|MAL5:1289593-1289595')
table2.look(truncate=30)
Beispiel #3
0
def test_fromtabix():
    actual = etl.fromtabix('fixture/test.bed.gz',
                           region='Pf3D7_02_v3:110000-120000')
    expect = (('#chrom', 'start', 'end', 'region'),
              ('Pf3D7_02_v3', '105800', '447300', 'Core'))
    ieq(expect, actual)
Beispiel #4
0
def fromgff3(filename, region=None):
    """
    Extract feature rows from a GFF3 file, e.g.::

        >>> import petl as etl
        >>> # activate bio extensions
        ... import petlx.bio
        >>> table1 = etl.fromgff3('fixture/sample.gff')
        >>> table1.look(truncate=30)
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | seqid        | source  | type          | start | end     | score | strand | phase | attributes                     |
        +==============+=========+===============+=======+=========+=======+========+=======+================================+
        | 'apidb|MAL1' | 'ApiDB' | 'supercontig' |     1 |  643292 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL2' | 'ApiDB' | 'supercontig' |     1 |  947102 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL3' | 'ApiDB' | 'supercontig' |     1 | 1060087 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL4' | 'ApiDB' | 'supercontig' |     1 | 1204112 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'supercontig' |     1 | 1343552 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        ...

    A region query string of the form '[seqid]' or '[seqid]:[start]-[end]'
    may be given for the `region` argument. If given, requires the GFF3
    file to be position sorted, bgzipped and tabix indexed. Requires pysam to be
    installed. E.g.::

        >>> # extract from a specific genome region via tabix
        ... table2 = etl.fromgff3('fixture/sample.sorted.gff.gz',
        ...                       region='apidb|MAL5:1289593-1289595')
        >>> table2.look(truncate=30)
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | seqid        | source  | type          | start   | end     | score | strand | phase | attributes                     |
        +==============+=========+===============+=========+=========+=======+========+=======+================================+
        | 'apidb|MAL5' | 'ApiDB' | 'supercontig' |       1 | 1343552 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'exon'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'size': '2092', 'Parent': 'ap |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'gene'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'ID': 'apidb|MAL5_18S', 'web_ |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'rRNA'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'ID': 'apidb|rna_MAL5_18S-1', |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+

    """

    if region is None:

        # parse file as tab-delimited
        table = etl.fromtsv(filename)

    else:

        # extract via tabix
        table = etl.fromtabix(filename, region=region)

    return (
        table
        .pushheader(GFF3_HEADER)
        .skipcomments('#')
        # ignore any row not 9 values long (e.g., trailing fasta)
        .rowlenselect(9)
        # parse attributes into a dict
        .convert('attributes', gff3_parse_attributes)
        # parse coordinates
        .convert(('start', 'end'), int)
    )