Ejemplo n.º 1
0
def test_fromgff3_region():
    tbl_features = etl.fromgff3('fixture/sample.sorted.gff.gz',
                                region='apidb|MAL5')
    eq_(7, tbl_features.nrows())
    tbl_features = etl.fromgff3('fixture/sample.sorted.gff.gz',
                                region='apidb|MAL5:1289593-1289595')
    eq_(4, tbl_features.nrows())
Ejemplo n.º 2
0
    def __init__(self, fasta_path, gff3_path, seqid=None):
        """
        An annotated reference genome.

        Parameters
        ----------

        fasta_path : string
            Path to reference genome FASTA file.
        gff3_path : string
            Path to genome annotations GFF3 file.

        """

        # store initialisation parameters
        self._fasta_path = fasta_path
        self._gff3_path = gff3_path
        self._seqid = seqid

        # setup access to reference sequence
        self._fasta = pyfasta.Fasta(fasta_path)

        # setup access to GFF3 as a table
        if isinstance(gff3_path, (list, tuple)):
            tbl_features = etl.cat(*[etl.fromgff3(p) for p in gff3_path])
        else:
            tbl_features = etl.fromgff3(gff3_path)
        tbl_features = (tbl_features.unpackdict(
            'attributes', ['ID', 'Parent']).rename({
                'ID': 'feature_id',
                'Parent': 'parent_id',
                'end': 'stop'
            }).select(lambda row: (row.stop - row.start) > 0))

        # limit data to a single chromosome
        if seqid is not None:
            tbl_features = tbl_features.eq('seqid', seqid)
        self._tbl_features = tbl_features.cache()

        # index features by ID
        self._idx_feature_id = self._tbl_features.recordlookupone('feature_id')

        # index features by parent ID
        self._idx_parent_id = self._tbl_features.recordlookup('parent_id')

        # index features by genomic location
        self._idx_location = self._tbl_features.facetintervalrecordlookup(
            'seqid', 'start', 'stop', include_stop=True)
Ejemplo n.º 3
0
def get_geneset_features(geneset_fn, chrom, start=None, stop=None):
    """Function to load geneset features for a specific genome region via petl."""
    if start and stop:
        region = '%s:%s-%s' % (chrom, start, stop)
    else:
        region = chrom
    return etl.fromgff3(geneset_fn, region=region)
Ejemplo n.º 4
0
def test_fromgff3_trailing_semicolon():

    features = etl.fromgff3(sample_gff3_filename)

    row = list(features)[2]
    eq_('apidb|MAL2', row[0])
    eq_('ApiDB', row[1])
    eq_('supercontig', row[2])
    eq_(1, row[3])
    eq_(947102, row[4])
    eq_('.', row[5])
    eq_('+', row[6])
    eq_('.', row[7])
    eq_('apidb|MAL2', row[8]['ID'])
    eq_('MAL2', row[8]['Name'])
    eq_('Plasmodium falciparum', row[8]['organism_name'])
Ejemplo n.º 5
0
def test_fromgff3_trailing_semicolon():

    features = etl.fromgff3(sample_gff3_filename)

    row = list(features)[2]
    eq_("apidb|MAL2", row[0])
    eq_("ApiDB", row[1])
    eq_("supercontig", row[2])
    eq_(1, row[3])
    eq_(947102, row[4])
    eq_(".", row[5])
    eq_("+", row[6])
    eq_(".", row[7])
    eq_("apidb|MAL2", row[8]["ID"])
    eq_("MAL2", row[8]["Name"])
    eq_("Plasmodium falciparum", row[8]["organism_name"])
Ejemplo n.º 6
0
def test_fromgff3():

    features = etl.fromgff3(sample_gff3_filename)

    eq_(GFF3_HEADER, features.header())

    row = list(features)[1]
    eq_('apidb|MAL1', row[0])
    eq_('ApiDB', row[1])
    eq_('supercontig', row[2])
    eq_(1, row[3])
    eq_(643292, row[4])
    eq_('.', row[5])
    eq_('+', row[6])
    eq_('.', row[7])
    eq_('apidb|MAL1', row[8]['ID'])
    eq_('MAL1', row[8]['Name'])
    eq_('Plasmodium falciparum', row[8]['organism_name'])
Ejemplo n.º 7
0
def test_fromgff3():

    features = etl.fromgff3(sample_gff3_filename)

    eq_(GFF3_HEADER, features.header())

    row = list(features)[1]
    eq_("apidb|MAL1", row[0])
    eq_("ApiDB", row[1])
    eq_("supercontig", row[2])
    eq_(1, row[3])
    eq_(643292, row[4])
    eq_(".", row[5])
    eq_("+", row[6])
    eq_(".", row[7])
    eq_("apidb|MAL1", row[8]["ID"])
    eq_("MAL1", row[8]["Name"])
    eq_("Plasmodium falciparum", row[8]["organism_name"])
Ejemplo n.º 8
0
import petlx.bio
table1 = etl.fromtabix('fixture/test.bed.gz',
                       region='Pf3D7_02_v3')
table1
table2 = etl.fromtabix('fixture/test.bed.gz',
                       region='Pf3D7_02_v3:110000-120000')
table2


# fromgff3()
############

import petl as etl
# activate bio extensions
import petlx.bio
table1 = etl.fromgff3('fixture/sample.gff')
table1.look(truncate=30)
# extract from a specific genome region via tabix
table2 = etl.fromgff3('fixture/sample.sorted.gff.gz',
                      region='apidb|MAL5:1289593-1289595')
table2.look(truncate=30)


# fromvcf()
###########

import petl as etl
# activate bio extensions
import petlx.bio
table1 = etl.fromvcf('fixture/sample.vcf')
table1.look(truncate=20)
Ejemplo n.º 9
0
import petl as etl
# activate bio extensions
import petlx.bio
table1 = etl.fromtabix('fixture/test.bed.gz', region='Pf3D7_02_v3')
table1
table2 = etl.fromtabix('fixture/test.bed.gz',
                       region='Pf3D7_02_v3:110000-120000')
table2

# fromgff3()
############

import petl as etl
# activate bio extensions
import petlx.bio
table1 = etl.fromgff3('fixture/sample.gff')
table1.look(truncate=30)
# extract from a specific genome region via tabix
table2 = etl.fromgff3('fixture/sample.sorted.gff.gz',
                      region='apidb|MAL5:1289593-1289595')
table2.look(truncate=30)

# fromvcf()
###########

import petl as etl
# activate bio extensions
import petlx.bio
table1 = etl.fromvcf('fixture/sample.vcf')
table1.look(truncate=20)
Ejemplo n.º 10
0
def test_fromgff3_region():
    tbl_features = etl.fromgff3("fixture/sample.sorted.gff.gz", region="apidb|MAL5")
    eq_(7, tbl_features.nrows())
    tbl_features = etl.fromgff3("fixture/sample.sorted.gff.gz", region="apidb|MAL5:1289593-1289595")
    eq_(4, tbl_features.nrows())