def test_fromgff3_region(): tbl_features = etl.fromgff3('fixture/sample.sorted.gff.gz', region='apidb|MAL5') eq_(7, tbl_features.nrows()) tbl_features = etl.fromgff3('fixture/sample.sorted.gff.gz', region='apidb|MAL5:1289593-1289595') eq_(4, tbl_features.nrows())
def __init__(self, fasta_path, gff3_path, seqid=None): """ An annotated reference genome. Parameters ---------- fasta_path : string Path to reference genome FASTA file. gff3_path : string Path to genome annotations GFF3 file. """ # store initialisation parameters self._fasta_path = fasta_path self._gff3_path = gff3_path self._seqid = seqid # setup access to reference sequence self._fasta = pyfasta.Fasta(fasta_path) # setup access to GFF3 as a table if isinstance(gff3_path, (list, tuple)): tbl_features = etl.cat(*[etl.fromgff3(p) for p in gff3_path]) else: tbl_features = etl.fromgff3(gff3_path) tbl_features = (tbl_features.unpackdict( 'attributes', ['ID', 'Parent']).rename({ 'ID': 'feature_id', 'Parent': 'parent_id', 'end': 'stop' }).select(lambda row: (row.stop - row.start) > 0)) # limit data to a single chromosome if seqid is not None: tbl_features = tbl_features.eq('seqid', seqid) self._tbl_features = tbl_features.cache() # index features by ID self._idx_feature_id = self._tbl_features.recordlookupone('feature_id') # index features by parent ID self._idx_parent_id = self._tbl_features.recordlookup('parent_id') # index features by genomic location self._idx_location = self._tbl_features.facetintervalrecordlookup( 'seqid', 'start', 'stop', include_stop=True)
def get_geneset_features(geneset_fn, chrom, start=None, stop=None): """Function to load geneset features for a specific genome region via petl.""" if start and stop: region = '%s:%s-%s' % (chrom, start, stop) else: region = chrom return etl.fromgff3(geneset_fn, region=region)
def test_fromgff3_trailing_semicolon(): features = etl.fromgff3(sample_gff3_filename) row = list(features)[2] eq_('apidb|MAL2', row[0]) eq_('ApiDB', row[1]) eq_('supercontig', row[2]) eq_(1, row[3]) eq_(947102, row[4]) eq_('.', row[5]) eq_('+', row[6]) eq_('.', row[7]) eq_('apidb|MAL2', row[8]['ID']) eq_('MAL2', row[8]['Name']) eq_('Plasmodium falciparum', row[8]['organism_name'])
def test_fromgff3_trailing_semicolon(): features = etl.fromgff3(sample_gff3_filename) row = list(features)[2] eq_("apidb|MAL2", row[0]) eq_("ApiDB", row[1]) eq_("supercontig", row[2]) eq_(1, row[3]) eq_(947102, row[4]) eq_(".", row[5]) eq_("+", row[6]) eq_(".", row[7]) eq_("apidb|MAL2", row[8]["ID"]) eq_("MAL2", row[8]["Name"]) eq_("Plasmodium falciparum", row[8]["organism_name"])
def test_fromgff3(): features = etl.fromgff3(sample_gff3_filename) eq_(GFF3_HEADER, features.header()) row = list(features)[1] eq_('apidb|MAL1', row[0]) eq_('ApiDB', row[1]) eq_('supercontig', row[2]) eq_(1, row[3]) eq_(643292, row[4]) eq_('.', row[5]) eq_('+', row[6]) eq_('.', row[7]) eq_('apidb|MAL1', row[8]['ID']) eq_('MAL1', row[8]['Name']) eq_('Plasmodium falciparum', row[8]['organism_name'])
def test_fromgff3(): features = etl.fromgff3(sample_gff3_filename) eq_(GFF3_HEADER, features.header()) row = list(features)[1] eq_("apidb|MAL1", row[0]) eq_("ApiDB", row[1]) eq_("supercontig", row[2]) eq_(1, row[3]) eq_(643292, row[4]) eq_(".", row[5]) eq_("+", row[6]) eq_(".", row[7]) eq_("apidb|MAL1", row[8]["ID"]) eq_("MAL1", row[8]["Name"]) eq_("Plasmodium falciparum", row[8]["organism_name"])
import petlx.bio table1 = etl.fromtabix('fixture/test.bed.gz', region='Pf3D7_02_v3') table1 table2 = etl.fromtabix('fixture/test.bed.gz', region='Pf3D7_02_v3:110000-120000') table2 # fromgff3() ############ import petl as etl # activate bio extensions import petlx.bio table1 = etl.fromgff3('fixture/sample.gff') table1.look(truncate=30) # extract from a specific genome region via tabix table2 = etl.fromgff3('fixture/sample.sorted.gff.gz', region='apidb|MAL5:1289593-1289595') table2.look(truncate=30) # fromvcf() ########### import petl as etl # activate bio extensions import petlx.bio table1 = etl.fromvcf('fixture/sample.vcf') table1.look(truncate=20)
import petl as etl # activate bio extensions import petlx.bio table1 = etl.fromtabix('fixture/test.bed.gz', region='Pf3D7_02_v3') table1 table2 = etl.fromtabix('fixture/test.bed.gz', region='Pf3D7_02_v3:110000-120000') table2 # fromgff3() ############ import petl as etl # activate bio extensions import petlx.bio table1 = etl.fromgff3('fixture/sample.gff') table1.look(truncate=30) # extract from a specific genome region via tabix table2 = etl.fromgff3('fixture/sample.sorted.gff.gz', region='apidb|MAL5:1289593-1289595') table2.look(truncate=30) # fromvcf() ########### import petl as etl # activate bio extensions import petlx.bio table1 = etl.fromvcf('fixture/sample.vcf') table1.look(truncate=20)
def test_fromgff3_region(): tbl_features = etl.fromgff3("fixture/sample.sorted.gff.gz", region="apidb|MAL5") eq_(7, tbl_features.nrows()) tbl_features = etl.fromgff3("fixture/sample.sorted.gff.gz", region="apidb|MAL5:1289593-1289595") eq_(4, tbl_features.nrows())