def t_unknown_seq(self): """Prepare unknown base sequences with the correct length. """ gff_iterator = GFFAddingIterator() rec_dict = gff_iterator.get_all_features(self._test_gff_file) assert len(rec_dict["I"].seq) == 12766937 assert len(rec_dict["X"].seq) == 17718531
def t_fasta_directive(self): """Parse FASTA sequence information contained in a GFF3 file. """ iterator = GFFAddingIterator() recs = iterator.get_all_features(self._gff_file) assert len(recs) == 1 test_rec = recs['chr17'] assert str(test_rec.seq) == "GATTACAGATTACA"
def t_ensembl_nested_features(self): """Test nesting of features with GFF2 files using transcript_id. """ gff_iterator = GFFAddingIterator() rec_dict = gff_iterator.get_all_features(self._ensembl_file) assert len(rec_dict["I"].features) == 2 t_feature = rec_dict["I"].features[0] assert len(t_feature.sub_features) == 32
def t_gff3_noval_attrib(self): """Parse GFF3 file from NCBI with a key/value pair with no value. """ gff_iterator = GFFAddingIterator() recs = gff_iterator.get_all_features(self._test_ncbi) assert len(recs) == 1 t_feature = recs.values()[0].features[0] assert t_feature.qualifiers["pseudo"] == ["true"]
def t_gff2_iteration(self): """Test iterated features with GFF2 files, breaking without parents. """ gff_iterator = GFFAddingIterator() break_dicts = [] for rec_dict in gff_iterator.get_features(self._wormbase_file, target_lines=15): break_dicts.append(rec_dict) assert len(break_dicts) == 3
def t_solid_iterator(self): """Iterated parsing in a flat file without nested features. """ gff_iterator = GFFAddingIterator() feature_sizes = [] for rec_dict in gff_iterator.get_features(self._test_gff_file, target_lines=5): feature_sizes.append([len(r.features) for r in rec_dict.values()]) assert max([sum(s) for s in feature_sizes]) == 5 assert len(feature_sizes) == 26, len(feature_sizes)
def t_gff3_multiple_ids(self): """Deal with GFF3 with non-unique ID attributes, using NCBI example. """ gff_iterator = GFFAddingIterator() recs = gff_iterator.get_all_features(self._test_ncbi) assert len(recs) == 1 t_features = recs.values()[0].features[1:] # 4 feature sets, same ID, different positions, different attributes assert len(t_features) == 4 for f in t_features: assert len(f.sub_features) == 3
def t_gff3_iterator(self): """Iterated parsing in GFF3 files with nested features. """ gff_iterator = GFFAddingIterator() feature_sizes = [] for rec_dict in gff_iterator.get_features(self._test_gff_file, target_lines=70): feature_sizes.append([len(r.features) for r in rec_dict.values()]) # should be one big set because we don't have a good place to split assert len(feature_sizes) == 1 assert feature_sizes[0][0] == 59
def t_basic_directives(self): """Parse out top level meta-data supplied in a GFF3 file. """ iterator = GFFAddingIterator() recs = iterator.get_all_features(self._gff_file) anns = recs['chr17'].annotations assert anns['gff-version'] == ['3'] assert anns['attribute-ontology'] == ['baz'] assert anns['feature-ontology'] == ['bar'] assert anns['source-ontology'] == ['boo'] assert anns['sequence-region'] == [('foo', '1', '100'), ('chr17', '62467934', '62469545')]
def t_wormbase_nested_features(self): """Test nesting of features with GFF2 files using Transcript only. """ gff_iterator = GFFAddingIterator() rec_dict = gff_iterator.get_all_features(self._wormbase_file) assert len(rec_dict) == 3 parent_features = [f for f in rec_dict["I"].features if f.type == "Transcript"] assert len(parent_features) == 1 inferred_features = [f for f in rec_dict["I"].features if f.type == "inferred_parent"] assert len(inferred_features) == 0 tfeature = parent_features[0] assert tfeature.qualifiers["WormPep"][0] == "WP:CE40797" assert len(tfeature.sub_features) == 46
def t_line_adjust(self): """Adjust lines during parsing to fix potential GFF problems. """ def adjust_fn(results): rec_index = results['quals']['i'][0] read_name = results['rec_id'] results['quals']['read_name'] = [read_name] results['rec_id'] = rec_index return results gff_iterator = GFFAddingIterator(line_adjust_fn=adjust_fn) rec_dict = gff_iterator.get_all_features(self._test_gff_file) assert len(rec_dict) == 1 assert rec_dict.keys() == ['1'] assert len(rec_dict.values()[0].features) == 112 assert rec_dict.values()[0].features[0].qualifiers['read_name'] == \ ['3_336_815_F3']
def main(seq_file, gff_file): # -- To be customized # You need to update these parameters to point to your local database user = "******" passwd = "cdev" host = "localhost" db_name = "wb199_gff" biodb_name = "wb199_gff_cds_pcr" # These need to be updated to reflect what you would like to parse # out of the GFF file. Set limit_info=None to parse everything, but # be sure the file is small or you may deal with memory issues. rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type = rnai_types + gene_types) # -- print "Parsing FASTA sequence file..." with open(seq_file) as seq_handle: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) print "Parsing GFF data file..." feature_adder = GFFAddingIterator(seq_dict) rec_dict = feature_adder.get_all_features(gff_file, limit_info) recs = rec_dict.values() print "Writing to BioSQL database..." server = BioSeqDatabase.open_database(driver="MySQLdb", user=user, passwd=passwd, host=host, db=db_name) try: if biodb_name not in server.keys(): server.new_database(biodb_name) else: server.remove_database(biodb_name) server.adaptor.commit() server.new_database(biodb_name) db = server[biodb_name] db.load(recs) server.adaptor.commit() except: server.adaptor.rollback() raise
def t_gff3_iterator_limit(self): """Iterated interface using a limit query on GFF3 files. """ gff_iterator = GFFAddingIterator() cds_limit_info = dict( gff_source_type = [('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id = ['I'] ) it_recs = [] for rec_dict in gff_iterator.get_features(self._test_gff_file, limit_info=cds_limit_info): it_recs.append(rec_dict) assert len(it_recs) == 1 tfeature = it_recs[0]["I"].features[0].sub_features[0] for sub_test in tfeature.sub_features: assert sub_test.type == "CDS", sub_test
def t_gff3_to_gff3(self): """Read in and write out GFF3 without any loss of information. """ gff_iterator = GFFAddingIterator() recs = gff_iterator.get_all_features(self._test_gff_file) out_handle = StringIO.StringIO() writer = GFF3Writer() writer.write(recs.values(), out_handle) wrote_handle = StringIO.StringIO(out_handle.getvalue()) (_, tmp_file) = tempfile.mkstemp(dir=self._test_dir) try: tmp_handle = open(tmp_file, "w") tmp_handle.write(wrote_handle.read()) tmp_handle.close() recs_two = gff_iterator.get_all_features(tmp_file) finally: os.remove(tmp_file) orig_rec = recs.values()[0] re_rec = recs.values()[0] assert len(orig_rec.features) == len(re_rec.features) for i, orig_f in enumerate(orig_rec.features): assert str(orig_f) == str(re_rec.features[i])