def t_unknown_seq(self):
     """Prepare unknown base sequences with the correct length.
     """
     gff_iterator = GFFAddingIterator()
     rec_dict = gff_iterator.get_all_features(self._test_gff_file)
     assert len(rec_dict["I"].seq) == 12766937
     assert len(rec_dict["X"].seq) == 17718531
 def t_fasta_directive(self):
     """Parse FASTA sequence information contained in a GFF3 file.
     """
     iterator = GFFAddingIterator()
     recs = iterator.get_all_features(self._gff_file)
     assert len(recs) == 1
     test_rec = recs['chr17']
     assert str(test_rec.seq) == "GATTACAGATTACA"
 def t_ensembl_nested_features(self):
     """Test nesting of features with GFF2 files using transcript_id.
     """
     gff_iterator = GFFAddingIterator()
     rec_dict = gff_iterator.get_all_features(self._ensembl_file)
     assert len(rec_dict["I"].features) == 2
     t_feature = rec_dict["I"].features[0]
     assert len(t_feature.sub_features) == 32
 def t_gff3_noval_attrib(self):
     """Parse GFF3 file from NCBI with a key/value pair with no value.
     """
     gff_iterator = GFFAddingIterator()
     recs = gff_iterator.get_all_features(self._test_ncbi)
     assert len(recs) == 1
     t_feature = recs.values()[0].features[0]
     assert t_feature.qualifiers["pseudo"] == ["true"]
 def t_gff2_iteration(self):
     """Test iterated features with GFF2 files, breaking without parents.
     """
     gff_iterator = GFFAddingIterator()
     break_dicts = []
     for rec_dict in gff_iterator.get_features(self._wormbase_file,
             target_lines=15):
         break_dicts.append(rec_dict)
     assert len(break_dicts) == 3
 def t_solid_iterator(self):
     """Iterated parsing in a flat file without nested features.
     """
     gff_iterator = GFFAddingIterator()
     feature_sizes = []
     for rec_dict in gff_iterator.get_features(self._test_gff_file,
             target_lines=5):
         feature_sizes.append([len(r.features) for r in rec_dict.values()])
     assert max([sum(s) for s in feature_sizes]) == 5
     assert len(feature_sizes) == 26, len(feature_sizes)
 def t_gff3_multiple_ids(self):
     """Deal with GFF3 with non-unique ID attributes, using NCBI example.
     """
     gff_iterator = GFFAddingIterator()
     recs = gff_iterator.get_all_features(self._test_ncbi)
     assert len(recs) == 1
     t_features = recs.values()[0].features[1:]
     # 4 feature sets, same ID, different positions, different attributes
     assert len(t_features) == 4
     for f in t_features:
         assert len(f.sub_features) == 3
 def t_gff3_iterator(self):
     """Iterated parsing in GFF3 files with nested features.
     """
     gff_iterator = GFFAddingIterator()
     feature_sizes = []
     for rec_dict in gff_iterator.get_features(self._test_gff_file,
             target_lines=70):
         feature_sizes.append([len(r.features) for r in rec_dict.values()])
     # should be one big set because we don't have a good place to split
     assert len(feature_sizes) == 1
     assert feature_sizes[0][0] == 59
 def t_basic_directives(self):
     """Parse out top level meta-data supplied in a GFF3 file.
     """
     iterator = GFFAddingIterator()
     recs = iterator.get_all_features(self._gff_file)
     anns = recs['chr17'].annotations
     assert anns['gff-version'] == ['3']
     assert anns['attribute-ontology'] == ['baz']
     assert anns['feature-ontology'] == ['bar']
     assert anns['source-ontology'] == ['boo']
     assert anns['sequence-region'] == [('foo', '1', '100'), ('chr17',
         '62467934', '62469545')]
 def t_wormbase_nested_features(self):
     """Test nesting of features with GFF2 files using Transcript only.
     """
     gff_iterator = GFFAddingIterator()
     rec_dict = gff_iterator.get_all_features(self._wormbase_file)
     assert len(rec_dict) == 3
     parent_features = [f for f in rec_dict["I"].features if f.type ==
             "Transcript"]
     assert len(parent_features) == 1
     inferred_features = [f for f in rec_dict["I"].features if f.type ==
             "inferred_parent"]
     assert len(inferred_features) == 0
     tfeature = parent_features[0]
     assert tfeature.qualifiers["WormPep"][0] == "WP:CE40797"
     assert len(tfeature.sub_features) == 46
 def t_line_adjust(self):
     """Adjust lines during parsing to fix potential GFF problems.
     """
     def adjust_fn(results):
         rec_index = results['quals']['i'][0]
         read_name = results['rec_id']
         results['quals']['read_name'] = [read_name]
         results['rec_id'] = rec_index
         return results
     gff_iterator = GFFAddingIterator(line_adjust_fn=adjust_fn)
     rec_dict = gff_iterator.get_all_features(self._test_gff_file)
     assert len(rec_dict) == 1
     assert rec_dict.keys() == ['1']
     assert len(rec_dict.values()[0].features) == 112
     assert rec_dict.values()[0].features[0].qualifiers['read_name'] == \
             ['3_336_815_F3']
Exemple #12
0
def main(seq_file, gff_file):
    # -- To be customized
    # You need to update these parameters to point to your local database
    user = "******"
    passwd = "cdev"
    host = "localhost"
    db_name = "wb199_gff"
    biodb_name = "wb199_gff_cds_pcr"
    # These need to be updated to reflect what you would like to parse
    # out of the GFF file. Set limit_info=None to parse everything, but
    # be sure the file is small or you may deal with memory issues.
    rnai_types = [('Orfeome', 'PCR_product'),
                ('GenePair_STS', 'PCR_product'),
                ('Promoterome', 'PCR_product')]
    gene_types = [('Non_coding_transcript', 'gene'),
                  ('Coding_transcript', 'gene'),
                  ('Coding_transcript', 'mRNA'),
                  ('Coding_transcript', 'CDS')]
    limit_info = dict(gff_source_type = rnai_types + gene_types)
    # --
    print "Parsing FASTA sequence file..."
    with open(seq_file) as seq_handle:
        seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))

    print "Parsing GFF data file..."
    feature_adder = GFFAddingIterator(seq_dict)
    rec_dict = feature_adder.get_all_features(gff_file, limit_info)
    recs = rec_dict.values()

    print "Writing to BioSQL database..."
    server = BioSeqDatabase.open_database(driver="MySQLdb", user=user,
            passwd=passwd, host=host, db=db_name)
    try:
        if biodb_name not in server.keys():
            server.new_database(biodb_name)
        else:
            server.remove_database(biodb_name)
            server.adaptor.commit()
            server.new_database(biodb_name)
        db = server[biodb_name]
        db.load(recs)
        server.adaptor.commit()
    except:
        server.adaptor.rollback()
        raise
 def t_gff3_iterator_limit(self):
     """Iterated interface using a limit query on GFF3 files.
     """
     gff_iterator = GFFAddingIterator()
     cds_limit_info = dict(
             gff_source_type = [('Coding_transcript', 'gene'),
                          ('Coding_transcript', 'mRNA'),
                          ('Coding_transcript', 'CDS')],
             gff_id = ['I']
             )
     it_recs = []
     for rec_dict in gff_iterator.get_features(self._test_gff_file,
             limit_info=cds_limit_info):
         it_recs.append(rec_dict)
     assert len(it_recs) == 1
     tfeature = it_recs[0]["I"].features[0].sub_features[0]
     for sub_test in tfeature.sub_features:
         assert sub_test.type == "CDS", sub_test
    def t_gff3_to_gff3(self):
        """Read in and write out GFF3 without any loss of information.
        """
        gff_iterator = GFFAddingIterator()
        recs = gff_iterator.get_all_features(self._test_gff_file)
        out_handle = StringIO.StringIO()
        writer = GFF3Writer()
        writer.write(recs.values(), out_handle)
        wrote_handle = StringIO.StringIO(out_handle.getvalue())
        (_, tmp_file) = tempfile.mkstemp(dir=self._test_dir)
        try:
            tmp_handle = open(tmp_file, "w")
            tmp_handle.write(wrote_handle.read())
            tmp_handle.close()
            recs_two = gff_iterator.get_all_features(tmp_file)
        finally:
            os.remove(tmp_file)

        orig_rec = recs.values()[0]
        re_rec = recs.values()[0]
        assert len(orig_rec.features) == len(re_rec.features)
        for i, orig_f in enumerate(orig_rec.features):
            assert str(orig_f) == str(re_rec.features[i])