def t_gff_annotations(self):
     """Check GFF annotations placed on an entire sequence.
     """
     feature_adder = GFFMapReduceFeatureAdder(dict())
     feature_adder.add_features(self._test_gff_ann_file)
     final_rec = feature_adder.base['I']
     assert len(final_rec.annotations.keys()) == 2
     assert final_rec.annotations['source'] == ['Expr_profile']
     assert final_rec.annotations['expr_profile'] == ['B0019.1']
 def t_no_dict_error(self):
     """Ensure an error is raised when no dictionary to map to is present.
     """
     feature_adder = GFFMapReduceFeatureAdder(dict(), create_missing=False)
     try:
         feature_adder.add_features(self._test_gff_file)
         # no error -- problem
         raise AssertionError('Did not complain with missing dictionary')
     except KeyError:
         pass
 def t_local_map_reduce(self):
     """General map reduce framework without parallelization.
     """
     cds_limit_info = dict(
             gff_type = ["gene", "mRNA", "CDS"],
             gff_id = ['I']
             )
     feature_adder = GFFMapReduceFeatureAdder(dict(), None)
     feature_adder.add_features(self._test_gff_file, cds_limit_info)
     final_rec = feature_adder.base['I']
     assert len(final_rec.features) == 32
 def t_tricky_semicolons(self):
     """Parsing of tricky semi-colon positions in WormBase GFF2.
     """
     limit_info = dict(
             gff_source_type = [('Genomic_canonical', 'region')]
             )
     feature_adder = GFFMapReduceFeatureAdder(dict())
     feature_adder.add_features(self._wormbase_file, limit_info)
     assert len(feature_adder.base['I'].features) == 1
     test_feature = feature_adder.base['I'].features[0]
     assert test_feature.qualifiers['Note'] == \
       ['Clone cTel33B; Genbank AC199162', 'Clone cTel33B; Genbank AC199162']
 def t_jgi_gff(self):
     """Parsing of JGI formatted GFF2, nested using transcriptId and proteinID
     """
     feature_adder = GFFMapReduceFeatureAdder(dict())
     feature_adder.add_features(self._jgi_file)
     tfeature = feature_adder.base['chr_1'].features[0]
     assert tfeature.location.nofuzzy_start == 37060
     assert tfeature.location.nofuzzy_end == 38216
     assert tfeature.type == 'inferred_parent'
     assert len(tfeature.sub_features) == 6
     sfeature = tfeature.sub_features[1]
     assert sfeature.qualifiers['proteinId'] == ['873']
     assert sfeature.qualifiers['phase'] == ['0']
 def t_basic_solid_parse(self):
     """Basic parsing of SOLiD GFF results files.
     """
     feature_adder = GFFMapReduceFeatureAdder(dict())
     feature_adder.add_features(self._test_gff_file)
     test_feature = feature_adder.base['3_341_424_F3'].features[0]
     assert test_feature.location.nofuzzy_start == 102716
     assert test_feature.location.nofuzzy_end == 102736
     assert len(test_feature.qualifiers) == 7
     assert test_feature.qualifiers['score'] == ['10.6']
     assert test_feature.qualifiers['source'] == ['solid']
     assert test_feature.strand == -1
     assert test_feature.type == 'read'
     assert test_feature.qualifiers['g'] == ['T2203031313223113212']
     assert len(test_feature.qualifiers['q']) == 20
 def t_basic_attributes(self):
     """Parse out basic attributes of GFF2 from Ensembl GTF.
     """
     limit_info = dict(
             gff_source_type = [('snoRNA', 'exon')]
             )
     feature_adder = GFFMapReduceFeatureAdder(dict())
     feature_adder.add_features(self._ensembl_file, limit_info)
     assert len(feature_adder.base['I'].features) == 1
     test_feature = feature_adder.base['I'].features[0]
     qual_keys = test_feature.qualifiers.keys()
     qual_keys.sort()
     assert qual_keys == ['Parent', 'exon_number', 'gene_id', 'gene_name',
             'source', 'transcript_id', 'transcript_name']
     assert test_feature.qualifiers['source'] == ['snoRNA']
     assert test_feature.qualifiers['transcript_name'] == ['NR_001477.2']
     assert test_feature.qualifiers['exon_number'] == ['1']
 def t_disco_map_reduce(self):
     """Map reduce framework parallelized using disco.
     """
     # this needs to be more generalized but fails okay with no disco
     try:
         import disco
         import simplejson
     except ImportError:
         print "Skipping -- disco and json not found"
         return
     cds_limit_info = dict(
             gff_source_type = [('Non_coding_transcript', 'gene'),
                          ('Coding_transcript', 'gene'),
                          ('Coding_transcript', 'mRNA'),
                          ('Coding_transcript', 'CDS')],
             gff_id = ['I']
             )
     feature_adder = GFFMapReduceFeatureAdder(dict(),
             disco_host=self._disco_host)
     feature_adder.add_features(self._test_gff_file, cds_limit_info)
     final_rec = feature_adder.base['I']
     # second gene feature is multi-parent
     assert len(final_rec.features) == 2 # two gene feature