def t_gff_annotations(self): """Check GFF annotations placed on an entire sequence. """ feature_adder = GFFMapReduceFeatureAdder(dict()) feature_adder.add_features(self._test_gff_ann_file) final_rec = feature_adder.base['I'] assert len(final_rec.annotations.keys()) == 2 assert final_rec.annotations['source'] == ['Expr_profile'] assert final_rec.annotations['expr_profile'] == ['B0019.1']
def t_no_dict_error(self): """Ensure an error is raised when no dictionary to map to is present. """ feature_adder = GFFMapReduceFeatureAdder(dict(), create_missing=False) try: feature_adder.add_features(self._test_gff_file) # no error -- problem raise AssertionError('Did not complain with missing dictionary') except KeyError: pass
def t_local_map_reduce(self): """General map reduce framework without parallelization. """ cds_limit_info = dict( gff_type = ["gene", "mRNA", "CDS"], gff_id = ['I'] ) feature_adder = GFFMapReduceFeatureAdder(dict(), None) feature_adder.add_features(self._test_gff_file, cds_limit_info) final_rec = feature_adder.base['I'] assert len(final_rec.features) == 32
def t_tricky_semicolons(self): """Parsing of tricky semi-colon positions in WormBase GFF2. """ limit_info = dict( gff_source_type = [('Genomic_canonical', 'region')] ) feature_adder = GFFMapReduceFeatureAdder(dict()) feature_adder.add_features(self._wormbase_file, limit_info) assert len(feature_adder.base['I'].features) == 1 test_feature = feature_adder.base['I'].features[0] assert test_feature.qualifiers['Note'] == \ ['Clone cTel33B; Genbank AC199162', 'Clone cTel33B; Genbank AC199162']
def t_jgi_gff(self): """Parsing of JGI formatted GFF2, nested using transcriptId and proteinID """ feature_adder = GFFMapReduceFeatureAdder(dict()) feature_adder.add_features(self._jgi_file) tfeature = feature_adder.base['chr_1'].features[0] assert tfeature.location.nofuzzy_start == 37060 assert tfeature.location.nofuzzy_end == 38216 assert tfeature.type == 'inferred_parent' assert len(tfeature.sub_features) == 6 sfeature = tfeature.sub_features[1] assert sfeature.qualifiers['proteinId'] == ['873'] assert sfeature.qualifiers['phase'] == ['0']
def t_basic_solid_parse(self): """Basic parsing of SOLiD GFF results files. """ feature_adder = GFFMapReduceFeatureAdder(dict()) feature_adder.add_features(self._test_gff_file) test_feature = feature_adder.base['3_341_424_F3'].features[0] assert test_feature.location.nofuzzy_start == 102716 assert test_feature.location.nofuzzy_end == 102736 assert len(test_feature.qualifiers) == 7 assert test_feature.qualifiers['score'] == ['10.6'] assert test_feature.qualifiers['source'] == ['solid'] assert test_feature.strand == -1 assert test_feature.type == 'read' assert test_feature.qualifiers['g'] == ['T2203031313223113212'] assert len(test_feature.qualifiers['q']) == 20
def t_basic_attributes(self): """Parse out basic attributes of GFF2 from Ensembl GTF. """ limit_info = dict( gff_source_type = [('snoRNA', 'exon')] ) feature_adder = GFFMapReduceFeatureAdder(dict()) feature_adder.add_features(self._ensembl_file, limit_info) assert len(feature_adder.base['I'].features) == 1 test_feature = feature_adder.base['I'].features[0] qual_keys = test_feature.qualifiers.keys() qual_keys.sort() assert qual_keys == ['Parent', 'exon_number', 'gene_id', 'gene_name', 'source', 'transcript_id', 'transcript_name'] assert test_feature.qualifiers['source'] == ['snoRNA'] assert test_feature.qualifiers['transcript_name'] == ['NR_001477.2'] assert test_feature.qualifiers['exon_number'] == ['1']
def t_disco_map_reduce(self): """Map reduce framework parallelized using disco. """ # this needs to be more generalized but fails okay with no disco try: import disco import simplejson except ImportError: print "Skipping -- disco and json not found" return cds_limit_info = dict( gff_source_type = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')], gff_id = ['I'] ) feature_adder = GFFMapReduceFeatureAdder(dict(), disco_host=self._disco_host) feature_adder.add_features(self._test_gff_file, cds_limit_info) final_rec = feature_adder.base['I'] # second gene feature is multi-parent assert len(final_rec.features) == 2 # two gene feature