Exemple #1
0
 def classify(self,db,fastain):
     proIDs,features,labels = [],[],[]
     prevFeatureset = ''
     prevText = ''
     for seq_record in SeqIO.parse(fastain, "fasta"):
         title = seq_record.id
         toks = title.split("|")
         proteinID = toks[5]
         query_rows = genbank.proteinQuery(proteinID,db)
         ids,text = zip(*query_rows)
         text = ''.join(map(str,text))
         if text=='': 
             label = ['na']
         else:
             text = word_reg.findall(text)
             featureset = self.gene_features(text)
             assert text!=prevText
             assert featureset!=prevFeatureset
             prevFeatureset = featureset
             prevText = text
             label = self.classifier.batch_classify([featureset])    
         
         proIDs.append(proteinID)  
         labels+=label
     return zip(proIDs,labels)