コード例 #1
0
 def extract(self,text):
     """
     """
     tokens = self.tokenizer.tokenize(text)
     tagged_terms = self.tagger.tag(tokens)
     terms = {}
     np_terms = {}
     
     noun_phrases = [
         node
         for node in self.np_finder.parse(tagged_terms)
         if not isinstance(node,tuple)]
     
     for node in noun_phrases:
         coll_tag = tree2conlltags(node)
         if len(coll_tag) > 1:
             mterm = [
                 term.lower()
                 for (term,tag,temp) in coll_tag
                 if len(term)>1
                 ]
             
             mterm = ' '.join(mterm)
             self._add(mterm,np_terms)
         for (term,tag,temp) in coll_tag:
             if tag.startswith('N') and len(term)>1:
                 if tag in ['NNS','NNPS']:
                     term = singularize(term)
                 self._add(term.lower(),terms)
     
     for term in terms.keys():
         if not self.filter(term,terms[term]):
             del terms[term]
     
     for term in np_terms.keys():
         if not self.filter(term,np_terms[term]):
             del np_terms[term]
     
     return (terms,np_terms)
コード例 #2
0
 def test_singularization(self):
     """This is really tested elsewhere,
     see http://www.bermi.org/inflector/
     """
     self.failUnless(singularize("axes") == "axis")
     self.failUnless(singularize(":") == ":")