def extract(self,text): """ """ tokens = self.tokenizer.tokenize(text) tagged_terms = self.tagger.tag(tokens) terms = {} np_terms = {} noun_phrases = [ node for node in self.np_finder.parse(tagged_terms) if not isinstance(node,tuple)] for node in noun_phrases: coll_tag = tree2conlltags(node) if len(coll_tag) > 1: mterm = [ term.lower() for (term,tag,temp) in coll_tag if len(term)>1 ] mterm = ' '.join(mterm) self._add(mterm,np_terms) for (term,tag,temp) in coll_tag: if tag.startswith('N') and len(term)>1: if tag in ['NNS','NNPS']: term = singularize(term) self._add(term.lower(),terms) for term in terms.keys(): if not self.filter(term,terms[term]): del terms[term] for term in np_terms.keys(): if not self.filter(term,np_terms[term]): del np_terms[term] return (terms,np_terms)
def test_singularization(self): """This is really tested elsewhere, see http://www.bermi.org/inflector/ """ self.failUnless(singularize("axes") == "axis") self.failUnless(singularize(":") == ":")