def test_filter_insignificant(self): chunk = self.extractor._parse_sentence(self.sentence) tags = [tag for word, tag in chunk.leaves()] assert_true('DT' in tags) filtered = filter_insignificant(chunk.leaves()) tags = [tag for word, tag in filtered] assert_true("DT" not in tags)
def extract(self, text): '''Return a list of noun phrases (strings) for body of text.''' sentences = nltk.tokenize.sent_tokenize(text) noun_phrases = [] for sentence in sentences: parsed = self._parse_sentence(sentence) # Get the string representation of each subtree that is a # noun phrase tree phrases = [_normalize_tags(filter_insignificant(each, self.INSIGNIFICANT_SUFFIXES)) for each in parsed if isinstance(each, nltk.tree.Tree) and each.label() == 'NP' and len(filter_insignificant(each)) >= 1 and _is_match(each, cfg=self.CFG)] nps = [tree2str(phrase) for phrase in phrases] noun_phrases.extend(nps) return noun_phrases