Esempio n. 1
0
 def test_filter_insignificant(self):
     chunk = self.extractor._parse_sentence(self.sentence)
     tags = [tag for word, tag in chunk.leaves()]
     assert_true('DT' in tags)
     filtered = filter_insignificant(chunk.leaves())
     tags = [tag for word, tag in filtered]
     assert_true("DT" not in tags)
Esempio n. 2
0
 def extract(self, text):
     '''Return a list of noun phrases (strings) for body of text.'''
     sentences = nltk.tokenize.sent_tokenize(text)
     noun_phrases = []
     for sentence in sentences:
         parsed = self._parse_sentence(sentence)
         # Get the string representation of each subtree that is a
         # noun phrase tree
         phrases = [_normalize_tags(filter_insignificant(each,
                    self.INSIGNIFICANT_SUFFIXES)) for each in parsed
                    if isinstance(each, nltk.tree.Tree) and each.label()
                    == 'NP' and len(filter_insignificant(each)) >= 1
                    and _is_match(each, cfg=self.CFG)]
         nps = [tree2str(phrase) for phrase in phrases]
         noun_phrases.extend(nps)
     return noun_phrases