def extract(self, document): """Return keywords""" # tokenize text sentences = stringUtils.sent_tokenize(document) tokens = [stringUtils.word_tokenize(s) for s in sentences] #TODO: need to pos tag words for picking only nouns #TODO: need to stem tokens for improving accuracy ratings = self.compute_ratings(tokens) result = self.pick_keywords(ratings, 5) return tuple(result)
def test_word_tokenize_with_stem(self): """Does it successfully tokenize words with stem option?""" input_text = "crying buying" expected = ( "cry", "buy" ) result = stringUtils.word_tokenize(input_text, filter_stopwords=False, stem=True) self.assertTupleEqual(expected, result)
def summarize(self, document, summaryLength): """Return a list of sentences""" # tokenize text sentences = stringUtils.sent_tokenize(document) tokens = [stringUtils.word_tokenize(s, stem=True) for s in sentences] cosine_matrix = self.compute_cosine(tokens, self._treshold) normalized_cosine_matrix = self.normalize_matrix(cosine_matrix) ratings = self.compute_ratings(normalized_cosine_matrix, self._epsilon) result = self.pick_best_sentences(sentences, ratings, summaryLength) return tuple(result)
def test_word_tokenize(self): """Does it successfully tokenize words?""" input_text = "This is a sample." expected = ( "this", "is", "a", "sample" ) result = stringUtils.word_tokenize(input_text, filter_stopwords=False, stem=False) self.assertTupleEqual(expected, result)
def test_word_tokenize_with_stopwords_filter(self): """Does it successfully tokenize words with stopwords filter option?""" input_text = "How do you choose the article that's listed on the site." expected = ( "choose", "article", "listed", "site" ) result = stringUtils.word_tokenize(input_text, filter_stopwords=True) self.assertTupleEqual(expected, result)