def extract(self, document): """Return keywords""" # tokenize text sentences = stringUtils.sent_tokenize(document) tokens = [stringUtils.word_tokenize(s) for s in sentences] #TODO: need to pos tag words for picking only nouns #TODO: need to stem tokens for improving accuracy ratings = self.compute_ratings(tokens) result = self.pick_keywords(ratings, 5) return tuple(result)
def summarize(self, document, summaryLength): """Return a list of sentences""" # tokenize text sentences = stringUtils.sent_tokenize(document) tokens = [stringUtils.word_tokenize(s, stem=True) for s in sentences] cosine_matrix = self.compute_cosine(tokens, self._treshold) normalized_cosine_matrix = self.normalize_matrix(cosine_matrix) ratings = self.compute_ratings(normalized_cosine_matrix, self._epsilon) result = self.pick_best_sentences(sentences, ratings, summaryLength) return tuple(result)
def test_sent_tokenize(self): """Does it successfully tokenize sentences?""" input_text = ( "A first line.\n" "Next line." ) expected = ( "A first line.", "Next line." ) result = stringUtils.sent_tokenize(input_text) self.assertTupleEqual(expected, result)
def test_sent_tokenize_string_with_single_quotation_marks(self): """Does it successfully tokenize sentence containing ‘ and ’ ?""" input_text = ( """A first line. I’ve next line here.""" ) expected = ( "A first line.", "I've next line here." ) result = stringUtils.sent_tokenize(input_text) self.assertTupleEqual(expected, result)
def test_sent_tokenize_string_with_double_qutation_marks(self): """Does it successfully tokenize sentences containing “ and ” ?""" input_text = ( "A first line.\n" "Next line is, " "“inside non ascii double quotes.”" ) expected = ( "A first line.", "Next line is, \"inside non ascii double quotes.\"" ) result = stringUtils.sent_tokenize(input_text) self.assertTupleEqual(expected, result)
def test_sent_tokenize_string_with_tabs(self): """Does it successfully tokenize sentences containing tabs?""" input_text = ( "A first \t\tline. \t\t\n" "\t\t\n" "Next line." ) expected = ( "A first line.", "Next line." ) result = stringUtils.sent_tokenize(input_text) self.assertTupleEqual(expected, result)