Exemple #1
0
    def extract(self, document):
        """Return keywords"""
        # tokenize text
        sentences = stringUtils.sent_tokenize(document)
        tokens = [stringUtils.word_tokenize(s) for s in sentences]

        #TODO: need to pos tag words for picking only nouns
        #TODO: need to stem tokens for improving accuracy
        ratings = self.compute_ratings(tokens)
        result = self.pick_keywords(ratings, 5)

        return tuple(result)
Exemple #2
0
    def summarize(self, document, summaryLength):
        """Return a list of sentences"""
        # tokenize text
        sentences = stringUtils.sent_tokenize(document)
        tokens = [stringUtils.word_tokenize(s, stem=True) for s in sentences]

        cosine_matrix = self.compute_cosine(tokens, self._treshold)
        normalized_cosine_matrix = self.normalize_matrix(cosine_matrix)
        ratings = self.compute_ratings(normalized_cosine_matrix, self._epsilon)

        result = self.pick_best_sentences(sentences, ratings, summaryLength)

        return tuple(result)
    def test_sent_tokenize(self):
        """Does it successfully tokenize sentences?"""
        input_text = (
            "A first line.\n"
            "Next line."
        )

        expected = (
            "A first line.",
            "Next line."
        )

        result = stringUtils.sent_tokenize(input_text)

        self.assertTupleEqual(expected, result)
    def test_sent_tokenize_string_with_single_quotation_marks(self):
        """Does it successfully tokenize sentence containing ‘ and ’ ?"""        
        input_text = (
            """A first line.
            I’ve next line here."""
        )

        expected = (
            "A first line.",
            "I've next line here."
        )

        result = stringUtils.sent_tokenize(input_text)

        self.assertTupleEqual(expected, result)        
    def test_sent_tokenize_string_with_double_qutation_marks(self):
        """Does it successfully tokenize sentences containing “ and ” ?"""    
        input_text = (
            "A first line.\n"
            "Next line is, "
            "“inside non ascii double quotes.”"
        )

        expected = (
            "A first line.",
            "Next line is, \"inside non ascii double quotes.\""
        )

        result = stringUtils.sent_tokenize(input_text)

        self.assertTupleEqual(expected, result)
    def test_sent_tokenize_string_with_tabs(self):
        """Does it successfully tokenize sentences containing tabs?"""        
        input_text = (
            "A first \t\tline. \t\t\n"
            "\t\t\n"
            "Next line."
        )

        expected = (
            "A first line.",
            "Next line."
        )

        result = stringUtils.sent_tokenize(input_text)

        self.assertTupleEqual(expected, result)