Example #1
0
    def test_tf_metrics(self):
        summarizer = LexRankSummarizer()

        sentences = [
            ("this", "sentence", "is", "simple", "sentence"),
            ("this", "is", "simple", "sentence", "yes", "is", "too", "too",
             "too"),
        ]
        metrics = summarizer._compute_tf(sentences)

        expected = [
            {
                "this": 1 / 2,
                "is": 1 / 2,
                "simple": 1 / 2,
                "sentence": 1.0
            },
            {
                "this": 1 / 3,
                "is": 2 / 3,
                "yes": 1 / 3,
                "simple": 1 / 3,
                "sentence": 1 / 3,
                "too": 1.0
            },
        ]
        self.assertEqual(expected, metrics)
Example #2
0
class LexRankSumy(Baseline):
    """ Description from https://arxiv.org/pdf/1905.13164.pdf
    LexRank (Erkan and Radev, 2004) is a widely-used graph-based extractive summarizer; 
    we build a graph with paragraphs as nodes andedges weighted by tf-idf cosine similarity; 
    we run a PageRank-like algorithm on this graph to rank and select paragraphs until 
    the length of the ground-truth summary is reached.
    """
    """ Implementation
    Wrapper of https://github.com/miso-belica/sumy
    """
    def __init__(self, name, language):
        super().__init__(name)
        self.language = language
        self.summarizer = LexRankSummarizer()

    def rank_sentences(self, dataset, document_column_name, **kwargs):
        all_sentences = []
        all_scores = []
        for document in tqdm(dataset[document_column_name]):
            sentences, scores = self.run_single(document)
            all_sentences.append(sentences)
            all_scores.append(scores)

        data = [{
            "sentences": sentences,
            "scores": scores
        } for sentences, scores in zip(all_sentences, all_scores)]
        return Baseline.append_column(dataset, data, self.name)

    def run_single(self, document):
        parser = PlaintextParser.from_string(document,
                                             Tokenizer(self.language))
        document = parser.document

        self.summarizer._ensure_dependencies_installed()

        sentences_words = [
            self.summarizer._to_words_set(s) for s in document.sentences
        ]
        if not sentences_words:
            return tuple()

        tf_metrics = self.summarizer._compute_tf(sentences_words)
        idf_metrics = self.summarizer._compute_idf(sentences_words)

        matrix = self.summarizer._create_matrix(sentences_words,
                                                self.summarizer.threshold,
                                                tf_metrics, idf_metrics)
        scores = self.summarizer.power_method(matrix, self.summarizer.epsilon)

        return list(map(str, document.sentences)), list(scores)
Example #3
0
    def test_tf_metrics(self):
        summarizer = LexRankSummarizer()

        sentences = [
            ("this", "sentence", "is", "simple", "sentence"),
            ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too"),
        ]
        metrics = summarizer._compute_tf(sentences)

        expected = [
            {"this": 1/2, "is": 1/2, "simple": 1/2, "sentence": 1.0},
            {"this": 1/3, "is": 2/3, "yes": 1/3, "simple": 1/3, "sentence": 1/3, "too": 1.0},
        ]
        self.assertEqual(expected, metrics)
Example #4
0
def lexrank_scoring(text: str) -> Tuple[List[str], np.ndarray]:
    """
    LexRankアルゴリズムによって文に点数をつける。
    この点数は文の重要度とみなすことができる。

    Parameters
    ----------
    text : str
        分析対象のテキスト。

    Returns
    -------
    List[str]
        text を文のリストに分解したもの。
    np.ndarray
        文のリストに対応する重要度のリスト。
    """
    doc = nlp(text)

    # 文のリストと単語のリストをつくる
    sentences = []
    corpus = []
    for sent in doc.sents:
        sentences.append(sent.text)
        tokens = []
        for token in sent:
            # 文に含まれる単語のうち, 名詞・副詞・形容詞・動詞に限定する
            if token.pos_ in ('NOUN', 'ADV', 'ADJ', 'VERB'):
                # ぶれをなくすため, 単語の見出し語 Token.lemma_ を使う
                tokens.append(token.lemma_)
        corpus.append(tokens)
    # sentences = [文0, 文1, ...]
    # corpus = [[文0の単語0, 文0の単語1, ...], [文1の単語0, 文1の単語1, ...], ...]

    # sumyライブラリによるLexRankスコアリング
    lxr = LexRankSummarizer()
    tf_metrics = lxr._compute_tf(corpus)
    idf_metrics = lxr._compute_idf(corpus)
    matrix = lxr._create_matrix(corpus, lxr.threshold, tf_metrics, idf_metrics)
    scores = lxr.power_method(matrix, lxr.epsilon)
    # scores = [文0の重要度, 文1の重要度, ...]

    return sentences, scores