def test_idf_metrics(self): summarizer = LexRankSummarizer() sentences = [ ("this", "sentence", "is", "simple", "sentence",), ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too",), ("not", "every", "sentence", "makes", "me", "happy",), ("yes",), (), ("every", "day", "is", "happy", "day",), ] metrics = summarizer._compute_idf(sentences) expected = { "this": 6/2, "is": 6/3, "yes": 6/2, "simple": 6/2, "sentence": 6/3, "too": 6/1, "not": 6/1, "every": 6/2, "makes": 6/1, "me": 6/1, "happy": 6/2, "day": 6/1, } self.assertEqual(expected, metrics)
def test_idf_metrics(): summarizer = LexRankSummarizer() sentences = [ ("this", "sentence", "is", "simple", "sentence",), ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too",), ("not", "every", "sentence", "makes", "me", "happy",), ("yes",), (), ("every", "day", "is", "happy", "day",), ] metrics = summarizer._compute_idf(sentences) expected = { "this": math.log(6/3), "is": math.log(6/4), "yes": math.log(6/3), "simple": math.log(6/3), "sentence": math.log(6/4), "too": math.log(6/2), "not": math.log(6/2), "every": math.log(6/3), "makes": math.log(6/2), "me": math.log(6/2), "happy": math.log(6/3), "day": math.log(6/2), } assert expected == metrics
def test_idf_metrics(): summarizer = LexRankSummarizer() sentences = [ ("this", "sentence", "is", "simple", "sentence",), ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too",), ("not", "every", "sentence", "makes", "me", "happy",), ("yes",), (), ("every", "day", "is", "happy", "day",), ] metrics = summarizer._compute_idf(sentences) expected = { "this": math.log(6/3), "is": math.log(6/4), "yes": math.log(6/3), "simple": math.log(6/3), "sentence": math.log(6/4), "too": math.log(6/2), "not": math.log(6/2), "every": math.log(6/3), "makes": math.log(6/2), "me": math.log(6/2), "happy": math.log(6/3), "day": math.log(6/2), } assert expected == metrics
def test_idf_metrics(self): summarizer = LexRankSummarizer() sentences = [ ( "this", "sentence", "is", "simple", "sentence", ), ( "this", "is", "simple", "sentence", "yes", "is", "too", "too", "too", ), ( "not", "every", "sentence", "makes", "me", "happy", ), ("yes", ), (), ( "every", "day", "is", "happy", "day", ), ] metrics = summarizer._compute_idf(sentences) expected = { "this": 6 / 2, "is": 6 / 3, "yes": 6 / 2, "simple": 6 / 2, "sentence": 6 / 3, "too": 6 / 1, "not": 6 / 1, "every": 6 / 2, "makes": 6 / 1, "me": 6 / 1, "happy": 6 / 2, "day": 6 / 1, } self.assertEqual(expected, metrics)
class LexRankSumy(Baseline): """ Description from https://arxiv.org/pdf/1905.13164.pdf LexRank (Erkan and Radev, 2004) is a widely-used graph-based extractive summarizer; we build a graph with paragraphs as nodes andedges weighted by tf-idf cosine similarity; we run a PageRank-like algorithm on this graph to rank and select paragraphs until the length of the ground-truth summary is reached. """ """ Implementation Wrapper of https://github.com/miso-belica/sumy """ def __init__(self, name, language): super().__init__(name) self.language = language self.summarizer = LexRankSummarizer() def rank_sentences(self, dataset, document_column_name, **kwargs): all_sentences = [] all_scores = [] for document in tqdm(dataset[document_column_name]): sentences, scores = self.run_single(document) all_sentences.append(sentences) all_scores.append(scores) data = [{ "sentences": sentences, "scores": scores } for sentences, scores in zip(all_sentences, all_scores)] return Baseline.append_column(dataset, data, self.name) def run_single(self, document): parser = PlaintextParser.from_string(document, Tokenizer(self.language)) document = parser.document self.summarizer._ensure_dependencies_installed() sentences_words = [ self.summarizer._to_words_set(s) for s in document.sentences ] if not sentences_words: return tuple() tf_metrics = self.summarizer._compute_tf(sentences_words) idf_metrics = self.summarizer._compute_idf(sentences_words) matrix = self.summarizer._create_matrix(sentences_words, self.summarizer.threshold, tf_metrics, idf_metrics) scores = self.summarizer.power_method(matrix, self.summarizer.epsilon) return list(map(str, document.sentences)), list(scores)
def lexrank_scoring(text: str) -> Tuple[List[str], np.ndarray]: """ LexRankアルゴリズムによって文に点数をつける。 この点数は文の重要度とみなすことができる。 Parameters ---------- text : str 分析対象のテキスト。 Returns ------- List[str] text を文のリストに分解したもの。 np.ndarray 文のリストに対応する重要度のリスト。 """ doc = nlp(text) # 文のリストと単語のリストをつくる sentences = [] corpus = [] for sent in doc.sents: sentences.append(sent.text) tokens = [] for token in sent: # 文に含まれる単語のうち, 名詞・副詞・形容詞・動詞に限定する if token.pos_ in ('NOUN', 'ADV', 'ADJ', 'VERB'): # ぶれをなくすため, 単語の見出し語 Token.lemma_ を使う tokens.append(token.lemma_) corpus.append(tokens) # sentences = [文0, 文1, ...] # corpus = [[文0の単語0, 文0の単語1, ...], [文1の単語0, 文1の単語1, ...], ...] # sumyライブラリによるLexRankスコアリング lxr = LexRankSummarizer() tf_metrics = lxr._compute_tf(corpus) idf_metrics = lxr._compute_idf(corpus) matrix = lxr._create_matrix(corpus, lxr.threshold, tf_metrics, idf_metrics) scores = lxr.power_method(matrix, lxr.epsilon) # scores = [文0の重要度, 文1の重要度, ...] return sentences, scores