def _vectorize_sentence(self, corpus, isi_penting, important_words=10, retry=5, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] if isi_penting: isi_penting = [summary_textcleaning(isi_penting)[1]] t = cleaned_strings + isi_penting else: t = cleaned_strings self.vectorizer.fit(t) freq = self.vectorizer.transform(cleaned_strings) if isi_penting: freq_isi_penting = self.vectorizer.transform(isi_penting) if important_words > 0: if hasattr(self.vectorizer, 'idf_'): indices = np.argsort(self.vectorizer.idf_)[::-1] else: indices = np.argsort(np.asarray(freq.sum(axis=0))[0])[::-1] features = self.vectorizer.get_feature_names() top_words = [features[i] for i in indices[:important_words]] else: top_words = [] if isi_penting: t = vstack([freq, freq_isi_penting]) else: t = freq self.model.fit(t) vectors = self.model.transform(freq) if isi_penting: vectors_isi_penting = self.model.transform(freq_isi_penting) similar = cosine_similarity(vectors, vectors) if isi_penting: similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) similar = similar * similar_isi_penting else: similar[similar >= 0.99] = 0 scores = pagerank(similar + 1e-6, retry) ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(original_strings)), reverse=True, ) return ( original_strings, ranked_sentences, top_words, cluster_words(top_words), )
def vectorize(self, strings): """ Vectorize string inputs using bert attention. Parameters ---------- strings : str / list of str Returns ------- array: vectorized strings """ if isinstance(strings, list): if not isinstance(strings[0], str): raise ValueError('input must be a list of strings or a string') else: if not isinstance(strings, str): raise ValueError('input must be a list of strings or a string') if isinstance(strings, str): strings = [strings] splitted_fullstop = [summary_textcleaning(i) for i in strings] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] sequences = skip_thought.batch_sequence(cleaned_strings, self.dictionary, maxlen=self._maxlen) return self._sess.run(self._logits, feed_dict={self._X: np.array(sequences)})
def _vectorize_sentence(self, corpus, isi_penting, important_words=10, batch_size=10, retry=5, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] vectors = self._batching(cleaned_strings, batch_size=batch_size) if isi_penting: vectors_isi_penting = self._batching([isi_penting], batch_size=batch_size) if 'DeepSkipThought' in str(self.vectorizer): top_words = [] else: if hasattr(self.vectorizer, 'attention'): attentions = self.vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] else: top_words = [] similar = cosine_similarity(vectors, vectors) if isi_penting: similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) similar = similar * similar_isi_penting else: similar[similar >= 0.99] = 0 scores = pagerank(similar + 1e-6, retry) ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(original_strings)), reverse=True, ) return ( original_strings, ranked_sentences, top_words, cluster_words(top_words), )
def _vectorize_word(self, corpus, isi_penting, window_size, important_words=10, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] ngram_list, splitted = create_ngram(' '.join(cleaned_strings), ngram=window_size) splitted = ' '.join(original_strings).split() if isi_penting: isi_penting = [summary_textcleaning(isi_penting)[1]] else: isi_penting = [' '.join(cleaned_strings)] t = ngram_list + isi_penting self.vectorizer.fit(t) freq = self.vectorizer.transform(ngram_list) freq_isi_penting = self.vectorizer.transform(isi_penting) if important_words > 0: if hasattr(self.vectorizer, 'idf_'): indices = np.argsort(self.vectorizer.idf_)[::-1] else: indices = np.argsort(np.asarray(freq.sum(axis=0))[0])[::-1] features = self.vectorizer.get_feature_names() top_words = [features[i] for i in indices[:important_words]] else: top_words = [] t = vstack([freq, freq_isi_penting]) self.model.fit(t) vectors = self.model.transform(freq) vectors_isi_penting = self.model.transform(freq_isi_penting) similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) scores = similar_isi_penting[:, 0] ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(splitted)), reverse=True) return (splitted, ranked_sentences, top_words, cluster_words(top_words))
def _vectorize_word(self, corpus, isi_penting, window_size=10, important_words=10, batch_size=10, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] ngram_list, splitted = create_ngram(' '.join(cleaned_strings), ngram=window_size) splitted = ' '.join(original_strings).split() if isi_penting: isi_penting = [isi_penting] else: isi_penting = original_strings vectors = self._batching(ngram_list, batch_size=batch_size) vectors_isi_penting = self._batching(isi_penting, batch_size=batch_size) if 'DeepSkipThought' in str(self.vectorizer): top_words = [] else: if hasattr(self.vectorizer, 'attention') and important_words > 0: attentions = self.vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] else: top_words = [] vectors_isi_penting = np.mean(vectors_isi_penting, axis=0) vectors_isi_penting = np.expand_dims(vectors_isi_penting, axis=0) similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) scores = similar_isi_penting[:, 0] ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(splitted)), reverse=True) return (splitted, ranked_sentences, top_words, cluster_words(top_words))
def summarize(self, corpus, top_k: int = 3, important_words: int = 3, **kwargs): """ Summarize list of strings / corpus Parameters ---------- corpus: str, list top_k: int, (default=3) number of summarized strings. important_words: int, (default=3) number of important words. Returns ------- string: summarized string """ if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] if 'DEEP_SKIPTHOUGHT' in str(self._vectorizer): sequences = skip_thought.batch_sequence( cleaned_strings, self._vectorizer.dictionary, maxlen=self._vectorizer._maxlen, ) vectors, attention = self._vectorizer._sess.run( [self._vectorizer._logits, self._vectorizer._attention], feed_dict={self._vectorizer._X: np.array(sequences)}, ) attention = attention.sum(axis=0) indices = np.argsort(attention)[::-1] top_words = [ self._vectorizer._rev_dictionary[i] for i in indices if self._vectorizer._rev_dictionary[i] not in STOPWORDS ][:important_words] else: vectors = self._vectorizer.vectorize(corpus) attentions = self._vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] similar = cosine_similarity(vectors, vectors) similar[similar >= 0.99999] = 0 scores = pagerank(similar) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True, ) summary = [r[1] for r in ranked_sentences[:top_k]] return { 'summary': ' '.join(summary), 'top-words': top_words, 'cluster-top-words': cluster_words(top_words), }
def doc2vec( vectorizer, corpus, top_k: int = 3, aggregation: int = 'mean', soft: bool = True, ): """ summarize a list of strings using doc2vec, scoring using TextRank. Parameters ---------- vectorizer : object fast-text or word2vec interface object. corpus: list top_k: int, (default=3) number of summarized strings. aggregation : str, optional (default='mean') Aggregation supported. Allowed values: * ``'mean'`` - mean. * ``'min'`` - min. * ``'max'`` - max. * ``'sum'`` - sum. * ``'sqrt'`` - square root. soft: bool, optional (default=True) word not inside vectorizer will replace with nearest word if True, else, will skip. Returns ------- dictionary: result """ if not hasattr(vectorizer, 'get_vector_by_name'): raise ValueError('vectorizer must has `get_vector_by_name` method') if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] aggregation = aggregation.lower() if aggregation == 'mean': aggregation_function = np.mean elif aggregation == 'min': aggregation_function = np.min elif aggregation == 'max': aggregation_function = np.max elif aggregation == 'sum': aggregation_function = np.sum elif aggregation == 'sqrt': aggregation_function = np.sqrt else: raise ValueError( 'aggregation only supports `mean`, `min`, `max`, `sum` and `sqrt`') vectors = [] for string in cleaned_strings: inside = [] for token in string.split(): try: inside.append(vectorizer.get_vector_by_name(token)) except: if not soft: pass else: arr = np.array([ self._jarowinkler.similarity(token, k) for k in vectorizer.words ]) idx = (-arr).argsort()[0] inside.append( vectorizer.get_vector_by_name(vectorizer.words[idx])) vectors.append(aggregation_function(inside, axis=0)) similar = cosine_similarity(vectors, vectors) similar[similar >= 0.999] = 0 scores = pagerank(similar) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True) summary = [r[1] for r in ranked_sentences[:top_k]] return ' '.join(summary)
def _base_summarizer( corpus, decomposition, top_k: int = 3, max_df: float = 0.95, min_df: int = 2, ngram: Tuple[int, int] = (1, 3), vectorizer: str = 'bow', important_words: int = 10, retry: int = 5, **kwargs, ): vectorizer = vectorizer.lower() if not vectorizer in ['tfidf', 'bow', 'skip-gram']: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] stemmed = [sastrawi(i) for i in cleaned_strings] if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise Exception("vectorizer must be in ['tfidf', 'bow', 'skip-gram']") tf_vectorizer = Vectorizer( max_df=max_df, min_df=min_df, ngram_range=ngram, stop_words=STOPWORDS, **kwargs, ) tf = tf_vectorizer.fit_transform(stemmed) if hasattr(tf_vectorizer, 'idf_'): indices = np.argsort(tf_vectorizer.idf_)[::-1] else: indices = np.argsort(np.asarray(tf.sum(axis=0))[0])[::-1] features = tf_vectorizer.get_feature_names() top_words = [features[i] for i in indices[:important_words]] vectors = decomposition(tf.shape[1] // 2).fit_transform(tf) similar = cosine_similarity(vectors, vectors) similar[similar >= 0.999] = 0 scores = pagerank(similar, retry) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True) summary = [r[1] for r in ranked_sentences[:top_k]] return { 'summary': ' '.join(summary), 'top-words': top_words, 'cluster-top-words': cluster_words(top_words), }
def _vectorize_sentence(self, corpus, isi_penting, aggregation=np.mean, soft=False, retry=5, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] vectors = [] for string in cleaned_strings: inside = [] for token in string.split(): if token in self.wordvector._dictionary: v = self.wordvector.get_vector_by_name(token) else: if not soft: v = np.zeros((self.wordvector._embed_matrix.shape[1])) else: arr = np.array([ self.wordvector._jarowinkler.similarity(token, k) for k in self.wordvector.words ]) idx = (-arr).argsort()[0] v = self.wordvector.get_vector_by_name( self.wordvector.words[idx]) inside.append(v) vectors.append(aggregation(inside, axis=0)) vectors = np.array(vectors) if isi_penting: cleaned_isi_penting = summary_textcleaning(isi_penting)[1] vectors_isi_penting = [] for token in cleaned_isi_penting.split(): if token in self.wordvector._dictionary: v = self.wordvector.get_vector_by_name(token) else: if not soft: v = np.zeros((self.wordvector._embed_matrix.shape[1])) else: arr = np.array([ self.wordvector._jarowinkler.similarity(token, k) for k in self.wordvector.words ]) idx = (-arr).argsort()[0] v = self.wordvector.get_vector_by_name( self.wordvector.words[idx]) vectors_isi_penting.append(v) vectors_isi_penting = aggregation(vectors_isi_penting, axis=0) vectors_isi_penting = np.expand_dims(vectors_isi_penting, 0) similar = cosine_similarity(vectors, vectors) if isi_penting: similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) similar = similar * similar_isi_penting else: similar[similar >= 0.99] = 0 scores = pagerank(similar + 1e-6, retry) ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(original_strings)), reverse=True, ) return (original_strings, ranked_sentences)
def _vectorize_word(self, corpus, isi_penting, window_size, aggregation=np.mean, soft=False, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] ngram_list, splitted = create_ngram(' '.join(cleaned_strings), ngram=window_size) splitted = ' '.join(original_strings).split() if isi_penting: isi_penting = summary_textcleaning(isi_penting)[1] else: isi_penting = ' '.join(cleaned_strings) vectors = [] for string in ngram_list: inside = [] for token in string.split(): if token in self.wordvector._dictionary: v = self.wordvector.get_vector_by_name(token) else: if not soft: v = np.zeros((self.wordvector._embed_matrix.shape[1])) else: arr = np.array([ self.wordvector._jarowinkler.similarity(token, k) for k in self.wordvector.words ]) idx = (-arr).argsort()[0] v = self.wordvector.get_vector_by_name( self.wordvector.words[idx]) inside.append(v) vectors.append(aggregation(inside, axis=0)) vectors = np.array(vectors) cleaned_isi_penting = isi_penting vectors_isi_penting = [] for token in cleaned_isi_penting.split(): if token in self.wordvector._dictionary: vectors_isi_penting.append( self.wordvector.get_vector_by_name(token)) else: if not soft: vectors_isi_penting.append( np.zeros((self.wordvector._embed_matrix.shape[1]))) else: arr = np.array([ self.wordvector._jarowinkler.similarity(token, k) for k in self.wordvector.words ]) idx = (-arr).argsort()[0] vectors_isi_penting.append( self.wordvector.get_vector_by_name( self.wordvector.words[idx])) vectors_isi_penting = aggregation(vectors_isi_penting, axis=0) vectors_isi_penting = np.expand_dims(vectors_isi_penting, 0) similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) scores = similar_isi_penting[:, 0] ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(splitted)), reverse=True) return (splitted, ranked_sentences)