def similarity_transformer(string, model, vectorizer=None, top_k: int = 5, atleast: int = 1, stopwords=get_stopwords, **kwargs): stopwords = validator.validate_stopwords(stopwords) if not hasattr(model, '_tree_plot'): raise ValueError('model must have `_tree_plot` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if ngram_method not in methods: raise ValueError("ngram_method must be in ['bow', 'skip-gram']") if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, ngram_method=ngram_method, ngram=ngram, stopwords=stopwords, **kwargs) similar = model._tree_plot(list(vocab.keys())) similar[similar >= 0.99999] = 0 scores = pagerank(similar) ranked_sentences = sorted( [(scores[i], s) for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast], reverse=True, ) return ranked_sentences[:top_k]
def similarity_transformer(string, model, top_k: int = 5, ngram_method: str = 'bow', ngram: Tuple[int, int] = (1, 1), atleast: int = 1, stop_words: List[str] = STOPWORDS, **kwargs): if not hasattr(model, '_tree_plot'): raise ValueError('model must has or `_tree_plot` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if ngram_method not in methods: raise ValueError("ngram_method must be in ['bow', 'skip-gram']") if auto_ngram: vocab = _auto_ngram(string, stop_words) else: vocab = _base(string, ngram_method=ngram_method, ngram=ngram, stop_words=stop_words, **kwargs) similar = model._tree_plot(list(vocab.keys())) similar[similar >= 0.99999] = 0 scores = pagerank(similar) ranked_sentences = sorted( [(scores[i], s) for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast], reverse=True, ) return ranked_sentences[:top_k]
def _vectorize_sentence(self, corpus, isi_penting, important_words=10, batch_size=10, retry=5, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] vectors = self._batching(cleaned_strings, batch_size=batch_size) if isi_penting: vectors_isi_penting = self._batching([isi_penting], batch_size=batch_size) if 'DeepSkipThought' in str(self.vectorizer): top_words = [] else: if hasattr(self.vectorizer, 'attention'): attentions = self.vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] else: top_words = [] similar = cosine_similarity(vectors, vectors) if isi_penting: similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) similar = similar * similar_isi_penting else: similar[similar >= 0.99] = 0 scores = pagerank(similar + 1e-6, retry) ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(original_strings)), reverse=True, ) return ( original_strings, ranked_sentences, top_words, cluster_words(top_words), )
def _vectorize_sentence(self, corpus, isi_penting, important_words=10, retry=5, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] if isi_penting: isi_penting = [summary_textcleaning(isi_penting)[1]] t = cleaned_strings + isi_penting else: t = cleaned_strings self.vectorizer.fit(t) freq = self.vectorizer.transform(cleaned_strings) if isi_penting: freq_isi_penting = self.vectorizer.transform(isi_penting) if important_words > 0: if hasattr(self.vectorizer, 'idf_'): indices = np.argsort(self.vectorizer.idf_)[::-1] else: indices = np.argsort(np.asarray(freq.sum(axis=0))[0])[::-1] features = self.vectorizer.get_feature_names() top_words = [features[i] for i in indices[:important_words]] else: top_words = [] if isi_penting: t = vstack([freq, freq_isi_penting]) else: t = freq self.model.fit(t) vectors = self.model.transform(freq) if isi_penting: vectors_isi_penting = self.model.transform(freq_isi_penting) similar = cosine_similarity(vectors, vectors) if isi_penting: similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) similar = similar * similar_isi_penting else: similar[similar >= 0.99] = 0 scores = pagerank(similar + 1e-6, retry) ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(original_strings)), reverse=True, ) return ( original_strings, ranked_sentences, top_words, cluster_words(top_words), )
def summarize(self, corpus, top_k: int = 3, important_words: int = 3, **kwargs): """ Summarize list of strings / corpus Parameters ---------- corpus: str, list top_k: int, (default=3) number of summarized strings. important_words: int, (default=3) number of important words. Returns ------- string: summarized string """ if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] if 'DEEP_SKIPTHOUGHT' in str(self._vectorizer): sequences = skip_thought.batch_sequence( cleaned_strings, self._vectorizer.dictionary, maxlen=self._vectorizer._maxlen, ) vectors, attention = self._vectorizer._sess.run( [self._vectorizer._logits, self._vectorizer._attention], feed_dict={self._vectorizer._X: np.array(sequences)}, ) attention = attention.sum(axis=0) indices = np.argsort(attention)[::-1] top_words = [ self._vectorizer._rev_dictionary[i] for i in indices if self._vectorizer._rev_dictionary[i] not in STOPWORDS ][:important_words] else: vectors = self._vectorizer.vectorize(corpus) attentions = self._vectorizer.attention(corpus, **kwargs) flatten = list(itertools.chain(*attentions)) r = {} for f in flatten: c = simple_textcleaning(f[0]) if c in STOPWORDS: continue if c not in r: r[c] = f[1] else: r[c] += f[1] top_words = sorted(r, key=r.get, reverse=True)[:important_words] similar = cosine_similarity(vectors, vectors) similar[similar >= 0.99999] = 0 scores = pagerank(similar) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True, ) summary = [r[1] for r in ranked_sentences[:top_k]] return { 'summary': ' '.join(summary), 'top-words': top_words, 'cluster-top-words': cluster_words(top_words), }
def doc2vec( vectorizer, corpus, top_k: int = 3, aggregation: int = 'mean', soft: bool = True, ): """ summarize a list of strings using doc2vec, scoring using TextRank. Parameters ---------- vectorizer : object fast-text or word2vec interface object. corpus: list top_k: int, (default=3) number of summarized strings. aggregation : str, optional (default='mean') Aggregation supported. Allowed values: * ``'mean'`` - mean. * ``'min'`` - min. * ``'max'`` - max. * ``'sum'`` - sum. * ``'sqrt'`` - square root. soft: bool, optional (default=True) word not inside vectorizer will replace with nearest word if True, else, will skip. Returns ------- dictionary: result """ if not hasattr(vectorizer, 'get_vector_by_name'): raise ValueError('vectorizer must has `get_vector_by_name` method') if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] aggregation = aggregation.lower() if aggregation == 'mean': aggregation_function = np.mean elif aggregation == 'min': aggregation_function = np.min elif aggregation == 'max': aggregation_function = np.max elif aggregation == 'sum': aggregation_function = np.sum elif aggregation == 'sqrt': aggregation_function = np.sqrt else: raise ValueError( 'aggregation only supports `mean`, `min`, `max`, `sum` and `sqrt`') vectors = [] for string in cleaned_strings: inside = [] for token in string.split(): try: inside.append(vectorizer.get_vector_by_name(token)) except: if not soft: pass else: arr = np.array([ self._jarowinkler.similarity(token, k) for k in vectorizer.words ]) idx = (-arr).argsort()[0] inside.append( vectorizer.get_vector_by_name(vectorizer.words[idx])) vectors.append(aggregation_function(inside, axis=0)) similar = cosine_similarity(vectors, vectors) similar[similar >= 0.999] = 0 scores = pagerank(similar) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True) summary = [r[1] for r in ranked_sentences[:top_k]] return ' '.join(summary)
def _base_summarizer( corpus, decomposition, top_k: int = 3, max_df: float = 0.95, min_df: int = 2, ngram: Tuple[int, int] = (1, 3), vectorizer: str = 'bow', important_words: int = 10, retry: int = 5, **kwargs, ): vectorizer = vectorizer.lower() if not vectorizer in ['tfidf', 'bow', 'skip-gram']: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] stemmed = [sastrawi(i) for i in cleaned_strings] if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise Exception("vectorizer must be in ['tfidf', 'bow', 'skip-gram']") tf_vectorizer = Vectorizer( max_df=max_df, min_df=min_df, ngram_range=ngram, stop_words=STOPWORDS, **kwargs, ) tf = tf_vectorizer.fit_transform(stemmed) if hasattr(tf_vectorizer, 'idf_'): indices = np.argsort(tf_vectorizer.idf_)[::-1] else: indices = np.argsort(np.asarray(tf.sum(axis=0))[0])[::-1] features = tf_vectorizer.get_feature_names() top_words = [features[i] for i in indices[:important_words]] vectors = decomposition(tf.shape[1] // 2).fit_transform(tf) similar = cosine_similarity(vectors, vectors) similar[similar >= 0.999] = 0 scores = pagerank(similar, retry) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True) summary = [r[1] for r in ranked_sentences[:top_k]] return { 'summary': ' '.join(summary), 'top-words': top_words, 'cluster-top-words': cluster_words(top_words), }
def textrank( string: str, model=None, vectorizer=None, top_k: int = 5, atleast: int = 1, stopwords=get_stopwords, **kwargs, ): """ Extract keywords using Textrank algorithm. Parameters ---------- string: str model: Object, optional (default='None') model has `fit_transform` or `vectorize` method. vectorizer: Object, optional (default=None) Prefer `sklearn.feature_extraction.text.CountVectorizer` or, `malaya.text.vectorizer.SkipGramCountVectorizer`. If None, will generate ngram automatically based on `stopwords`. top_k: int, optional (default=5) return top-k results. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] Returns ------- result: Tuple[float, str] """ stopwords = validator.validate_stopwords(stopwords) if not hasattr(model, 'fit_transform') and not hasattr(model, 'vectorize'): raise ValueError( 'model must have `fit_transform` or `vectorize` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if not vectorizer: auto_ngram = True else: auto_ngram = False if not hasattr(vectorizer, 'fit'): raise ValueError('vectorizer must have `fit` method') if auto_ngram and not len(stopwords): raise ValueError('insert stopwords if auto_ngram') if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, vectorizer=vectorizer, **kwargs) if hasattr(model, 'fit_transform'): vectors = model.fit_transform(list(vocab.keys())) if hasattr(model, 'vectorize'): vectors = model.vectorize(list(vocab.keys())) similar = cosine_similarity(vectors, vectors) similar[similar >= 0.99999] = 0 scores = pagerank(similar) total = sum(scores) ranked_sentences = sorted( [(scores[i] / total, s) for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast], reverse=True, ) return ranked_sentences[:top_k]
def textrank(string: str, vectorizer, top_k: int = 5, auto_ngram: bool = True, ngram_method: str = 'bow', ngram: Tuple[int, int] = (1, 1), atleast: int = 1, stopwords=get_stopwords, **kwargs): """ Extract keywords using Textrank algorithm. Parameters ---------- string: str vectorizer: Object, optional (default='None') model has `fit_transform` or `vectorize` method. top_k: int, optional (default=5) return top-k results. auto_ngram: bool, optional (default=True) If True, will generate keyword candidates using N suitable ngram. Else use `ngram_method`. ngram_method: str, optional (default='bow') Only usable if `auto_ngram` is False. supported ngram generator: * ``'bow'`` - bag-of-word. * ``'skipgram'`` - bag-of-word with skip technique. ngram: tuple, optional (default=(1,1)) n-grams size. atleast: int, optional (default=1) at least count appeared in the string to accept as candidate. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] Returns ------- result: Tuple[float, str] """ stopwords = validator.validate_stopwords(stopwords) if not hasattr(vectorizer, 'fit_transform') and not hasattr( vectorizer, 'vectorize'): raise ValueError( 'vectorizer must has `fit_transform` or `vectorize` method') if top_k < 1: raise ValueError('top_k must bigger than 0') if atleast < 1: raise ValueError('atleast must bigger than 0') if ngram_method not in methods: raise ValueError("ngram_method must be in ['bow', 'skip-gram']") if auto_ngram and not len(stopwords): raise ValueError('insert stopwords if auto_ngram') if auto_ngram: vocab = _auto_ngram(string, stopwords) else: vocab = _base(string, ngram_method=ngram_method, ngram=ngram, stopwords=stopwords, **kwargs) if hasattr(vectorizer, 'fit_transform'): vectors = vectorizer.fit_transform(list(vocab.keys())) if hasattr(vectorizer, 'vectorize'): vectors = vectorizer.vectorize(list(vocab.keys())) similar = cosine_similarity(vectors, vectors) similar[similar >= 0.99999] = 0 scores = pagerank(similar) total = sum(scores) ranked_sentences = sorted( [(scores[i] / total, s) for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast], reverse=True, ) return ranked_sentences[:top_k]
def _vectorize_sentence(self, corpus, isi_penting, aggregation=np.mean, soft=False, retry=5, **kwargs): corpus = corpus_checker(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] vectors = [] for string in cleaned_strings: inside = [] for token in string.split(): if token in self.wordvector._dictionary: v = self.wordvector.get_vector_by_name(token) else: if not soft: v = np.zeros((self.wordvector._embed_matrix.shape[1])) else: arr = np.array([ self.wordvector._jarowinkler.similarity(token, k) for k in self.wordvector.words ]) idx = (-arr).argsort()[0] v = self.wordvector.get_vector_by_name( self.wordvector.words[idx]) inside.append(v) vectors.append(aggregation(inside, axis=0)) vectors = np.array(vectors) if isi_penting: cleaned_isi_penting = summary_textcleaning(isi_penting)[1] vectors_isi_penting = [] for token in cleaned_isi_penting.split(): if token in self.wordvector._dictionary: v = self.wordvector.get_vector_by_name(token) else: if not soft: v = np.zeros((self.wordvector._embed_matrix.shape[1])) else: arr = np.array([ self.wordvector._jarowinkler.similarity(token, k) for k in self.wordvector.words ]) idx = (-arr).argsort()[0] v = self.wordvector.get_vector_by_name( self.wordvector.words[idx]) vectors_isi_penting.append(v) vectors_isi_penting = aggregation(vectors_isi_penting, axis=0) vectors_isi_penting = np.expand_dims(vectors_isi_penting, 0) similar = cosine_similarity(vectors, vectors) if isi_penting: similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting) similar = similar * similar_isi_penting else: similar[similar >= 0.99] = 0 scores = pagerank(similar + 1e-6, retry) ranked_sentences = sorted( ((scores[i], s, i) for i, s in enumerate(original_strings)), reverse=True, ) return (original_strings, ranked_sentences)