Exemple #1
0
 def _vectorize_sentence(self,
                         corpus,
                         isi_penting,
                         important_words=10,
                         retry=5,
                         **kwargs):
     corpus = corpus_checker(corpus)
     splitted_fullstop = [summary_textcleaning(i) for i in corpus]
     original_strings = [i[0] for i in splitted_fullstop]
     cleaned_strings = [i[1] for i in splitted_fullstop]
     if isi_penting:
         isi_penting = [summary_textcleaning(isi_penting)[1]]
         t = cleaned_strings + isi_penting
     else:
         t = cleaned_strings
     self.vectorizer.fit(t)
     freq = self.vectorizer.transform(cleaned_strings)
     if isi_penting:
         freq_isi_penting = self.vectorizer.transform(isi_penting)
     if important_words > 0:
         if hasattr(self.vectorizer, 'idf_'):
             indices = np.argsort(self.vectorizer.idf_)[::-1]
         else:
             indices = np.argsort(np.asarray(freq.sum(axis=0))[0])[::-1]
         features = self.vectorizer.get_feature_names()
         top_words = [features[i] for i in indices[:important_words]]
     else:
         top_words = []
     if isi_penting:
         t = vstack([freq, freq_isi_penting])
     else:
         t = freq
     self.model.fit(t)
     vectors = self.model.transform(freq)
     if isi_penting:
         vectors_isi_penting = self.model.transform(freq_isi_penting)
     similar = cosine_similarity(vectors, vectors)
     if isi_penting:
         similar_isi_penting = cosine_similarity(vectors,
                                                 vectors_isi_penting)
         similar = similar * similar_isi_penting
     else:
         similar[similar >= 0.99] = 0
     scores = pagerank(similar + 1e-6, retry)
     ranked_sentences = sorted(
         ((scores[i], s, i) for i, s in enumerate(original_strings)),
         reverse=True,
     )
     return (
         original_strings,
         ranked_sentences,
         top_words,
         cluster_words(top_words),
     )
Exemple #2
0
    def vectorize(self, strings):
        """
        Vectorize string inputs using bert attention.

        Parameters
        ----------
        strings : str / list of str

        Returns
        -------
        array: vectorized strings
        """

        if isinstance(strings, list):
            if not isinstance(strings[0], str):
                raise ValueError('input must be a list of strings or a string')
        else:
            if not isinstance(strings, str):
                raise ValueError('input must be a list of strings or a string')
        if isinstance(strings, str):
            strings = [strings]

        splitted_fullstop = [summary_textcleaning(i) for i in strings]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]
        sequences = skip_thought.batch_sequence(cleaned_strings,
                                                self.dictionary,
                                                maxlen=self._maxlen)
        return self._sess.run(self._logits,
                              feed_dict={self._X: np.array(sequences)})
Exemple #3
0
    def _vectorize_sentence(self,
                            corpus,
                            isi_penting,
                            important_words=10,
                            batch_size=10,
                            retry=5,
                            **kwargs):
        corpus = corpus_checker(corpus)
        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]

        vectors = self._batching(cleaned_strings, batch_size=batch_size)
        if isi_penting:
            vectors_isi_penting = self._batching([isi_penting],
                                                 batch_size=batch_size)

        if 'DeepSkipThought' in str(self.vectorizer):
            top_words = []
        else:
            if hasattr(self.vectorizer, 'attention'):
                attentions = self.vectorizer.attention(corpus, **kwargs)
                flatten = list(itertools.chain(*attentions))
                r = {}
                for f in flatten:
                    c = simple_textcleaning(f[0])
                    if c in STOPWORDS:
                        continue
                    if c not in r:
                        r[c] = f[1]
                    else:
                        r[c] += f[1]
                top_words = sorted(r, key=r.get,
                                   reverse=True)[:important_words]
            else:
                top_words = []

        similar = cosine_similarity(vectors, vectors)
        if isi_penting:
            similar_isi_penting = cosine_similarity(vectors,
                                                    vectors_isi_penting)
            similar = similar * similar_isi_penting
        else:
            similar[similar >= 0.99] = 0
        scores = pagerank(similar + 1e-6, retry)
        ranked_sentences = sorted(
            ((scores[i], s, i) for i, s in enumerate(original_strings)),
            reverse=True,
        )
        return (
            original_strings,
            ranked_sentences,
            top_words,
            cluster_words(top_words),
        )
Exemple #4
0
 def _vectorize_word(self,
                     corpus,
                     isi_penting,
                     window_size,
                     important_words=10,
                     **kwargs):
     corpus = corpus_checker(corpus)
     splitted_fullstop = [summary_textcleaning(i) for i in corpus]
     original_strings = [i[0] for i in splitted_fullstop]
     cleaned_strings = [i[1] for i in splitted_fullstop]
     ngram_list, splitted = create_ngram(' '.join(cleaned_strings),
                                         ngram=window_size)
     splitted = ' '.join(original_strings).split()
     if isi_penting:
         isi_penting = [summary_textcleaning(isi_penting)[1]]
     else:
         isi_penting = [' '.join(cleaned_strings)]
     t = ngram_list + isi_penting
     self.vectorizer.fit(t)
     freq = self.vectorizer.transform(ngram_list)
     freq_isi_penting = self.vectorizer.transform(isi_penting)
     if important_words > 0:
         if hasattr(self.vectorizer, 'idf_'):
             indices = np.argsort(self.vectorizer.idf_)[::-1]
         else:
             indices = np.argsort(np.asarray(freq.sum(axis=0))[0])[::-1]
         features = self.vectorizer.get_feature_names()
         top_words = [features[i] for i in indices[:important_words]]
     else:
         top_words = []
     t = vstack([freq, freq_isi_penting])
     self.model.fit(t)
     vectors = self.model.transform(freq)
     vectors_isi_penting = self.model.transform(freq_isi_penting)
     similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting)
     scores = similar_isi_penting[:, 0]
     ranked_sentences = sorted(
         ((scores[i], s, i) for i, s in enumerate(splitted)), reverse=True)
     return (splitted, ranked_sentences, top_words,
             cluster_words(top_words))
Exemple #5
0
    def _vectorize_word(self,
                        corpus,
                        isi_penting,
                        window_size=10,
                        important_words=10,
                        batch_size=10,
                        **kwargs):
        corpus = corpus_checker(corpus)
        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]
        ngram_list, splitted = create_ngram(' '.join(cleaned_strings),
                                            ngram=window_size)
        splitted = ' '.join(original_strings).split()
        if isi_penting:
            isi_penting = [isi_penting]
        else:
            isi_penting = original_strings

        vectors = self._batching(ngram_list, batch_size=batch_size)
        vectors_isi_penting = self._batching(isi_penting,
                                             batch_size=batch_size)

        if 'DeepSkipThought' in str(self.vectorizer):
            top_words = []
        else:
            if hasattr(self.vectorizer, 'attention') and important_words > 0:
                attentions = self.vectorizer.attention(corpus, **kwargs)
                flatten = list(itertools.chain(*attentions))
                r = {}
                for f in flatten:
                    c = simple_textcleaning(f[0])
                    if c in STOPWORDS:
                        continue
                    if c not in r:
                        r[c] = f[1]
                    else:
                        r[c] += f[1]
                top_words = sorted(r, key=r.get,
                                   reverse=True)[:important_words]
            else:
                top_words = []

        vectors_isi_penting = np.mean(vectors_isi_penting, axis=0)
        vectors_isi_penting = np.expand_dims(vectors_isi_penting, axis=0)
        similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting)
        scores = similar_isi_penting[:, 0]
        ranked_sentences = sorted(
            ((scores[i], s, i) for i, s in enumerate(splitted)), reverse=True)
        return (splitted, ranked_sentences, top_words,
                cluster_words(top_words))
Exemple #6
0
    def summarize(self,
                  corpus,
                  top_k: int = 3,
                  important_words: int = 3,
                  **kwargs):
        """
        Summarize list of strings / corpus

        Parameters
        ----------
        corpus: str, list

        top_k: int, (default=3)
            number of summarized strings.
        important_words: int, (default=3)
            number of important words.

        Returns
        -------
        string: summarized string
        """
        if not isinstance(corpus, list) and not isinstance(corpus, str):
            raise ValueError('corpus must be a list')
        if isinstance(corpus, list):
            if not isinstance(corpus[0], str):
                raise ValueError('corpus must be list of strings')

        if isinstance(corpus, str):
            corpus = split_into_sentences(corpus)
        else:
            corpus = '. '.join(corpus)
            corpus = split_into_sentences(corpus)

        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]

        if 'DEEP_SKIPTHOUGHT' in str(self._vectorizer):

            sequences = skip_thought.batch_sequence(
                cleaned_strings,
                self._vectorizer.dictionary,
                maxlen=self._vectorizer._maxlen,
            )
            vectors, attention = self._vectorizer._sess.run(
                [self._vectorizer._logits, self._vectorizer._attention],
                feed_dict={self._vectorizer._X: np.array(sequences)},
            )
            attention = attention.sum(axis=0)
            indices = np.argsort(attention)[::-1]
            top_words = [
                self._vectorizer._rev_dictionary[i] for i in indices
                if self._vectorizer._rev_dictionary[i] not in STOPWORDS
            ][:important_words]

        else:
            vectors = self._vectorizer.vectorize(corpus)
            attentions = self._vectorizer.attention(corpus, **kwargs)
            flatten = list(itertools.chain(*attentions))
            r = {}
            for f in flatten:
                c = simple_textcleaning(f[0])
                if c in STOPWORDS:
                    continue
                if c not in r:
                    r[c] = f[1]
                else:
                    r[c] += f[1]
            top_words = sorted(r, key=r.get, reverse=True)[:important_words]

        similar = cosine_similarity(vectors, vectors)
        similar[similar >= 0.99999] = 0
        scores = pagerank(similar)
        ranked_sentences = sorted(
            ((scores[i], s) for i, s in enumerate(original_strings)),
            reverse=True,
        )
        summary = [r[1] for r in ranked_sentences[:top_k]]

        return {
            'summary': ' '.join(summary),
            'top-words': top_words,
            'cluster-top-words': cluster_words(top_words),
        }
Exemple #7
0
def doc2vec(
    vectorizer,
    corpus,
    top_k: int = 3,
    aggregation: int = 'mean',
    soft: bool = True,
):
    """
    summarize a list of strings using doc2vec, scoring using TextRank.

    Parameters
    ----------
    vectorizer : object
        fast-text or word2vec interface object.
    corpus: list
    top_k: int, (default=3)
        number of summarized strings.
    aggregation : str, optional (default='mean')
        Aggregation supported. Allowed values:

        * ``'mean'`` - mean.
        * ``'min'`` - min.
        * ``'max'`` - max.
        * ``'sum'`` - sum.
        * ``'sqrt'`` - square root.
    soft: bool, optional (default=True)
        word not inside vectorizer will replace with nearest word if True, else, will skip.

    Returns
    -------
    dictionary: result
    """
    if not hasattr(vectorizer, 'get_vector_by_name'):
        raise ValueError('vectorizer must has `get_vector_by_name` method')
    if not isinstance(corpus, list) and not isinstance(corpus, str):
        raise ValueError('corpus must be a list')
    if isinstance(corpus, list):
        if not isinstance(corpus[0], str):
            raise ValueError('corpus must be list of strings')
    if isinstance(corpus, str):
        corpus = split_into_sentences(corpus)
    else:
        corpus = '. '.join(corpus)
        corpus = split_into_sentences(corpus)
    splitted_fullstop = [summary_textcleaning(i) for i in corpus]
    original_strings = [i[0] for i in splitted_fullstop]
    cleaned_strings = [i[1] for i in splitted_fullstop]

    aggregation = aggregation.lower()
    if aggregation == 'mean':
        aggregation_function = np.mean
    elif aggregation == 'min':
        aggregation_function = np.min
    elif aggregation == 'max':
        aggregation_function = np.max
    elif aggregation == 'sum':
        aggregation_function = np.sum
    elif aggregation == 'sqrt':
        aggregation_function = np.sqrt
    else:
        raise ValueError(
            'aggregation only supports `mean`, `min`, `max`, `sum` and `sqrt`')

    vectors = []
    for string in cleaned_strings:
        inside = []
        for token in string.split():
            try:
                inside.append(vectorizer.get_vector_by_name(token))
            except:
                if not soft:
                    pass
                else:
                    arr = np.array([
                        self._jarowinkler.similarity(token, k)
                        for k in vectorizer.words
                    ])
                    idx = (-arr).argsort()[0]
                    inside.append(
                        vectorizer.get_vector_by_name(vectorizer.words[idx]))
        vectors.append(aggregation_function(inside, axis=0))
    similar = cosine_similarity(vectors, vectors)
    similar[similar >= 0.999] = 0
    scores = pagerank(similar)
    ranked_sentences = sorted(
        ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True)
    summary = [r[1] for r in ranked_sentences[:top_k]]
    return ' '.join(summary)
Exemple #8
0
def _base_summarizer(
    corpus,
    decomposition,
    top_k: int = 3,
    max_df: float = 0.95,
    min_df: int = 2,
    ngram: Tuple[int, int] = (1, 3),
    vectorizer: str = 'bow',
    important_words: int = 10,
    retry: int = 5,
    **kwargs,
):

    vectorizer = vectorizer.lower()
    if not vectorizer in ['tfidf', 'bow', 'skip-gram']:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")

    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')
    if not isinstance(corpus, list) and not isinstance(corpus, str):
        raise ValueError('corpus must be a list')
    if isinstance(corpus, list):
        if not isinstance(corpus[0], str):
            raise ValueError('corpus must be list of strings')
    if isinstance(corpus, str):
        corpus = split_into_sentences(corpus)
    else:
        corpus = '. '.join(corpus)
        corpus = split_into_sentences(corpus)

    splitted_fullstop = [summary_textcleaning(i) for i in corpus]
    original_strings = [i[0] for i in splitted_fullstop]
    cleaned_strings = [i[1] for i in splitted_fullstop]
    stemmed = [sastrawi(i) for i in cleaned_strings]

    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise Exception("vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    tf_vectorizer = Vectorizer(
        max_df=max_df,
        min_df=min_df,
        ngram_range=ngram,
        stop_words=STOPWORDS,
        **kwargs,
    )
    tf = tf_vectorizer.fit_transform(stemmed)
    if hasattr(tf_vectorizer, 'idf_'):
        indices = np.argsort(tf_vectorizer.idf_)[::-1]
    else:
        indices = np.argsort(np.asarray(tf.sum(axis=0))[0])[::-1]

    features = tf_vectorizer.get_feature_names()
    top_words = [features[i] for i in indices[:important_words]]
    vectors = decomposition(tf.shape[1] // 2).fit_transform(tf)
    similar = cosine_similarity(vectors, vectors)
    similar[similar >= 0.999] = 0
    scores = pagerank(similar, retry)
    ranked_sentences = sorted(
        ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True)
    summary = [r[1] for r in ranked_sentences[:top_k]]
    return {
        'summary': ' '.join(summary),
        'top-words': top_words,
        'cluster-top-words': cluster_words(top_words),
    }
Exemple #9
0
    def _vectorize_sentence(self,
                            corpus,
                            isi_penting,
                            aggregation=np.mean,
                            soft=False,
                            retry=5,
                            **kwargs):

        corpus = corpus_checker(corpus)
        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]
        vectors = []
        for string in cleaned_strings:
            inside = []
            for token in string.split():
                if token in self.wordvector._dictionary:
                    v = self.wordvector.get_vector_by_name(token)
                else:
                    if not soft:
                        v = np.zeros((self.wordvector._embed_matrix.shape[1]))

                    else:
                        arr = np.array([
                            self.wordvector._jarowinkler.similarity(token, k)
                            for k in self.wordvector.words
                        ])
                        idx = (-arr).argsort()[0]
                        v = self.wordvector.get_vector_by_name(
                            self.wordvector.words[idx])

                inside.append(v)
            vectors.append(aggregation(inside, axis=0))
        vectors = np.array(vectors)

        if isi_penting:
            cleaned_isi_penting = summary_textcleaning(isi_penting)[1]
            vectors_isi_penting = []
            for token in cleaned_isi_penting.split():
                if token in self.wordvector._dictionary:
                    v = self.wordvector.get_vector_by_name(token)
                else:
                    if not soft:
                        v = np.zeros((self.wordvector._embed_matrix.shape[1]))
                    else:
                        arr = np.array([
                            self.wordvector._jarowinkler.similarity(token, k)
                            for k in self.wordvector.words
                        ])
                        idx = (-arr).argsort()[0]
                        v = self.wordvector.get_vector_by_name(
                            self.wordvector.words[idx])
                vectors_isi_penting.append(v)
            vectors_isi_penting = aggregation(vectors_isi_penting, axis=0)
            vectors_isi_penting = np.expand_dims(vectors_isi_penting, 0)

        similar = cosine_similarity(vectors, vectors)
        if isi_penting:
            similar_isi_penting = cosine_similarity(vectors,
                                                    vectors_isi_penting)
            similar = similar * similar_isi_penting
        else:
            similar[similar >= 0.99] = 0
        scores = pagerank(similar + 1e-6, retry)
        ranked_sentences = sorted(
            ((scores[i], s, i) for i, s in enumerate(original_strings)),
            reverse=True,
        )
        return (original_strings, ranked_sentences)
Exemple #10
0
    def _vectorize_word(self,
                        corpus,
                        isi_penting,
                        window_size,
                        aggregation=np.mean,
                        soft=False,
                        **kwargs):
        corpus = corpus_checker(corpus)
        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]
        ngram_list, splitted = create_ngram(' '.join(cleaned_strings),
                                            ngram=window_size)
        splitted = ' '.join(original_strings).split()
        if isi_penting:
            isi_penting = summary_textcleaning(isi_penting)[1]
        else:
            isi_penting = ' '.join(cleaned_strings)
        vectors = []
        for string in ngram_list:
            inside = []
            for token in string.split():
                if token in self.wordvector._dictionary:
                    v = self.wordvector.get_vector_by_name(token)
                else:
                    if not soft:
                        v = np.zeros((self.wordvector._embed_matrix.shape[1]))
                    else:
                        arr = np.array([
                            self.wordvector._jarowinkler.similarity(token, k)
                            for k in self.wordvector.words
                        ])
                        idx = (-arr).argsort()[0]
                        v = self.wordvector.get_vector_by_name(
                            self.wordvector.words[idx])
                inside.append(v)
            vectors.append(aggregation(inside, axis=0))
        vectors = np.array(vectors)

        cleaned_isi_penting = isi_penting
        vectors_isi_penting = []
        for token in cleaned_isi_penting.split():
            if token in self.wordvector._dictionary:
                vectors_isi_penting.append(
                    self.wordvector.get_vector_by_name(token))
            else:
                if not soft:
                    vectors_isi_penting.append(
                        np.zeros((self.wordvector._embed_matrix.shape[1])))
                else:
                    arr = np.array([
                        self.wordvector._jarowinkler.similarity(token, k)
                        for k in self.wordvector.words
                    ])
                    idx = (-arr).argsort()[0]
                    vectors_isi_penting.append(
                        self.wordvector.get_vector_by_name(
                            self.wordvector.words[idx]))
        vectors_isi_penting = aggregation(vectors_isi_penting, axis=0)
        vectors_isi_penting = np.expand_dims(vectors_isi_penting, 0)
        similar_isi_penting = cosine_similarity(vectors, vectors_isi_penting)
        scores = similar_isi_penting[:, 0]
        ranked_sentences = sorted(
            ((scores[i], s, i) for i, s in enumerate(splitted)), reverse=True)
        return (splitted, ranked_sentences)