Ejemplo n.º 1
0
    def vectorize(self, strings):
        """
        Vectorize string inputs using bert attention.

        Parameters
        ----------
        strings : str / list of str

        Returns
        -------
        array: vectorized strings
        """

        if isinstance(strings, list):
            if not isinstance(strings[0], str):
                raise ValueError('input must be a list of strings or a string')
        else:
            if not isinstance(strings, str):
                raise ValueError('input must be a list of strings or a string')
        if isinstance(strings, str):
            strings = [strings]

        splitted_fullstop = [summary_textcleaning(i) for i in strings]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]
        sequences = skip_thought.batch_sequence(cleaned_strings,
                                                self.dictionary,
                                                maxlen=self._maxlen)
        return self._sess.run(self._logits,
                              feed_dict={self._X: np.array(sequences)})
Ejemplo n.º 2
0
    def summarize(self,
                  corpus,
                  top_k: int = 3,
                  important_words: int = 3,
                  **kwargs):
        """
        Summarize list of strings / corpus

        Parameters
        ----------
        corpus: str, list

        top_k: int, (default=3)
            number of summarized strings.
        important_words: int, (default=3)
            number of important words.

        Returns
        -------
        string: summarized string
        """
        if not isinstance(corpus, list) and not isinstance(corpus, str):
            raise ValueError('corpus must be a list')
        if isinstance(corpus, list):
            if not isinstance(corpus[0], str):
                raise ValueError('corpus must be list of strings')

        if isinstance(corpus, str):
            corpus = split_into_sentences(corpus)
        else:
            corpus = '. '.join(corpus)
            corpus = split_into_sentences(corpus)

        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]

        if 'DEEP_SKIPTHOUGHT' in str(self._vectorizer):

            sequences = skip_thought.batch_sequence(
                cleaned_strings,
                self._vectorizer.dictionary,
                maxlen=self._vectorizer._maxlen,
            )
            vectors, attention = self._vectorizer._sess.run(
                [self._vectorizer._logits, self._vectorizer._attention],
                feed_dict={self._vectorizer._X: np.array(sequences)},
            )
            attention = attention.sum(axis=0)
            indices = np.argsort(attention)[::-1]
            top_words = [
                self._vectorizer._rev_dictionary[i] for i in indices
                if self._vectorizer._rev_dictionary[i] not in STOPWORDS
            ][:important_words]

        else:
            vectors = self._vectorizer.vectorize(corpus)
            attentions = self._vectorizer.attention(corpus, **kwargs)
            flatten = list(itertools.chain(*attentions))
            r = {}
            for f in flatten:
                c = simple_textcleaning(f[0])
                if c in STOPWORDS:
                    continue
                if c not in r:
                    r[c] = f[1]
                else:
                    r[c] += f[1]
            top_words = sorted(r, key=r.get, reverse=True)[:important_words]

        similar = cosine_similarity(vectors, vectors)
        similar[similar >= 0.99999] = 0
        scores = pagerank(similar)
        ranked_sentences = sorted(
            ((scores[i], s) for i, s in enumerate(original_strings)),
            reverse=True,
        )
        summary = [r[1] for r in ranked_sentences[:top_k]]

        return {
            'summary': ' '.join(summary),
            'top-words': top_words,
            'cluster-top-words': cluster_words(top_words),
        }