コード例 #1
0
def similarity_transformer(string,
                           model,
                           vectorizer=None,
                           top_k: int = 5,
                           atleast: int = 1,
                           stopwords=get_stopwords,
                           **kwargs):
    stopwords = validator.validate_stopwords(stopwords)
    if not hasattr(model, '_tree_plot'):
        raise ValueError('model must have `_tree_plot` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if ngram_method not in methods:
        raise ValueError("ngram_method must be in ['bow', 'skip-gram']")

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string,
                      ngram_method=ngram_method,
                      ngram=ngram,
                      stopwords=stopwords,
                      **kwargs)

    similar = model._tree_plot(list(vocab.keys()))
    similar[similar >= 0.99999] = 0
    scores = pagerank(similar)
    ranked_sentences = sorted(
        [(scores[i], s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )
    return ranked_sentences[:top_k]
コード例 #2
0
def similarity_transformer(string,
                           model,
                           top_k: int = 5,
                           ngram_method: str = 'bow',
                           ngram: Tuple[int, int] = (1, 1),
                           atleast: int = 1,
                           stop_words: List[str] = STOPWORDS,
                           **kwargs):
    if not hasattr(model, '_tree_plot'):
        raise ValueError('model must has or `_tree_plot` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if ngram_method not in methods:
        raise ValueError("ngram_method must be in ['bow', 'skip-gram']")

    if auto_ngram:
        vocab = _auto_ngram(string, stop_words)
    else:
        vocab = _base(string,
                      ngram_method=ngram_method,
                      ngram=ngram,
                      stop_words=stop_words,
                      **kwargs)

    similar = model._tree_plot(list(vocab.keys()))
    similar[similar >= 0.99999] = 0
    scores = pagerank(similar)
    ranked_sentences = sorted(
        [(scores[i], s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )
    return ranked_sentences[:top_k]
コード例 #3
0
    def _vectorize_sentence(self,
                            corpus,
                            isi_penting,
                            important_words=10,
                            batch_size=10,
                            retry=5,
                            **kwargs):
        corpus = corpus_checker(corpus)
        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]

        vectors = self._batching(cleaned_strings, batch_size=batch_size)
        if isi_penting:
            vectors_isi_penting = self._batching([isi_penting],
                                                 batch_size=batch_size)

        if 'DeepSkipThought' in str(self.vectorizer):
            top_words = []
        else:
            if hasattr(self.vectorizer, 'attention'):
                attentions = self.vectorizer.attention(corpus, **kwargs)
                flatten = list(itertools.chain(*attentions))
                r = {}
                for f in flatten:
                    c = simple_textcleaning(f[0])
                    if c in STOPWORDS:
                        continue
                    if c not in r:
                        r[c] = f[1]
                    else:
                        r[c] += f[1]
                top_words = sorted(r, key=r.get,
                                   reverse=True)[:important_words]
            else:
                top_words = []

        similar = cosine_similarity(vectors, vectors)
        if isi_penting:
            similar_isi_penting = cosine_similarity(vectors,
                                                    vectors_isi_penting)
            similar = similar * similar_isi_penting
        else:
            similar[similar >= 0.99] = 0
        scores = pagerank(similar + 1e-6, retry)
        ranked_sentences = sorted(
            ((scores[i], s, i) for i, s in enumerate(original_strings)),
            reverse=True,
        )
        return (
            original_strings,
            ranked_sentences,
            top_words,
            cluster_words(top_words),
        )
コード例 #4
0
 def _vectorize_sentence(self,
                         corpus,
                         isi_penting,
                         important_words=10,
                         retry=5,
                         **kwargs):
     corpus = corpus_checker(corpus)
     splitted_fullstop = [summary_textcleaning(i) for i in corpus]
     original_strings = [i[0] for i in splitted_fullstop]
     cleaned_strings = [i[1] for i in splitted_fullstop]
     if isi_penting:
         isi_penting = [summary_textcleaning(isi_penting)[1]]
         t = cleaned_strings + isi_penting
     else:
         t = cleaned_strings
     self.vectorizer.fit(t)
     freq = self.vectorizer.transform(cleaned_strings)
     if isi_penting:
         freq_isi_penting = self.vectorizer.transform(isi_penting)
     if important_words > 0:
         if hasattr(self.vectorizer, 'idf_'):
             indices = np.argsort(self.vectorizer.idf_)[::-1]
         else:
             indices = np.argsort(np.asarray(freq.sum(axis=0))[0])[::-1]
         features = self.vectorizer.get_feature_names()
         top_words = [features[i] for i in indices[:important_words]]
     else:
         top_words = []
     if isi_penting:
         t = vstack([freq, freq_isi_penting])
     else:
         t = freq
     self.model.fit(t)
     vectors = self.model.transform(freq)
     if isi_penting:
         vectors_isi_penting = self.model.transform(freq_isi_penting)
     similar = cosine_similarity(vectors, vectors)
     if isi_penting:
         similar_isi_penting = cosine_similarity(vectors,
                                                 vectors_isi_penting)
         similar = similar * similar_isi_penting
     else:
         similar[similar >= 0.99] = 0
     scores = pagerank(similar + 1e-6, retry)
     ranked_sentences = sorted(
         ((scores[i], s, i) for i, s in enumerate(original_strings)),
         reverse=True,
     )
     return (
         original_strings,
         ranked_sentences,
         top_words,
         cluster_words(top_words),
     )
コード例 #5
0
    def summarize(self,
                  corpus,
                  top_k: int = 3,
                  important_words: int = 3,
                  **kwargs):
        """
        Summarize list of strings / corpus

        Parameters
        ----------
        corpus: str, list

        top_k: int, (default=3)
            number of summarized strings.
        important_words: int, (default=3)
            number of important words.

        Returns
        -------
        string: summarized string
        """
        if not isinstance(corpus, list) and not isinstance(corpus, str):
            raise ValueError('corpus must be a list')
        if isinstance(corpus, list):
            if not isinstance(corpus[0], str):
                raise ValueError('corpus must be list of strings')

        if isinstance(corpus, str):
            corpus = split_into_sentences(corpus)
        else:
            corpus = '. '.join(corpus)
            corpus = split_into_sentences(corpus)

        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]

        if 'DEEP_SKIPTHOUGHT' in str(self._vectorizer):

            sequences = skip_thought.batch_sequence(
                cleaned_strings,
                self._vectorizer.dictionary,
                maxlen=self._vectorizer._maxlen,
            )
            vectors, attention = self._vectorizer._sess.run(
                [self._vectorizer._logits, self._vectorizer._attention],
                feed_dict={self._vectorizer._X: np.array(sequences)},
            )
            attention = attention.sum(axis=0)
            indices = np.argsort(attention)[::-1]
            top_words = [
                self._vectorizer._rev_dictionary[i] for i in indices
                if self._vectorizer._rev_dictionary[i] not in STOPWORDS
            ][:important_words]

        else:
            vectors = self._vectorizer.vectorize(corpus)
            attentions = self._vectorizer.attention(corpus, **kwargs)
            flatten = list(itertools.chain(*attentions))
            r = {}
            for f in flatten:
                c = simple_textcleaning(f[0])
                if c in STOPWORDS:
                    continue
                if c not in r:
                    r[c] = f[1]
                else:
                    r[c] += f[1]
            top_words = sorted(r, key=r.get, reverse=True)[:important_words]

        similar = cosine_similarity(vectors, vectors)
        similar[similar >= 0.99999] = 0
        scores = pagerank(similar)
        ranked_sentences = sorted(
            ((scores[i], s) for i, s in enumerate(original_strings)),
            reverse=True,
        )
        summary = [r[1] for r in ranked_sentences[:top_k]]

        return {
            'summary': ' '.join(summary),
            'top-words': top_words,
            'cluster-top-words': cluster_words(top_words),
        }
コード例 #6
0
def doc2vec(
    vectorizer,
    corpus,
    top_k: int = 3,
    aggregation: int = 'mean',
    soft: bool = True,
):
    """
    summarize a list of strings using doc2vec, scoring using TextRank.

    Parameters
    ----------
    vectorizer : object
        fast-text or word2vec interface object.
    corpus: list
    top_k: int, (default=3)
        number of summarized strings.
    aggregation : str, optional (default='mean')
        Aggregation supported. Allowed values:

        * ``'mean'`` - mean.
        * ``'min'`` - min.
        * ``'max'`` - max.
        * ``'sum'`` - sum.
        * ``'sqrt'`` - square root.
    soft: bool, optional (default=True)
        word not inside vectorizer will replace with nearest word if True, else, will skip.

    Returns
    -------
    dictionary: result
    """
    if not hasattr(vectorizer, 'get_vector_by_name'):
        raise ValueError('vectorizer must has `get_vector_by_name` method')
    if not isinstance(corpus, list) and not isinstance(corpus, str):
        raise ValueError('corpus must be a list')
    if isinstance(corpus, list):
        if not isinstance(corpus[0], str):
            raise ValueError('corpus must be list of strings')
    if isinstance(corpus, str):
        corpus = split_into_sentences(corpus)
    else:
        corpus = '. '.join(corpus)
        corpus = split_into_sentences(corpus)
    splitted_fullstop = [summary_textcleaning(i) for i in corpus]
    original_strings = [i[0] for i in splitted_fullstop]
    cleaned_strings = [i[1] for i in splitted_fullstop]

    aggregation = aggregation.lower()
    if aggregation == 'mean':
        aggregation_function = np.mean
    elif aggregation == 'min':
        aggregation_function = np.min
    elif aggregation == 'max':
        aggregation_function = np.max
    elif aggregation == 'sum':
        aggregation_function = np.sum
    elif aggregation == 'sqrt':
        aggregation_function = np.sqrt
    else:
        raise ValueError(
            'aggregation only supports `mean`, `min`, `max`, `sum` and `sqrt`')

    vectors = []
    for string in cleaned_strings:
        inside = []
        for token in string.split():
            try:
                inside.append(vectorizer.get_vector_by_name(token))
            except:
                if not soft:
                    pass
                else:
                    arr = np.array([
                        self._jarowinkler.similarity(token, k)
                        for k in vectorizer.words
                    ])
                    idx = (-arr).argsort()[0]
                    inside.append(
                        vectorizer.get_vector_by_name(vectorizer.words[idx]))
        vectors.append(aggregation_function(inside, axis=0))
    similar = cosine_similarity(vectors, vectors)
    similar[similar >= 0.999] = 0
    scores = pagerank(similar)
    ranked_sentences = sorted(
        ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True)
    summary = [r[1] for r in ranked_sentences[:top_k]]
    return ' '.join(summary)
コード例 #7
0
def _base_summarizer(
    corpus,
    decomposition,
    top_k: int = 3,
    max_df: float = 0.95,
    min_df: int = 2,
    ngram: Tuple[int, int] = (1, 3),
    vectorizer: str = 'bow',
    important_words: int = 10,
    retry: int = 5,
    **kwargs,
):

    vectorizer = vectorizer.lower()
    if not vectorizer in ['tfidf', 'bow', 'skip-gram']:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")

    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')
    if not isinstance(corpus, list) and not isinstance(corpus, str):
        raise ValueError('corpus must be a list')
    if isinstance(corpus, list):
        if not isinstance(corpus[0], str):
            raise ValueError('corpus must be list of strings')
    if isinstance(corpus, str):
        corpus = split_into_sentences(corpus)
    else:
        corpus = '. '.join(corpus)
        corpus = split_into_sentences(corpus)

    splitted_fullstop = [summary_textcleaning(i) for i in corpus]
    original_strings = [i[0] for i in splitted_fullstop]
    cleaned_strings = [i[1] for i in splitted_fullstop]
    stemmed = [sastrawi(i) for i in cleaned_strings]

    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise Exception("vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    tf_vectorizer = Vectorizer(
        max_df=max_df,
        min_df=min_df,
        ngram_range=ngram,
        stop_words=STOPWORDS,
        **kwargs,
    )
    tf = tf_vectorizer.fit_transform(stemmed)
    if hasattr(tf_vectorizer, 'idf_'):
        indices = np.argsort(tf_vectorizer.idf_)[::-1]
    else:
        indices = np.argsort(np.asarray(tf.sum(axis=0))[0])[::-1]

    features = tf_vectorizer.get_feature_names()
    top_words = [features[i] for i in indices[:important_words]]
    vectors = decomposition(tf.shape[1] // 2).fit_transform(tf)
    similar = cosine_similarity(vectors, vectors)
    similar[similar >= 0.999] = 0
    scores = pagerank(similar, retry)
    ranked_sentences = sorted(
        ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True)
    summary = [r[1] for r in ranked_sentences[:top_k]]
    return {
        'summary': ' '.join(summary),
        'top-words': top_words,
        'cluster-top-words': cluster_words(top_words),
    }
コード例 #8
0
def textrank(
    string: str,
    model=None,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Textrank algorithm.

    Parameters
    ----------
    string: str
    model: Object, optional (default='None')
        model has `fit_transform` or `vectorize` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or, 
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'fit_transform') and not hasattr(model, 'vectorize'):
        raise ValueError(
            'model must have `fit_transform` or `vectorize` method')

    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)

    if hasattr(model, 'fit_transform'):
        vectors = model.fit_transform(list(vocab.keys()))
    if hasattr(model, 'vectorize'):
        vectors = model.vectorize(list(vocab.keys()))
    similar = cosine_similarity(vectors, vectors)
    similar[similar >= 0.99999] = 0
    scores = pagerank(similar)
    total = sum(scores)
    ranked_sentences = sorted(
        [(scores[i] / total, s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )

    return ranked_sentences[:top_k]
コード例 #9
0
ファイル: keyword_extraction.py プロジェクト: tx-qi/malaya
def textrank(string: str,
             vectorizer,
             top_k: int = 5,
             auto_ngram: bool = True,
             ngram_method: str = 'bow',
             ngram: Tuple[int, int] = (1, 1),
             atleast: int = 1,
             stopwords=get_stopwords,
             **kwargs):
    """
    Extract keywords using Textrank algorithm.

    Parameters
    ----------
    string: str
    vectorizer: Object, optional (default='None')
        model has `fit_transform` or `vectorize` method.
    top_k: int, optional (default=5)
        return top-k results.
    auto_ngram: bool, optional (default=True)
        If True, will generate keyword candidates using N suitable ngram. Else use `ngram_method`.
    ngram_method: str, optional (default='bow')
        Only usable if `auto_ngram` is False. supported ngram generator:

        * ``'bow'`` - bag-of-word.
        * ``'skipgram'`` - bag-of-word with skip technique.
    ngram: tuple, optional (default=(1,1))
        n-grams size.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(vectorizer, 'fit_transform') and not hasattr(
            vectorizer, 'vectorize'):
        raise ValueError(
            'vectorizer must has `fit_transform` or `vectorize` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if ngram_method not in methods:
        raise ValueError("ngram_method must be in ['bow', 'skip-gram']")
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string,
                      ngram_method=ngram_method,
                      ngram=ngram,
                      stopwords=stopwords,
                      **kwargs)

    if hasattr(vectorizer, 'fit_transform'):
        vectors = vectorizer.fit_transform(list(vocab.keys()))
    if hasattr(vectorizer, 'vectorize'):
        vectors = vectorizer.vectorize(list(vocab.keys()))
    similar = cosine_similarity(vectors, vectors)
    similar[similar >= 0.99999] = 0
    scores = pagerank(similar)
    total = sum(scores)
    ranked_sentences = sorted(
        [(scores[i] / total, s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )

    return ranked_sentences[:top_k]
コード例 #10
0
    def _vectorize_sentence(self,
                            corpus,
                            isi_penting,
                            aggregation=np.mean,
                            soft=False,
                            retry=5,
                            **kwargs):

        corpus = corpus_checker(corpus)
        splitted_fullstop = [summary_textcleaning(i) for i in corpus]
        original_strings = [i[0] for i in splitted_fullstop]
        cleaned_strings = [i[1] for i in splitted_fullstop]
        vectors = []
        for string in cleaned_strings:
            inside = []
            for token in string.split():
                if token in self.wordvector._dictionary:
                    v = self.wordvector.get_vector_by_name(token)
                else:
                    if not soft:
                        v = np.zeros((self.wordvector._embed_matrix.shape[1]))

                    else:
                        arr = np.array([
                            self.wordvector._jarowinkler.similarity(token, k)
                            for k in self.wordvector.words
                        ])
                        idx = (-arr).argsort()[0]
                        v = self.wordvector.get_vector_by_name(
                            self.wordvector.words[idx])

                inside.append(v)
            vectors.append(aggregation(inside, axis=0))
        vectors = np.array(vectors)

        if isi_penting:
            cleaned_isi_penting = summary_textcleaning(isi_penting)[1]
            vectors_isi_penting = []
            for token in cleaned_isi_penting.split():
                if token in self.wordvector._dictionary:
                    v = self.wordvector.get_vector_by_name(token)
                else:
                    if not soft:
                        v = np.zeros((self.wordvector._embed_matrix.shape[1]))
                    else:
                        arr = np.array([
                            self.wordvector._jarowinkler.similarity(token, k)
                            for k in self.wordvector.words
                        ])
                        idx = (-arr).argsort()[0]
                        v = self.wordvector.get_vector_by_name(
                            self.wordvector.words[idx])
                vectors_isi_penting.append(v)
            vectors_isi_penting = aggregation(vectors_isi_penting, axis=0)
            vectors_isi_penting = np.expand_dims(vectors_isi_penting, 0)

        similar = cosine_similarity(vectors, vectors)
        if isi_penting:
            similar_isi_penting = cosine_similarity(vectors,
                                                    vectors_isi_penting)
            similar = similar * similar_isi_penting
        else:
            similar[similar >= 0.99] = 0
        scores = pagerank(similar + 1e-6, retry)
        ranked_sentences = sorted(
            ((scores[i], s, i) for i, s in enumerate(original_strings)),
            reverse=True,
        )
        return (original_strings, ranked_sentences)