Esempio n. 1
0
def similarity_transformer(string,
                           model,
                           vectorizer=None,
                           top_k: int = 5,
                           atleast: int = 1,
                           stopwords=get_stopwords,
                           **kwargs):
    stopwords = validator.validate_stopwords(stopwords)
    if not hasattr(model, '_tree_plot'):
        raise ValueError('model must have `_tree_plot` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if ngram_method not in methods:
        raise ValueError("ngram_method must be in ['bow', 'skip-gram']")

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string,
                      ngram_method=ngram_method,
                      ngram=ngram,
                      stopwords=stopwords,
                      **kwargs)

    similar = model._tree_plot(list(vocab.keys()))
    similar[similar >= 0.99999] = 0
    scores = pagerank(similar)
    ranked_sentences = sorted(
        [(scores[i], s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )
    return ranked_sentences[:top_k]
Esempio n. 2
0
def sklearn(
    corpus: List[str],
    model,
    vectorizer,
    n_topics: int,
    cleaning=simple_textcleaning,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Train a SKlearn model to do topic modelling based on corpus / list of strings given.

    Parameters
    ----------
    corpus: list
    model : object
        Should have `fit_transform` method. Commonly:

        * ``sklearn.decomposition.TruncatedSVD`` - LSA algorithm.
        * ``sklearn.decomposition.LatentDirichletAllocation`` - LDA algorithm.
        * ``sklearn.decomposition.NMF`` - NMF algorithm.
    vectorizer : object
        Should have `fit_transform` method. Commonly:

        * ``sklearn.feature_extraction.text.TfidfVectorizer`` - TFIDF algorithm.
        * ``sklearn.feature_extraction.text.CountVectorizer`` - Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramCountVectorizer`` - Skip Gram Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramTfidfVectorizer`` - Skip Gram TFIDF algorithm.
    n_topics: int, (default=10)
        size of decomposition column.
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: malaya.topic_modelling.Topic class
    """
    stopwords = validator.validate_stopwords(stopwords)
    stopwords = list(stopwords)
    validator.validate_function(cleaning, 'cleaning')
    if not hasattr(vectorizer, 'fit_transform'):
        raise ValueError('vectorizer must have `fit_transform` method')

    if len(corpus) < n_topics:
        raise ValueError(
            'length corpus must be bigger than or equal to n_topics')

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])

    tf = vectorizer.fit_transform(corpus)
    tf_features = vectorizer.get_feature_names()
    compose = model(n_topics).fit(tf)
    return Topic(tf_features, compose, corpus, compose.transform(tf),
                 vectorizer, tf)
Esempio n. 3
0
def lsa(
    corpus,
    n_topics,
    max_df=0.95,
    min_df=2,
    ngram=(1, 3),
    vectorizer='bow',
    cleaning=simple_textcleaning,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Train a LSA model to do topic modelling based on corpus / list of strings given.

    Parameters
    ----------
    corpus: list
    n_topics: int, (default=10)
        size of decomposition column.
    max_df: float, (default=0.95)
        maximum of a word selected based on document frequency.
    min_df: int, (default=2)
        minimum of a word selected on based on document frequency.
    ngram: tuple, (default=(1,3))
        n-grams size to train a corpus.
    vectorizer: str, (default='bow')
        vectorizer technique. Allowed values:

        * ``'bow'`` - Bag of Word.
        * ``'tfidf'`` - Term frequency inverse Document Frequency.
        * ``'skip-gram'`` - Bag of Word with skipping certain n-grams.
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    TOPIC: malaya.topic_modelling.TOPIC class
    """
    stopwords = validator.validate_stopwords(stopwords)
    stopwords = list(stopwords)
    return _base_topic_modelling(
        corpus,
        n_topics,
        TruncatedSVD,
        max_df=max_df,
        min_df=min_df,
        ngram=ngram,
        vectorizer=vectorizer,
        cleaning=cleaning,
        stopwords=stopwords,
        **kwargs,
    )
Esempio n. 4
0
def attention(
    string: str,
    model,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Attention mechanism.

    Parameters
    ----------
    string: str
    model: Object
        Transformer model or any model has `attention` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or, 
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """

    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'attention'):
        raise ValueError('model must have `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    string = transformer_textcleaning(string)

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)

    attention = model.attention([string])[0]
    d = defaultdict(float)
    for k, v in attention:
        d[k] += v

    scores = []
    for k in vocab.keys():
        scores.append(sum([d.get(w, 0) for w in k.split()]))

    total = sum(scores)

    ranked_sentences = sorted(
        [(scores[i] / total, s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )
    return ranked_sentences[:top_k]
Esempio n. 5
0
def textrank(
    string: str,
    model=None,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Textrank algorithm.

    Parameters
    ----------
    string: str
    model: Object, optional (default='None')
        model has `fit_transform` or `vectorize` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or, 
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'fit_transform') and not hasattr(model, 'vectorize'):
        raise ValueError(
            'model must have `fit_transform` or `vectorize` method')

    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)

    if hasattr(model, 'fit_transform'):
        vectors = model.fit_transform(list(vocab.keys()))
    if hasattr(model, 'vectorize'):
        vectors = model.vectorize(list(vocab.keys()))
    similar = cosine_similarity(vectors, vectors)
    similar[similar >= 0.99999] = 0
    scores = pagerank(similar)
    total = sum(scores)
    ranked_sentences = sorted(
        [(scores[i] / total, s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )

    return ranked_sentences[:top_k]
Esempio n. 6
0
def cluster_entity_linking(
        corpus: List[str],
        vectorizer,
        entity_model,
        topic_modeling_model,
        threshold: float = 0.3,
        topic_decomposition: int = 2,
        topic_length: int = 10,
        fuzzy_ratio: int = 70,
        accepted_entities: List[str] = [
            'law',
            'location',
            'organization',
            'person',
            'event',
        ],
        cleaning=simple_textcleaning,
        colors: List[str] = None,
        stopwords=get_stopwords,
        max_df: float = 1.0,
        min_df: int = 1,
        ngram: Tuple[int, int] = (2, 3),
        figsize: Tuple[int, int] = (17, 9),
        batch_size: int = 20,
):
    """
    plot undirected graph for Entities and topics relationship.

    Parameters
    ----------
    corpus: list or str
    vectorizer: class
    titles: list
        list of titles, length must same with corpus.
    colors: list
        list of colors, length must same with num_clusters.
    threshold: float, (default=0.3)
        0.3 means, 30% above absolute pearson correlation.
    topic_decomposition: int, (default=2)
        size of decomposition.
    topic_length: int, (default=10)
        size of topic models.
    fuzzy_ratio: int, (default=70)
        size of ratio for fuzzywuzzy.
    max_df: float, (default=0.95)
        maximum of a word selected based on document frequency.
    min_df: int, (default=2)
        minimum of a word selected on based on document frequency.
    ngram: tuple, (default=(1,3))
        n-grams size to train a corpus.
    cleaning: function, (default=simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str] or List[str] or Tuple[str]

    Returns
    -------
    dictionary: {'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels}
    """

    import inspect

    validator.validate_object_methods(vectorizer, ['vectorize', 'fit'],
                                      'vectorizer')
    stopwords = validator.validate_stopwords(stopwords)
    validator.validate_function(cleaning, 'cleaning')

    if 'max_df' not in inspect.getargspec(topic_modeling_model)[0]:
        raise ValueError('topic_modeling_model must have `max_df` parameter')

    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')
    if not (fuzzy_ratio > 0 and fuzzy_ratio <= 100):
        raise ValueError(
            'fuzzy_ratio must be bigger than 0, less than or equal to 100')
    if not isinstance(threshold, float):
        raise ValueError('threshold must be a float')
    if not (threshold <= 1 and threshold > 0):
        raise ValueError(
            'threshold must be bigger than 0, less than or equal to 1')

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        import networkx as nx
        import networkx.drawing.layout as nxlayout
        import pandas as pd
        from fuzzywuzzy import fuzz

        sns.set()
    except BaseException:
        raise ModuleNotFoundError(
            'matplotlib, seaborn, networkx, fuzzywuzzy not installed. Please install it and try again.'
        )

    if isinstance(corpus, str):
        corpus = split_into_sentences(corpus)
    else:
        corpus = '. '.join(corpus)
        corpus = split_into_sentences(corpus)

    corpus = [string for string in corpus if len(string) > 5]

    if not colors:
        colors = sns.color_palette(n_colors=len(accepted_entities) + 1)
    else:
        if len(colors) != (len(accepted_entities) + 1):
            raise ValueError('len of colors must same as %d' %
                             (len(accepted_entities) + 1))

    topic_model = topic_modeling_model(
        corpus,
        topic_decomposition,
        ngram=ngram,
        max_df=max_df,
        min_df=min_df,
    )
    topics = []
    for no, topic in enumerate(topic_model.comp.components_):
        for i in topic.argsort()[:-topic_length - 1:-1]:
            topics.append(topic_model.features[i])

    entities_cluster = {entity: [] for entity in accepted_entities}
    for string in corpus:
        entities_clustered = cluster_entities(entity_model.predict(string))
        for entity in accepted_entities:
            entities_cluster[entity].extend(entities_clustered[entity])
    for entity in accepted_entities:
        entities_cluster[entity] = cluster_words(
            list(set(entities_cluster[entity])))

    topics = cluster_words(list(set(topics)))
    color_dict = {topic: colors[-1] for topic in topics}
    for no, entity in enumerate(accepted_entities):
        for e in entities_cluster[entity]:
            topics.append(e)
            color_dict[e] = colors[no]

    topics_corpus = []
    for topic in topics:
        nested_corpus = []
        for string in corpus:
            if (topic in string
                    or fuzz.token_set_ratio(topic, string) >= fuzzy_ratio):
                nested_corpus.append(string)
        topics_corpus.append(' '.join(nested_corpus))

    corpus = topics_corpus

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stopwords]))

    if hasattr(vectorizer, 'fit'):
        vectorizer.fit(text_clean)
        transformed_text_clean = vectorizer.transform(text_clean).todense()
        features = vectorizer.get_feature_names()
    else:
        transformed_text_clean, attentions = [], []
        for i in range(0, len(text_clean), batch_size):
            index = min(i + batch_size, len(text_clean))
            transformed_text_clean.append(
                vectorizer.vectorize(text_clean[i:index]))
            if hasattr(vectorizer, 'attention'):
                attentions.extend(vectorizer.attention(text_clean[i:index]))
            else:
                attentions.extend(text_clean[i:index])
        transformed_text_clean = np.concatenate(transformed_text_clean, axis=0)

    DxT = transformed_text_clean
    DxD = np.abs(pd.DataFrame(DxT.T).corr()).values

    G = nx.Graph()
    for i in range(DxT.shape[0]):
        G.add_node(i, text=topics[i], label=topics[i])

    len_dense = len(DxD)
    for i in range(len_dense):
        for j in range(len_dense):
            if j == i:
                continue
            if DxD[i, j] >= threshold:
                weight = DxD[i, j]
                G.add_edge(i, j, weight=weight)

    node_colors, node_labels = [], {}
    for node in G:
        node_colors.append(color_dict[G.node[node]['label']])
        node_labels[node] = G.node[node]['text']
    pos = nxlayout.fruchterman_reingold_layout(G,
                                               k=1.5 / np.sqrt(len(G.nodes())))
    f = plt.figure(figsize=figsize)
    ax = f.add_subplot(1, 1, 1)
    for no, entity in enumerate(accepted_entities):
        ax.plot([0], [0], color=colors[no], label=entity)
    ax.plot([0], [0], color=colors[-1], label='topics')
    nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels, ax=ax)
    plt.legend()
    plt.tight_layout()
    plt.show()
    return {
        'G': G,
        'pos': pos,
        'node_colors': node_colors,
        'node_labels': node_labels,
    }
Esempio n. 7
0
def cluster_graph(
    corpus: List[str],
    vectorizer,
    threshold: float = 0.9,
    num_clusters: int = 5,
    titles: List[str] = None,
    colors: List[str] = None,
    stopwords=get_stopwords,
    ngram: Tuple[int, int] = (1, 3),
    cleaning=simple_textcleaning,
    clustering=KMeans,
    figsize: Tuple[int, int] = (17, 9),
    with_labels: bool = True,
    batch_size: int = 20,
):
    """
    plot undirected graph with similar texts.

    Parameters
    ----------

    corpus: List[str]
    vectorizer: class
        vectorizer class.
    threshold: float, (default=0.9)
        0.9 means, 90% above absolute pearson correlation.
    num_clusters: int, (default=5)
        size of unsupervised clusters.
    titles: List[str], (default=True)
        list of titles, length must same with corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str] or List[str] or Tuple[str].
    cleaning: function, (default=malaya.texts.function.simple_textcleaning)
        function to clean the corpus.
    ngram: Tuple[int, int], (default=(1,3))
        n-grams size to train a corpus.
    batch_size: int, (default=20)
        size of strings for each vectorization and attention. Only useful if use transformer vectorizer.

    Returns
    -------
    dictionary: {'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels}
    """
    validator.validate_object_methods(vectorizer, ['vectorize', 'fit'],
                                      'vectorizer')
    stopwords = validator.validate_stopwords(stopwords)
    validator.validate_function(cleaning, 'cleaning')
    if titles:
        if len(titles) != len(corpus):
            raise ValueError('length of titles must be same with corpus')
    if colors:
        if len(colors) != num_clusters:
            raise ValueError(
                'size of colors must be same with number of clusters')
    if not (threshold <= 1 and threshold > 0):
        raise ValueError(
            'threshold must be bigger than 0, less than or equal to 1')

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        import networkx as nx
        import networkx.drawing.layout as nxlayout
        import pandas as pd

        sns.set()
    except BaseException:
        raise ModuleNotFoundError(
            'matplotlib, seaborn, networkx not installed. Please install it and try again.'
        )

    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stopwords]))

    if hasattr(vectorizer, 'fit'):
        vectorizer.fit(text_clean)
        transformed_text_clean = vectorizer.transform(text_clean).todense()
        features = vectorizer.get_feature_names()
    else:
        transformed_text_clean, attentions = [], []
        for i in range(0, len(text_clean), batch_size):
            index = min(i + batch_size, len(text_clean))
            transformed_text_clean.append(
                vectorizer.vectorize(text_clean[i:index]))
            if hasattr(vectorizer, 'attention'):
                attentions.extend(vectorizer.attention(text_clean[i:index]))
            else:
                t = []
                for s in text_clean[i:index]:
                    t.append([(w, 1.0) for w in s.split()])
                attentions.extend(t)
        transformed_text_clean = np.concatenate(transformed_text_clean, axis=0)

    DxT = transformed_text_clean
    DxD = np.abs(pd.DataFrame(DxT.T).corr()).values
    km = clustering(n_clusters=num_clusters)
    km.fit(DxT)
    clusters = km.labels_.tolist()

    if not titles:
        titles = []
        for i in range(transformed_text_clean.shape[0]):

            if hasattr(vectorizer, 'fit'):
                indices = np.argsort(np.array(
                    transformed_text_clean[i])[0])[::-1]
                titles.append(' '.join(
                    [features[i] for i in indices[:ngram[1]]]))
            else:
                attentions[i].sort(key=lambda x: x[1])
                titles.append(' '.join(
                    [i[0] for i in attentions[i][-ngram[1]:]]))

    if not colors:
        colors = sns.color_palette(n_colors=num_clusters)
    G = nx.Graph()
    for i in range(DxT.shape[0]):
        G.add_node(i, text=titles[i], label=clusters[i])

    len_dense = len(DxD)
    for i in range(len_dense):
        for j in range(len_dense):
            if j == i:
                continue
            if DxD[i, j] >= threshold:
                weight = DxD[i, j]
                G.add_edge(i, j, weight=weight)
    node_colors, node_labels = [], {}
    for node in G:
        node_colors.append(colors[G.node[node]['label']])
        node_labels[node] = G.node[node]['text']
    pos = nxlayout.fruchterman_reingold_layout(G,
                                               k=1.5 / np.sqrt(len(G.nodes())))
    plt.figure(figsize=figsize)
    if with_labels:
        nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels)
    else:
        nx.draw(G, node_color=node_colors, pos=pos)

    return {
        'G': G,
        'pos': pos,
        'node_colors': node_colors,
        'node_labels': node_labels,
    }
Esempio n. 8
0
def cluster_dendogram(
        corpus: List[str],
        vectorizer,
        titles: List[str] = None,
        stopwords=get_stopwords,
        cleaning=simple_textcleaning,
        random_samples: float = 0.3,
        ngram: Tuple[int, int] = (1, 3),
        figsize: Tuple[int, int] = (17, 9),
        batch_size: int = 20,
):
    """
    plot hierarchical dendogram with similar texts.

    Parameters
    ----------

    corpus: List[str]
    vectorizer: class
        vectorizer class.
    num_clusters: int, (default=5)
        size of unsupervised clusters.
    titles: List[str], (default=None)
        list of titles, length must same with corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    random_samples: float, (default=0.3)
        random samples from the corpus, 0.3 means 30%.
    ngram: Tuple[int, int], (default=(1,3))
        n-grams size to train a corpus.
    batch_size: int, (default=20)
        size of strings for each vectorization and attention. Only useful if use transformer vectorizer.

    Returns
    -------
    dictionary: {'linkage_matrix': linkage_matrix, 'titles': titles}
    """

    if titles:
        if len(titles) != len(corpus):
            raise ValueError('length of titles must be same with corpus')

    validator.validate_object_methods(vectorizer, ['vectorize', 'fit'],
                                      'vectorizer')
    stopwords = validator.validate_stopwords(stopwords)
    validator.validate_function(cleaning, 'cleaning')

    if not (random_samples < 1 and random_samples > 0):
        raise ValueError('random_samples must be between 0 and 1')

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        from scipy.cluster.hierarchy import ward, dendrogram

        sns.set()
    except BaseException:
        raise ModuleNotFoundError(
            'matplotlib and seaborn not installed. Please install it and try again.'
        )

    corpus = random.sample(corpus, k=int(random_samples * len(corpus)))

    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stopwords]))

    if hasattr(vectorizer, 'fit'):
        vectorizer.fit(text_clean)
        transformed_text_clean = vectorizer.transform(text_clean)
        features = vectorizer.get_feature_names()
    else:
        transformed_text_clean, attentions = [], []
        for i in range(0, len(text_clean), batch_size):
            index = min(i + batch_size, len(text_clean))
            transformed_text_clean.append(
                vectorizer.vectorize(text_clean[i:index]))
            if hasattr(vectorizer, 'attention'):
                attentions.extend(vectorizer.attention(text_clean[i:index]))
            else:
                t = []
                for s in text_clean[i:index]:
                    t.append([(w, 1.0) for w in s.split()])
                attentions.extend(t)
        transformed_text_clean = np.concatenate(transformed_text_clean, axis=0)

    dist = 1 - cosine_similarity(transformed_text_clean)
    linkage_matrix = ward(dist)
    if not titles:
        titles = []
        for i in range(transformed_text_clean.shape[0]):

            if hasattr(vectorizer, 'fit'):
                indices = np.argsort(
                    np.array(transformed_text_clean[i].todense())[0])[::-1]
                titles.append(' '.join(
                    [features[i] for i in indices[:ngram[1]]]))
            else:
                attentions[i].sort(key=lambda x: x[1])
                titles.append(' '.join(
                    [i[0] for i in attentions[i][-ngram[1]:]]))
    plt.figure(figsize=figsize)
    ax = dendrogram(linkage_matrix, orientation='right', labels=titles)
    plt.tick_params(
        axis='x',
        which='both',
        bottom='off',
        top='off',
        labelbottom='off',
    )
    plt.tight_layout()
    plt.show()
    return {'linkage_matrix': linkage_matrix, 'titles': titles}
Esempio n. 9
0
def cluster_scatter(
        corpus: List[str],
        vectorizer,
        num_clusters: int = 5,
        titles: List[str] = None,
        colors: List[str] = None,
        stopwords=get_stopwords,
        cleaning=simple_textcleaning,
        clustering=KMeans,
        decomposition=MDS,
        ngram: Tuple[int, int] = (1, 3),
        figsize: Tuple[int, int] = (17, 9),
        batch_size: int = 20,
):
    """
    plot scatter plot on similar text clusters.

    Parameters
    ----------

    corpus: List[str]
    vectorizer: class
        vectorizer class.
    num_clusters: int, (default=5)
        size of unsupervised clusters.
    titles: List[str], (default=None)
        list of titles, length must same with corpus.
    colors: List[str], (default=None)
        list of colors, length must same with num_clusters.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
    ngram: Tuple[int, int], (default=(1,3))
        n-grams size to train a corpus.
    cleaning: function, (default=malaya.texts.function.simple_textcleaning)
        function to clean the corpus.
    batch_size: int, (default=10)
        size of strings for each vectorization and attention. Only useful if use transformer vectorizer.

    Returns
    -------
    dictionary: {'X': X, 'Y': Y, 'labels': clusters, 'vector': transformed_text_clean, 'titles': titles}
    """

    if titles:
        if len(titles) != len(corpus):
            raise ValueError('length of titles must be same with corpus')
    if colors:
        if len(colors) != num_clusters:
            raise ValueError(
                'size of colors must be same with number of clusters')

    validator.validate_object_methods(vectorizer, ['vectorize', 'fit'],
                                      'vectorizer')
    stopwords = validator.validate_stopwords(stopwords)
    validator.validate_function(cleaning, 'cleaning')

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        sns.set()
    except BaseException:
        raise ModuleNotFoundError(
            'matplotlib and seaborn not installed. Please install it and try again.'
        )

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stopwords]))

    if hasattr(vectorizer, 'fit'):
        vectorizer.fit(text_clean)
        transformed_text_clean = vectorizer.transform(text_clean)
        features = vectorizer.get_feature_names()
    else:
        transformed_text_clean, attentions = [], []
        for i in range(0, len(text_clean), batch_size):
            index = min(i + batch_size, len(text_clean))
            transformed_text_clean.append(
                vectorizer.vectorize(text_clean[i:index]))
            if hasattr(vectorizer, 'attention'):
                attentions.extend(vectorizer.attention(text_clean[i:index]))
            else:
                t = []
                for s in text_clean[i:index]:
                    t.append([(w, 1.0) for w in s.split()])
                attentions.extend(t)
        transformed_text_clean = np.concatenate(transformed_text_clean, axis=0)
    km = clustering(n_clusters=num_clusters)
    dist = 1 - cosine_similarity(transformed_text_clean)
    km.fit(transformed_text_clean)
    clusters = km.labels_.tolist()
    if isinstance(decomposition, MDS):
        decomposed = decomposition(n_components=2, dissimilarity='precomputed')
    else:
        decomposed = decomposition(n_components=2)
    pos = decomposed.fit_transform(dist)
    if not titles:
        titles = []
        for i in range(transformed_text_clean.shape[0]):

            if hasattr(vectorizer, 'fit'):
                indices = np.argsort(
                    np.array(transformed_text_clean[i].todense())[0])[::-1]
                titles.append(' '.join(
                    [features[i] for i in indices[:ngram[1]]]))
            else:
                attentions[i].sort(key=lambda x: x[1])
                titles.append(' '.join(
                    [i[0] for i in attentions[i][-ngram[1]:]]))

    if not colors:
        colors = sns.color_palette(n_colors=num_clusters)
    X, Y = pos[:, 0], pos[:, 1]
    plt.figure(figsize=figsize)
    for i in np.unique(clusters):
        plt.scatter(
            X[clusters == i],
            Y[clusters == i],
            color=colors[i],
            label='cluster %d' % (i),
        )
    for i in range(len(X)):
        plt.text(X[i], Y[i], titles[i], size=8)
    plt.legend()
    plt.show()
    return {
        'X': X,
        'Y': Y,
        'labels': clusters,
        'vector': transformed_text_clean,
        'titles': titles,
    }
Esempio n. 10
0
def lda2vec(
    corpus: List[str],
    n_topics: int,
    max_df: float = 0.95,
    min_df: int = 2,
    ngram: Tuple[int, int] = (1, 3),
    cleaning=simple_textcleaning,
    vectorizer: str = 'bow',
    stopwords=get_stopwords,
    window_size: int = 2,
    embedding_size: int = 128,
    epoch: int = 10,
    switch_loss: int = 3,
    skip: int = 5,
    **kwargs,
):
    """
    Train a LDA2Vec model to do topic modelling based on corpus / list of strings given.

    Parameters
    ----------
    corpus: list
    n_topics: int, (default=10)
        size of decomposition column.
    max_df: float, (default=0.95)
        maximum of a word selected based on document frequency.
    min_df: int, (default=2)
        minimum of a word selected on based on document frequency.
    ngram: tuple, (default=(1,3))
        n-grams size to train a corpus.
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
    embedding_size: int, (default=128)
        embedding size of lda2vec tensors.
    training_iteration: int, (default=10)
        training iteration, how many loop need to train.
    switch_loss: int, (default=3)
        baseline to switch from document based loss to document + word based loss.
    vectorizer: str, (default='bow')
        vectorizer technique. Allowed values:

        * ``'bow'`` - Bag of Word.
        * ``'tfidf'`` - Term frequency inverse Document Frequency.
        * ``'skip-gram'`` - Bag of Word with skipping certain n-grams.
    skip: int, (default=5)
        skip value if vectorizer = 'skip-gram'

    Returns
    -------
    result: malaya.topic_modelling.DEEP_TOPIC class
    """
    validator.validate_function(cleaning, 'cleaning')
    stopwords = validator.validate_stopwords(stopwords)
    stopwords = list(stopwords)

    vectorizer = vectorizer.lower()
    if not vectorizer in ['tfidf', 'bow', 'skip-gram']:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")

    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')
    Vectorizer = vectorizer_mapping.get(vectorizer)
    if not Vectorizer:
        raise ValueError(
            'vectorizer is not supported, please check supported vectorizers from `malaya.topic_model.available_vectorizer()`'
        )
    tf_vectorizer = Vectorizer(
        ngram_range=ngram,
        min_df=min_df,
        max_df=max_df,
        stop_words=stopwords,
    )

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stopwords]))
    tf_vectorizer.fit(text_clean)
    idx_text_clean, len_idx_text_clean = [], []
    transformed_text_clean = tf_vectorizer.transform(text_clean)
    for text in transformed_text_clean:
        splitted = text.nonzero()[1]
        idx_text_clean.append(splitted)
        len_idx_text_clean.append(len(splitted))
    dictionary = {
        i: no
        for no, i in enumerate(tf_vectorizer.get_feature_names())
    }
    reversed_dictionary = {
        no: i
        for no, i in enumerate(tf_vectorizer.get_feature_names())
    }
    freqs = transformed_text_clean.toarray().sum(axis=0).tolist()
    doc_ids = np.arange(len(idx_text_clean))
    num_unique_documents = doc_ids.max()
    pivot_words, target_words, doc_ids = [], [], []
    for i, t in enumerate(idx_text_clean):
        pairs, _ = skipgrams(
            t,
            vocabulary_size=len(dictionary),
            window_size=window_size,
            shuffle=True,
            negative_samples=0,
        )
        for pair in pairs:
            temp_data = pair
            pivot_words.append(temp_data[0])
            target_words.append(temp_data[1])
            doc_ids.append(i)
    pivot_words, target_words, doc_ids = shuffle(pivot_words,
                                                 target_words,
                                                 doc_ids,
                                                 random_state=10)
    num_unique_documents = len(idx_text_clean)

    model = LDA2VEC(
        num_unique_documents,
        len(dictionary),
        n_topics,
        freqs,
        embedding_size=embedding_size,
        **kwargs,
    )
    model.train(pivot_words,
                target_words,
                doc_ids,
                epoch,
                switch_loss=switch_loss)
    return DEEP_TOPIC(
        model,
        dictionary,
        reversed_dictionary,
        freqs,
        len_idx_text_clean,
        text_clean,
    )
Esempio n. 11
0
def rake(string: str,
         model=None,
         top_k: int = 5,
         auto_ngram: bool = True,
         ngram_method: str = 'bow',
         ngram: Tuple[int, int] = (1, 1),
         atleast: int = 1,
         stopwords=get_stopwords,
         **kwargs):
    """
    Extract keywords using Rake algorithm.

    Parameters
    ----------
    string: str
    model: Object, optional (default='None')
        Transformer model or any model has `attention` method.
    top_k: int, optional (default=5)
        return top-k results.
    auto_ngram: bool, optional (default=True)
        If True, will generate keyword candidates using N suitable ngram. Else use `ngram_method`.
    ngram_method: str, optional (default='bow')
        Only usable if `auto_ngram` is False. supported ngram generator:

        * ``'bow'`` - bag-of-word.
        * ``'skipgram'`` - bag-of-word with skip technique.
    ngram: tuple, optional (default=(1,1))
        n-grams size.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if model is not None:
        if not hasattr(model, 'attention'):
            raise ValueError('model must has or `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if ngram_method not in methods:
        raise ValueError("ngram_method must be in ['bow', 'skip-gram']")
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if model:
        string = transformer_textcleaning(string)
        attention = model.attention([string])[0]
        d = defaultdict(float)
        for k, v in attention:
            d[k] += v

    else:
        d = None

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string,
                      ngram_method=ngram_method,
                      ngram=ngram,
                      stopwords=stopwords,
                      **kwargs)
    phrase_list = list(vocab.keys())
    scores = rake_function.calculate_word_scores(phrase_list, attentions=d)
    keywordcandidates = rake_function.generate_candidate_keyword_scores(
        phrase_list, scores)

    sortedKeywords = sorted(keywordcandidates.items(),
                            key=operator.itemgetter(1),
                            reverse=True)

    total = sum([i[1] for i in sortedKeywords])

    ranked_sentences = [(i[1] / total, i[0]) for i in sortedKeywords
                        if vocab[i[0]] >= atleast]
    return ranked_sentences[:top_k]
Esempio n. 12
0
def attention(string: str,
              model,
              top_k: int = 5,
              auto_ngram: bool = True,
              ngram_method: str = 'bow',
              ngram: Tuple[int, int] = (1, 1),
              atleast: int = 1,
              stopwords=get_stopwords,
              **kwargs):
    """
    Extract keywords using Attention mechanism.

    Parameters
    ----------
    string: str
    model: Object, optional (default='None')
        Transformer model or any model has `attention` method.
    top_k: int, optional (default=5)
        return top-k results.
    auto_ngram: bool, optional (default=True)
        If True, will generate keyword candidates using N suitable ngram. Else use `ngram_method`.
    ngram_method: str, optional (default='bow')
        Only usable if `auto_ngram` is False. supported ngram generator:

        * ``'bow'`` - bag-of-word.
        * ``'skipgram'`` - bag-of-word with skip technique.
    ngram: tuple, optional (default=(1,1))
        n-grams size.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """

    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'attention'):
        raise ValueError('model must has `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if ngram_method not in methods:
        raise ValueError("ngram_method must be in ['bow', 'skip-gram']")
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    string = transformer_textcleaning(string)

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string,
                      ngram_method=ngram_method,
                      ngram=ngram,
                      stopwords=stopwords,
                      **kwargs)

    attention = model.attention([string])[0]
    d = defaultdict(float)
    for k, v in attention:
        d[k] += v

    scores = []
    for k in vocab.keys():
        scores.append(sum([d.get(w, 0) for w in k.split()]))

    total = sum(scores)

    ranked_sentences = sorted(
        [(scores[i] / total, s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )
    return ranked_sentences[:top_k]
Esempio n. 13
0
def textrank(string: str,
             vectorizer,
             top_k: int = 5,
             auto_ngram: bool = True,
             ngram_method: str = 'bow',
             ngram: Tuple[int, int] = (1, 1),
             atleast: int = 1,
             stopwords=get_stopwords,
             **kwargs):
    """
    Extract keywords using Textrank algorithm.

    Parameters
    ----------
    string: str
    vectorizer: Object, optional (default='None')
        model has `fit_transform` or `vectorize` method.
    top_k: int, optional (default=5)
        return top-k results.
    auto_ngram: bool, optional (default=True)
        If True, will generate keyword candidates using N suitable ngram. Else use `ngram_method`.
    ngram_method: str, optional (default='bow')
        Only usable if `auto_ngram` is False. supported ngram generator:

        * ``'bow'`` - bag-of-word.
        * ``'skipgram'`` - bag-of-word with skip technique.
    ngram: tuple, optional (default=(1,1))
        n-grams size.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(vectorizer, 'fit_transform') and not hasattr(
            vectorizer, 'vectorize'):
        raise ValueError(
            'vectorizer must has `fit_transform` or `vectorize` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if ngram_method not in methods:
        raise ValueError("ngram_method must be in ['bow', 'skip-gram']")
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string,
                      ngram_method=ngram_method,
                      ngram=ngram,
                      stopwords=stopwords,
                      **kwargs)

    if hasattr(vectorizer, 'fit_transform'):
        vectors = vectorizer.fit_transform(list(vocab.keys()))
    if hasattr(vectorizer, 'vectorize'):
        vectors = vectorizer.vectorize(list(vocab.keys()))
    similar = cosine_similarity(vectors, vectors)
    similar[similar >= 0.99999] = 0
    scores = pagerank(similar)
    total = sum(scores)
    ranked_sentences = sorted(
        [(scores[i] / total, s)
         for i, s in enumerate(vocab.keys()) if vocab[s] >= atleast],
        reverse=True,
    )

    return ranked_sentences[:top_k]
Esempio n. 14
0
def attention(
        corpus: List[str],
        n_topics: int,
        vectorizer,
        cleaning=simple_textcleaning,
        stopwords=get_stopwords,
        ngram: Tuple[int, int] = (1, 3),
        batch_size: int = 10,
):
    """
    Use attention from transformer model to do topic modelling based on corpus / list of strings given.

    Parameters
    ----------
    corpus: list
    n_topics: int, (default=10)
        size of decomposition column.
    vectorizer: object
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
    ngram: tuple, (default=(1,3))
        n-grams size to train a corpus.
    batch_size: int, (default=10)
        size of strings for each vectorization and attention.

    Returns
    -------
    result: malaya.topic_modelling.AttentionTopic class
    """

    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(vectorizer, 'attention') and not hasattr(
            vectorizer, 'vectorize'):
        raise ValueError(
            'vectorizer must have `attention` and `vectorize` methods')
    validator.validate_function(cleaning, 'cleaning')

    if len(corpus) < n_topics:
        raise ValueError(
            'length corpus must be bigger than or equal to n_topics')

    from sklearn.cluster import KMeans

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])

    def generate_ngram(seq, ngram=(1, 3)):
        g = []
        for i in range(ngram[0], ngram[-1] + 1):
            g.extend(list(ngrams_generator(seq, i)))
        return g

    rows, attentions = [], []
    for i in range(0, len(corpus), batch_size):
        index = min(i + batch_size, len(corpus))
        rows.append(vectorizer.vectorize(corpus[i:index]))
        attentions.extend(vectorizer.attention(corpus[i:index]))

    concat = np.concatenate(rows, axis=0)
    kmeans = KMeans(n_clusters=n_topics, random_state=0).fit(concat)
    labels = kmeans.labels_

    overall, filtered_a = [], []
    for a in attentions:
        f = [i for i in a if i[0] not in stopwords]
        overall.extend(f)
        filtered_a.append(f)

    o_ngram = generate_ngram(overall, ngram)
    features = []
    for i in o_ngram:
        features.append(' '.join([w[0] for w in i]))
    features = list(set(features))

    components = np.zeros((n_topics, len(features)))
    for no, i in enumerate(labels):
        f = generate_ngram(filtered_a[no], ngram)
        for w in f:
            word = ' '.join([r[0] for r in w])
            score = np.mean([r[1] for r in w])
            if word in features:
                components[i, features.index(word)] += score

    return AttentionTopic(features, components)
Esempio n. 15
0
def lda2vec(
    corpus: List[str],
    vectorizer,
    n_topics: int = 10,
    cleaning=simple_textcleaning,
    stopwords=get_stopwords,
    window_size: int = 2,
    embedding_size: int = 128,
    epoch: int = 10,
    switch_loss: int = 1000,
    **kwargs,
):
    """
    Train a LDA2Vec model to do topic modelling based on corpus / list of strings given.

    Parameters
    ----------
    corpus: list
    vectorizer : object
        Should have `fit_transform` method. Commonly:

        * ``sklearn.feature_extraction.text.TfidfVectorizer`` - TFIDF algorithm.
        * ``sklearn.feature_extraction.text.CountVectorizer`` - Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramCountVectorizer`` - Skip Gram Bag-of-Word algorithm.
        * ``malaya.text.vectorizer.SkipGramTfidfVectorizer`` - Skip Gram TFIDF algorithm.
    n_topics: int, (default=10)
        size of decomposition column.
    cleaning: function, (default=malaya.text.function.simple_textcleaning)
        function to clean the corpus.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
    embedding_size: int, (default=128)
        embedding size of lda2vec tensors.
    epoch: int, (default=10)
        training iteration, how many loop need to train.
    switch_loss: int, (default=3)
        baseline to switch from document based loss to document + word based loss.

    Returns
    -------
    result: malaya.topic_modelling.DeepTopic class
    """
    validator.validate_function(cleaning, 'cleaning')
    stopwords = validator.validate_stopwords(stopwords)
    stopwords = list(stopwords)

    tf_vectorizer = vectorizer

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stopwords]))
    tf_vectorizer.fit(text_clean)
    idx_text_clean, len_idx_text_clean = [], []
    transformed_text_clean = tf_vectorizer.transform(text_clean)
    for text in transformed_text_clean:
        splitted = text.nonzero()[1]
        idx_text_clean.append(splitted)
        len_idx_text_clean.append(len(splitted))
    dictionary = {
        i: no
        for no, i in enumerate(tf_vectorizer.get_feature_names())
    }
    reversed_dictionary = {
        no: i
        for no, i in enumerate(tf_vectorizer.get_feature_names())
    }
    freqs = transformed_text_clean.toarray().sum(axis=0).tolist()
    doc_ids = np.arange(len(idx_text_clean))
    num_unique_documents = doc_ids.max()
    pivot_words, target_words, doc_ids = [], [], []
    for i, t in enumerate(idx_text_clean):
        pairs, _ = skipgrams(
            t,
            vocabulary_size=len(dictionary),
            window_size=window_size,
            shuffle=True,
            negative_samples=0,
        )
        for pair in pairs:
            temp_data = pair
            pivot_words.append(temp_data[0])
            target_words.append(temp_data[1])
            doc_ids.append(i)
    pivot_words, target_words, doc_ids = shuffle(pivot_words,
                                                 target_words,
                                                 doc_ids,
                                                 random_state=10)
    num_unique_documents = len(idx_text_clean)

    model = LDA2Vec(
        num_unique_documents,
        len(dictionary),
        n_topics,
        freqs,
        embedding_size=embedding_size,
        **kwargs,
    )
    model.train(pivot_words,
                target_words,
                doc_ids,
                epoch,
                switch_loss=switch_loss)
    return DeepTopic(
        model,
        dictionary,
        reversed_dictionary,
        freqs,
        len_idx_text_clean,
        text_clean,
    )
Esempio n. 16
0
def similarity_transformer(
    string: str,
    model,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    use_maxsum: bool = False,
    use_mmr: bool = False,
    diversity: float = 0.5,
    nr_candidates: int = 20,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Sentence embedding VS keyword embedding similarity.
    https://github.com/MaartenGr/KeyBERT/blob/master/keybert/model.py

    Parameters
    ----------
    string: str
    model: Object
        Transformer model or any model has `attention` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or, 
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    use_maxsum: bool, optional (default=False) 
        Whether to use Max Sum Similarity.
    use_mmr: bool, optional (default=False) 
        Whether to use MMR.
    diversity: float, optional (default=0.5)
        The diversity of results between 0 and 1 if use_mmr is True.
    nr_candidates: int, optional (default=20) 
        The number of candidates to consider if use_maxsum is set to True.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'vectorize'):
        raise ValueError('model must have `vectorize` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if nr_candidates < top_k:
        raise Exception('nr_candidates must bigger than top_k')

    string = transformer_textcleaning(string)

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)

    words = list(vocab.keys())
    vectors_keywords = model.vectorize(words)
    vectors_string = model.vectorize([string])

    if use_mmr:
        # https://github.com/MaartenGr/KeyBERT/blob/master/keybert/mmr.py

        word_doc_similarity = cosine_similarity(vectors_keywords,
                                                vectors_string)
        word_similarity = cosine_similarity(vectors_keywords)
        keywords_idx = [np.argmax(word_doc_similarity)]
        candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
        for _ in range(top_n - 1):
            candidate_similarities = word_doc_similarity[candidates_idx, :]
            target_similarities = np.max(
                word_similarity[candidates_idx][:, keywords_idx], axis=1)

            mmr = (
                1 - diversity
            ) * candidate_similarities - diversity * target_similarities.reshape(
                -1, 1)
            mmr_idx = candidates_idx[np.argmax(mmr)]

            keywords_idx.append(mmr_idx)
            candidates_idx.remove(mmr_idx)
        ranked_sentences = [(word_doc_similarity.reshape(1, -1)[0][idx],
                             words[idx]) for idx in keywords_idx]

    elif use_maxsum:
        # https://github.com/MaartenGr/KeyBERT/blob/master/keybert/maxsum.py

        distances = cosine_similarity(vectors_string, vectors_keywords)
        distances_words = cosine_similarity(vectors_keywords, vectors_keywords)
        words_idx = list(distances.argsort()[0][-nr_candidates:])
        words_vals = [words[index] for index in words_idx]
        candidates = distances_words[np.ix_(words_idx, words_idx)]
        min_sim = 100_000
        candidate = None
        for combination in itertools.combinations(range(len(words_idx)),
                                                  top_n):
            sim = sum([
                candidates[i][j] for i in combination for j in combination
                if i != j
            ])
            if sim < min_sim:
                candidate = combination
                min_sim = sim

        ranked_sentences = [(distances[0][idx], words_vals[idx])
                            for idx in candidate]

    else:
        distances = cosine_similarity(vectors_string, vectors_keywords)
        ranked_sentences = [(distances[0][index], words[index])
                            for index in distances.argsort()[0]][::-1]

    ranked_sentences = [i for i in ranked_sentences if vocab[i[1]] >= atleast]
    return ranked_sentences[:top_k]
Esempio n. 17
0
def rake(
    string: str,
    model=None,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Rake algorithm.

    Parameters
    ----------
    string: str
    model: Object, optional (default=None)
        Transformer model or any model has `attention` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    ngram: tuple, optional (default=(1,1))
        n-grams size.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]
        For automatic Ngram generator.

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if model is not None:
        if not hasattr(model, 'attention'):
            raise ValueError('model must have `attention` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if model:
        string = transformer_textcleaning(string)
        attention = model.attention([string])[0]
        d = defaultdict(float)
        for k, v in attention:
            d[k] += v

    else:
        d = None

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)
    phrase_list = list(vocab.keys())
    scores = rake_function.calculate_word_scores(phrase_list, attentions=d)
    keywordcandidates = rake_function.generate_candidate_keyword_scores(
        phrase_list, scores)

    sortedKeywords = sorted(keywordcandidates.items(),
                            key=operator.itemgetter(1),
                            reverse=True)

    total = sum([i[1] for i in sortedKeywords])

    ranked_sentences = [(i[1] / total, i[0]) for i in sortedKeywords
                        if vocab[i[0]] >= atleast]
    return ranked_sentences[:top_k]
Esempio n. 18
0
def similarity(
    string: str,
    model,
    vectorizer=None,
    top_k: int = 5,
    atleast: int = 1,
    stopwords=get_stopwords,
    **kwargs,
):
    """
    Extract keywords using Sentence embedding VS keyword embedding similarity.

    Parameters
    ----------
    string: str
    model: Object
        Transformer model or any model has `vectorize` method.
    vectorizer: Object, optional (default=None)
        Prefer `sklearn.feature_extraction.text.CountVectorizer` or,
        `malaya.text.vectorizer.SkipGramCountVectorizer`.
        If None, will generate ngram automatically based on `stopwords`.
    top_k: int, optional (default=5)
        return top-k results.
    atleast: int, optional (default=1)
        at least count appeared in the string to accept as candidate.
    stopwords: List[str], (default=malaya.texts.function.get_stopwords)
        A callable that returned a List[str], or a List[str], or a Tuple[str]

    Returns
    -------
    result: Tuple[float, str]
    """
    stopwords = validator.validate_stopwords(stopwords)

    if not hasattr(model, 'vectorize'):
        raise ValueError('model must have `vectorize` method')
    if top_k < 1:
        raise ValueError('top_k must bigger than 0')
    if atleast < 1:
        raise ValueError('atleast must bigger than 0')
    if not vectorizer:
        auto_ngram = True
    else:
        auto_ngram = False
        if not hasattr(vectorizer, 'fit'):
            raise ValueError('vectorizer must have `fit` method')
    if auto_ngram and not len(stopwords):
        raise ValueError('insert stopwords if auto_ngram')

    if nr_candidates < top_k:
        raise ValueError('nr_candidates must bigger than top_k')

    string = transformer_textcleaning(string)

    if auto_ngram:
        vocab = _auto_ngram(string, stopwords)
    else:
        vocab = _base(string, vectorizer=vectorizer, **kwargs)

    words = list(vocab.keys())
    vectors_keywords = model.vectorize(words)
    vectors_string = model.vectorize([string])

    distances = cosine_similarity(vectors_string, vectors_keywords)
    ranked_sentences = [(distances[0][index], words[index])
                        for index in distances.argsort()[0]][::-1]

    ranked_sentences = [i for i in ranked_sentences if vocab[i[1]] >= atleast]
    return ranked_sentences[:top_k]