Ejemplo n.º 1
0
    def draw_graph(self, colors=None):
        pos = fruchterman_reingold_layout(self.graph)
        node_sizes = [
            100000 * x * x + 50 for x in list(self.betweenness.values())
        ]

        colors = colors if colors else self.colors

        nx.draw_networkx_nodes(self.graph,
                               pos,
                               node_size=node_sizes,
                               node_color=colors,
                               alpha=0.5,
                               line_color=None)
        nx.draw_networkx_edges(self.graph, pos, alpha=0.05, style='solid')
        nx.draw_networkx_labels(self.graph,
                                pos, {node: node
                                      for node in self.nodes},
                                font_size=8)
Ejemplo n.º 2
0
def cluster_entity_linking(
        corpus: List[str],
        vectorizer,
        entity_model,
        topic_modeling_model,
        threshold: float = 0.3,
        topic_decomposition: int = 2,
        topic_length: int = 10,
        fuzzy_ratio: int = 70,
        accepted_entities: List[str] = [
            'law',
            'location',
            'organization',
            'person',
            'event',
        ],
        cleaning=simple_textcleaning,
        stemming=sastrawi,
        colors: List[str] = None,
        stop_words: List[str] = STOPWORDS,
        max_df: float = 1.0,
        min_df: int = 1,
        ngram: Tuple[int, int] = (2, 3),
        figsize: Tuple[int, int] = (17, 9),
        batch_size: int = 20,
):
    """
    plot undirected graph for Entities and topics relationship.

    Parameters
    ----------
    corpus: list or str
    vectorizer: class
    titles: list
        list of titles, length must same with corpus.
    colors: list
        list of colors, length must same with num_clusters.
    threshold: float, (default=0.3)
        0.3 means, 30% above absolute pearson correlation.
    topic_decomposition: int, (default=2)
        size of decomposition.
    topic_length: int, (default=10)
        size of topic models.
    fuzzy_ratio: int, (default=70)
        size of ratio for fuzzywuzzy.
    stemming: bool, (default=True)
        If True, sastrawi_stemmer will apply.
    max_df: float, (default=0.95)
        maximum of a word selected based on document frequency.
    min_df: int, (default=2)
        minimum of a word selected on based on document frequency.
    ngram: tuple, (default=(1,3))
        n-grams size to train a corpus.
    cleaning: function, (default=simple_textcleaning)
        function to clean the corpus.
    stop_words: list, (default=STOPWORDS)
        list of stop words to remove.

    Returns
    -------
    dictionary: {'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels}
    """

    import inspect

    if not isinstance(stemming, Callable) and stemming is not None:
        raise ValueError('stemming must be a callable type or None')
    if not isinstance(cleaning, Callable) and cleaning is not None:
        raise ValueError('cleaning must be a callable type or None')

    if not hasattr(vectorizer, 'vectorize') and not hasattr(vectorizer, 'fit'):
        raise ValueError('vectorizer must has `fit` and `vectorize` methods')
    if 'max_df' not in inspect.getargspec(topic_modeling_model)[0]:
        raise ValueError('topic_modeling_model must has `max_df` parameter')

    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')
    if not (fuzzy_ratio > 0 and fuzzy_ratio <= 100):
        raise ValueError(
            'fuzzy_ratio must be bigger than 0, less than or equal to 100')
    if not isinstance(threshold, float):
        raise ValueError('threshold must be a float')
    if not (threshold <= 1 and threshold > 0):
        raise ValueError(
            'threshold must be bigger than 0, less than or equal to 1')

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        import networkx as nx
        import networkx.drawing.layout as nxlayout
        import pandas as pd
        from fuzzywuzzy import fuzz

        sns.set()
    except:
        raise Exception(
            'matplotlib, seaborn, networkx, fuzzywuzzy not installed. Please install it and try again.'
        )

    if isinstance(corpus, str):
        corpus = split_into_sentences(corpus)
    else:
        corpus = '. '.join(corpus)
        corpus = split_into_sentences(corpus)

    corpus = [string for string in corpus if len(string) > 5]

    if not colors:
        colors = sns.color_palette(n_colors=len(accepted_entities) + 1)
    else:
        if len(colors) != (len(accepted_entities) + 1):
            raise ValueError('len of colors must same as %d' %
                             (len(accepted_entities) + 1))

    topic_model = topic_modeling_model(
        corpus,
        topic_decomposition,
        ngram=ngram,
        max_df=max_df,
        min_df=min_df,
    )
    topics = []
    for no, topic in enumerate(topic_model.comp.components_):
        for i in topic.argsort()[:-topic_length - 1:-1]:
            topics.append(topic_model.features[i])

    entities_cluster = {entity: [] for entity in accepted_entities}
    for string in corpus:
        entities_clustered = cluster_entities(entity_model.predict(string))
        for entity in accepted_entities:
            entities_cluster[entity].extend(entities_clustered[entity])
    for entity in accepted_entities:
        entities_cluster[entity] = cluster_words(
            list(set(entities_cluster[entity])))

    topics = cluster_words(list(set(topics)))
    color_dict = {topic: colors[-1] for topic in topics}
    for no, entity in enumerate(accepted_entities):
        for e in entities_cluster[entity]:
            topics.append(e)
            color_dict[e] = colors[no]

    topics_corpus = []
    for topic in topics:
        nested_corpus = []
        for string in corpus:
            if (topic in string
                    or fuzz.token_set_ratio(topic, string) >= fuzzy_ratio):
                nested_corpus.append(string)
        topics_corpus.append(' '.join(nested_corpus))

    corpus = topics_corpus

    if cleaning:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    if stemming:
        for i in range(len(corpus)):
            corpus[i] = stemming(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stop_words]))

    if hasattr(vectorizer, 'fit'):
        vectorizer.fit(text_clean)
        transformed_text_clean = vectorizer.transform(text_clean).todense()
        features = vectorizer.get_feature_names()
    else:
        transformed_text_clean, attentions = [], []
        for i in range(0, len(text_clean), batch_size):
            index = min(i + batch_size, len(text_clean))
            transformed_text_clean.append(
                vectorizer.vectorize(text_clean[i:index]))
            attentions.extend(vectorizer.attention(text_clean[i:index]))
        transformed_text_clean = np.concatenate(transformed_text_clean, axis=0)

    DxT = transformed_text_clean
    DxD = np.abs(pd.DataFrame(DxT.T).corr()).values

    G = nx.Graph()
    for i in range(DxT.shape[0]):
        G.add_node(i, text=topics[i], label=topics[i])

    len_dense = len(DxD)
    for i in range(len_dense):
        for j in range(len_dense):
            if j == i:
                continue
            if DxD[i, j] >= threshold:
                weight = DxD[i, j]
                G.add_edge(i, j, weight=weight)

    node_colors, node_labels = [], {}
    for node in G:
        node_colors.append(color_dict[G.node[node]['label']])
        node_labels[node] = G.node[node]['text']
    pos = nxlayout.fruchterman_reingold_layout(G,
                                               k=1.5 / np.sqrt(len(G.nodes())))
    f = plt.figure(figsize=figsize)
    ax = f.add_subplot(1, 1, 1)
    for no, entity in enumerate(accepted_entities):
        ax.plot([0], [0], color=colors[no], label=entity)
    ax.plot([0], [0], color=colors[-1], label='topics')
    nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels, ax=ax)
    plt.legend()
    plt.tight_layout()
    plt.show()
    return {
        'G': G,
        'pos': pos,
        'node_colors': node_colors,
        'node_labels': node_labels,
    }
Ejemplo n.º 3
0
def cluster_graph(
    corpus: List[str],
    vectorizer,
    threshold: float = 0.9,
    num_clusters: int = 5,
    titles: List[str] = None,
    colors: List[str] = None,
    stop_words: List[str] = STOPWORDS,
    stemming=sastrawi,
    ngram: Tuple[int, int] = (1, 3),
    cleaning=simple_textcleaning,
    clustering=KMeans,
    figsize: Tuple[int, int] = (17, 9),
    with_labels: bool = True,
    batch_size: int = 20,
):
    """
    plot undirected graph with similar texts.

    Parameters
    ----------

    corpus: List[str]
    vectorizer: class
        vectorizer class.
    threshold: float, (default=0.9)
        0.9 means, 90% above absolute pearson correlation.
    num_clusters: int, (default=5)
        size of unsupervised clusters.
    titles: List[str], (default=True)
        list of titles, length must same with corpus.
    stemming: function, (default=malaya.stem.sastrawi)
        function to stem the corpus.
    stop_words: List[str], (default=malaya.texts.function.STOPWORDS)
        list of stop words to remove.
    cleaning: function, (default=malaya.texts.function.simple_textcleaning)
        function to clean the corpus.
    ngram: Tuple[int, int], (default=(1,3))
        n-grams size to train a corpus.
    batch_size: int, (default=20)
        size of strings for each vectorization and attention. Only useful if use transformer vectorizer.

    Returns
    -------
    dictionary: {'G': G, 'pos': pos, 'node_colors': node_colors, 'node_labels': node_labels}
    """
    if not isinstance(stemming, Callable) and stemming is not None:
        raise ValueError('stemming must be a callable type or None')
    if not isinstance(cleaning, Callable) and cleaning is not None:
        raise ValueError('cleaning must be a callable type or None')

    if titles:
        if len(titles) != len(corpus):
            raise ValueError('length of titles must be same with corpus')
    if colors:
        if len(colors) != num_clusters:
            raise ValueError(
                'size of colors must be same with number of clusters')
    if not hasattr(vectorizer, 'vectorize') and not hasattr(vectorizer, 'fit'):
        raise ValueError('vectorizer must has `fit` and `vectorize` methods')
    if not (threshold <= 1 and threshold > 0):
        raise ValueError(
            'threshold must be bigger than 0, less than or equal to 1')

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        import networkx as nx
        import networkx.drawing.layout as nxlayout
        import pandas as pd

        sns.set()
    except:
        raise Exception(
            'matplotlib, seaborn, networkx not installed. Please install it and try again.'
        )

    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    if stemming:
        for i in range(len(corpus)):
            corpus[i] = stemming(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stop_words]))

    if hasattr(vectorizer, 'fit'):
        vectorizer.fit(text_clean)
        transformed_text_clean = vectorizer.transform(text_clean).todense()
        features = vectorizer.get_feature_names()
    else:
        transformed_text_clean, attentions = [], []
        for i in range(0, len(text_clean), batch_size):
            index = min(i + batch_size, len(text_clean))
            transformed_text_clean.append(
                vectorizer.vectorize(text_clean[i:index]))
            attentions.extend(vectorizer.attention(text_clean[i:index]))
        transformed_text_clean = np.concatenate(transformed_text_clean, axis=0)

    DxT = transformed_text_clean
    DxD = np.abs(pd.DataFrame(DxT.T).corr()).values
    km = clustering(n_clusters=num_clusters)
    km.fit(DxT)
    clusters = km.labels_.tolist()

    if not titles:
        titles = []
        for i in range(transformed_text_clean.shape[0]):

            if hasattr(vectorizer, 'fit'):
                indices = np.argsort(np.array(
                    transformed_text_clean[i])[0])[::-1]
                titles.append(' '.join(
                    [features[i] for i in indices[:ngram[1]]]))
            else:
                attentions[i].sort(key=lambda x: x[1])
                titles.append(' '.join(
                    [i[0] for i in attentions[i][-ngram[1]:]]))

    if not colors:
        colors = sns.color_palette(n_colors=num_clusters)
    G = nx.Graph()
    for i in range(DxT.shape[0]):
        G.add_node(i, text=titles[i], label=clusters[i])

    len_dense = len(DxD)
    for i in range(len_dense):
        for j in range(len_dense):
            if j == i:
                continue
            if DxD[i, j] >= threshold:
                weight = DxD[i, j]
                G.add_edge(i, j, weight=weight)
    node_colors, node_labels = [], {}
    for node in G:
        node_colors.append(colors[G.node[node]['label']])
        node_labels[node] = G.node[node]['text']
    pos = nxlayout.fruchterman_reingold_layout(G,
                                               k=1.5 / np.sqrt(len(G.nodes())))
    plt.figure(figsize=figsize)
    if with_labels:
        nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels)
    else:
        nx.draw(G, node_color=node_colors, pos=pos)

    return {
        'G': G,
        'pos': pos,
        'node_colors': node_colors,
        'node_labels': node_labels,
    }
Ejemplo n.º 4
0
def cluster_graph(
    corpus,
    titles = None,
    colors = None,
    threshold = 0.3,
    stemming = True,
    max_df = 0.95,
    min_df = 2,
    ngram = (1, 3),
    cleaning = simple_textcleaning,
    vectorizer = 'bow',
    stop_words = STOPWORDS,
    num_clusters = 5,
    clustering = KMeans,
    figsize = (17, 9),
    with_labels = True,
):
    """
    plot undirected graph with similar texts

    corpus: list
    titles: list
        list of titles, length must same with corpus
    colors: list
        list of colors, length must same with num_clusters
    num_clusters: int, (default=5)
        size of unsupervised clusters.
    stemming: bool, (default=True)
        If True, sastrawi_stemmer will apply
    max_df: float, (default=0.95)
        maximum of a word selected based on document frequency.
    min_df: int, (default=2)
        minimum of a word selected on based on document frequency.
    ngram: tuple, (default=(1,3))
        n-grams size to train a corpus
    cleaning: function, (default=simple_textcleaning)
        function to clean the corpus
    stop_words: list, (default=STOPWORDS)
        list of stop words to remove
    vectorizer: str, (default='bow')
        vectorizer technique. Allowed values:

        * ``'bow'`` - Bag of Word
        * ``'tfidf'`` - Term frequency inverse Document Frequency
        * ``'skip-gram'`` - Bag of Word with skipping certain n-grams
    """
    if not isinstance(corpus, list):
        raise ValueError('corpus must be a list')
    if not isinstance(corpus[0], str):
        raise ValueError('corpus must be list of strings')
    if not isinstance(titles, list) and titles is not None:
        raise ValueError('titles must be a list or None')
    if not isinstance(colors, list) and colors is not None:
        raise ValueError('colors must be a list or None')
    if titles:
        if len(titles) != len(corpus):
            raise ValueError('length of titles must be same with corpus')
    if colors:
        if len(colors) != num_clusters:
            raise ValueError(
                'size of colors must be same with number of clusters'
            )
    if not isinstance(vectorizer, str):
        raise ValueError('vectorizer must be a string')
    if not isinstance(stemming, bool):
        raise ValueError('bool must be a boolean')
    vectorizer = vectorizer.lower()
    if not vectorizer in ['tfidf', 'bow', 'skip-gram']:
        raise ValueError("vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    if not isinstance(ngram, tuple):
        raise ValueError('ngram must be a tuple')
    if not len(ngram) == 2:
        raise ValueError('ngram size must equal to 2')
    if not isinstance(min_df, int):
        raise ValueError('min_df must be an integer')
    if not (isinstance(max_df, int) or isinstance(max_df, float)):
        raise ValueError('max_df must be an integer or a float')
    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df < 1 and max_df > 0):
        raise ValueError('max_df must be bigger than 0, less than 1')
    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise Exception("vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise Exception("vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        import networkx as nx
        import networkx.drawing.layout as nxlayout

        sns.set()
    except:
        raise Exception(
            'matplotlib, seaborn, networkx not installed. Please install it and try again.'
        )

    tf_vectorizer = Vectorizer(
        ngram_range = ngram,
        min_df = min_df,
        max_df = max_df,
        stop_words = stop_words,
    )
    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    if stemming:
        for i in range(len(corpus)):
            corpus[i] = sastrawi(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(
            ' '.join([word for word in text.split() if word not in stop_words])
        )
    tf_vectorizer.fit(text_clean)
    DxT = tf_vectorizer.transform(text_clean)
    DxD = np.dot(DxT, DxT.T)
    km = clustering(n_clusters = num_clusters)
    km.fit(DxT)
    clusters = km.labels_.tolist()
    features = tf_vectorizer.get_feature_names()
    if not titles:
        titles = []
        for i in range(DxT.shape[0]):
            indices = np.argsort(np.array(DxT[i].todense())[0])[::-1]
            titles.append(' '.join([features[i] for i in indices[: ngram[1]]]))
    if not colors:
        colors = sns.color_palette(n_colors = num_clusters)
    G = nx.Graph()
    for i in range(DxT.shape[0]):
        G.add_node(i, text = titles[i], label = clusters[i])
    dense_DxD = DxD.toarray()
    len_dense = len(dense_DxD)
    for i in range(len_dense):
        for j in range(i + 1, len_dense):
            if dense_DxD[i, j] >= threshold:
                weight = dense_DxD[i, j]
                G.add_edge(i, j, weight = weight)
    for node, degree in list(dict(G.degree()).items()):
        if degree == 0:
            G.remove_node(node)
    node_colors, node_labels = [], {}
    for node in G:
        node_colors.append(colors[G.node[node]['label']])
        node_labels[node] = G.node[node]['text']
    pos = nxlayout.fruchterman_reingold_layout(
        G, k = 1.5 / np.sqrt(len(G.nodes()))
    )
    plt.figure(figsize = figsize)
    if with_labels:
        nx.draw(G, node_color = node_colors, pos = pos, labels = node_labels)
    else:
        nx.draw(G, node_color = node_colors, pos = pos)
    return {
        'G': G,
        'pos': pos,
        'node_colors': node_colors,
        'node_labels': node_labels,
    }
Ejemplo n.º 5
0
def filterP(links):
    outputs = run_parallel_in_threads(getP, links)
    overall_emotion, overall_sentiment, overall_subj, overall_pol, overall_irony, overall_msg, overall_bias = [], [], [], [], [], [], []
    overall_local_entities_nouns = []
    persons, orgs, gpes = [], [], []
    df_texts, df_sentiments = [], []
    for i in range(len(outputs)):
        local_entities_nouns, local_persons, local_orgs, local_gpes = [], [], [], []
        for sentence in outputs[i]['p']:
            for token in nlp(sentence):
                if token.ent_type_ == 'PERSON':
                    local_persons.append(str(token))
                if token.ent_type_ == 'ORG':
                    local_orgs.append(str(token))
                if token.ent_type_ == 'GPE':
                    local_gpes.append(str(token))
                if (len(token.ent_type_) > 0 or token.tag_ in ['NNP', 'NN']
                    ) and str(token).lower() not in english_stopwords:
                    local_entities_nouns.append(str(token))
        sentiments = getsentiment(outputs[i]['p-classifier'])
        df_sentiments += np.argmax(sentiments, axis=1).tolist()
        df_texts += outputs[i]['p-classifier']
        emotions = getemotion(outputs[i]['p-classifier'])
        msgs = getmsg(outputs[i]['p-classifier'])
        subjectivities, polarities, ironies, biases = getpolar(
            outputs[i]['p-classifier'])
        overall_local_entities_nouns += local_entities_nouns
        persons += local_persons
        orgs += local_orgs
        gpes += local_gpes
        local_entities_nouns_unique, local_entities_nouns_count = np.unique(
            local_entities_nouns, return_counts=True)
        sorted_val = local_entities_nouns_unique[np.argsort(
            local_entities_nouns_count)[::-1]].tolist()
        outputs[i]['tokens'] = sorted_val[:15]
        outputs[i]['sentiment'] = sentiments.tolist()
        outputs[i]['emotion'] = emotions.tolist()
        outputs[i]['msg'] = msgs.tolist()
        outputs[i]['subjectivity'] = subjectivities.tolist()
        outputs[i]['polarity'] = polarities.tolist()
        outputs[i]['irony'] = ironies.tolist()
        outputs[i]['bias'] = biases.tolist()
        outputs[i]['person'] = list(set(local_persons))
        outputs[i]['org'] = list(set(local_orgs))
        outputs[i]['gpes'] = list(set(local_gpes))
        avg_sentiment = sentiments.mean(axis=0)
        avg_emotion = emotions.mean(axis=0)
        avg_msg = msgs.mean(axis=0)
        avg_subjectivity = subjectivities.mean()
        avg_polarity = polarities.mean()
        avg_irony = ironies.mean()
        avg_bias = biases.mean()
        overall_emotion.append(avg_emotion)
        overall_sentiment.append(avg_sentiment)
        overall_msg.append(avg_msg)
        overall_subj.append(avg_subjectivity)
        overall_pol.append(avg_polarity)
        overall_irony.append(avg_irony)
        overall_bias.append(avg_bias)
        outputs[i]['avg_sentiment'] = avg_sentiment.tolist()
        outputs[i]['avg_emotion'] = avg_emotion.tolist()
        outputs[i]['avg_msg'] = avg_msg.tolist()
        outputs[i]['avg_subjectivity'] = avg_subjectivity.tolist()
        outputs[i]['avg_polarity'] = avg_polarity.tolist()
        outputs[i]['avg_irony'] = avg_irony.tolist()
        outputs[i]['avg_bias'] = avg_bias.tolist()

    # graph pipeline
    df = pd.DataFrame({'text': df_texts, 'sentiment': df_sentiments})
    df['id'] = df.index
    tfidf = TfidfVectorizer(stop_words='english', norm='l2')
    DxT = tfidf.fit_transform(df['text'])
    DxD = np.dot(DxT, DxT.T)
    G = nx.Graph()
    for i in range(df.shape[0]):
        idx = df.at[i, 'id']
        text = df.at[i, 'text']
        sentiment = df.at[i, 'sentiment']
        G.add_node(idx, text=text, sentiment=sentiment)
    dense_DxD = DxD.toarray()
    len_dense = len(dense_DxD)
    cutoff = 0
    for i in range(len_dense):
        for j in range(i + 1, len_dense):
            if dense_DxD[i, j] >= cutoff:
                weight = dense_DxD[i, j]
                G.add_edge(df.at[i, 'id'], df.at[j, 'id'], weight=weight)
    for node, degree in list(dict(G.degree()).items()):
        if degree == 0:
            G.remove_node(node)
    pos = nxlayout.fruchterman_reingold_layout(G,
                                               k=1.5 / np.sqrt(len(G.nodes())))
    edge_data = []
    colors = {0: '1', 1: '2'}
    for u, v, w in G.edges(data=True):
        x0, y0 = pos[u]
        x1, y1 = pos[v]
        w = w['weight']
        edge_data.append(
            go.Scatter(x=[x0, x1, None],
                       y=[y0, y1, None],
                       line=go.Line(width=3.0 * w, color='#888'),
                       hoverinfo='none',
                       mode='lines'))
    node_data = go.Scatter(x=[],
                           y=[],
                           text=[],
                           mode='markers',
                           hoverinfo='text',
                           marker=go.Marker(
                               showscale=True,
                               reversescale=True,
                               color=[],
                               size=5.0,
                               colorbar=dict(thickness=15,
                                             xanchor='left',
                                             tickmode='array',
                                             tickvals=[1, 2],
                                             ticktext=['negative', 'positive'],
                                             ticks='outside'),
                               line=dict(width=0.5)))
    for u, w in G.nodes(data=True):
        x, y = pos[u]
        color = colors[w['sentiment']]
        text = w['text']
        node_data['x'].append(x)
        node_data['y'].append(y)
        node_data['text'].append(text)
        node_data['marker']['color'].append(color)
    # end graph pipeline

    overall_unique, overall_count = np.unique(overall_local_entities_nouns,
                                              return_counts=True)
    overall_unique = overall_unique[np.argsort(overall_count)
                                    [::-1]][:200].tolist()
    overall_count = overall_count[np.argsort(overall_count)
                                  [::-1]][:200].tolist()
    return {
        'overall_sentiment': np.array(overall_sentiment).mean(axis=0).tolist(),
        'overall_emotion': np.array(overall_emotion).mean(axis=0).tolist(),
        'overall_msg': np.array(overall_msg).mean(axis=0).tolist(),
        'overall_subjectivity': np.array(overall_subj).mean().tolist(),
        'overall_polarity': np.array(overall_pol).mean().tolist(),
        'overall_irony': np.array(overall_irony).mean().tolist(),
        'overall_bias': np.array(overall_bias).mean().tolist(),
        'person': list(set(persons)),
        'org': list(set(orgs)),
        'gpe': list(set(gpes)),
        'outputs': outputs,
        'wordcloud': list(zip(overall_unique, overall_count)),
        'sentiment-network': edge_data + [node_data]
    }
Ejemplo n.º 6
0
def cluster_entity_linking(corpus,
                           entity_model,
                           topic_modeling_model,
                           topic_decomposition=2,
                           topic_length=10,
                           threshold=0.3,
                           fuzzy_ratio=70,
                           accepted_entities=[
                               'law', 'location', 'organization', 'person',
                               'event'
                           ],
                           colors=None,
                           max_df=1.0,
                           min_df=1,
                           ngram=(2, 3),
                           stemming=True,
                           cleaning=simple_textcleaning,
                           vectorizer='bow',
                           stop_words=STOPWORDS,
                           figsize=(17, 9),
                           **kwargs):
    """
    plot undirected graph for Entities and topics relationship.

    Parameters
    ----------
    corpus: list or str
    titles: list
        list of titles, length must same with corpus.
    colors: list
        list of colors, length must same with num_clusters.
    threshold: float, (default=0.3)
        threshold to assume similarity for covariance matrix.
    topic_decomposition: int, (default=2)
        size of decomposition.
    topic_length: int, (default=10)
        size of topic models.
    fuzzy_ratio: int, (default=70)
        size of ratio for fuzzywuzzy.
    stemming: bool, (default=True)
        If True, sastrawi_stemmer will apply.
    max_df: float, (default=0.95)
        maximum of a word selected based on document frequency.
    min_df: int, (default=2)
        minimum of a word selected on based on document frequency.
    ngram: tuple, (default=(1,3))
        n-grams size to train a corpus.
    cleaning: function, (default=simple_textcleaning)
        function to clean the corpus.
    stop_words: list, (default=STOPWORDS)
        list of stop words to remove.
    vectorizer: str, (default='bow')
        vectorizer technique. Allowed values:

        * ``'bow'`` - Bag of Word.
        * ``'tfidf'`` - Term frequency inverse Document Frequency.
        * ``'skip-gram'`` - Bag of Word with skipping certain n-grams.

    Returns
    -------
    dictionary: {
        'G': G,
        'pos': pos,
        'node_colors': node_colors,
        'node_labels': node_labels,
    }
    """
    if not isinstance(corpus, list) and not isinstance(corpus, str):
        raise ValueError('corpus must be a list')
    if isinstance(corpus, list):
        if not isinstance(corpus[0], str):
            raise ValueError('corpus must be list of strings')
    if not isinstance(colors, list) and colors is not None:
        raise ValueError('colors must be a list or None')
    if not isinstance(vectorizer, str):
        raise ValueError('vectorizer must be a string')
    if not isinstance(stemming, bool):
        raise ValueError('bool must be a boolean')
    vectorizer = vectorizer.lower()
    if not vectorizer in ['tfidf', 'bow', 'skip-gram']:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    if not isinstance(ngram, tuple):
        raise ValueError('ngram must be a tuple')
    if not len(ngram) == 2:
        raise ValueError('ngram size must equal to 2')
    if not isinstance(min_df, int):
        raise ValueError('min_df must be an integer')
    if not isinstance(topic_decomposition, int):
        raise ValueError('topic_decomposition must be an integer')
    if not isinstance(topic_length, int):
        raise ValueError('topic_length must be an integer')
    if not isinstance(fuzzy_ratio, int):
        raise ValueError('fuzzy_ratio must be an integer')
    if not isinstance(max_df, float):
        raise ValueError('max_df must be a float')
    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')
    if not (fuzzy_ratio > 0 and fuzzy_ratio <= 100):
        raise ValueError(
            'fuzzy_ratio must be bigger than 0, less than or equal to 100')
    if not isinstance(threshold, float):
        raise ValueError('threshold must be a float')
    if not (threshold <= 1 and threshold > 0):
        raise ValueError(
            'threshold must be bigger than 0, less than or equal to 1')

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        import networkx as nx
        import networkx.drawing.layout as nxlayout

        sns.set()
    except:
        raise Exception(
            'matplotlib, seaborn, networkx not installed. Please install it and try again.'
        )

    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")

    if isinstance(corpus, str):
        corpus = corpus.replace('\n', '.')
        corpus = split_by_dot(corpus)
    else:
        corpus = [c + '.' for c in corpus]
        corpus = ' '.join(corpus)
        corpus = re.findall('(?=\S)[^.\n]+(?<=\S)', corpus)
    corpus = [string for string in corpus if len(string) > 5]

    if not colors:
        colors = sns.color_palette(n_colors=len(accepted_entities) + 1)
    else:
        if len(colors) != (len(accepted_entities) + 1):
            raise ValueError('len of colors must same as %d' %
                             (len(accepted_entities) + 1))

    topic_model = topic_modeling_model(
        corpus,
        topic_decomposition,
        stemming=stemming,
        vectorizer=vectorizer,
        ngram=ngram,
        max_df=max_df,
        min_df=min_df,
    )
    topics = []
    for no, topic in enumerate(topic_model.comp.components_):
        for i in topic.argsort()[:-topic_length - 1:-1]:
            topics.append(topic_model.features[i])

    entities_cluster = {entity: [] for entity in accepted_entities}
    for string in corpus:
        entities_clustered = cluster_entities(entity_model.predict(string))
        for entity in accepted_entities:
            entities_cluster[entity].extend(entities_clustered[entity])
    for entity in accepted_entities:
        entities_cluster[entity] = cluster_words(
            list(set(entities_cluster[entity])))

    topics = cluster_words(list(set(topics)))
    color_dict = {topic: colors[-1] for topic in topics}
    for no, entity in enumerate(accepted_entities):
        for e in entities_cluster[entity]:
            topics.append(e)
            color_dict[e] = colors[no]

    topics_corpus = []
    for topic in topics:
        nested_corpus = []
        for string in corpus:
            if (topic in string
                    or fuzz.token_set_ratio(topic, string) >= fuzzy_ratio):
                nested_corpus.append(string)
        topics_corpus.append(' '.join(nested_corpus))

    tf_vectorizer = Vectorizer(ngram_range=ngram,
                               min_df=min_df,
                               max_df=max_df,
                               stop_words=stop_words,
                               **kwargs)
    if cleaning is not None:
        for i in range(len(topics_corpus)):
            topics_corpus[i] = cleaning(topics_corpus[i])
    if stemming:
        for i in range(len(topics_corpus)):
            topics_corpus[i] = sastrawi(topics_corpus[i])

    tf_vectorizer.fit(topics_corpus)
    DxT = tf_vectorizer.transform(topics_corpus)
    DxD = np.dot(DxT, DxT.T)

    G = nx.Graph()
    for i in range(DxT.shape[0]):
        G.add_node(i, text=topics[i], label=topics[i])

    dense_DxD = DxD.toarray()
    len_dense = len(dense_DxD)
    for i in range(len_dense):
        for j in range(len_dense):
            if j == i:
                continue
            if dense_DxD[i, j] >= threshold:
                weight = dense_DxD[i, j]
                G.add_edge(i, j, weight=weight)
    node_colors, node_labels = [], {}
    for node in G:
        node_colors.append(color_dict[G.node[node]['label']])
        node_labels[node] = G.node[node]['text']
    pos = nxlayout.fruchterman_reingold_layout(G,
                                               k=1.5 / np.sqrt(len(G.nodes())))
    f = plt.figure(figsize=figsize)
    ax = f.add_subplot(1, 1, 1)
    for no, entity in enumerate(accepted_entities):
        ax.plot([0], [0], color=colors[no], label=entity)
    ax.plot([0], [0], color=colors[-1], label='topics')
    nx.draw(G, node_color=node_colors, pos=pos, labels=node_labels, ax=ax)
    plt.legend()
    plt.tight_layout()
    plt.show()
    return {
        'G': G,
        'pos': pos,
        'node_colors': node_colors,
        'node_labels': node_labels,
    }