Ejemplo n.º 1
0
def create_co_occurrence_matrix(interesting_words: [str],
                                filename: str = None):
    entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl",
                                                generate_article_ner_frame)
    data = entities[entities['Text'].isin(interesting_words)].groupby(
        by='ID_Article', as_index=False).agg(lambda x: ' '.join(list(x)))[[
            'ID_Article', 'Text'
        ]]
    interesting_articles = np.array(data['ID_Article'])

    percent_interesting_articles = (interesting_articles.size / np.unique(
        entities['ID_Article']).size) * 100
    print("We look at " + str(len(interesting_words)) +
          " entities and therefore at " +
          str(round(percent_interesting_articles, 2)) +
          "% of all articles for co-occurrence")

    count_model = CountVectorizer(ngram_range=(1, 1))  # default unigram model
    X = count_model.fit_transform(np.array(data['Text']))
    names = count_model.get_feature_names()
    # X[X > 0] = 1 # run this line if you don't want extra within-text cooccurence (see below)
    Xc = (X.T * X)  # this is co-occurrence matrix in sparse csr format
    Xc.setdiag(0)  # fill same word cooccurence to 0
    co_occurrences = pandas.DataFrame(data=Xc.toarray(),
                                      columns=names,
                                      index=names)
    if filename:
        co_occurrences.to_csv(dbase_helper.PKL_CACHE_FOLDER + '/' + filename,
                              sep=',')

    return pandas.DataFrame(data=X.toarray(),
                            columns=names,
                            index=data.ID_Article.values), co_occurrences
Ejemplo n.º 2
0
def create_co_occurrence_all():
    entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl",
                                                generate_article_ner_frame)
    num_top_entities = 50
    pandas.DataFrame(
        entities['Text'].value_counts().head(num_top_entities)).plot.bar()
    plt.title("Distribution of top " + str(num_top_entities) +
              " named entities over all " + str(entities['ID_Article'].size) +
              " Articles")
    plt.show()

    word_occurrences = pandas.DataFrame(entities['Text'].value_counts())
    word_occurrences = word_occurrences[word_occurrences['Text'] >= 10]
    word_occurrences = word_occurrences.rename(
        columns={'Text': 'NumOccurrences'})

    interesting_words = word_occurrences.index.values
    create_co_occurrence_matrix(interesting_words,
                                'article_co_occurrences.csv')

    entities_without_locations = entities[entities.Label != 'LOC']
    word_occurrences = pandas.DataFrame(
        entities_without_locations['Text'].value_counts())
    word_occurrences = word_occurrences[word_occurrences['Text'] >= 10]
    word_occurrences = word_occurrences.rename(
        columns={'Text': 'NumOccurrences'})
    interesting_words = word_occurrences.index.values
    create_co_occurrence_matrix(
        interesting_words, 'article_co_occurrences_without_locations.csv')
    print("done")
Ejemplo n.º 3
0
def prepare_data():
    posts = load_raw_posts()

    post_embeddings = load_or_create_post_embeddings(posts)

    data = {
        "stylometric":
        dbase_helper.generate_pkl_cached(
            "stylometric_features_with_headlines.pkl",
            compute_stylometric_features,
            posts=posts),
        "embedded_posts":
        load_or_embed_posts(posts, post_embeddings),
        "date_stats":
        compute_date_stats(posts),
        "article_stats":
        compute_article_category_stats(posts),
        "article_entities":
        encode_article_named_entities(posts),
        "post_ratings":
        load_post_ratings(posts),
        "parent_posts":
        load_parent_posts(posts),
        "targets":
        tf.keras.utils.to_categorical(posts["ID_User"].cat.codes)
    }

    return posts, data
Ejemplo n.º 4
0
def encode_article_named_entities(posts):
    entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl",
                                                ner.generate_article_ner_frame)

    # Select named entities with minimal occurrence
    minimal_number_word_occurrences = 20
    word_occurrences = pd.DataFrame(entities['Text'].value_counts())
    word_occurrences = word_occurrences[
        word_occurrences['Text'] >= minimal_number_word_occurrences]
    word_occurrences = word_occurrences.rename(
        columns={'Text': 'NumOccurrences'})
    entity_occurrences, co_occurrences = ner.create_co_occurrence_matrix(
        word_occurrences.index.values)
    num_articles = dbase_helper.query_to_data_frame(
        """
        SELECT MAX(Articles.ID_Article) FROM Articles;
        """, "number_articles.pkl")[0][0]
    entity_occurrences = entity_occurrences.reindex(
        index=range(num_articles), fill_value=0).astype('uint8')
    posts = posts[['ID_Post', 'ID_Article']]
    posts_entity_occurrences_in_article = posts.join(entity_occurrences,
                                                     on='ID_Article').drop(
                                                         'ID_Article', axis=1)
    return posts_entity_occurrences_in_article.drop("ID_Post", axis=1)
Ejemplo n.º 5
0
def prepare_data():
    entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl",
                                                ner.generate_article_ner_frame)

    # Select named entities with minimal occurrence
    minimal_number_word_occurrences = 5
    minimal_number_words_per_article = 5
    word_occurrences = pandas.DataFrame(entities['Text'].value_counts())
    word_occurrences = word_occurrences[
        word_occurrences['Text'] >= minimal_number_word_occurrences]
    word_occurrences = word_occurrences.rename(
        columns={'Text': 'NumOccurrences'})
    interesting_words = word_occurrences.index.values
    occurrences, co_occurrences = ner.create_co_occurrence_matrix(
        interesting_words)

    article_ids = occurrences.index.values
    data = data_analysis.generate_joined_rating_articles_frame()
    data = data[data.ID_Article.isin(article_ids)]

    interesting_words_per_article = entities[entities['Text'].isin(
        interesting_words)].groupby(
            by='ID_Article',
            as_index=False).agg(lambda x: len(list(x)))[['ID_Article', 'Text']]

    article_ids = interesting_words_per_article[
        interesting_words_per_article.Text >
        minimal_number_words_per_article].ID_Article
    data = data[data.ID_Article.isin(article_ids)]

    articles = data[[
        'ID_Article', 'Title', 'MainCategory', 'SubCategory', 'RemainingPath'
    ]]
    ratings = data[['ID_Article', 'PositiveVotesCount', 'NegativeVotesCount']]

    # Plot the data we shall predict
    plt.hist(data.PositiveVotesCount, label="PositiveVotesCount")
    plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount")
    ax = plt.gca()
    ax.set_yscale('log')
    plt.legend()
    plt_helper.save_and_show_plot(
        "Logarithmic Vote Distribution over Articles")

    plt.hist(data.PositiveVotesCount, label="PositiveVotesCount")
    plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount")
    plt.legend()
    plt_helper.save_and_show_plot("Vote Distribution over Articles")

    normalize = False
    if normalize:
        pos_mean = data.PositiveVotesCount.mean()
        pos_std = data.PositiveVotesCount.std()
        data.PositiveVotesCount = (data.PositiveVotesCount -
                                   pos_mean) / pos_std

        neg_mean = data.NegativeVotesCount.mean()
        neg_std = data.NegativeVotesCount.std()
        data.NegativeVotesCount = (data.NegativeVotesCount -
                                   neg_mean) / neg_std

        plt.hist(data.PositiveVotesCount, label="PositiveVotesCount")
        plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount")
        ax = plt.gca()
        ax.set_yscale('log')
        plt.title("Normalized Data")
        plt.legend()
        plt.show()

    training_article_ids = np.random.choice(article_ids,
                                            round(len(article_ids) * 0.8))
    training_data = {
        "articles": articles[articles.ID_Article.isin(training_article_ids)],
        "ratings": ratings[ratings.ID_Article.isin(training_article_ids)],
        "occurrences":
        occurrences[occurrences.index.isin(training_article_ids)],
    }

    test_article_ids = np.setdiff1d(article_ids, training_article_ids)
    test_data = {
        "articles": articles[articles.ID_Article.isin(test_article_ids)],
        "ratings": ratings[ratings.ID_Article.isin(test_article_ids)],
        "occurrences": occurrences[occurrences.index.isin(test_article_ids)]
    }

    return training_data, test_data
Ejemplo n.º 6
0
def ner_article_plots():
    entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl",
                                                generate_article_ner_frame)
    pandas.DataFrame(entities['Text'].value_counts().head(30)).plot.bar()
    plt_helper.save_and_show_plot("Entity Distribution")

    entities["Label"].value_counts().plot.bar()
    plt.xlabel("Number of Occurrences")
    plt_helper.save_and_show_plot("Entity Label Distribution")

    joined_article_categories = data_analysis.generate_joined_category_articles_frame(
    )
    articles_time = joined_article_categories[['ID_Article', 'PublishingDate']]

    for label in set(entities["Label"]):
        print("Doing plots for: " + label)
        label_entities = entities[entities['Label'] == label]
        label_series = label_entities["Text"].value_counts().head(20)

        if label == "PER":
            print(
                "For top person entries try to unify first+last name and first-name/last-name only entries"
            )
            persons = label_series.index.values
            for person in persons:
                for compare_person in persons:
                    if compare_person in person and person != compare_person:
                        print(
                            str(person) + " is not unique, subset of " +
                            str(compare_person))
                        label_series[compare_person] += label_series[person]
                        label_series = label_series.drop(labels=[person])
                        break

        pandas.DataFrame(label_series.sort_values()).plot.barh()
        ax = plt.gca()
        ax.get_legend().remove()
        plt.xlabel("Number of Occurrences")
        plt_helper.save_and_show_plot("Entities - " + label + " Distribution")
        top_entities = label_series.sort_values(
            ascending=False).head(6).index.values

        years = [2015, 2016]
        top_entity_entries = []
        for entity in top_entities:
            if label == "PER":
                entity_entries = label_entities[
                    label_entities.Text.str.contains(entity)]
                entity_entries = entity_entries.assign(Text=entity)
            else:
                entity_entries = label_entities[label_entities.Text == entity]
            top_entity_entries.append(entity_entries)
        top_entity_entries = pandas.concat(top_entity_entries)
        top_entity_entries = pandas.merge(top_entity_entries, articles_time)

        plt.style.use('seaborn-deep')
        year_entity_entries = top_entity_entries[
            top_entity_entries.PublishingDate.dt.year > 2014][[
                'PublishingDate', 'Text'
            ]]
        year_entity_entries.PublishingDate = year_entity_entries.PublishingDate.dt.date
        plots = year_entity_entries['PublishingDate'].hist(
            by=year_entity_entries['Text'], histtype='bar', alpha=0.8, bins=12)
        fig = plt.gca().figure
        title = "Top Entities from " + label + "  over time"
        fig.suptitle(title, y=0.99)
        plt_helper.save_and_show_plot(title, False)

        values = []
        labels = []
        for entity in top_entities:
            values.append(year_entity_entries[year_entity_entries.Text ==
                                              entity]['PublishingDate'])
            labels.append(entity)
        plt.hist(values, label=labels)
        plt.legend()
        plt_helper.save_and_show_plot("Top Entities from " + label +
                                      " over time")

    print("done")