Exemple #1
0
def create_and_train_model(training_data):
    EPOCHS = 500

    Y = training_data['ratings'][['PositiveVotesCount',
                                  'NegativeVotesCount']].values
    X = training_data['occurrences'].values
    assert X.shape[0] == Y.shape[0]

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(4, activation='relu', input_shape=[X.shape[1]]),
        tf.keras.layers.Dense(4, activation='relu'),
        tf.keras.layers.Dense(Y.shape[1])
    ])

    optimizer = tf.keras.optimizers.Adam(0.00001)
    loss = 'mean_squared_logarithmic_error'
    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=['mean_squared_logarithmic_error', 'mae'])

    tf.keras.utils.plot_model(
        model,
        show_shapes=True,
        expand_nested=True,
        to_file='plots/user_response_prediction-model.png')
    model.summary()

    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=2)
    history = model.fit(X,
                        Y,
                        epochs=EPOCHS,
                        validation_split=0.2,
                        callbacks=[early_stop],
                        batch_size=10)

    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt_helper.save_and_show_plot("User Response Prediction Training Loss (" +
                                  loss + ")")

    plt.plot(history.history['mae'], label='mean absolute error')
    plt.plot(history.history['val_mae'], label='mean validation error')
    plt.xlabel('Epochs')
    plt.ylabel('MAE')
    plt.legend()
    plt_helper.save_and_show_plot(
        "User Response Prediction Training Absolute Error")

    print("done")
    return model
def rating_analysis():
    frame = generate_joined_rating_articles_frame()
    main_category_votes = frame[[
        "PositiveVotesCount", "NegativeVotesCount", "MainCategory"
    ]].groupby(by="MainCategory").sum()
    main_category_votes.plot(kind='bar')
    plt_helper.save_and_show_plot("Votes for Posts per Main Category")

    newsroom_data = frame[frame.MainCategory == "Newsroom"]
    newsroom_votes = newsroom_data[[
        "PositiveVotesCount", "NegativeVotesCount", "SubCategory"
    ]].groupby(by="SubCategory").sum()
    newsroom_votes.plot(kind='bar')
    plt_helper.save_and_show_plot("Votes for Posts per Newsroom Category")
    print("done")
Exemple #3
0
def load_parent_posts(posts, plot=False):
    parent_posts = dbase_helper.query_to_data_frame(
        """
                SELECT Posts.ID_Post, Posts.ID_Parent_Post FROM Posts;
                """, "post_parents.pkl")
    parent_posts.columns = ["ID_Post", "ID_Parent_Post"]

    # For now just encode if there exists a parent post
    parent_posts["Parent_Post"] = parent_posts.ID_Parent_Post >= 0

    if plot:
        parent_posts["Parent_Post"].value_counts().plot.bar()
        plt.ylabel("Number of Posts")
        plt.xlabel("Has Parent-Post")
        plt_helper.save_and_show_plot("Posts with Parent-Post")

    return parent_posts[["ID_Post", "Parent_Post"]][parent_posts.ID_Post.isin(
        posts.ID_Post)].drop("ID_Post", axis=1)
Exemple #4
0
def load_post_ratings(posts, plot=False):
    post_ratings = dbase_helper.query_to_data_frame(
        """
            SELECT Posts.ID_Post, Posts.PositiveVotes, Posts.NegativeVotes FROM Posts;
            """, "post_votes.pkl")
    post_ratings.columns = ["ID_Post", "PositiveVotes", "NegativeVotes"]
    if plot:
        plt.hist(post_ratings.PositiveVotes, label="PositiveVotes")
        plt.hist(-post_ratings.NegativeVotes, label="NegativeVotes")
        plt.legend()
        plt.ylabel("Number of Occurrences")
        plt.xlabel("Number of Votes")
        ax = plt.gca()
        ax.set_yscale('log')
        plt_helper.save_and_show_plot(
            "Logarithmic Vote Distribution over Posts")

    post_ratings[["PositiveVotes", "NegativeVotes"
                  ]] = post_ratings[["PositiveVotes",
                                     "NegativeVotes"]].astype('uint16')
    return post_ratings[post_ratings.ID_Post.isin(posts.ID_Post)].drop(
        "ID_Post", axis=1)
Exemple #5
0
def compute_date_stats(posts, plot=False):
    date_stats = pd.DataFrame()
    date_stats["ID_Post"] = posts["ID_Post"]

    date_stats["TimeOfDay"] = posts["CreatedAt"].apply(
        lambda x: int(x.hour * 60 + x.minute))
    date_stats["Timestamp"] = posts["CreatedAt"].apply(lambda x: x.value)
    date_stats["DayOfWeek"] = posts["CreatedAt"].apply(lambda x: x.dayofweek)

    date_inputs = np.asarray(
        date_stats.drop("ID_Post", axis=1).drop("Timestamp", axis=1))

    if plot:
        date_stats.DayOfWeek.hist(bins=6)
        plt.ylabel("Number of Posts")
        plt.xlabel("Day of the Week")
        plt_helper.save_and_show_plot("Number of posts per Weekday")

        posts["CreatedAt"].apply(lambda x: x.hour).hist(bins=24)
        plt.ylabel("Number of Posts")
        plt.xlabel("Time of the Day")
        plt_helper.save_and_show_plot("Number of Posts per Time of Day")
    return date_inputs
def category_analysis():
    frame = dbase_helper.get_pandas_from_table("Article_Categories")
    main_categories = np.array(frame.MainCategory)
    plt_helper.plot_histogram_distinct("Main Category Distribution",
                                       main_categories)

    for current_main_category in np.unique(main_categories):
        main_category_data = frame[frame['MainCategory'] ==
                                   current_main_category]
        main_category_sub_categories = main_category_data["SubCategory"]
        plt_helper.plot_histogram_distinct(
            "Category Distribution " + current_main_category,
            main_category_sub_categories)

    article_frame = generate_joined_category_articles_frame()

    # Do stuff with articles by year
    years = np.array(
        article_frame.sort_values(
            by='PublishingDate')['PublishingDate'].dt.year)
    plt_helper.plot_histogram_distinct("Article count over years", years)
    min_year = years[0]
    max_year = years[-1]

    # We actually only have articles for 2015 and 2016 all others years have only one article
    min_year = 2015
    for year in range(min_year, max_year + 1):
        year_articles = article_frame[article_frame['PublishingDate'].dt.year
                                      == year]
        main_categories = np.array(year_articles.MainCategory)
        plt_helper.plot_histogram_distinct(
            "Main Category Distribution " + str(year), main_categories)
        for current_main_category in np.unique(main_categories):
            main_category_data = year_articles[year_articles['MainCategory'] ==
                                               current_main_category]
            main_category_sub_categories = main_category_data["SubCategory"]
            plt_helper.plot_histogram_distinct(
                "Category Distribution " + current_main_category + str(year),
                main_category_sub_categories)

    # Time & Day analysis
    newsroom_articles = article_frame[article_frame.MainCategory == 'Newsroom']
    pandas.Series(newsroom_articles.PublishingDate.dt.hour).plot.hist(
        alpha=0.8, bins=list(range(0, 24)), rwidth=0.8)
    plt_helper.save_and_show_plot("Time Distribution Newsroom")
    plt_helper.plot_day_histogram("Day Distribution Newsroom",
                                  newsroom_articles.PublishingDate.dt.weekday)

    sub_categories = np.unique(newsroom_articles.SubCategory)
    for category in sub_categories:
        pandas.Series(
            newsroom_articles[newsroom_articles.SubCategory ==
                              category].PublishingDate.dt.hour).plot.hist(
                                  alpha=0.8,
                                  bins=list(range(0, 24)),
                                  rwidth=0.8)
        plt_helper.save_and_show_plot("Time Distribution " + str(category))

        days = newsroom_articles[newsroom_articles.SubCategory ==
                                 category].PublishingDate.dt.weekday
        plt_helper.plot_day_histogram("Day Distribution " + str(category),
                                      days)
    print("done")
Exemple #7
0
def prepare_data():
    entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl",
                                                ner.generate_article_ner_frame)

    # Select named entities with minimal occurrence
    minimal_number_word_occurrences = 5
    minimal_number_words_per_article = 5
    word_occurrences = pandas.DataFrame(entities['Text'].value_counts())
    word_occurrences = word_occurrences[
        word_occurrences['Text'] >= minimal_number_word_occurrences]
    word_occurrences = word_occurrences.rename(
        columns={'Text': 'NumOccurrences'})
    interesting_words = word_occurrences.index.values
    occurrences, co_occurrences = ner.create_co_occurrence_matrix(
        interesting_words)

    article_ids = occurrences.index.values
    data = data_analysis.generate_joined_rating_articles_frame()
    data = data[data.ID_Article.isin(article_ids)]

    interesting_words_per_article = entities[entities['Text'].isin(
        interesting_words)].groupby(
            by='ID_Article',
            as_index=False).agg(lambda x: len(list(x)))[['ID_Article', 'Text']]

    article_ids = interesting_words_per_article[
        interesting_words_per_article.Text >
        minimal_number_words_per_article].ID_Article
    data = data[data.ID_Article.isin(article_ids)]

    articles = data[[
        'ID_Article', 'Title', 'MainCategory', 'SubCategory', 'RemainingPath'
    ]]
    ratings = data[['ID_Article', 'PositiveVotesCount', 'NegativeVotesCount']]

    # Plot the data we shall predict
    plt.hist(data.PositiveVotesCount, label="PositiveVotesCount")
    plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount")
    ax = plt.gca()
    ax.set_yscale('log')
    plt.legend()
    plt_helper.save_and_show_plot(
        "Logarithmic Vote Distribution over Articles")

    plt.hist(data.PositiveVotesCount, label="PositiveVotesCount")
    plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount")
    plt.legend()
    plt_helper.save_and_show_plot("Vote Distribution over Articles")

    normalize = False
    if normalize:
        pos_mean = data.PositiveVotesCount.mean()
        pos_std = data.PositiveVotesCount.std()
        data.PositiveVotesCount = (data.PositiveVotesCount -
                                   pos_mean) / pos_std

        neg_mean = data.NegativeVotesCount.mean()
        neg_std = data.NegativeVotesCount.std()
        data.NegativeVotesCount = (data.NegativeVotesCount -
                                   neg_mean) / neg_std

        plt.hist(data.PositiveVotesCount, label="PositiveVotesCount")
        plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount")
        ax = plt.gca()
        ax.set_yscale('log')
        plt.title("Normalized Data")
        plt.legend()
        plt.show()

    training_article_ids = np.random.choice(article_ids,
                                            round(len(article_ids) * 0.8))
    training_data = {
        "articles": articles[articles.ID_Article.isin(training_article_ids)],
        "ratings": ratings[ratings.ID_Article.isin(training_article_ids)],
        "occurrences":
        occurrences[occurrences.index.isin(training_article_ids)],
    }

    test_article_ids = np.setdiff1d(article_ids, training_article_ids)
    test_data = {
        "articles": articles[articles.ID_Article.isin(test_article_ids)],
        "ratings": ratings[ratings.ID_Article.isin(test_article_ids)],
        "occurrences": occurrences[occurrences.index.isin(test_article_ids)]
    }

    return training_data, test_data
Exemple #8
0
def ner_article_plots():
    entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl",
                                                generate_article_ner_frame)
    pandas.DataFrame(entities['Text'].value_counts().head(30)).plot.bar()
    plt_helper.save_and_show_plot("Entity Distribution")

    entities["Label"].value_counts().plot.bar()
    plt.xlabel("Number of Occurrences")
    plt_helper.save_and_show_plot("Entity Label Distribution")

    joined_article_categories = data_analysis.generate_joined_category_articles_frame(
    )
    articles_time = joined_article_categories[['ID_Article', 'PublishingDate']]

    for label in set(entities["Label"]):
        print("Doing plots for: " + label)
        label_entities = entities[entities['Label'] == label]
        label_series = label_entities["Text"].value_counts().head(20)

        if label == "PER":
            print(
                "For top person entries try to unify first+last name and first-name/last-name only entries"
            )
            persons = label_series.index.values
            for person in persons:
                for compare_person in persons:
                    if compare_person in person and person != compare_person:
                        print(
                            str(person) + " is not unique, subset of " +
                            str(compare_person))
                        label_series[compare_person] += label_series[person]
                        label_series = label_series.drop(labels=[person])
                        break

        pandas.DataFrame(label_series.sort_values()).plot.barh()
        ax = plt.gca()
        ax.get_legend().remove()
        plt.xlabel("Number of Occurrences")
        plt_helper.save_and_show_plot("Entities - " + label + " Distribution")
        top_entities = label_series.sort_values(
            ascending=False).head(6).index.values

        years = [2015, 2016]
        top_entity_entries = []
        for entity in top_entities:
            if label == "PER":
                entity_entries = label_entities[
                    label_entities.Text.str.contains(entity)]
                entity_entries = entity_entries.assign(Text=entity)
            else:
                entity_entries = label_entities[label_entities.Text == entity]
            top_entity_entries.append(entity_entries)
        top_entity_entries = pandas.concat(top_entity_entries)
        top_entity_entries = pandas.merge(top_entity_entries, articles_time)

        plt.style.use('seaborn-deep')
        year_entity_entries = top_entity_entries[
            top_entity_entries.PublishingDate.dt.year > 2014][[
                'PublishingDate', 'Text'
            ]]
        year_entity_entries.PublishingDate = year_entity_entries.PublishingDate.dt.date
        plots = year_entity_entries['PublishingDate'].hist(
            by=year_entity_entries['Text'], histtype='bar', alpha=0.8, bins=12)
        fig = plt.gca().figure
        title = "Top Entities from " + label + "  over time"
        fig.suptitle(title, y=0.99)
        plt_helper.save_and_show_plot(title, False)

        values = []
        labels = []
        for entity in top_entities:
            values.append(year_entity_entries[year_entity_entries.Text ==
                                              entity]['PublishingDate'])
            labels.append(entity)
        plt.hist(values, label=labels)
        plt.legend()
        plt_helper.save_and_show_plot("Top Entities from " + label +
                                      " over time")

    print("done")