def create_and_train_model(training_data): EPOCHS = 500 Y = training_data['ratings'][['PositiveVotesCount', 'NegativeVotesCount']].values X = training_data['occurrences'].values assert X.shape[0] == Y.shape[0] model = tf.keras.Sequential([ tf.keras.layers.Dense(4, activation='relu', input_shape=[X.shape[1]]), tf.keras.layers.Dense(4, activation='relu'), tf.keras.layers.Dense(Y.shape[1]) ]) optimizer = tf.keras.optimizers.Adam(0.00001) loss = 'mean_squared_logarithmic_error' model.compile(loss=loss, optimizer=optimizer, metrics=['mean_squared_logarithmic_error', 'mae']) tf.keras.utils.plot_model( model, show_shapes=True, expand_nested=True, to_file='plots/user_response_prediction-model.png') model.summary() early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) history = model.fit(X, Y, epochs=EPOCHS, validation_split=0.2, callbacks=[early_stop], batch_size=10) plt.plot(history.history['loss'], label='loss') plt.plot(history.history['val_loss'], label='validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt_helper.save_and_show_plot("User Response Prediction Training Loss (" + loss + ")") plt.plot(history.history['mae'], label='mean absolute error') plt.plot(history.history['val_mae'], label='mean validation error') plt.xlabel('Epochs') plt.ylabel('MAE') plt.legend() plt_helper.save_and_show_plot( "User Response Prediction Training Absolute Error") print("done") return model
def rating_analysis(): frame = generate_joined_rating_articles_frame() main_category_votes = frame[[ "PositiveVotesCount", "NegativeVotesCount", "MainCategory" ]].groupby(by="MainCategory").sum() main_category_votes.plot(kind='bar') plt_helper.save_and_show_plot("Votes for Posts per Main Category") newsroom_data = frame[frame.MainCategory == "Newsroom"] newsroom_votes = newsroom_data[[ "PositiveVotesCount", "NegativeVotesCount", "SubCategory" ]].groupby(by="SubCategory").sum() newsroom_votes.plot(kind='bar') plt_helper.save_and_show_plot("Votes for Posts per Newsroom Category") print("done")
def load_parent_posts(posts, plot=False): parent_posts = dbase_helper.query_to_data_frame( """ SELECT Posts.ID_Post, Posts.ID_Parent_Post FROM Posts; """, "post_parents.pkl") parent_posts.columns = ["ID_Post", "ID_Parent_Post"] # For now just encode if there exists a parent post parent_posts["Parent_Post"] = parent_posts.ID_Parent_Post >= 0 if plot: parent_posts["Parent_Post"].value_counts().plot.bar() plt.ylabel("Number of Posts") plt.xlabel("Has Parent-Post") plt_helper.save_and_show_plot("Posts with Parent-Post") return parent_posts[["ID_Post", "Parent_Post"]][parent_posts.ID_Post.isin( posts.ID_Post)].drop("ID_Post", axis=1)
def load_post_ratings(posts, plot=False): post_ratings = dbase_helper.query_to_data_frame( """ SELECT Posts.ID_Post, Posts.PositiveVotes, Posts.NegativeVotes FROM Posts; """, "post_votes.pkl") post_ratings.columns = ["ID_Post", "PositiveVotes", "NegativeVotes"] if plot: plt.hist(post_ratings.PositiveVotes, label="PositiveVotes") plt.hist(-post_ratings.NegativeVotes, label="NegativeVotes") plt.legend() plt.ylabel("Number of Occurrences") plt.xlabel("Number of Votes") ax = plt.gca() ax.set_yscale('log') plt_helper.save_and_show_plot( "Logarithmic Vote Distribution over Posts") post_ratings[["PositiveVotes", "NegativeVotes" ]] = post_ratings[["PositiveVotes", "NegativeVotes"]].astype('uint16') return post_ratings[post_ratings.ID_Post.isin(posts.ID_Post)].drop( "ID_Post", axis=1)
def compute_date_stats(posts, plot=False): date_stats = pd.DataFrame() date_stats["ID_Post"] = posts["ID_Post"] date_stats["TimeOfDay"] = posts["CreatedAt"].apply( lambda x: int(x.hour * 60 + x.minute)) date_stats["Timestamp"] = posts["CreatedAt"].apply(lambda x: x.value) date_stats["DayOfWeek"] = posts["CreatedAt"].apply(lambda x: x.dayofweek) date_inputs = np.asarray( date_stats.drop("ID_Post", axis=1).drop("Timestamp", axis=1)) if plot: date_stats.DayOfWeek.hist(bins=6) plt.ylabel("Number of Posts") plt.xlabel("Day of the Week") plt_helper.save_and_show_plot("Number of posts per Weekday") posts["CreatedAt"].apply(lambda x: x.hour).hist(bins=24) plt.ylabel("Number of Posts") plt.xlabel("Time of the Day") plt_helper.save_and_show_plot("Number of Posts per Time of Day") return date_inputs
def category_analysis(): frame = dbase_helper.get_pandas_from_table("Article_Categories") main_categories = np.array(frame.MainCategory) plt_helper.plot_histogram_distinct("Main Category Distribution", main_categories) for current_main_category in np.unique(main_categories): main_category_data = frame[frame['MainCategory'] == current_main_category] main_category_sub_categories = main_category_data["SubCategory"] plt_helper.plot_histogram_distinct( "Category Distribution " + current_main_category, main_category_sub_categories) article_frame = generate_joined_category_articles_frame() # Do stuff with articles by year years = np.array( article_frame.sort_values( by='PublishingDate')['PublishingDate'].dt.year) plt_helper.plot_histogram_distinct("Article count over years", years) min_year = years[0] max_year = years[-1] # We actually only have articles for 2015 and 2016 all others years have only one article min_year = 2015 for year in range(min_year, max_year + 1): year_articles = article_frame[article_frame['PublishingDate'].dt.year == year] main_categories = np.array(year_articles.MainCategory) plt_helper.plot_histogram_distinct( "Main Category Distribution " + str(year), main_categories) for current_main_category in np.unique(main_categories): main_category_data = year_articles[year_articles['MainCategory'] == current_main_category] main_category_sub_categories = main_category_data["SubCategory"] plt_helper.plot_histogram_distinct( "Category Distribution " + current_main_category + str(year), main_category_sub_categories) # Time & Day analysis newsroom_articles = article_frame[article_frame.MainCategory == 'Newsroom'] pandas.Series(newsroom_articles.PublishingDate.dt.hour).plot.hist( alpha=0.8, bins=list(range(0, 24)), rwidth=0.8) plt_helper.save_and_show_plot("Time Distribution Newsroom") plt_helper.plot_day_histogram("Day Distribution Newsroom", newsroom_articles.PublishingDate.dt.weekday) sub_categories = np.unique(newsroom_articles.SubCategory) for category in sub_categories: pandas.Series( newsroom_articles[newsroom_articles.SubCategory == category].PublishingDate.dt.hour).plot.hist( alpha=0.8, bins=list(range(0, 24)), rwidth=0.8) plt_helper.save_and_show_plot("Time Distribution " + str(category)) days = newsroom_articles[newsroom_articles.SubCategory == category].PublishingDate.dt.weekday plt_helper.plot_day_histogram("Day Distribution " + str(category), days) print("done")
def prepare_data(): entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl", ner.generate_article_ner_frame) # Select named entities with minimal occurrence minimal_number_word_occurrences = 5 minimal_number_words_per_article = 5 word_occurrences = pandas.DataFrame(entities['Text'].value_counts()) word_occurrences = word_occurrences[ word_occurrences['Text'] >= minimal_number_word_occurrences] word_occurrences = word_occurrences.rename( columns={'Text': 'NumOccurrences'}) interesting_words = word_occurrences.index.values occurrences, co_occurrences = ner.create_co_occurrence_matrix( interesting_words) article_ids = occurrences.index.values data = data_analysis.generate_joined_rating_articles_frame() data = data[data.ID_Article.isin(article_ids)] interesting_words_per_article = entities[entities['Text'].isin( interesting_words)].groupby( by='ID_Article', as_index=False).agg(lambda x: len(list(x)))[['ID_Article', 'Text']] article_ids = interesting_words_per_article[ interesting_words_per_article.Text > minimal_number_words_per_article].ID_Article data = data[data.ID_Article.isin(article_ids)] articles = data[[ 'ID_Article', 'Title', 'MainCategory', 'SubCategory', 'RemainingPath' ]] ratings = data[['ID_Article', 'PositiveVotesCount', 'NegativeVotesCount']] # Plot the data we shall predict plt.hist(data.PositiveVotesCount, label="PositiveVotesCount") plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount") ax = plt.gca() ax.set_yscale('log') plt.legend() plt_helper.save_and_show_plot( "Logarithmic Vote Distribution over Articles") plt.hist(data.PositiveVotesCount, label="PositiveVotesCount") plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount") plt.legend() plt_helper.save_and_show_plot("Vote Distribution over Articles") normalize = False if normalize: pos_mean = data.PositiveVotesCount.mean() pos_std = data.PositiveVotesCount.std() data.PositiveVotesCount = (data.PositiveVotesCount - pos_mean) / pos_std neg_mean = data.NegativeVotesCount.mean() neg_std = data.NegativeVotesCount.std() data.NegativeVotesCount = (data.NegativeVotesCount - neg_mean) / neg_std plt.hist(data.PositiveVotesCount, label="PositiveVotesCount") plt.hist(-data.NegativeVotesCount, label="NegativeVotesCount") ax = plt.gca() ax.set_yscale('log') plt.title("Normalized Data") plt.legend() plt.show() training_article_ids = np.random.choice(article_ids, round(len(article_ids) * 0.8)) training_data = { "articles": articles[articles.ID_Article.isin(training_article_ids)], "ratings": ratings[ratings.ID_Article.isin(training_article_ids)], "occurrences": occurrences[occurrences.index.isin(training_article_ids)], } test_article_ids = np.setdiff1d(article_ids, training_article_ids) test_data = { "articles": articles[articles.ID_Article.isin(test_article_ids)], "ratings": ratings[ratings.ID_Article.isin(test_article_ids)], "occurrences": occurrences[occurrences.index.isin(test_article_ids)] } return training_data, test_data
def ner_article_plots(): entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl", generate_article_ner_frame) pandas.DataFrame(entities['Text'].value_counts().head(30)).plot.bar() plt_helper.save_and_show_plot("Entity Distribution") entities["Label"].value_counts().plot.bar() plt.xlabel("Number of Occurrences") plt_helper.save_and_show_plot("Entity Label Distribution") joined_article_categories = data_analysis.generate_joined_category_articles_frame( ) articles_time = joined_article_categories[['ID_Article', 'PublishingDate']] for label in set(entities["Label"]): print("Doing plots for: " + label) label_entities = entities[entities['Label'] == label] label_series = label_entities["Text"].value_counts().head(20) if label == "PER": print( "For top person entries try to unify first+last name and first-name/last-name only entries" ) persons = label_series.index.values for person in persons: for compare_person in persons: if compare_person in person and person != compare_person: print( str(person) + " is not unique, subset of " + str(compare_person)) label_series[compare_person] += label_series[person] label_series = label_series.drop(labels=[person]) break pandas.DataFrame(label_series.sort_values()).plot.barh() ax = plt.gca() ax.get_legend().remove() plt.xlabel("Number of Occurrences") plt_helper.save_and_show_plot("Entities - " + label + " Distribution") top_entities = label_series.sort_values( ascending=False).head(6).index.values years = [2015, 2016] top_entity_entries = [] for entity in top_entities: if label == "PER": entity_entries = label_entities[ label_entities.Text.str.contains(entity)] entity_entries = entity_entries.assign(Text=entity) else: entity_entries = label_entities[label_entities.Text == entity] top_entity_entries.append(entity_entries) top_entity_entries = pandas.concat(top_entity_entries) top_entity_entries = pandas.merge(top_entity_entries, articles_time) plt.style.use('seaborn-deep') year_entity_entries = top_entity_entries[ top_entity_entries.PublishingDate.dt.year > 2014][[ 'PublishingDate', 'Text' ]] year_entity_entries.PublishingDate = year_entity_entries.PublishingDate.dt.date plots = year_entity_entries['PublishingDate'].hist( by=year_entity_entries['Text'], histtype='bar', alpha=0.8, bins=12) fig = plt.gca().figure title = "Top Entities from " + label + " over time" fig.suptitle(title, y=0.99) plt_helper.save_and_show_plot(title, False) values = [] labels = [] for entity in top_entities: values.append(year_entity_entries[year_entity_entries.Text == entity]['PublishingDate']) labels.append(entity) plt.hist(values, label=labels) plt.legend() plt_helper.save_and_show_plot("Top Entities from " + label + " over time") print("done")