def load_raw_posts(): columns = [ "ID_Post", "ID_User", "CreatedAt", "Status", "Headline", "p.Body", "ID_Article", "Path" ] sql = "SELECT " + ", ".join( columns) + " FROM Posts p INNER JOIN Articles a USING (ID_Article)" # only treat posts from users with at least this many post min_user_posts = 500 sql += ''' WHERE p.ID_User IN ( SELECT ID_User FROM Posts q GROUP BY ID_User HAVING COUNT(*) > ''' + str(min_user_posts) + " )" if DEBUG: sql += " LIMIT 100" posts = dbase_helper.query_to_data_frame( sql, "posts-" + str(min_user_posts) + ".pkl") posts.columns = columns # drop posts with empty main text posts["p.Body"] = posts["p.Body"].replace("", np.nan) posts = posts.dropna() # parse date strings posts["CreatedAt"] = pd.to_datetime(posts["CreatedAt"]) # convert to category data data to reencode IDs posts["ID_User"] = posts["ID_User"].astype('category') return posts
def generate_joined_category_articles_frame(): article_frame = dbase_helper.query_to_data_frame( ''' SELECT Articles.ID_Article, Articles.Title, Articles.publishingDate, Article_Categories.MainCategory, Article_Categories.SubCategory, Article_Categories.RemainingPath, Articles.Body FROM Articles LEFT JOIN Article_Categories ON Article_Categories.ID_Article = Articles.ID_Article; ''', "joined_category_articles.pkl") article_frame.columns = [ 'ID_Article', 'Title', 'PublishingDate', 'MainCategory', 'SubCategory', 'RemainingPath', 'Body' ] article_frame['PublishingDate'] = pandas.to_datetime( article_frame['PublishingDate']) return article_frame
def load_parent_posts(posts, plot=False): parent_posts = dbase_helper.query_to_data_frame( """ SELECT Posts.ID_Post, Posts.ID_Parent_Post FROM Posts; """, "post_parents.pkl") parent_posts.columns = ["ID_Post", "ID_Parent_Post"] # For now just encode if there exists a parent post parent_posts["Parent_Post"] = parent_posts.ID_Parent_Post >= 0 if plot: parent_posts["Parent_Post"].value_counts().plot.bar() plt.ylabel("Number of Posts") plt.xlabel("Has Parent-Post") plt_helper.save_and_show_plot("Posts with Parent-Post") return parent_posts[["ID_Post", "Parent_Post"]][parent_posts.ID_Post.isin( posts.ID_Post)].drop("ID_Post", axis=1)
def generate_joined_rating_articles_frame(): sql = ''' SELECT Posts.ID_Article, Articles.Title, SUM(case when Posts.PositiveVotes =1 then 1 else 0 END) AS PositiveVotesCount, SUM(case when Posts.NegativeVotes =1 then 1 else 0 END) AS NegativeVotesCount, Article_Categories.MainCategory, Article_Categories.SubCategory, Article_Categories.RemainingPath, Articles.Body FROM Posts LEFT JOIN Articles ON Posts.ID_Article = Articles.ID_Article LEFT JOIN Article_Categories ON Posts.ID_Article = Article_Categories.ID_Article GROUP BY Posts.ID_Article; ''' frame = dbase_helper.query_to_data_frame(sql, "joined_rating_articles.pkl") frame.columns = [ "ID_Article", "Title", "PositiveVotesCount", "NegativeVotesCount", "MainCategory", "SubCategory", "RemainingPath", "Body" ] return frame
def load_post_ratings(posts, plot=False): post_ratings = dbase_helper.query_to_data_frame( """ SELECT Posts.ID_Post, Posts.PositiveVotes, Posts.NegativeVotes FROM Posts; """, "post_votes.pkl") post_ratings.columns = ["ID_Post", "PositiveVotes", "NegativeVotes"] if plot: plt.hist(post_ratings.PositiveVotes, label="PositiveVotes") plt.hist(-post_ratings.NegativeVotes, label="NegativeVotes") plt.legend() plt.ylabel("Number of Occurrences") plt.xlabel("Number of Votes") ax = plt.gca() ax.set_yscale('log') plt_helper.save_and_show_plot( "Logarithmic Vote Distribution over Posts") post_ratings[["PositiveVotes", "NegativeVotes" ]] = post_ratings[["PositiveVotes", "NegativeVotes"]].astype('uint16') return post_ratings[post_ratings.ID_Post.isin(posts.ID_Post)].drop( "ID_Post", axis=1)
def encode_article_named_entities(posts): entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl", ner.generate_article_ner_frame) # Select named entities with minimal occurrence minimal_number_word_occurrences = 20 word_occurrences = pd.DataFrame(entities['Text'].value_counts()) word_occurrences = word_occurrences[ word_occurrences['Text'] >= minimal_number_word_occurrences] word_occurrences = word_occurrences.rename( columns={'Text': 'NumOccurrences'}) entity_occurrences, co_occurrences = ner.create_co_occurrence_matrix( word_occurrences.index.values) num_articles = dbase_helper.query_to_data_frame( """ SELECT MAX(Articles.ID_Article) FROM Articles; """, "number_articles.pkl")[0][0] entity_occurrences = entity_occurrences.reindex( index=range(num_articles), fill_value=0).astype('uint8') posts = posts[['ID_Post', 'ID_Article']] posts_entity_occurrences_in_article = posts.join(entity_occurrences, on='ID_Article').drop( 'ID_Article', axis=1) return posts_entity_occurrences_in_article.drop("ID_Post", axis=1)