Beispiel #1
0
def load_raw_posts():
    columns = [
        "ID_Post", "ID_User", "CreatedAt", "Status", "Headline", "p.Body",
        "ID_Article", "Path"
    ]
    sql = "SELECT " + ", ".join(
        columns) + " FROM Posts p INNER JOIN Articles a USING (ID_Article)"

    # only treat posts from users with at least this many post
    min_user_posts = 500
    sql += ''' WHERE p.ID_User IN (
        SELECT ID_User
        FROM Posts q
        GROUP BY ID_User
        HAVING COUNT(*) > ''' + str(min_user_posts) + " )"

    if DEBUG:
        sql += " LIMIT 100"

    posts = dbase_helper.query_to_data_frame(
        sql, "posts-" + str(min_user_posts) + ".pkl")
    posts.columns = columns

    # drop posts with empty main text
    posts["p.Body"] = posts["p.Body"].replace("", np.nan)
    posts = posts.dropna()

    # parse date strings
    posts["CreatedAt"] = pd.to_datetime(posts["CreatedAt"])

    # convert to category data data to reencode IDs
    posts["ID_User"] = posts["ID_User"].astype('category')

    return posts
def generate_joined_category_articles_frame():
    article_frame = dbase_helper.query_to_data_frame(
        '''
        SELECT Articles.ID_Article, Articles.Title, Articles.publishingDate, Article_Categories.MainCategory, 
        Article_Categories.SubCategory, Article_Categories.RemainingPath, Articles.Body
        FROM Articles LEFT JOIN Article_Categories ON Article_Categories.ID_Article = Articles.ID_Article;
        ''', "joined_category_articles.pkl")
    article_frame.columns = [
        'ID_Article', 'Title', 'PublishingDate', 'MainCategory', 'SubCategory',
        'RemainingPath', 'Body'
    ]
    article_frame['PublishingDate'] = pandas.to_datetime(
        article_frame['PublishingDate'])
    return article_frame
Beispiel #3
0
def load_parent_posts(posts, plot=False):
    parent_posts = dbase_helper.query_to_data_frame(
        """
                SELECT Posts.ID_Post, Posts.ID_Parent_Post FROM Posts;
                """, "post_parents.pkl")
    parent_posts.columns = ["ID_Post", "ID_Parent_Post"]

    # For now just encode if there exists a parent post
    parent_posts["Parent_Post"] = parent_posts.ID_Parent_Post >= 0

    if plot:
        parent_posts["Parent_Post"].value_counts().plot.bar()
        plt.ylabel("Number of Posts")
        plt.xlabel("Has Parent-Post")
        plt_helper.save_and_show_plot("Posts with Parent-Post")

    return parent_posts[["ID_Post", "Parent_Post"]][parent_posts.ID_Post.isin(
        posts.ID_Post)].drop("ID_Post", axis=1)
def generate_joined_rating_articles_frame():
    sql = '''
            SELECT Posts.ID_Article, 
            Articles.Title,
            SUM(case when Posts.PositiveVotes =1 then 1 else 0 END) AS PositiveVotesCount, 
            SUM(case when Posts.NegativeVotes =1 then 1 else 0 END) AS NegativeVotesCount,
            Article_Categories.MainCategory,
            Article_Categories.SubCategory,
            Article_Categories.RemainingPath,
            Articles.Body
            FROM Posts 
            LEFT JOIN Articles ON Posts.ID_Article = Articles.ID_Article
            LEFT JOIN Article_Categories ON Posts.ID_Article = Article_Categories.ID_Article
            GROUP BY Posts.ID_Article;
        '''
    frame = dbase_helper.query_to_data_frame(sql, "joined_rating_articles.pkl")
    frame.columns = [
        "ID_Article", "Title", "PositiveVotesCount", "NegativeVotesCount",
        "MainCategory", "SubCategory", "RemainingPath", "Body"
    ]
    return frame
Beispiel #5
0
def load_post_ratings(posts, plot=False):
    post_ratings = dbase_helper.query_to_data_frame(
        """
            SELECT Posts.ID_Post, Posts.PositiveVotes, Posts.NegativeVotes FROM Posts;
            """, "post_votes.pkl")
    post_ratings.columns = ["ID_Post", "PositiveVotes", "NegativeVotes"]
    if plot:
        plt.hist(post_ratings.PositiveVotes, label="PositiveVotes")
        plt.hist(-post_ratings.NegativeVotes, label="NegativeVotes")
        plt.legend()
        plt.ylabel("Number of Occurrences")
        plt.xlabel("Number of Votes")
        ax = plt.gca()
        ax.set_yscale('log')
        plt_helper.save_and_show_plot(
            "Logarithmic Vote Distribution over Posts")

    post_ratings[["PositiveVotes", "NegativeVotes"
                  ]] = post_ratings[["PositiveVotes",
                                     "NegativeVotes"]].astype('uint16')
    return post_ratings[post_ratings.ID_Post.isin(posts.ID_Post)].drop(
        "ID_Post", axis=1)
Beispiel #6
0
def encode_article_named_entities(posts):
    entities = dbase_helper.generate_pkl_cached("prepared_ner_articles.pkl",
                                                ner.generate_article_ner_frame)

    # Select named entities with minimal occurrence
    minimal_number_word_occurrences = 20
    word_occurrences = pd.DataFrame(entities['Text'].value_counts())
    word_occurrences = word_occurrences[
        word_occurrences['Text'] >= minimal_number_word_occurrences]
    word_occurrences = word_occurrences.rename(
        columns={'Text': 'NumOccurrences'})
    entity_occurrences, co_occurrences = ner.create_co_occurrence_matrix(
        word_occurrences.index.values)
    num_articles = dbase_helper.query_to_data_frame(
        """
        SELECT MAX(Articles.ID_Article) FROM Articles;
        """, "number_articles.pkl")[0][0]
    entity_occurrences = entity_occurrences.reindex(
        index=range(num_articles), fill_value=0).astype('uint8')
    posts = posts[['ID_Post', 'ID_Article']]
    posts_entity_occurrences_in_article = posts.join(entity_occurrences,
                                                     on='ID_Article').drop(
                                                         'ID_Article', axis=1)
    return posts_entity_occurrences_in_article.drop("ID_Post", axis=1)