Example #1
0
def make_dataset_json(output_file_path, raw_data_file_path, features_folder,
                      comparison_lifetimes_path, anonymous_coward_name):
    # Read raw data.
    document_gen = document_generator([raw_data_file_path])

    # Read features.
    h5_stores_and_keys = get_h5_stores_and_keys(features_folder, "reddit")

    # Read comparison lifetimes.
    lifetime_list = get_comparison_lifetimes(comparison_lifetimes_path)

    post_ids_to_keep = decide_posts_to_keep(raw_data_file_path,
                                            anonymous_coward_name)

    with open(output_file_path, "w") as fp:
        fp.write("[\n")
        for document in document_gen:

            if document["post_id"] in post_ids_to_keep:
                timestamp_df,\
                handcrafted_df = get_features_df(document, h5_stores_and_keys)

                # if handcrafted_df is None:
                #     continue

                discussion_json = make_discussion_json(document, timestamp_df,
                                                       handcrafted_df,
                                                       lifetime_list,
                                                       anonymous_coward_name)

                if discussion_json is None:
                    continue

                json.dump(discussion_json, fp)

                fp.write(",\n\n")
        fp.write("]\n")
def make_dataset_json(output_file_path,
                      raw_data_file_path,
                      features_folder,
                      comparison_lifetimes_path,
                      anonymous_coward_name):
    # Read raw data.
    document_gen = document_generator([raw_data_file_path])

    # Read features.
    h5_stores_and_keys = get_h5_stores_and_keys(features_folder,
                                                "reddit")

    # Read comparison lifetimes.
    lifetime_list = get_comparison_lifetimes(comparison_lifetimes_path)

    post_ids_to_keep = decide_posts_to_keep(raw_data_file_path, anonymous_coward_name)

    with open(output_file_path, "w") as fp:
        fp.write("[\n")
        for document in document_gen:

            if document["post_id"] in post_ids_to_keep:
                timestamp_df,\
                handcrafted_df = get_features_df(document, h5_stores_and_keys)

                # if handcrafted_df is None:
                #     continue

                discussion_json = make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name)

                if discussion_json is None:
                    continue

                json.dump(discussion_json, fp)

                fp.write(",\n\n")
        fp.write("]\n")
Example #3
0
def decide_posts_to_keep(raw_data_file_path, anonymous_coward_name):
    # Read raw data.
    document_gen = document_generator([raw_data_file_path])

    post_to_targets = dict()

    for document in document_gen:
        comment_gen = comment_generator(document=document)

        ################################################################################################################
        # Within-discussion comment and user anonymization.
        ################################################################################################################
        comment_name_set,\
        user_name_set,\
        within_discussion_comment_anonymize,\
        within_discussion_user_anonymize,\
        within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                              extract_comment_name=extract_comment_name,
                                                                                              extract_user_name=extract_user_name,
                                                                                              anonymous_coward_name=anonymous_coward_name)

        ################################################################################################################
        # Calculate prediction targets.
        ################################################################################################################
        try:
            targets = calculate_targets(document, comment_name_set,
                                        user_name_set,
                                        within_discussion_anonymous_coward)
        except KeyError as e:
            continue

        if targets["comments"] > 1:
            post_id = document["post_id"]
            post_to_targets[post_id] = targets

    post_id_list = list()
    comments_list = list()
    users_list = list()
    score_list = list()
    controversiality_list = list()
    for post_id, targets in post_to_targets.items():
        post_id_list.append(post_id)
        comments_list.append(targets["comments"])
        users_list.append(targets["users"])
        score_list.append(targets["score_wilson"])
        controversiality_list.append(targets["controversiality_wilson"])

    n = len(post_id_list)

    post_id_list = np.array(post_id_list)
    comments_list = np.array(comments_list)
    users_list = np.array(users_list)
    score_list = np.array(score_list)
    controversiality_list = np.array(controversiality_list)

    # Rank according to comments.
    comments_rank = rankdata(-comments_list)
    i_comments = np.argsort(comments_list)
    post_id_list_comments = post_id_list[i_comments]
    comments_list = comments_list[i_comments]

    print(np.max(comments_list))

    # Rank according to users.
    users_rank = rankdata(-users_list)
    i_users = np.argsort(users_list)
    post_id_list_users = post_id_list[i_users]
    users_list = users_list[i_users]
    print(np.max(users_list))

    # Rank according to score_wilson.
    score_rank = rankdata(-score_list)
    i_score = np.argsort(score_list)
    post_id_list_score = post_id_list[i_score]
    score_list = score_list[i_score]
    print(np.max(score_list))

    # Rank according to controversiality_wilson.
    controversiality_rank = rankdata(-controversiality_list)
    i_controversiality = np.argsort(controversiality_list)
    post_id_list_controversiality = post_id_list[i_controversiality]
    controversiality_list = controversiality_list[i_controversiality]
    print(np.max(controversiality_list))

    # Rank according to all.
    all_rank = comments_rank + users_rank + score_rank + controversiality_rank
    i = np.argsort(all_rank)
    post_id_list_new = post_id_list[i][::-1]

    # Select 500 posts.
    post_id_chunk_list = [
        chunk[-1] for chunk in split_list(list(post_id_list_new), 500)
    ]

    for post_id in post_id_chunk_list:
        print(post_to_targets[post_id])

    return set(post_id_chunk_list)
def decide_posts_to_keep(raw_data_file_path, anonymous_coward_name):
    # Read raw data.
    document_gen = document_generator([raw_data_file_path])

    post_to_targets = dict()

    for document in document_gen:
        comment_gen = comment_generator(document=document)

        ################################################################################################################
        # Within-discussion comment and user anonymization.
        ################################################################################################################
        comment_name_set,\
        user_name_set,\
        within_discussion_comment_anonymize,\
        within_discussion_user_anonymize,\
        within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen,
                                                                                              extract_comment_name=extract_comment_name,
                                                                                              extract_user_name=extract_user_name,
                                                                                              anonymous_coward_name=anonymous_coward_name)

        ################################################################################################################
        # Calculate prediction targets.
        ################################################################################################################
        try:
            targets = calculate_targets(document,
                                        comment_name_set,
                                        user_name_set,
                                        within_discussion_anonymous_coward)
        except KeyError as e:
            continue

        if targets["comments"] > 1:
            post_id = document["post_id"]
            post_to_targets[post_id] = targets

    post_id_list = list()
    comments_list = list()
    users_list = list()
    score_list = list()
    controversiality_list = list()
    for post_id, targets in post_to_targets.items():
        post_id_list.append(post_id)
        comments_list.append(targets["comments"])
        users_list.append(targets["users"])
        score_list.append(targets["score_wilson"])
        controversiality_list.append(targets["controversiality_wilson"])

    n = len(post_id_list)

    post_id_list = np.array(post_id_list)
    comments_list = np.array(comments_list)
    users_list = np.array(users_list)
    score_list = np.array(score_list)
    controversiality_list = np.array(controversiality_list)

    # Rank according to comments.
    comments_rank = rankdata(- comments_list)
    i_comments = np.argsort(comments_list)
    post_id_list_comments = post_id_list[i_comments]
    comments_list = comments_list[i_comments]

    print(np.max(comments_list))

    # Rank according to users.
    users_rank = rankdata(- users_list)
    i_users = np.argsort(users_list)
    post_id_list_users = post_id_list[i_users]
    users_list = users_list[i_users]
    print(np.max(users_list))

    # Rank according to score_wilson.
    score_rank = rankdata(- score_list)
    i_score = np.argsort(score_list)
    post_id_list_score = post_id_list[i_score]
    score_list = score_list[i_score]
    print(np.max(score_list))

    # Rank according to controversiality_wilson.
    controversiality_rank = rankdata(- controversiality_list)
    i_controversiality = np.argsort(controversiality_list)
    post_id_list_controversiality = post_id_list[i_controversiality]
    controversiality_list = controversiality_list[i_controversiality]
    print(np.max(controversiality_list))

    # Rank according to all.
    all_rank = comments_rank + users_rank + score_rank + controversiality_rank
    i = np.argsort(all_rank)
    post_id_list_new = post_id_list[i][::-1]

    # Select 500 posts.
    post_id_chunk_list = [chunk[-1] for chunk in split_list(list(post_id_list_new), 500)]

    for post_id in post_id_chunk_list:
        print(post_to_targets[post_id])

    return set(post_id_chunk_list)