def decide_posts_to_keep(raw_data_file_path, anonymous_coward_name): # Read raw data. document_gen = document_generator([raw_data_file_path]) post_to_targets = dict() for document in document_gen: comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: targets = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: continue if targets["comments"] > 1: post_id = document["post_id"] post_to_targets[post_id] = targets post_id_list = list() comments_list = list() users_list = list() score_list = list() controversiality_list = list() for post_id, targets in post_to_targets.items(): post_id_list.append(post_id) comments_list.append(targets["comments"]) users_list.append(targets["users"]) score_list.append(targets["score_wilson"]) controversiality_list.append(targets["controversiality_wilson"]) n = len(post_id_list) post_id_list = np.array(post_id_list) comments_list = np.array(comments_list) users_list = np.array(users_list) score_list = np.array(score_list) controversiality_list = np.array(controversiality_list) # Rank according to comments. comments_rank = rankdata(-comments_list) i_comments = np.argsort(comments_list) post_id_list_comments = post_id_list[i_comments] comments_list = comments_list[i_comments] print(np.max(comments_list)) # Rank according to users. users_rank = rankdata(-users_list) i_users = np.argsort(users_list) post_id_list_users = post_id_list[i_users] users_list = users_list[i_users] print(np.max(users_list)) # Rank according to score_wilson. score_rank = rankdata(-score_list) i_score = np.argsort(score_list) post_id_list_score = post_id_list[i_score] score_list = score_list[i_score] print(np.max(score_list)) # Rank according to controversiality_wilson. controversiality_rank = rankdata(-controversiality_list) i_controversiality = np.argsort(controversiality_list) post_id_list_controversiality = post_id_list[i_controversiality] controversiality_list = controversiality_list[i_controversiality] print(np.max(controversiality_list)) # Rank according to all. all_rank = comments_rank + users_rank + score_rank + controversiality_rank i = np.argsort(all_rank) post_id_list_new = post_id_list[i][::-1] # Select 500 posts. post_id_chunk_list = [ chunk[-1] for chunk in split_list(list(post_id_list_new), 500) ] for post_id in post_id_chunk_list: print(post_to_targets[post_id]) return set(post_id_chunk_list)
def decide_posts_to_keep(raw_data_file_path, anonymous_coward_name): # Read raw data. document_gen = document_generator([raw_data_file_path]) post_to_targets = dict() for document in document_gen: comment_gen = comment_generator(document=document) ################################################################################################################ # Within-discussion comment and user anonymization. ################################################################################################################ comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) ################################################################################################################ # Calculate prediction targets. ################################################################################################################ try: targets = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: continue if targets["comments"] > 1: post_id = document["post_id"] post_to_targets[post_id] = targets post_id_list = list() comments_list = list() users_list = list() score_list = list() controversiality_list = list() for post_id, targets in post_to_targets.items(): post_id_list.append(post_id) comments_list.append(targets["comments"]) users_list.append(targets["users"]) score_list.append(targets["score_wilson"]) controversiality_list.append(targets["controversiality_wilson"]) n = len(post_id_list) post_id_list = np.array(post_id_list) comments_list = np.array(comments_list) users_list = np.array(users_list) score_list = np.array(score_list) controversiality_list = np.array(controversiality_list) # Rank according to comments. comments_rank = rankdata(- comments_list) i_comments = np.argsort(comments_list) post_id_list_comments = post_id_list[i_comments] comments_list = comments_list[i_comments] print(np.max(comments_list)) # Rank according to users. users_rank = rankdata(- users_list) i_users = np.argsort(users_list) post_id_list_users = post_id_list[i_users] users_list = users_list[i_users] print(np.max(users_list)) # Rank according to score_wilson. score_rank = rankdata(- score_list) i_score = np.argsort(score_list) post_id_list_score = post_id_list[i_score] score_list = score_list[i_score] print(np.max(score_list)) # Rank according to controversiality_wilson. controversiality_rank = rankdata(- controversiality_list) i_controversiality = np.argsort(controversiality_list) post_id_list_controversiality = post_id_list[i_controversiality] controversiality_list = controversiality_list[i_controversiality] print(np.max(controversiality_list)) # Rank according to all. all_rank = comments_rank + users_rank + score_rank + controversiality_rank i = np.argsort(all_rank) post_id_list_new = post_id_list[i][::-1] # Select 500 posts. post_id_chunk_list = [chunk[-1] for chunk in split_list(list(post_id_list_new), 500)] for post_id in post_id_chunk_list: print(post_to_targets[post_id]) return set(post_id_chunk_list)
def make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name): discussion_json = dict() discussion_json["post_url"] = get_post_url(document) discussion_json["post_title"] = get_post_title(document) # discussion_json["snapshot_timestamps"] = [repr(float(snapshot_timestamp)) for snapshot_timestamp in lifetime_list] discussion_json["graph_snapshots"] = list() comment_gen = comment_generator(document=document) comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) try: discussion_json["prediction_targets"] = calculate_targets( document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: return None try: safe_comment_gen = safe_comment_generator( document=document, comment_generator=comment_generator, within_discussion_comment_anonymize= within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: return None try: initial_post = next(safe_comment_gen) except TypeError: return None try: timestamp = extract_timestamp(initial_post) except TypeError: return None op_raw_id = extract_user_name(initial_post) op_id = within_discussion_user_anonymize[op_raw_id] if op_id == within_discussion_anonymous_coward: op_is_anonymous = True else: op_is_anonymous = False comment_counter = 0 timestamp_column_names_list,\ timestamp_array = initialize_timestamp_array(discussion_json["prediction_targets"]["comments"] + 1, cascade_source_timestamp=timestamp) intermediate_dict = initialize_intermediate( comment_name_set, user_name_set, timestamp, within_discussion_anonymous_coward, op_is_anonymous=op_is_anonymous) comment_tree = spsp.dok_matrix( (len(comment_name_set), len(comment_name_set)), dtype=np.int8) user_graph = spsp.dok_matrix((len(user_name_set), len(user_name_set)), dtype=np.int32) current_lifetime = 0.0 # lifetime_list.append(np.inf) for lifetime_counter, lifetime in enumerate(lifetime_list): while True: try: comment = next(safe_comment_gen) except TypeError: return None except StopIteration: handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json( current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break if comment is None: return None comment_counter += 1 commenter_name = extract_user_name(comment) if commenter_name is None: commenter_is_anonymous = True else: commenter_is_anonymous = False try: discussion_tree,\ user_graph,\ comment_id,\ parent_comment_id,\ commenter_id,\ parent_commenter_id,\ user_graph_modified,\ parent_commenter_is_anonymous,\ comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_user_name=extract_user_name, discussion_tree=comment_tree, user_graph=user_graph, within_discussion_comment_anonymize=within_discussion_comment_anonymize, within_discussion_user_anonymize=within_discussion_user_anonymize, within_discussion_anonymous_coward=within_discussion_anonymous_coward, comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"]) intermediate_dict[ "comment_id_to_user_id"] = comment_id_to_user_id except RuntimeError: return None try: timestamp = extract_timestamp(comment) except TypeError: return None update_timestamp_array(timestamp_column_names_list, timestamp_array, timestamp, comment_counter) timestamp_difference = timestamp_array[ comment_counter, 1] - timestamp_array[comment_counter - 1, 1] try: intermediate_dict,\ comment_depth = update_intermediate(discussion_tree, user_graph, intermediate_dict, commenter_is_anonymous, parent_commenter_is_anonymous, comment_id, parent_comment_id, commenter_id, parent_commenter_id, user_graph_modified, timestamp, timestamp_difference) except RuntimeError: return None current_lifetime = timestamp_array[comment_counter, 1] - timestamp_array[0, 1] if current_lifetime >= lifetime: # Read features. # handcrafted_df_row = handcrafted_df[feature_list] handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json( current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break discussion_json["post_timestamp"] = timestamp_array[0, 1] # discussion_json["final_comment_tree_size"] = discussion_json["prediction_targets"]["comments"] + 1 # discussion_json["final_user_graph_size"] = discussion_json["prediction_targets"]["users"] return discussion_json
def make_discussion_json(document, timestamp_df, handcrafted_df, lifetime_list, anonymous_coward_name): discussion_json = dict() discussion_json["post_url"] = get_post_url(document) discussion_json["post_title"] = get_post_title(document) # discussion_json["snapshot_timestamps"] = [repr(float(snapshot_timestamp)) for snapshot_timestamp in lifetime_list] discussion_json["graph_snapshots"] = list() comment_gen = comment_generator(document=document) comment_name_set,\ user_name_set,\ within_discussion_comment_anonymize,\ within_discussion_user_anonymize,\ within_discussion_anonymous_coward = within_discussion_comment_and_user_anonymization(comment_gen=comment_gen, extract_comment_name=extract_comment_name, extract_user_name=extract_user_name, anonymous_coward_name=anonymous_coward_name) try: discussion_json["prediction_targets"] = calculate_targets(document, comment_name_set, user_name_set, within_discussion_anonymous_coward) except KeyError as e: return None try: safe_comment_gen = safe_comment_generator(document=document, comment_generator=comment_generator, within_discussion_comment_anonymize=within_discussion_comment_anonymize, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_timestamp=extract_timestamp, safe=True) except TypeError: return None try: initial_post = next(safe_comment_gen) except TypeError: return None try: timestamp = extract_timestamp(initial_post) except TypeError: return None op_raw_id = extract_user_name(initial_post) op_id = within_discussion_user_anonymize[op_raw_id] if op_id == within_discussion_anonymous_coward: op_is_anonymous = True else: op_is_anonymous = False comment_counter = 0 timestamp_column_names_list,\ timestamp_array = initialize_timestamp_array(discussion_json["prediction_targets"]["comments"] + 1, cascade_source_timestamp=timestamp) intermediate_dict = initialize_intermediate(comment_name_set, user_name_set, timestamp, within_discussion_anonymous_coward, op_is_anonymous=op_is_anonymous) comment_tree = spsp.dok_matrix((len(comment_name_set), len(comment_name_set)), dtype=np.int8) user_graph = spsp.dok_matrix((len(user_name_set), len(user_name_set)), dtype=np.int32) current_lifetime = 0.0 # lifetime_list.append(np.inf) for lifetime_counter, lifetime in enumerate(lifetime_list): while True: try: comment = next(safe_comment_gen) except TypeError: return None except StopIteration: handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json(current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break if comment is None: return None comment_counter += 1 commenter_name = extract_user_name(comment) if commenter_name is None: commenter_is_anonymous = True else: commenter_is_anonymous = False try: discussion_tree,\ user_graph,\ comment_id,\ parent_comment_id,\ commenter_id,\ parent_commenter_id,\ user_graph_modified,\ parent_commenter_is_anonymous,\ comment_id_to_user_id = update_discussion_and_user_graphs(comment=comment, extract_comment_name=extract_comment_name, extract_parent_comment_name=extract_parent_comment_name, extract_user_name=extract_user_name, discussion_tree=comment_tree, user_graph=user_graph, within_discussion_comment_anonymize=within_discussion_comment_anonymize, within_discussion_user_anonymize=within_discussion_user_anonymize, within_discussion_anonymous_coward=within_discussion_anonymous_coward, comment_id_to_user_id=intermediate_dict["comment_id_to_user_id"]) intermediate_dict["comment_id_to_user_id"] = comment_id_to_user_id except RuntimeError: return None try: timestamp = extract_timestamp(comment) except TypeError: return None update_timestamp_array(timestamp_column_names_list, timestamp_array, timestamp, comment_counter) timestamp_difference = timestamp_array[comment_counter, 1] - timestamp_array[comment_counter-1, 1] try: intermediate_dict,\ comment_depth = update_intermediate(discussion_tree, user_graph, intermediate_dict, commenter_is_anonymous, parent_commenter_is_anonymous, comment_id, parent_comment_id, commenter_id, parent_commenter_id, user_graph_modified, timestamp, timestamp_difference) except RuntimeError: return None current_lifetime = timestamp_array[comment_counter, 1] - timestamp_array[0, 1] if current_lifetime >= lifetime: # Read features. # handcrafted_df_row = handcrafted_df[feature_list] handcrafted_df_row = handcrafted_df.iloc[comment_counter] time_step_json = make_time_step_json(current_lifetime, comment_tree, user_graph, timestamp_array[comment_counter, 1], handcrafted_df_row) discussion_json["graph_snapshots"].append(time_step_json) break discussion_json["post_timestamp"] = timestamp_array[0, 1] # discussion_json["final_comment_tree_size"] = discussion_json["prediction_targets"]["comments"] + 1 # discussion_json["final_user_graph_size"] = discussion_json["prediction_targets"]["users"] return discussion_json