def get_glove_model(dimension = 50): #convert Stanford glove format to gensim word2vec format, if file doesn't exist if file_utils.verify_file(gensim_vector_filepath % dimension) == False: glove2word2vec(glove_input_file = (vector_filepath % dimension), word2vec_output_file = (gensim_vector_filepath % dimension)) #load embeddings and init model (word2vec model, but init with glove embeddings) glove_model = KeyedVectors.load_word2vec_format(gensim_vector_filepath % dimension, binary=False) return glove_model
def fit_all_cascades(code, cascades, comments, pickle_save, subreddit = False): #if all saved, load from that if file_utils.verify_file("data_cache/fitted_params/%s_cascade_params.pkl" % code): cascade_params = file_utils.load_pickle("data_cache/fitted_params/%s_cascade_params.pkl" % code) fit_fail = [post_id for post_id, post in cascades.items() if post_id not in cascade_params] return cascade_params, fit_fail #anything to load? if so, load the latest checkpoint if pickle_save: #build glob filestring - to get all matching checkpoints if subreddit == False: filename = "data_cache/fitted_params/%s*_cascade_params.pkl" % code else: filename = "data_cache/fitted_params/%s_%s*_cascade_params.pkl" % (code, subreddit) #extract matching filenames and their numeric values, selecting the most complete one to load files = glob.glob(filename) best_int = -1 #count of records in best file - set to "" if a complete file is found for file in files: file_int = re.search(r'\d+', file) #if no number in filename, have a complete file - use that if file_int is None: best_int = "" break else: file_int = int(file_int.group()) if file_int > best_int: best_int = file_int #load checkpoint, if we have one if best_int != -1: if subreddit == False: cascade_params = cascade_manip.load_cascade_params(code, str(best_int)) else: cascade_params = cascade_manip.load_cascade_params(code, subreddit + str(best_int)) print("Loaded", len(cascade_params), "fitted cascade parameters") #otherwise, empty dictionary else: cascade_params = {} else: cascade_params = {} avg_quality = 0 #fit any cascades that have not been fitted before, add to params dictionary: post_id -> params post_count = len(cascade_params) fit_fail = [] print("Fitting all cascade models") for post_id, post in cascades.items(): #if this cascade already fitted, and params are valid, skip if post_id in cascade_params and (cascade_params[post_id][0] != 20 and cascade_params[post_id][1] != 500 and cascade_params[post_id][2] != 2.3): continue #fit the current cascade (filtering comments to just this post is not required) #print("Fitting cascade", post_id) param_res = fit_cascade.fit_cascade_model(post, comments) #if negative comment times, skip this cascade and move to next if param_res == False: fit_fail.append(post_id) continue cascade_params[post_id] = param_res avg_quality += cascade_params[post_id][6] post_count += 1 if post_count % 1000 == 0: print("Fitted", post_count, "cascades") if pickle_save and post_count % 10000 == 0: if subreddit == False: cascade_manip.save_cascade_params(code, cascade_params, str(post_count)) else: cascade_manip.save_cascade_params(code, cascade_params, subreddit + str(post_count)) avg_quality /= len(cascade_params) #dump params to file print("Fitted a total of", len(cascade_params), "cascades (average quality", str(avg_quality) + ")") if pickle_save: cascade_manip.save_cascade_params(code, cascade_params, subreddit) #return all params, loaded and newly fitted return cascade_params, fit_fail #end fit_all_cascades
#TESTING ONLY!!!! if subreddit != sub_filter: continue print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate") #what is the graph limit for this subreddit? if subreddit in sub_limits: max_nodes = sub_limits[subreddit] print("Max graph size for this subreddit:", max_nodes) else: max_nodes = default_max_nodes print("Using default max graph size:", max_nodes) #load preprocessed posts for this subreddit if file_utils.verify_file(posts_filepath % subreddit): posts = file_utils.load_pickle(posts_filepath % subreddit) print("Loaded", len(posts), "processed posts from", posts_filepath % subreddit) else: print("Cannot simulate for subreddit", subreddit, "without processed posts file", posts_filepath % subreddit) exit(0) #find highest assigned post id for this data, so we know where to assign new ids if we need to next_id = max([value['id'] for key, value in posts.items()]) + 1 #do we need to build a graph and infer at all? loop to find out infer = False infer_count = 0
all_events = [] #list for all sim events, across all seed posts post_counter = 1 #counter of posts to simulate, across all subreddits #process each subreddit for subreddit, seeds in post_seeds.items(): ''' #TESTING ONLY!!!! if subreddit != "Lisk": continue ''' print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate") #if have a cached graph, load and use that instead of rebuilding if file_utils.verify_file("graph_cache/%s_post_graph.pkl" % subreddit) and file_utils.verify_file("graph_cache/%s_user_ids.pkl" % subreddit): print("Loading post graph from graph_cache/%s_post_graph.pkl and user id list from graph_cache/%s_user_ids.pkl" % (subreddit, subreddit)) sub_graph = file_utils.load_pickle("graph_cache/%s_post_graph.pkl" % subreddit) user_ids = file_utils.load_pickle("graph_cache/%s_user_ids.pkl" % subreddit) print("Loaded graph has", sub_graph.graph.number_of_nodes(), "nodes and", sub_graph.graph.size(), "edges") #no cached, build graph from raw posts and params else: #load subreddit posts (don't need the comments!) raw_sub_posts = cascade_manip.load_filtered_posts(domain, subreddit) #load subreddit parameters raw_sub_params = cascade_manip.load_cascade_params(domain, subreddit) #filter posts - TESTING ONLY!!!!!!!! - if you didn't load all the params ''' raw_sub_posts = {post_id : post for post_id, post in sub_posts.items() if post_id in sub_params}
index = 7 if "baseline" in model else 5 #process each file for results_file in results_files: #what testing size is this file for? file_tokens = results_file.split('_') file_test_size = int(file_tokens[index][:-4]) file_run = int(file_tokens[index + 2][-1]) #if test size larger than target, process this file (and its associated pkl) if file_test_size > testing_num: #if bookmark and results already exist at target size for this run, skip reduction if file_utils.verify_file( bookmark_format % (subreddit, subreddit, model, training_num, testing_num, testing_start_year, testing_start_month, file_run)) and file_utils.verify_file( results_format % (subreddit, subreddit, model, training_num, testing_num, testing_start_year, testing_start_month, file_run)): print(" target exists, skipping", results_file) continue print(" reducing", results_file) #get corresponding bookmark filename bookmark_file = bookmark_format % ( subreddit, subreddit, model, training_num, file_test_size, testing_start_year, testing_start_month, file_run)
#optional command line argument to limit to a single subreddit if len(sys.argv) == 2: subreddit_filter = sys.argv[1] #filepaths of output files subreddits_filepath = "model_files/subreddits.pkl" #dictionary of subreddit -> domain code posts_filepath = "model_files/posts/%s_posts.pkl" #processed post data for each post, one file per subreddit #each post maps original post id to numeric id, set of tokens, and user id params_filepath = "model_files/params/%s_params.txt" #text file of fitted cascade params, one file per subreddit #one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality) graph_filepath = "model_files/graphs/%s_graph.txt" #edgelist of post graph for this subreddit users_filepath = "model_files/users/%s_users.txt" #list of users seen in posts/comments, one file per subreddit #load the subreddit distribution for all cascades (just need a list of subreddits) if file_utils.verify_file(subreddits_filepath): print("Loading subreddit list from", subreddits_filepath) subreddit_dict = file_utils.load_pickle(subreddits_filepath) #file doesn't exist, build it else: #load all three domain breakdown files crypto_subreddit_dist = file_utils.load_json( "results/crypto_post_subreddit_dist.json") cve_subreddit_dist = file_utils.load_json( "results/cve_post_subreddit_dist.json") cyber_subreddit_dist = file_utils.load_json( "results/cyber_post_subreddit_dist.json") #combine into single dictionary of subreddit -> list of corresponding domain codes subreddit_dict = build_domain_dict([ set(crypto_subreddit_dist.keys()), set(cve_subreddit_dist.keys()),
raw_comments = {} #build/load cascades (auto-load as a result, either raw data or cached cascades) cascades, comments, missing_posts, missing_comments = cascade_analysis.build_cascades( code, raw_stories, raw_comments) #optional: filter out cascades with any missing elements (posts or comments) cascades, comments = cascade_manip.remove_missing(code, cascades, comments) print("\nHave total of", len(cascades), "cascades and", len(comments), "comments for hackernews") #build processed post file #assign numeric ids to each post for node2vec input files #get set of tokens #extract and maintain user if file_utils.verify_file(posts_filepath % code): print("Processed post file already exists.") posts = file_utils.load_pickle(posts_filepath % code) else: c = count() posts = { key: { 'user': value['author_h'], 'tokens': extract_tokens(value), 'id': next(c) } for key, value in cascades.items() } #save this to file file_utils.save_pickle(posts, posts_filepath % code) print("Saved", len(posts), "processed posts to", posts_filepath % code)
def graph_infer(sim_post, sim_post_id, group, max_nodes, min_node_quality, estimate_initial_params): print("Inferring post parameters from post graph") #load preprocessed posts for this group if file_utils.verify_file(posts_filepath % group): posts = file_utils.load_pickle(posts_filepath % group) print("Loaded", len(posts), "processed posts from", posts_filepath % group) else: print("Cannot simulate for group", group, "without processed posts file", posts_filepath % group) exit(0) #if seed post not in posts file - we're gonna have a bad time if sim_post['id_h'] not in posts: print("Simulation post not in dataset - exiting\n") exit(0) #grab numeric/graph id of sim post numeric_sim_post_id = posts[sim_post_id]['id'] #load in fitted simulation params - need these for graph build fitted_params, fitted_quality = functions_hybrid_model.load_params( params_filepath % group, posts, False, True) #remove sim post from graph params - no cheating! (pop based on numeric id) res = fitted_params.pop(numeric_sim_post_id) res = fitted_quality.pop(numeric_sim_post_id) #graph stuff - sample graph if necessary, add new nodes, etc graph = {} isolated_nodes = [] added_count = 0 #do we need to sample/process the graph? sample if whole graph too big, imposing a min node quality, need to estimate initial params, or we don't have a precomputed graph file if (max_nodes != None and len(posts) > max_nodes) or file_utils.verify_file( graph_filepath % group ) == False or min_node_quality != None or estimate_initial_params: #only sample down if we actually have to if max_nodes != None: print("\nSampling graph to", max_nodes, "nodes") #sample down posts graph_posts = user_sample_graph(posts, [sim_post], max_nodes, group, min_node_quality, fitted_quality) #otherwise, use them all else: graph_posts = posts #build graph, getting initial param estimate if required if estimate_initial_params: estimated_params = functions_hybrid_model.build_graph_estimate_node_params( graph_posts, fitted_params, fitted_quality, numeric_sim_post_id, temp_graph_filepath % group) else: functions_hybrid_model.build_graph(graph_posts, temp_graph_filepath % group) #no graph sampling/processing, use the full set and copy graph file to temp location else: graph_posts = posts copyfile(graph_filepath % group, temp_graph_filepath % group) print("Copied complete post-graph to", temp_graph_filepath % group) #ALWAYS sample down params to match whatever graph we have - because we can't use the previously fitted params! if estimate_initial_params: functions_hybrid_model.get_graph_params(graph_posts, numeric_sim_post_id, fitted_params, fitted_quality, temp_params_filepath % group, estimated_params) else: functions_hybrid_model.get_graph_params(graph_posts, numeric_sim_post_id, fitted_params, fitted_quality, temp_params_filepath % group) #graph is built and ready - graph file and input params file #run node2vec to get embeddings - if we have to infer parameters #offload to C++, because I feel the need... the need for speed!: if file_utils.verify_file(output_params_filepath % group): os.remove(output_params_filepath % group) #clear output to prevent append #run node2vec on graph and params subprocess.check_call([ "./c_node2vec/examples/node2vec/node2vec", "-i:" + (temp_graph_filepath % group), "-ie:" + (temp_params_filepath % group), "-o:" + (output_params_filepath % group), "-d:6", "-l:3", "-w", "-s", "-otf" ]) print("") #load the inferred params (dictionary of numeric id -> params) all_inferred_params = functions_hybrid_model.load_params( output_params_filepath % group, posts, inferred=True) inferred_params = all_inferred_params[numeric_sim_post_id] return inferred_params
post_seeds[post['subreddit']].append(post) print({key: len(post_seeds[key]) for key in post_seeds}) post_counter = 1 #counter of posts to simulate, across all subreddits #process each subreddit for subreddit, seeds in post_seeds.items(): #TESTING ONLY!!!! if subreddit != "Lisk": continue print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate") #load preprocessed posts for this subreddit if file_utils.verify_file(posts_filepath % subreddit): posts = file_utils.load_pickle(posts_filepath % subreddit) print("Loaded", len(posts), "processed posts from", posts_filepath % subreddit) else: print("Cannot simulate for subreddit", subreddit, "without processed posts file", posts_filepath % subreddit) exit(0) #correct key format of seed posts, if necessary for seed_post in seeds: #if post id contains the t3_ prefix, strip it off so we don't have to change everything if seed_post['id_h'].startswith(POST_PREFIX): seed_post['id_h'] = seed_post['id_h'][3:] #load fitted params for this subreddit
#get correct index of test size token, based on model name index = 7 if "baseline" in model else (4 if model == "comparative" else 5) #process each file for results_file in results_files: #what testing size is this file for? file_tokens = results_file.split('_') file_test_size = int(file_tokens[index][:-4]) file_run = int(file_tokens[index + 2][-1]) #if test size larger than target, process this file (and its associated pkl) if file_test_size < testing_num: #if bookmark and results already exist at target size for this run, skip reduction if (model == "comparative" and file_utils.verify_file( comparative_bookmark_format % (subreddit, subreddit, model, testing_num, testing_start_year, testing_start_month, file_run)) and file_utils.verify_file( comparative_results_format % (subreddit, subreddit, model, testing_num, testing_start_year, testing_start_month, file_run)) ) or (file_utils.verify_file( bookmark_format % (subreddit, subreddit, model, training_num, testing_num, testing_start_year, testing_start_month, file_run)) and file_utils.verify_file( results_format % (subreddit, subreddit, model, training_num, testing_num, testing_start_year, testing_start_month, file_run))): print(" target exists, skipping", results_file) continue