def load_subreddit_cascades(subreddit, domain, cascades, comments): #load filtered cascades for this subreddit filtered_cascades, filtered_comments = cascade_manip.load_filtered_cascades( domain, subreddit) #don't exist, filter them now if filtered_cascades == False: #have we loaded the raw cascades/comments yet? if not, do it now #(waiting until now in case we have all the filtered versions and don't need these at all) if cascades == None or comments == None: #build/load cascades (auto-load as a result, either raw data or cached cascades) cascades, comments, missing_posts, missing_comments = cascade_analysis.build_cascades( domain) #optional: filter out cascades with any missing elements (posts or comments) cascades, comments = cascade_manip.remove_missing( domain, cascades, comments) #filter cascades by a particular subreddit filtered_cascades = cascade_manip.filter_cascades_by_subreddit( cascades, subreddit) #and filter comments to match those posts filtered_cascades, filtered_comments = cascade_manip.filter_comments_by_posts( filtered_cascades, comments) #save these filtered posts/comments for easier loading later cascade_manip.save_cascades(code, filtered_cascades, subreddit) cascade_manip.save_comments(code, filtered_comments, subreddit) return filtered_cascades, filtered_comments
#have we loaded the raw cascades/comments yet? if not, do it now #(waiting until now in case we have all the filtered versions and don't need these at all) if cascades == None or comments == None: #build/load cascades (auto-load as a result, either raw data or cached cascades) cascades, comments, missing_posts, missing_comments = cascade_analysis.build_cascades( code) #optional: filter out cascades with any missing elements (posts or comments) cascades, comments = cascade_manip.remove_missing( code, cascades, comments) #filter cascades by a particular subreddit filtered_cascades = cascade_manip.filter_cascades_by_subreddit( cascades, subreddit) #and filter comments to match those posts filtered_cascades, filtered_comments = cascade_manip.filter_comments_by_posts( filtered_cascades, comments) #save these filtered posts/comments for easier loading later cascade_manip.save_cascades(code, filtered_cascades, subreddit) cascade_manip.save_comments(code, filtered_comments, subreddit) #fit params to all of the filtered cascades, loading checkpoints if they exist all_params = cascade_analysis.fit_all_cascades(code, filtered_cascades, filtered_comments, pickle_save, subreddit) #if not saving to pickle, save to text file now if pickle_save == False: with open("data_cache/txt_params/%s_params.txt" % subreddit, "w") as f: for post_id, params in all_params.items(): f.write(post_id + " ") for i in range(len(params)):
total_update_time = 0 total_insert_count = 0 total_insert_time = 0 total_remove_count = 0 total_remove_time = 0 total_match_count = 0 #process all posts (or just one, if doing that) print("Processing", len(sim_post_id_list), "post", "s" if len(sim_post_id_list) > 1 else "") for sim_post_id in sim_post_id_list: #pull out just the post (and associated comments) we care about sim_post = raw_posts[sim_post_id] #and filter to comments junk, post_comments = cascade_manip.filter_comments_by_posts( {sim_post_id: sim_post}, raw_comments, False) if batch == False: print("Simulation post has", len(post_comments), "comments\n") #GRAPH INFER inferred_params = functions_paper_model.graph_infer( sim_post, sim_post_id, group, max_nodes, min_node_quality, estimate_initial_params) #inferred_params = [1.73166, 0.651482, 1.08986, 0.762604, 2.49934, 0.19828] #placeholder if skipping the infer if batch == False: print("Inferred params:", inferred_params, "\n") #REFINE PARAMS - for partial observed trees partial_fit_params = fit_partial_cascade.fit_partial_cascade( sim_post, post_comments, time_observed, inferred_params, display=False) if batch == False:
#load cascades and fit params to all of them, loading checkpoints if they exist ''' cascades, comments = cascade_manip.load_filtered_cascades(code, subreddit) #load posts + comments cascade_analysis.fit_all_cascades(code, cascades, comments, subreddit) #load saved fits, and fit the rest ''' #or, load specific saved cascade params from file cascades, comments = cascade_manip.load_filtered_cascades(code, subreddit) #load posts + comments cascade_params = cascade_manip.load_cascade_params(code, subreddit + "100") #filter cascades/comments to fitted posts (for testing) cascades = {post_id : post for post_id, post in cascades.items() if post_id in cascade_params} print("Filtered to", len(cascades), "posts with fitted parameters") cascade_manip.filter_comments_by_posts(cascades, comments) #pull out one random cascade from those loaded for testing, remove from all cascades test_post_id = "WYNap8ZYQ6kc0lKZRAX3tA" #"BitkI6YOhOueIKiphn5okA" #"kRl5UtFpGFEaAQ374AREfw" #random.choice(list(cascades.keys())) test_post = cascades.pop(test_post_id) test_post_params = cascade_params.pop(test_post_id) print("Random post:", test_post_id, "\n " + test_post['title_m'], "\n ", test_post_params) #build a ParamGraph for set of posts pgraph = ParamGraph() pgraph.build_graph(cascades, cascade_params) #file_utils.save_pickle(pgraph, "class_pickle_test.pkl") #save class instance for later
fit_fail = set( file_utils.load_json("model_files/params/hackernews_failed_param_fit.txt")) #load hackernews cascades posts = file_utils.load_pickle( "data_cache/hackernews_cascades/hackernews_cascade_posts.pkl") comments = file_utils.load_pickle( "data_cache/hackernews_cascades/hackernews_cascade_comments.pkl") print("Loaded", len(posts), "posts and", len(comments), "comments") #remove missing posts, comments = cascade_manip.remove_missing(code, posts, comments) #remove posts for which the fit failed posts = {key: value for (key, value) in posts.items() if key not in fit_fail} posts, comments = cascade_manip.filter_comments_by_posts(posts, comments) print("Down to", len(posts), "posts and", len(comments), "comments") #filenames of filtered cascades and comments cascades_filepath = "data_cache/filtered_cascades/%s_%s_cascades.pkl" #domain and subreddit cascades comments_filepath = "data_cache/filtered_cascades/%s_%s_comments.pkl" #domain and subreddit comments #save to same place as other filtered cascades - use hackernews as domain and subreddit file_utils.save_pickle(posts, cascades_filepath % (code, code)) file_utils.save_pickle(comments, comments_filepath % (code, code)) #add hackernews to subreddit -> domain mapping subs = file_utils.load_pickle("model_files/subreddits.pkl") subs[ 'hackernews_sub'] = 'cve' #redirect existing cve sub for this file - shouldn't be a problem subs[code] = code