def load_group_data(group): #read group -> domain mapping for later file loads domain_mapping = file_utils.load_pickle(domain_mapping_filepath) if group not in domain_mapping: print(group, "not in domain mapping - exiting.\n") exit(0) domain = domain_mapping[group] #load cascades and comments for this group print("") raw_posts = file_utils.load_pickle(cascades_filepath % (domain, group)) raw_comments = file_utils.load_pickle(comments_filepath % (domain, group)) print("Loaded", len(raw_posts), "posts and", len(raw_comments), "comments\n") return raw_posts, raw_comments
def replace_links(self): """ Find small files, recover original link from google redirect notice and update link in DB and crawl links again """ files = fx.get_fnames(self.directory) if 'errors' in files: # remove log file files.remove('errors') ids = [] ids.extend([ f for f in files if os.path.getsize(self.directory + '\\' + f + '.pkl') < 10000 ]) links = [] for id in tqdm(ids): doc = fx.load_pickle(self.directory + '\\' + str(id) + '.pkl') links.append(rx.get_url(doc['text'])) data = list(zip(ids, links)) df = pd.DataFrame(data, columns=['id', 'real']) print(df) pdx.df_update_sql_field(self.db, self.table, 'id', 'real', df, 'TEXT') df = df[~df['real'].isnull()] for ix, row in df.iterrows(): self.get_article(row['real'], row['id'])
def simulate_comment_tree(sim_post, sim_params, group, sim_comments, time_observed): print("\nSimulating comment tree") #load active users list to draw from when assigning users to comments user_ids = file_utils.load_pickle(users_filepath % group) #simulate tree structure + comment times! print("Post created at", sim_post['created_utc'] / 60.0) #simulate from partially observed tree if time_observed != 0: #get alternate structure of observed tree observed_tree = convert_comment_tree(sim_post, sim_comments, time_observed) #simulate from this observed tree sim_root, all_times = sim_tree.simulate_comment_tree( sim_params, time_observed * 60, observed_tree) #simulate entirely new tree from root only else: sim_root, all_times = sim_tree.simulate_comment_tree(sim_params) #convert that to desired output format sim_events = functions_hybrid_model.build_cascade_events( sim_root, sim_post, user_ids, group) #sort list of events by time sim_events = sorted(sim_events, key=lambda k: k['nodeTime']) print("Generated", len(sim_events) - 1, "total comments for post", sim_post['id_h'], "(including observed)") print(" ", len(sim_comments), "actual\n") return sim_events, sim_root #return events list, and dictionary format of simulated tree
def clean_file(self, fname, field, remove, **kwargs): media = ('Media', 'Video', 'Image', 'Search', 'Sorry') attributes = [ 'caption', 'copyright', 'playback', 'episode', 'iPlayer', 'radio', 'BBC2' ] doc = fx.load_pickle(self.directory + '\\' + fname) lines = doc[field] # print(lines) if 'split' in kwargs: lines = [ line for line in lines.split('\n') if not line.startswith(media) or not any(x in line.split() for x in attributes) ] doc[field] = '\n'.join(lines) if 'clean' in kwargs: if remove in lines: doc[field] = '' else: pass doc[field] = lines.replace(remove, '') # print(doc['text']) fx.save_pickle(self.directory + '\\' + fname, doc)
def clean_directory(self, **kwargs): """ Remove files with with less than 5kb and/or from other domains """ files = fx.get_fnames(self.directory) if 'errors' in files: # remove log file files.remove('errors') files_to_remove = [] if 'clear_small' in kwargs: ids = [ f for f in files if os.path.getsize(self.directory + '\\' + f + '.pkl') < 10000 ] files_to_remove.extend( [self.directory + '\\' + f + '.pkl' for f in ids]) dbx.delete_rows(self.db, self.table, 'id', ids) if len(files_to_remove) > 0: fx.delete_files(files_to_remove) # verify if links correspond to domain if 'save' in kwargs: data = [] for id in tqdm(files): doc = fx.load_pickle(self.directory + '\\' + id + '.pkl') link = doc['canonical_link'] data.append((id, link)) df = pd.DataFrame(data, columns=['id', 'real']) pdx.save_to_csv(df, os.path.join(self.corpus, self.domain))
def __init__(self, root, transform=None, target_transform=None, loader=default_loader, training=None, kpt_file=None, ps_dir=None, re_obj=None, ps_w_h=(16, 48), ps_fuse_type='None'): self.root = root self.transform = transform self.target_transform = target_transform self.loader = loader self.imgs = [ path for path in list_pictures(self.root) if self.id(path) != -1 ] # convert person id to softmax continuous label self._id2label = {_id: idx for idx, _id in enumerate(self.unique_ids)} self.training = training self.im_path_to_kpt = load_pickle( kpt_file) if kpt_file is not None else None self.ps_dir = ps_dir self.re_obj = re_obj self.ps_w_h = ps_w_h self.ps_fuse_type = ps_fuse_type
def load_cached_posts(code): if os.path.exists("data_cache/%s_posts.pkl" % code): #load from pickle print("Loading posts from data_cache") posts = file_utils.load_pickle("data_cache/%s_posts.pkl" % code) print(" Loaded", len(posts), "posts") return posts else: return False
def load_filtered_cascades(code, subreddit): #if files don't exist, quit if os.path.exists("data_cache/filtered_cascades/%s_%s_comments.pkl" % (code, subreddit)) == False or os.path.exists( "data_cache/filtered_cascades/%s_%s_cascades.pkl" % (code, subreddit)) == False: print("No saved filtered cascades") return False, False print("Loading", subreddit, "posts and comments from cache...") #load from file cascades = file_utils.load_pickle( "data_cache/filtered_cascades/%s_%s_cascades.pkl" % (code, subreddit)) comments = file_utils.load_pickle( "data_cache/filtered_cascades/%s_%s_comments.pkl" % (code, subreddit)) print(" Loaded", len(cascades), "posts and", len(comments), "comments") return cascades, comments
def load_cached_comments(code): #load comments, either from cached pickle or directly from data if os.path.exists("data_cache/%s_comments.pkl" % code): #load from pickle print("Loading comments from data_cache") comments = file_utils.load_pickle("data_cache/%s_comments.pkl" % code) print(" Loaded", len(comments)) return comments elif code == "cyber" and os.path.exists("data_cache/cyber_comments"): #load from multiple pickles print("Loading comments from data_cache") comments = [] files = sorted(glob.glob('data_cache/cyber_comments/*')) for file in files: print(" Loading", file) new_comments = file_utils.load_pickle(file) comments.extend(new_comments) print(" Loaded", len(comments), "comments") return comments else: return False
def __init__(self, transform=None, target_transform=None, loader=default_loader, training=None, use_kpt=False, use_ps=False, split='train'): self.root = 'msmt17' self.transform = transform self.target_transform = target_transform self.loader = loader self.imgs = self._get_im_paths(split) # convert person id to softmax continuous label self._id2label = {_id: idx for idx, _id in enumerate(self.unique_ids)} self.training = training self.im_path_to_kpt = load_pickle(osp.join(self.root, 'im_path_to_kpt.pkl')) if use_kpt else None self.ps_dir = osp.join(self.root, 'MSMT17_V1_ps_label') if use_ps else None
def load_cascade_params(code, filtered=False, display=True): if filtered == False: filename = "data_cache/fitted_params/%s_cascade_params.pkl" % code else: filename = "data_cache/fitted_params/%s_%s_cascade_params.pkl" % ( code, filtered) if os.path.exists(filename) == False: if display: print("No saved cascade parameters - exiting") exit(0) else: if display: print("Loading cascade parameters from cache:", filename) params = file_utils.load_pickle(filename) return params
def __init__(self, transform=None, target_transform=None, loader=default_loader, training=None, use_kpt=False, ps_dir=None, split='train', re_obj=None, ps_w_h=(16, 48), ps_fuse_type='None'): self.root = 'msmt17' self.transform = transform self.target_transform = target_transform self.loader = loader self.imgs = self._get_im_paths(split) # convert person id to softmax continuous label self._id2label = {_id: idx for idx, _id in enumerate(self.unique_ids)} self.training = training self.im_path_to_kpt = load_pickle(osp.join(self.root, 'im_path_to_kpt.pkl')) if use_kpt else None self.ps_dir = ps_dir self.re_obj = re_obj self.ps_w_h = ps_w_h self.ps_fuse_type = ps_fuse_type
def load_filtered_posts(code, subreddit): #if files don't exist, quit if os.path.exists("data_cache/filtered_cascades/%s_%s_cascades.pkl" % (code, subreddit)) == False: print("No saved filtered posts") return False print("Loading", subreddit, "posts from cache...") #load from file posts = file_utils.load_pickle( "data_cache/filtered_cascades/%s_%s_cascades.pkl" % (code, subreddit)) print(" Loaded", len(posts), "posts") return posts #end load_filtered_posts
def get_corpus(self): files = fx.get_fnames(self.directory) # remove log file files.remove('errors') articles = [] for fname in files: path = os.path.join( os.path.abspath(os.curdir) + '\\' + self.directory, fname + '.pkl') doc = fx.load_pickle(path) article = {'id': fname, 'title': doc['title'], 'text': doc['text']} articles.append(article) df = pd.DataFrame(articles) fx.save_pickle(os.path.join(self.corpus, self.domain + '.pkl'), df) pdx.save_to_csv(df, os.path.join(self.corpus, self.domain))
def get_corpus_weight(self, column): """ Get weighted corpus dataframe according to column weight (count, favorites, retweets, is_bot) """ df_corpus = fx.load_pickle(self.corpus + '/' + self.domain + '.pkl') df_weight = self.df.filter(['id', column], axis=1) df_corpus['id'] = df_corpus['id'].astype(int) df_weight['id'] = df_weight['id'].astype(int) df = pd.merge(df_corpus, df_weight, on='id') df = pd.DataFrame(np.repeat(df.values, df[column].replace(0, 1).tolist(), axis=0), columns=df.columns) fx.save_pickle( os.path.join(self.corpus, self.domain + '_' + str(column) + '.pkl'), df)
#process each model separately for model, curr_filename in [("model", model_filename), ("comp", comp_filename), ("rand_tree", rand_tree_filename), ("rand_sim", rand_sim_filename), ("avg_sim", avg_filename)]: print("Processing", model, "and", subreddit) #process each subreddit for run in range(5): print(" run", run) #load timestamps data timestamps = file_utils.load_pickle(curr_filename % (subreddit, subreddit, run)) #loop posts for post_id, post_timestamps in timestamps.items(): #grab true timestamps true_timestamps = post_timestamps['true'] #loop observation times for time in post_timestamps.keys(): #skip true if time == "true": continue #pull sim comment times sim_timestamps = post_timestamps[time]
import file_utils import cascade_manip import fit_cascade code = "crypto" subreddit = "Bitcoin" #load cascades #cascades, comments = cascade_manip.load_filtered_cascades(code, subreddit) #check the fits - how many bad initializtion params are left? #load the param pickle cascade_params = file_utils.load_pickle( "data_cache/fitted_params/%s_%s_cascade_params.pkl" % (code, subreddit)) print("Loaded", len(cascade_params), "fitted params") fail_count = 0 for post_id, params in cascade_params.items(): #print(post_id, params) if params[0] == 20 and params[1] == 500 and params[2] == 2.3: print("FIT FAIL", post_id, params) fail_count += 1 ''' #try to fit this cascade again, get a read on what caused the failure print("old params", params) post = cascades[post_id] junk, post_comments = cascade_manip.filter_comments_by_posts({post_id : post}, comments) new_params = fit_cascade.fit_cascade_model(post, post_comments) print("new params", new_params) '''
if subreddit != sub_filter: continue print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate") #what is the graph limit for this subreddit? if subreddit in sub_limits: max_nodes = sub_limits[subreddit] print("Max graph size for this subreddit:", max_nodes) else: max_nodes = default_max_nodes print("Using default max graph size:", max_nodes) #load preprocessed posts for this subreddit if file_utils.verify_file(posts_filepath % subreddit): posts = file_utils.load_pickle(posts_filepath % subreddit) print("Loaded", len(posts), "processed posts from", posts_filepath % subreddit) else: print("Cannot simulate for subreddit", subreddit, "without processed posts file", posts_filepath % subreddit) exit(0) #find highest assigned post id for this data, so we know where to assign new ids if we need to next_id = max([value['id'] for key, value in posts.items()]) + 1 #do we need to build a graph and infer at all? loop to find out infer = False infer_count = 0 #also fetch/assign numeric ids to seed posts
#partition these into ~8 chunks count = 0 for chunk_cascades in chunks(cascades): print("Chunked to", len(chunk_cascades), "posts") junk, chunk_comments = cascade_manip.filter_comments_by_posts(chunk_cascades, comments) file_utils.save_pickle(chunk_cascades, "pcmasterrace/chunk_cascades_%s.pkl" % count) file_utils.save_pickle(chunk_comments, "pcmasterrace/chunk_comments_%s.pkl" % count) count += 1 exit(0) ''' #once those chunks are created - call multiple instances, one per chunk #load chunked posts/comments print("Loading chunk") cascades = file_utils.load_pickle("pcmasterrace/chunk_cascades_%s.pkl" % count) comments = file_utils.load_pickle("pcmasterrace/chunk_comments_%s.pkl" % count) print("Loaded", len(cascades), "posts and", len(comments), "comments") #fit params to all cascades all_params = cascade_analysis.fit_all_cascades(domain, cascades, comments, False, subreddit) #load processed posts posts = file_utils.load_pickle(posts_filepath % subreddit) #save to text file now with open(params_filepath % count, "w") as f: for post_id, params in all_params.items(): f.write(str(posts[post_id]['id']) + " ") #write numeric post id for i in range(len(params)):
file_run)) and file_utils.verify_file( results_format % (subreddit, subreddit, model, training_num, testing_num, testing_start_year, testing_start_month, file_run)): print(" target exists, skipping", results_file) continue print(" reducing", results_file) #get corresponding bookmark filename bookmark_file = bookmark_format % ( subreddit, subreddit, model, training_num, file_test_size, testing_start_year, testing_start_month, file_run) #load the pickle bookmark bookmark = file_utils.load_pickle(bookmark_file) #build a new one, including only the posts in our reduced target set finished_set = set([ post_id for post_id in bookmark['finished_posts'] if post_id in test_ids ]) print(" ", len(finished_set), "posts finished in bookmark") #save new bookmark - if doesn't already exist (don't want to overwrite stuff!) file_utils.save_pickle( { "finished_posts": finished_set, 'complete': True if len(finished_set) == testing_num else False }, bookmark_format %
comments["t1_" + comment_id] = comment print("Total of %d comments for %d-%d posts" % (len(comments), post_month, post_year)) return comments #end load_comments #---MAIN BEGINS HERE---# domain = "crypto" subreddit = "Lisk" #load crypto subreddit data - reconstructed cascades posts = file_utils.load_pickle( "data_cache/filtered_cascades/%s_%s_cascades.pkl" % (domain, subreddit)) comments = file_utils.load_pickle( "data_cache/filtered_cascades/%s_%s_comments.pkl" % (domain, subreddit)) print("Read %d posts and %d comments" % (len(posts), len(comments))) #artificial month partitioning - half the posts to 8/16, half to 9/16 (date doesn't matter) august_posts = dict(list(posts.items())[int(len(posts) / 2):]) september_posts = dict(list(posts.items())[:int(len(posts) / 2)]) print("Split to %d posts for 8-16 and %d posts for 9-16" % (len(august_posts), len(september_posts))) #convert both post sets file_utils.verify_dir("reddit_data/%s" % subreddit) august_posts = process(august_posts, subreddit, 2016, 8) september_posts = process(september_posts, subreddit, 2016, 9)
import file_utils import cascade_analysis import cascade_manip import glob code = "cyber" #load cascades and comments from pickle #cascades, comments, missing_posts, missing_comments = build_cascades(code, posts = False, comments = False) print("Loading cascades from data_cache") cascades = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) #comments: across multiple files print("Loading comments from data_cache") comments = {} files = sorted(glob.glob('data_cache/%s_cascades/%s_cascade_comments*' % (code, code))) for file in files: print("Loading", file) new_comments = file_utils.load_pickle(file) comments.update(new_comments) #missing posts and comments missing_posts = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code)) missing_comments = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code)) #yay! loaded print(" Loaded", len(cascades), "cascades with", len(comments), "comments")
import file_utils import glob code = "cve" #load all cve cascades and comments files = glob.glob('data_cache/filtered_cascades/cve_*_cascades.pkl') posts = {} for file in files: posts.update(file_utils.load_pickle(file)) files = glob.glob('data_cache/filtered_cascades/cve_*_comments.pkl') comments = {} for file in files: comments.update(file_utils.load_pickle(file)) #filenames of filtered cascades and comments cascades_filepath = "data_cache/filtered_cascades/%s_%s_cascades.pkl" #domain and subreddit cascades comments_filepath = "data_cache/filtered_cascades/%s_%s_comments.pkl" #domain and subreddit comments #save to same place as other filtered cascades - use hackernews as domain and subreddit file_utils.save_pickle(posts, cascades_filepath % (code, code)) file_utils.save_pickle(comments, comments_filepath % (code, code)) #add cve to subreddit -> domain mapping subs = file_utils.load_pickle("model_files/domain_mapping.pkl") if code not in subs: subs[code] = code file_utils.save_pickle(subs, "model_files/domain_mapping.pkl")
def build_cascades(code, posts = False, comments = False): #if cascades already exist, read from cache if os.path.exists("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) and (os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) or os.path.exists("data_cache/%s_cascades/%s_cascade_comments_1.pkl" % (code, code))): #load from pickle print("Loading cascades from data_cache") cascades = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) #comments: either a single file, or multiple files print("Loading comments from data_cache") if os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)): comments = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) else: comments = {} files = sorted(glob.glob('data_cache/%s_cascades/%s_cascade_comments*' % (code, code))) for file in files: print("Loading", file) new_comments = file_utils.load_pickle(file) comments.update(new_comments) missing_posts = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code)) missing_comments = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code)) print(" Loaded", len(cascades), "cascades with", len(comments), "comments") print(" ", len(missing_posts), "missing posts", len(missing_comments), "missing comments") return cascades, comments, missing_posts, missing_comments #if no cached cascades, build them from scratch #if no loaded posts/comments, load those up first if posts == False or comments == False: posts, comments = load_model_data.load_reddit_data(code) print("Extracting post/comment structure for", len(posts), "posts and", len(comments), "comments") #add replies field to all posts/comments, init to empty list data_utils.add_field(posts, "replies", []) data_utils.add_field(comments, "replies", []) #add placeholder field to all posts/comments, flag indicates if we created a dummy object data_utils.add_field(posts, 'placeholder', False) data_utils.add_field(comments, 'placeholder', False) #add comment_count field to all post objects as well: count total number of comments all the way down the cascade data_utils.add_field(posts, "comment_count_total", 0) #and direct replies only data_utils.add_field(posts, "comment_count_direct", 0) #and add a missing_comments field to all post objects: set True if we find any missing comments in this cascade data_utils.add_field(posts, "missing_comments", False) #grab list of fields for each type of object (used to create placeholders when items are missing) post_fields = list(posts[0].keys()) comment_fields = list(comments[0].keys()) ''' id_h = post/commend id parent_id_h = direct parent link_id_h = post parent if a parent_id starts with t1_, you remove t1_ and match the rest against a comment id if it starts with t3_, you remove t3_ and match the rest against a submission id. linked_id always starts with t3_, since it always points to a submission. ''' #create dictionary of post id -> post object to store cascades cascades = data_utils.list_to_dict(posts, "id_h") #convert list of comments to dictionary, where key is comment id comments = data_utils.list_to_dict(comments, "id_h") #now that we can find posts and comments at will, let's build the dictionary! #loop all comments, assign to immediate parent and increment comment_count of post parent comment_count = 0 missing_comments = set() #missing comments missing_posts = set() #missing posts for comment_id in list(comments.keys()): #get immediate parent (post or comment) direct_parent = comments[comment_id]['parent_id_h'][3:] direct_parent_type = "post" if comments[comment_id]['parent_id_h'][:2] == "t3" else "comment" #get post parent post_parent = comments[comment_id]['link_id_h'][3:] comment_count += 1 #add this comment to replies list of immediate parent, and update counters on post_parent try: #if post parent missing, create placeholder if post_parent not in cascades: cascades[post_parent] = create_object(post_parent, post_fields) missing_posts.add(post_parent) #update overall post comment count for this new comment cascades[post_parent]['comment_count_total'] += 1 #now handle direct parent, post or comment #parent is post if direct_parent_type == "post": #missing post, create placeholder to hold replies if direct_parent not in cascades: cascades[direct_parent] = create_object(direct_parent, post_fields) missing_posts.add(direct_parent) #add this comment to replies field of post (no total comment increment, done above) cascades[direct_parent]['replies'].append(comment_id) #add 1 to direct comment count field cascades[direct_parent]['comment_count_direct'] += 1 #parent is comment else: #missing comment, create placeholder to contain replies, point to parent post by default if direct_parent not in comments: comments[direct_parent] = create_object(direct_parent, comment_fields) #point this placeholder comment to the top-level post comments[direct_parent]['link_id_h'] = post_parent comments[direct_parent]['parent_id_h'] = post_parent #add manufactured comment to counters cascades[post_parent]['comment_count_total'] += 1 cascades[post_parent]['comment_count_direct'] += 1 #and add to replies cascades[post_parent]['replies'].append(direct_parent) #flag this cascade as containing missing comments cascades[post_parent]['missing_comments'] = True missing_comments.add(direct_parent) #add comment to list of missing #add current comment to replies field of parent comment comments[direct_parent]['replies'].append(comment_id) except: print("FAIL") print(len(missing_posts), "posts") print(len(missing_comments), "comments") for field in comments[comment_id]: if field != "replies": print(field, comments[comment_id][field]) exit(0) print("\nProcessed", comment_count, "comments in", len(cascades), "cascades") print(" ", len(missing_posts), "missing posts") print(" ", len(missing_comments), "missing comments") print(" ", len([x for x in cascades if cascades[x]['missing_comments']]), "cascades with missing comments") #verify the above process, a couple different ways #count comments from parent counters across all cascades ''' total_comments = 0 for post_id, post in cascades.items(): total_comments += post['comment_count'] print(total_comments, "from post counters") ''' #traverse each cascade and count comments, check against stored comment count ''' for post_id, post in cascades.items(): traverse_comments = traverse_cascade(post, comments) if traverse_comments != post['comment_count']: print("post counter says", post['comment_count'], "comments, but traversal says", traverse_comments) ''' #save cascades for later loading cascade_manip.save_cascades(code, cascades) #cascades cascade_manip.save_comments(code, comments) #comments file_utils.save_json(list(missing_posts), "data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code)) file_utils.save_json(list(missing_comments), "data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code)) return cascades, comments, missing_posts, missing_comments
def graph_infer(sim_post, sim_post_id, group, max_nodes, min_node_quality, estimate_initial_params): print("Inferring post parameters from post graph") #load preprocessed posts for this group if file_utils.verify_file(posts_filepath % group): posts = file_utils.load_pickle(posts_filepath % group) print("Loaded", len(posts), "processed posts from", posts_filepath % group) else: print("Cannot simulate for group", group, "without processed posts file", posts_filepath % group) exit(0) #if seed post not in posts file - we're gonna have a bad time if sim_post['id_h'] not in posts: print("Simulation post not in dataset - exiting\n") exit(0) #grab numeric/graph id of sim post numeric_sim_post_id = posts[sim_post_id]['id'] #load in fitted simulation params - need these for graph build fitted_params, fitted_quality = functions_hybrid_model.load_params( params_filepath % group, posts, False, True) #remove sim post from graph params - no cheating! (pop based on numeric id) res = fitted_params.pop(numeric_sim_post_id) res = fitted_quality.pop(numeric_sim_post_id) #graph stuff - sample graph if necessary, add new nodes, etc graph = {} isolated_nodes = [] added_count = 0 #do we need to sample/process the graph? sample if whole graph too big, imposing a min node quality, need to estimate initial params, or we don't have a precomputed graph file if (max_nodes != None and len(posts) > max_nodes) or file_utils.verify_file( graph_filepath % group ) == False or min_node_quality != None or estimate_initial_params: #only sample down if we actually have to if max_nodes != None: print("\nSampling graph to", max_nodes, "nodes") #sample down posts graph_posts = user_sample_graph(posts, [sim_post], max_nodes, group, min_node_quality, fitted_quality) #otherwise, use them all else: graph_posts = posts #build graph, getting initial param estimate if required if estimate_initial_params: estimated_params = functions_hybrid_model.build_graph_estimate_node_params( graph_posts, fitted_params, fitted_quality, numeric_sim_post_id, temp_graph_filepath % group) else: functions_hybrid_model.build_graph(graph_posts, temp_graph_filepath % group) #no graph sampling/processing, use the full set and copy graph file to temp location else: graph_posts = posts copyfile(graph_filepath % group, temp_graph_filepath % group) print("Copied complete post-graph to", temp_graph_filepath % group) #ALWAYS sample down params to match whatever graph we have - because we can't use the previously fitted params! if estimate_initial_params: functions_hybrid_model.get_graph_params(graph_posts, numeric_sim_post_id, fitted_params, fitted_quality, temp_params_filepath % group, estimated_params) else: functions_hybrid_model.get_graph_params(graph_posts, numeric_sim_post_id, fitted_params, fitted_quality, temp_params_filepath % group) #graph is built and ready - graph file and input params file #run node2vec to get embeddings - if we have to infer parameters #offload to C++, because I feel the need... the need for speed!: if file_utils.verify_file(output_params_filepath % group): os.remove(output_params_filepath % group) #clear output to prevent append #run node2vec on graph and params subprocess.check_call([ "./c_node2vec/examples/node2vec/node2vec", "-i:" + (temp_graph_filepath % group), "-ie:" + (temp_params_filepath % group), "-o:" + (output_params_filepath % group), "-d:6", "-l:3", "-w", "-s", "-otf" ]) print("") #load the inferred params (dictionary of numeric id -> params) all_inferred_params = functions_hybrid_model.load_params( output_params_filepath % group, posts, inferred=True) inferred_params = all_inferred_params[numeric_sim_post_id] return inferred_params
def test(query_loader, query_flip_loader, test_loader, test_flip_loader, trainset_name, testset_name, epoch, verbose=False): cache_file = '{}/feat_cache-{}_to_{}.pkl'.format( exp_dir, trainset_name, testset_name) if args.use_feat_cache: assert os.path.exists( cache_file), "Feature cache file {} does not exist!".format( cache_file) query_2, q_vis, query_flip_2, q_vis, test_2, test_vis, test_flip_2, test_vis, q_ids, q_cams, g_ids, g_cams = load_pickle( cache_file) else: query_2, q_vis = extract_loader_feat(query_loader, verbose=verbose) query_flip_2, q_vis = extract_loader_feat(query_flip_loader, verbose=verbose) test_2, test_vis = extract_loader_feat(test_loader, verbose=verbose) test_flip_2, test_vis = extract_loader_feat(test_flip_loader, verbose=verbose) q_ids = query_loader.dataset.ids q_cams = query_loader.dataset.cameras g_ids = test_loader.dataset.ids g_cams = test_loader.dataset.cameras save_pickle([ query_2, q_vis, query_flip_2, q_vis, test_2, test_vis, test_flip_2, test_vis, q_ids, q_cams, g_ids, g_cams ], cache_file) if args.test_which_feat > 0: # TODO: implement for pap idx = args.test_which_feat query_2 = query_2[:, 256 * idx - 256:256 * idx] query_flip_2 = query_flip_2[:, 256 * idx - 256:256 * idx] test_2 = test_2[:, 256 * idx - 256:256 * idx] test_flip_2 = test_flip_2[:, 256 * idx - 256:256 * idx] query = normalize(query_2 + query_flip_2) test = normalize(test_2 + test_flip_2) if verbose: print('query.shape:', query.shape) print('test.shape:', test.shape) if args.pap: print('q_vis.shape:', q_vis.shape) print('test_vis.shape:', test_vis.shape) if args.pap: dist_1 = compute_dist_with_visibility(query, test, q_vis, test_vis, dist_type='euclidean', avg_by_vis_num=False) else: dist_1 = cdist(query, test) r_1 = cmc(dist_1, q_ids, g_ids, q_cams, g_cams, separate_camera_set=False, single_gallery_shot=False, first_match_break=True) m_ap_1 = mean_ap(dist_1, q_ids, g_ids, q_cams, g_cams) print('EPOCH [%d] %s -> %s: mAP=%f, r@1=%f, r@3=%f, r@5=%f, r@10=%f' % (epoch + 1, trainset_name, testset_name, m_ap_1, r_1[0], r_1[2], r_1[4], r_1[9]))
if len(sys.argv) == 2: subreddit_filter = sys.argv[1] #filepaths of output files subreddits_filepath = "model_files/subreddits.pkl" #dictionary of subreddit -> domain code posts_filepath = "model_files/posts/%s_posts.pkl" #processed post data for each post, one file per subreddit #each post maps original post id to numeric id, set of tokens, and user id params_filepath = "model_files/params/%s_params.txt" #text file of fitted cascade params, one file per subreddit #one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality) graph_filepath = "model_files/graphs/%s_graph.txt" #edgelist of post graph for this subreddit users_filepath = "model_files/users/%s_users.txt" #list of users seen in posts/comments, one file per subreddit #load the subreddit distribution for all cascades (just need a list of subreddits) if file_utils.verify_file(subreddits_filepath): print("Loading subreddit list from", subreddits_filepath) subreddit_dict = file_utils.load_pickle(subreddits_filepath) #file doesn't exist, build it else: #load all three domain breakdown files crypto_subreddit_dist = file_utils.load_json( "results/crypto_post_subreddit_dist.json") cve_subreddit_dist = file_utils.load_json( "results/cve_post_subreddit_dist.json") cyber_subreddit_dist = file_utils.load_json( "results/cyber_post_subreddit_dist.json") #combine into single dictionary of subreddit -> list of corresponding domain codes subreddit_dict = build_domain_dict([ set(crypto_subreddit_dist.keys()), set(cve_subreddit_dist.keys()), set(cyber_subreddit_dist.keys()) ], ["crypto", "cve", "cyber"])
#build/load cascades (auto-load as a result, either raw data or cached cascades) cascades, comments, missing_posts, missing_comments = cascade_analysis.build_cascades( code, raw_stories, raw_comments) #optional: filter out cascades with any missing elements (posts or comments) cascades, comments = cascade_manip.remove_missing(code, cascades, comments) print("\nHave total of", len(cascades), "cascades and", len(comments), "comments for hackernews") #build processed post file #assign numeric ids to each post for node2vec input files #get set of tokens #extract and maintain user if file_utils.verify_file(posts_filepath % code): print("Processed post file already exists.") posts = file_utils.load_pickle(posts_filepath % code) else: c = count() posts = { key: { 'user': value['author_h'], 'tokens': extract_tokens(value), 'id': next(c) } for key, value in cascades.items() } #save this to file file_utils.save_pickle(posts, posts_filepath % code) print("Saved", len(posts), "processed posts to", posts_filepath % code) #build list of users active in this subreddit - list, not set, so more active users are more likely to get drawn in the simulation
def fit_all_cascades(code, cascades, comments, pickle_save, subreddit = False): #if all saved, load from that if file_utils.verify_file("data_cache/fitted_params/%s_cascade_params.pkl" % code): cascade_params = file_utils.load_pickle("data_cache/fitted_params/%s_cascade_params.pkl" % code) fit_fail = [post_id for post_id, post in cascades.items() if post_id not in cascade_params] return cascade_params, fit_fail #anything to load? if so, load the latest checkpoint if pickle_save: #build glob filestring - to get all matching checkpoints if subreddit == False: filename = "data_cache/fitted_params/%s*_cascade_params.pkl" % code else: filename = "data_cache/fitted_params/%s_%s*_cascade_params.pkl" % (code, subreddit) #extract matching filenames and their numeric values, selecting the most complete one to load files = glob.glob(filename) best_int = -1 #count of records in best file - set to "" if a complete file is found for file in files: file_int = re.search(r'\d+', file) #if no number in filename, have a complete file - use that if file_int is None: best_int = "" break else: file_int = int(file_int.group()) if file_int > best_int: best_int = file_int #load checkpoint, if we have one if best_int != -1: if subreddit == False: cascade_params = cascade_manip.load_cascade_params(code, str(best_int)) else: cascade_params = cascade_manip.load_cascade_params(code, subreddit + str(best_int)) print("Loaded", len(cascade_params), "fitted cascade parameters") #otherwise, empty dictionary else: cascade_params = {} else: cascade_params = {} avg_quality = 0 #fit any cascades that have not been fitted before, add to params dictionary: post_id -> params post_count = len(cascade_params) fit_fail = [] print("Fitting all cascade models") for post_id, post in cascades.items(): #if this cascade already fitted, and params are valid, skip if post_id in cascade_params and (cascade_params[post_id][0] != 20 and cascade_params[post_id][1] != 500 and cascade_params[post_id][2] != 2.3): continue #fit the current cascade (filtering comments to just this post is not required) #print("Fitting cascade", post_id) param_res = fit_cascade.fit_cascade_model(post, comments) #if negative comment times, skip this cascade and move to next if param_res == False: fit_fail.append(post_id) continue cascade_params[post_id] = param_res avg_quality += cascade_params[post_id][6] post_count += 1 if post_count % 1000 == 0: print("Fitted", post_count, "cascades") if pickle_save and post_count % 10000 == 0: if subreddit == False: cascade_manip.save_cascade_params(code, cascade_params, str(post_count)) else: cascade_manip.save_cascade_params(code, cascade_params, subreddit + str(post_count)) avg_quality /= len(cascade_params) #dump params to file print("Fitted a total of", len(cascade_params), "cascades (average quality", str(avg_quality) + ")") if pickle_save: cascade_manip.save_cascade_params(code, cascade_params, subreddit) #return all params, loaded and newly fitted return cascade_params, fit_fail #end fit_all_cascades
post_counter = 1 #counter of posts to simulate, across all subreddits #process each subreddit for subreddit, seeds in post_seeds.items(): ''' #TESTING ONLY!!!! if subreddit != "Lisk": continue ''' print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate") #if have a cached graph, load and use that instead of rebuilding if file_utils.verify_file("graph_cache/%s_post_graph.pkl" % subreddit) and file_utils.verify_file("graph_cache/%s_user_ids.pkl" % subreddit): print("Loading post graph from graph_cache/%s_post_graph.pkl and user id list from graph_cache/%s_user_ids.pkl" % (subreddit, subreddit)) sub_graph = file_utils.load_pickle("graph_cache/%s_post_graph.pkl" % subreddit) user_ids = file_utils.load_pickle("graph_cache/%s_user_ids.pkl" % subreddit) print("Loaded graph has", sub_graph.graph.number_of_nodes(), "nodes and", sub_graph.graph.size(), "edges") #no cached, build graph from raw posts and params else: #load subreddit posts (don't need the comments!) raw_sub_posts = cascade_manip.load_filtered_posts(domain, subreddit) #load subreddit parameters raw_sub_params = cascade_manip.load_cascade_params(domain, subreddit) #filter posts - TESTING ONLY!!!!!!!! - if you didn't load all the params ''' raw_sub_posts = {post_id : post for post_id, post in sub_posts.items() if post_id in sub_params} print("Filtered to", len(sub_posts), "posts with fitted parameters") '''