def clean_file(self, fname, field, remove, **kwargs): media = ('Media', 'Video', 'Image', 'Search', 'Sorry') attributes = [ 'caption', 'copyright', 'playback', 'episode', 'iPlayer', 'radio', 'BBC2' ] doc = fx.load_pickle(self.directory + '\\' + fname) lines = doc[field] # print(lines) if 'split' in kwargs: lines = [ line for line in lines.split('\n') if not line.startswith(media) or not any(x in line.split() for x in attributes) ] doc[field] = '\n'.join(lines) if 'clean' in kwargs: if remove in lines: doc[field] = '' else: pass doc[field] = lines.replace(remove, '') # print(doc['text']) fx.save_pickle(self.directory + '\\' + fname, doc)
def process(in_posts, subreddit, year, month): posts = {} for post_id, raw_post in in_posts.items(): #check for good row, fail and error if something is amiss (probably a non-quoted body) if raw_post['title_m'] == None or raw_post[ 'subreddit'] == None or raw_post[ 'created_utc'] == None or raw_post['author_h'] == None: print("Invalid post, skipping") continue #build new post dict post = {} post['tokens'] = functions_gen_cascade_model.extract_tokens( raw_post['title_m']) if post['tokens'] == False: continue post['time'] = int(raw_post['created_utc']) post['author'] = raw_post['author_h'] #add to overall post dict post_id = "t3_" + raw_post['id_h'] #post id with t3_ prefix posts[post_id] = post #save to pickle processed_posts_filepath = "reddit_data/%s/%s_processed_posts_%d_%d.pkl" file_utils.save_pickle( posts, processed_posts_filepath % (subreddit, subreddit, year, month)) print("Processed %d posts for %d-%d" % (len(posts), month, year)) return posts
def save_comments(code, comments): #save all comments to pickle print ("Loaded", len(comments), "comments, saving to data_cache/%s_comments.pkl" % code) if not os.path.exists("data_cache"): os.makedirs("data_cache") #break cyber comments into 32 separate files, because memory error if code == "cyber": if not os.path.exists("data_cache/cyber_comments"): os.makedirs("data_cache/cyber_comments") for i in range(0, len(comments), 1000000): file_utils.save_pickle(comments[i:i+1000000], "data_cache/cyber_comments/%s_comments_%s.pkl" % (code, i//1000000)) else: file_utils.save_pickle(comments, "data_cache/%s_comments.pkl" % code) print(" Comments saved")
def get_article_scrapper(self, driver, url, id): soup = scx.get_driver_soup(driver, url) text = soup.find("article") if text: doc = { 'title': soup.find("h1").get_text(), 'text': text.get_text(), 'canonical_link': soup.find("link")['href'] } fx.save_pickle(self.directory + '/' + str(id) + '.pkl', doc) else: self.log_errors('Page without article: ' + str(url)) pass
def get_article(self, url, id): try: a = Article(str(url)) a.download() a.parse() doc = { attr: value for attr, value in a.__dict__.items() if not attr.startswith('__') and type(value) in [str, list, set, bool, int, dict, 'collections.defaultdict'] } fx.save_pickle(self.directory + '/' + str(id) + '.pkl', doc) except newspaper.article.ArticleException as e: self.log_errors('Error downloading: ' + str(url)) pass
def get_accounts(users, folder, id=True): collected = fx.get_fnames(folder) for user in users: if user not in collected: try: result = (user, get_user_account(user)) fx.save_pickle('{}/{}'.format(folder, user), result) except TweepError as e: print('Could not retrieve info for user: {}'.format(user)) print(e) # print('exception raised, waiting 15 minutes') # time.sleep(15*60) pass
def save_cascades(code, cascades, filtered=False): if filtered == False: file_utils.verify_dir("data_cache/%s_cascades" % code) print( "Saving cascades to data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) file_utils.save_pickle( cascades, "data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) else: file_utils.verify_dir("data_cache/filtered_cascades") print( "Saving filtered cascades to data_cache/filtered_cascades/%s_%s_cascades.pkl" % (code, filtered)) file_utils.save_pickle( cascades, "data_cache/filtered_cascades/%s_%s_cascades.pkl" % (code, filtered))
def save_cascade_params(code, cascade_params, filtered=False): if filtered == False: file_utils.verify_dir("data_cache/fitted_params/") print( "Saving cascade params to data_cache/fitted_params/%s_cascade_params.pkl" % code) file_utils.save_pickle( cascade_params, "data_cache/fitted_params/%s_cascade_params.pkl" % code) else: file_utils.verify_dir("data_cache/fitted_params") print( "Saving filtered cascades to data_cache/fitted_params/%s_%s_cascade_params.pkl" % (code, filtered)) file_utils.save_pickle( cascade_params, "data_cache/fitted_params/%s_%s_cascade_params.pkl" % (code, filtered))
def get_corpus(self): files = fx.get_fnames(self.directory) # remove log file files.remove('errors') articles = [] for fname in files: path = os.path.join( os.path.abspath(os.curdir) + '\\' + self.directory, fname + '.pkl') doc = fx.load_pickle(path) article = {'id': fname, 'title': doc['title'], 'text': doc['text']} articles.append(article) df = pd.DataFrame(articles) fx.save_pickle(os.path.join(self.corpus, self.domain + '.pkl'), df) pdx.save_to_csv(df, os.path.join(self.corpus, self.domain))
def get_corpus_weight(self, column): """ Get weighted corpus dataframe according to column weight (count, favorites, retweets, is_bot) """ df_corpus = fx.load_pickle(self.corpus + '/' + self.domain + '.pkl') df_weight = self.df.filter(['id', column], axis=1) df_corpus['id'] = df_corpus['id'].astype(int) df_weight['id'] = df_weight['id'].astype(int) df = pd.merge(df_corpus, df_weight, on='id') df = pd.DataFrame(np.repeat(df.values, df[column].replace(0, 1).tolist(), axis=0), columns=df.columns) fx.save_pickle( os.path.join(self.corpus, self.domain + '_' + str(column) + '.pkl'), df)
def save_comments(code, comments, filtered=False): if filtered == False: print( "Saving comments to data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) #save all comments to pickle file_utils.verify_dir("data_cache/%s_cascades" % code) #break cyber comments into separate files, because memory error if code == "cyber": temp = {} #temporary dictionary to hold a chunk of comments count = 0 for comment_id, comment in comments.items(): temp[comment_id] = comment count += 1 if count % 1000000 == 0: file_utils.save_pickle( temp, "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" % (code, code, count // 1000000)) temp = {} #last save file_utils.save_pickle( temp, "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" % (code, code, count // 1000000)) else: file_utils.save_pickle( comments, "data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) else: file_utils.verify_dir("data_cache/filtered_cascades") print( "Saving filtered comments to data_cache/filtered_cascades/%s_%s_comments.pkl" % (code, filtered)) file_utils.save_pickle( comments, "data_cache/filtered_cascades/%s_%s_comments.pkl" % (code, filtered))
def save_cascade_comments(code, comments): #save all comments to pickle if not os.path.exists("data_cache/%s_cascades" % code): os.makedirs("data_cache/%s_cascades" % code) #break cyber comments into separate files, because memory error if code == "cyber": temp = {} #temporary dictionary to hold a chunk of comments count = 0 for comment_id, comment in comments.items(): temp[comment_id] = comment count += 1 if count % 1000000 == 0: file_utils.save_pickle(temp, "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" % (code, code, count//1000000)) temp = {} #last save file_utils.save_pickle(temp, "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" % (code, code, count//1000000)) else: file_utils.save_pickle(comments, "data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code))
def save_posts(code, posts): print("Loaded", len(posts), "posts, saving to data_cache/%s_posts.pkl" % code) if not os.path.exists("data_cache"): os.makedirs("data_cache") file_utils.save_pickle(posts, "data_cache/%s_posts.pkl" % code) print(" Posts saved")
vprint(" finished %d posts" % post_count) #dump results every 10%, to save memory if batch and post_count % dump_count == 0: vprint(" saving results so far (%d posts)" % post_count) #append new results to running csv functions_gen_cascade_model.save_results(outfile, all_metrics, observing_time) all_metrics.clear() #clear out what we already saved #and save pickle bookmark: set of finished posts and current status functions_gen_cascade_model.save_bookmark(finished_posts, outfile) #don't clear that list, want it to contain everything #all done, print final disconnected count vprint("Finished simulating %d posts" % post_count) if post_count == 0: vprint("\nNo posts simulated, no results to save\n") exit(0) #save metrics + settings to output file functions_gen_cascade_model.save_results(outfile, all_metrics, observing_time) #all done, update bookmark to "finished" functions_gen_cascade_model.save_bookmark(finished_posts, outfile, status=(True if len(finished_posts) == len(test_posts) else False)) #if outputting timestamps, dump to pickle (hackery) if output_timestamps: file_utils.save_pickle(timestamps, outfile+"_timestamps.pkl") vprint("All done, all results saved\n")
#load the pickle bookmark bookmark = file_utils.load_pickle(bookmark_file) #build a new one, including only the posts in our reduced target set finished_set = set([ post_id for post_id in bookmark['finished_posts'] if post_id in test_ids ]) print(" ", len(finished_set), "posts finished in bookmark") #save new bookmark - if doesn't already exist (don't want to overwrite stuff!) file_utils.save_pickle( { "finished_posts": finished_set, 'complete': True if len(finished_set) == testing_num else False }, bookmark_format % (subreddit, subreddit, model, training_num, testing_num, testing_start_year, testing_start_month, file_run)) #edit the results csv to match this post set - if correctly sized results don't already exist count = 0 first = True with open(results_file, 'r') as inp, open( results_format % (subreddit, subreddit, model, training_num, testing_num, testing_start_year, testing_start_month, file_run), 'w') as out: writer = csv.writer(out) for row in csv.reader(inp): if first:
def test(query_loader, query_flip_loader, test_loader, test_flip_loader, trainset_name, testset_name, epoch, verbose=False): cache_file = '{}/feat_cache-{}_to_{}.pkl'.format( exp_dir, trainset_name, testset_name) if args.use_feat_cache: assert os.path.exists( cache_file), "Feature cache file {} does not exist!".format( cache_file) query_2, q_vis, query_flip_2, q_vis, test_2, test_vis, test_flip_2, test_vis, q_ids, q_cams, g_ids, g_cams = load_pickle( cache_file) else: query_2, q_vis = extract_loader_feat(query_loader, verbose=verbose) query_flip_2, q_vis = extract_loader_feat(query_flip_loader, verbose=verbose) test_2, test_vis = extract_loader_feat(test_loader, verbose=verbose) test_flip_2, test_vis = extract_loader_feat(test_flip_loader, verbose=verbose) q_ids = query_loader.dataset.ids q_cams = query_loader.dataset.cameras g_ids = test_loader.dataset.ids g_cams = test_loader.dataset.cameras save_pickle([ query_2, q_vis, query_flip_2, q_vis, test_2, test_vis, test_flip_2, test_vis, q_ids, q_cams, g_ids, g_cams ], cache_file) if args.test_which_feat > 0: # TODO: implement for pap idx = args.test_which_feat query_2 = query_2[:, 256 * idx - 256:256 * idx] query_flip_2 = query_flip_2[:, 256 * idx - 256:256 * idx] test_2 = test_2[:, 256 * idx - 256:256 * idx] test_flip_2 = test_flip_2[:, 256 * idx - 256:256 * idx] query = normalize(query_2 + query_flip_2) test = normalize(test_2 + test_flip_2) if verbose: print('query.shape:', query.shape) print('test.shape:', test.shape) if args.pap: print('q_vis.shape:', q_vis.shape) print('test_vis.shape:', test_vis.shape) if args.pap: dist_1 = compute_dist_with_visibility(query, test, q_vis, test_vis, dist_type='euclidean', avg_by_vis_num=False) else: dist_1 = cdist(query, test) r_1 = cmc(dist_1, q_ids, g_ids, q_cams, g_cams, separate_camera_set=False, single_gallery_shot=False, first_match_break=True) m_ap_1 = mean_ap(dist_1, q_ids, g_ids, q_cams, g_cams) print('EPOCH [%d] %s -> %s: mAP=%f, r@1=%f, r@3=%f, r@5=%f, r@10=%f' % (epoch + 1, trainset_name, testset_name, m_ap_1, r_1[0], r_1[2], r_1[4], r_1[9]))
import file_utils import glob code = "cve" #load all cve cascades and comments files = glob.glob('data_cache/filtered_cascades/cve_*_cascades.pkl') posts = {} for file in files: posts.update(file_utils.load_pickle(file)) files = glob.glob('data_cache/filtered_cascades/cve_*_comments.pkl') comments = {} for file in files: comments.update(file_utils.load_pickle(file)) #filenames of filtered cascades and comments cascades_filepath = "data_cache/filtered_cascades/%s_%s_cascades.pkl" #domain and subreddit cascades comments_filepath = "data_cache/filtered_cascades/%s_%s_comments.pkl" #domain and subreddit comments #save to same place as other filtered cascades - use hackernews as domain and subreddit file_utils.save_pickle(posts, cascades_filepath % (code, code)) file_utils.save_pickle(comments, comments_filepath % (code, code)) #add cve to subreddit -> domain mapping subs = file_utils.load_pickle("model_files/domain_mapping.pkl") if code not in subs: subs[code] = code file_utils.save_pickle(subs, "model_files/domain_mapping.pkl")
set(cyber_subreddit_dist.keys()) ], ["crypto", "cve", "cyber"]) #now, kill all the duplicates! crypto and cyber scraped entire subreddits, #so any overlap is redudant and can be thrown away #(yes, there are neater ways to do this, but I don't care!) for item in subreddit_dict.keys(): if len(subreddit_dict[item]) > 1: #crypto and cyber drowns out cve, so remove it if ("crypto" in subreddit_dict[item] or "cyber" in subreddit_dict[item]) and "cve" in subreddit_dict[item]: subreddit_dict[item].remove("cve") subreddit_dict[item] = subreddit_dict[item][0] #save as pickle for later print("Saving subreddit->domain mapping to", subreddits_filepath) file_utils.save_pickle(subreddit_dict, subreddits_filepath) #verify directories for output files file_utils.verify_dir("model_files/params") file_utils.verify_dir("model_files/posts") file_utils.verify_dir("model_files/graphs") file_utils.verify_dir("model_files/users") #loop all subreddits for subreddit, domain in subreddit_dict.items(): if subreddit != subreddit_filter: continue ''' if domain != "crypto": continue
#extract and maintain user if file_utils.verify_file(posts_filepath % code): print("Processed post file already exists.") posts = file_utils.load_pickle(posts_filepath % code) else: c = count() posts = { key: { 'user': value['author_h'], 'tokens': extract_tokens(value), 'id': next(c) } for key, value in cascades.items() } #save this to file file_utils.save_pickle(posts, posts_filepath % code) print("Saved", len(posts), "processed posts to", posts_filepath % code) #build list of users active in this subreddit - list, not set, so more active users are more likely to get drawn in the simulation if file_utils.verify_file(users_filepath % code): print("Active users exist in", users_filepath % code) else: #build active users list active_users = [] for post_id, post in cascades.items(): active_users.append(post['author_h']) for comment_id, comment in comments.items(): active_users.append(comment['author_h']) file_utils.save_pickle(active_users, users_filepath % code) print("Saved", len(active_users), "active users to", users_filepath % code)
#if have cached cascades/comments for this set of posts, load them if file_utils.verify_file("%s_test_cascades.pkl" % subreddit): cascades = file_utils.load_pickle("%s_test_cascades.pkl" % subreddit) comments = file_utils.load_pickle("%s_test_comments.pkl" % subreddit) print("Loaded", len(cascades), "filtered test cascades") #ptherwise, load filtered cascades for this subreddit, and build filtered list else: all_cascades, all_comments = cascade_manip.load_filtered_cascades( domain, subreddit) seed_ids = [post['id_h'] for post in seeds] cascades = { post_id: post for post_id, post in all_cascades.items() if post_id in seed_ids } file_utils.save_pickle(cascades, "%s_test_cascades.pkl" % subreddit) cascades, comments = cascade_manip.filter_comments_by_posts( cascades, all_comments) file_utils.save_pickle(comments, "%s_test_comments.pkl" % subreddit) #node2vec finished, on to the simulation! #for each post, infer parameters and simulate print("Simulating comment trees...") for seed_post in seeds: #grap real post for this cascade test_post = cascades[seed_post['id_h']] ''' #if we can, use fitted params if test_post['id'] in fitted_params: post_params = fitted_params[test_post['id']]