Exemple #1
0
    def clean_file(self, fname, field, remove, **kwargs):
        media = ('Media', 'Video', 'Image', 'Search', 'Sorry')
        attributes = [
            'caption', 'copyright', 'playback', 'episode', 'iPlayer', 'radio',
            'BBC2'
        ]

        doc = fx.load_pickle(self.directory + '\\' + fname)
        lines = doc[field]
        # print(lines)

        if 'split' in kwargs:
            lines = [
                line for line in lines.split('\n')
                if not line.startswith(media) or not any(x in line.split()
                                                         for x in attributes)
            ]
            doc[field] = '\n'.join(lines)

        if 'clean' in kwargs:
            if remove in lines:
                doc[field] = ''
            else:
                pass

        doc[field] = lines.replace(remove, '')
        # print(doc['text'])
        fx.save_pickle(self.directory + '\\' + fname, doc)
Exemple #2
0
def process(in_posts, subreddit, year, month):
    posts = {}
    for post_id, raw_post in in_posts.items():

        #check for good row, fail and error if something is amiss (probably a non-quoted body)
        if raw_post['title_m'] == None or raw_post[
                'subreddit'] == None or raw_post[
                    'created_utc'] == None or raw_post['author_h'] == None:
            print("Invalid post, skipping")
            continue

        #build new post dict
        post = {}
        post['tokens'] = functions_gen_cascade_model.extract_tokens(
            raw_post['title_m'])
        if post['tokens'] == False:
            continue
        post['time'] = int(raw_post['created_utc'])
        post['author'] = raw_post['author_h']

        #add to overall post dict
        post_id = "t3_" + raw_post['id_h']  #post id with t3_ prefix
        posts[post_id] = post

    #save to pickle
    processed_posts_filepath = "reddit_data/%s/%s_processed_posts_%d_%d.pkl"
    file_utils.save_pickle(
        posts, processed_posts_filepath % (subreddit, subreddit, year, month))

    print("Processed %d posts for %d-%d" % (len(posts), month, year))

    return posts
def save_comments(code, comments):
	#save all comments to pickle
	print ("Loaded", len(comments), "comments, saving to data_cache/%s_comments.pkl" % code)
	if not os.path.exists("data_cache"):
		os.makedirs("data_cache")
	#break cyber comments into 32 separate files, because memory error
	if code == "cyber":
		if not os.path.exists("data_cache/cyber_comments"):
			os.makedirs("data_cache/cyber_comments")
		for i in range(0, len(comments), 1000000):
			file_utils.save_pickle(comments[i:i+1000000], "data_cache/cyber_comments/%s_comments_%s.pkl" % (code, i//1000000))
	else:
		file_utils.save_pickle(comments, "data_cache/%s_comments.pkl" % code)
	print("   Comments saved")
Exemple #4
0
    def get_article_scrapper(self, driver, url, id):
        soup = scx.get_driver_soup(driver, url)
        text = soup.find("article")

        if text:
            doc = {
                'title': soup.find("h1").get_text(),
                'text': text.get_text(),
                'canonical_link': soup.find("link")['href']
            }

            fx.save_pickle(self.directory + '/' + str(id) + '.pkl', doc)
        else:
            self.log_errors('Page without article: ' + str(url))
            pass
Exemple #5
0
    def get_article(self, url, id):
        try:
            a = Article(str(url))
            a.download()
            a.parse()
            doc = {
                attr: value
                for attr, value in a.__dict__.items()
                if not attr.startswith('__') and type(value) in
                [str, list, set, bool, int, dict, 'collections.defaultdict']
            }
            fx.save_pickle(self.directory + '/' + str(id) + '.pkl', doc)

        except newspaper.article.ArticleException as e:
            self.log_errors('Error downloading: ' + str(url))
            pass
def get_accounts(users, folder, id=True):

    collected = fx.get_fnames(folder)

    for user in users:
        if user not in collected:
            try:
                result = (user, get_user_account(user))
                fx.save_pickle('{}/{}'.format(folder, user), result)

            except TweepError as e:
                print('Could not retrieve info for user: {}'.format(user))
                print(e)
                # print('exception raised, waiting 15 minutes')
                # time.sleep(15*60)
                pass
Exemple #7
0
def save_cascades(code, cascades, filtered=False):
    if filtered == False:
        file_utils.verify_dir("data_cache/%s_cascades" % code)
        print(
            "Saving cascades to data_cache/%s_cascades/%s_cascade_posts.pkl" %
            (code, code))
        file_utils.save_pickle(
            cascades,
            "data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code))
    else:
        file_utils.verify_dir("data_cache/filtered_cascades")
        print(
            "Saving filtered cascades to data_cache/filtered_cascades/%s_%s_cascades.pkl"
            % (code, filtered))
        file_utils.save_pickle(
            cascades, "data_cache/filtered_cascades/%s_%s_cascades.pkl" %
            (code, filtered))
Exemple #8
0
def save_cascade_params(code, cascade_params, filtered=False):
    if filtered == False:
        file_utils.verify_dir("data_cache/fitted_params/")
        print(
            "Saving cascade params to data_cache/fitted_params/%s_cascade_params.pkl"
            % code)
        file_utils.save_pickle(
            cascade_params,
            "data_cache/fitted_params/%s_cascade_params.pkl" % code)
    else:
        file_utils.verify_dir("data_cache/fitted_params")
        print(
            "Saving filtered cascades to data_cache/fitted_params/%s_%s_cascade_params.pkl"
            % (code, filtered))
        file_utils.save_pickle(
            cascade_params,
            "data_cache/fitted_params/%s_%s_cascade_params.pkl" %
            (code, filtered))
Exemple #9
0
    def get_corpus(self):

        files = fx.get_fnames(self.directory)

        # remove log file
        files.remove('errors')

        articles = []

        for fname in files:
            path = os.path.join(
                os.path.abspath(os.curdir) + '\\' + self.directory,
                fname + '.pkl')
            doc = fx.load_pickle(path)
            article = {'id': fname, 'title': doc['title'], 'text': doc['text']}
            articles.append(article)

        df = pd.DataFrame(articles)
        fx.save_pickle(os.path.join(self.corpus, self.domain + '.pkl'), df)
        pdx.save_to_csv(df, os.path.join(self.corpus, self.domain))
Exemple #10
0
    def get_corpus_weight(self, column):
        """
        Get weighted corpus dataframe according to column weight
        (count, favorites, retweets, is_bot)
        """
        df_corpus = fx.load_pickle(self.corpus + '/' + self.domain + '.pkl')

        df_weight = self.df.filter(['id', column], axis=1)

        df_corpus['id'] = df_corpus['id'].astype(int)
        df_weight['id'] = df_weight['id'].astype(int)

        df = pd.merge(df_corpus, df_weight, on='id')

        df = pd.DataFrame(np.repeat(df.values,
                                    df[column].replace(0, 1).tolist(),
                                    axis=0),
                          columns=df.columns)

        fx.save_pickle(
            os.path.join(self.corpus,
                         self.domain + '_' + str(column) + '.pkl'), df)
Exemple #11
0
def save_comments(code, comments, filtered=False):
    if filtered == False:
        print(
            "Saving comments to data_cache/%s_cascades/%s_cascade_comments.pkl"
            % (code, code))
        #save all comments to pickle
        file_utils.verify_dir("data_cache/%s_cascades" % code)
        #break cyber comments into separate files, because memory error
        if code == "cyber":
            temp = {}  #temporary dictionary to hold a chunk of comments
            count = 0
            for comment_id, comment in comments.items():
                temp[comment_id] = comment
                count += 1
                if count % 1000000 == 0:
                    file_utils.save_pickle(
                        temp,
                        "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" %
                        (code, code, count // 1000000))
                    temp = {}
            #last save
            file_utils.save_pickle(
                temp, "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" %
                (code, code, count // 1000000))
        else:
            file_utils.save_pickle(
                comments, "data_cache/%s_cascades/%s_cascade_comments.pkl" %
                (code, code))
    else:
        file_utils.verify_dir("data_cache/filtered_cascades")
        print(
            "Saving filtered comments to data_cache/filtered_cascades/%s_%s_comments.pkl"
            % (code, filtered))
        file_utils.save_pickle(
            comments, "data_cache/filtered_cascades/%s_%s_comments.pkl" %
            (code, filtered))
def save_cascade_comments(code, comments):
	#save all comments to pickle
	if not os.path.exists("data_cache/%s_cascades" % code):
		os.makedirs("data_cache/%s_cascades" % code)
	#break cyber comments into separate files, because memory error
	if code == "cyber":
		temp = {}		#temporary dictionary to hold a chunk of comments
		count = 0
		for comment_id, comment in comments.items():
			temp[comment_id] = comment
			count += 1
			if count % 1000000 == 0:
				file_utils.save_pickle(temp, "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" % (code, code, count//1000000))
				temp = {}
		#last save
		file_utils.save_pickle(temp, "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" % (code, code, count//1000000))
	else:
		file_utils.save_pickle(comments, "data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code))
Exemple #13
0
def save_posts(code, posts):
	print("Loaded", len(posts), "posts, saving to data_cache/%s_posts.pkl" % code)
	if not os.path.exists("data_cache"):
		os.makedirs("data_cache")
	file_utils.save_pickle(posts, "data_cache/%s_posts.pkl" % code)
	print("   Posts saved")
Exemple #14
0
		vprint("   finished %d posts" % post_count)

	#dump results every 10%, to save memory
	if batch and post_count % dump_count == 0:
		vprint("   saving results so far (%d posts)" % post_count)
		#append new results to running csv
		functions_gen_cascade_model.save_results(outfile, all_metrics, observing_time)
		all_metrics.clear()		#clear out what we already saved
		#and save pickle bookmark: set of finished posts and current status
		functions_gen_cascade_model.save_bookmark(finished_posts, outfile)
		#don't clear that list, want it to contain everything

#all done, print final disconnected count
vprint("Finished simulating %d posts" % post_count)

if post_count == 0:
	vprint("\nNo posts simulated, no results to save\n")
	exit(0)

#save metrics + settings to output file
functions_gen_cascade_model.save_results(outfile, all_metrics, observing_time)

#all done, update bookmark to "finished"
functions_gen_cascade_model.save_bookmark(finished_posts, outfile, status=(True if len(finished_posts) == len(test_posts) else False))

#if outputting timestamps, dump to pickle (hackery)
if output_timestamps:
	file_utils.save_pickle(timestamps, outfile+"_timestamps.pkl")

vprint("All done, all results saved\n")
Exemple #15
0
            #load the pickle bookmark
            bookmark = file_utils.load_pickle(bookmark_file)

            #build a new one, including only the posts in our reduced target set
            finished_set = set([
                post_id for post_id in bookmark['finished_posts']
                if post_id in test_ids
            ])
            print("     ", len(finished_set), "posts finished in bookmark")

            #save new bookmark - if doesn't already exist (don't want to overwrite stuff!)
            file_utils.save_pickle(
                {
                    "finished_posts": finished_set,
                    'complete':
                    True if len(finished_set) == testing_num else False
                }, bookmark_format %
                (subreddit, subreddit, model, training_num, testing_num,
                 testing_start_year, testing_start_month, file_run))

            #edit the results csv to match this post set - if correctly sized results don't already exist
            count = 0
            first = True
            with open(results_file, 'r') as inp, open(
                    results_format %
                (subreddit, subreddit, model, training_num, testing_num,
                 testing_start_year, testing_start_month, file_run),
                    'w') as out:
                writer = csv.writer(out)
                for row in csv.reader(inp):
                    if first:
Exemple #16
0
    def test(query_loader,
             query_flip_loader,
             test_loader,
             test_flip_loader,
             trainset_name,
             testset_name,
             epoch,
             verbose=False):
        cache_file = '{}/feat_cache-{}_to_{}.pkl'.format(
            exp_dir, trainset_name, testset_name)
        if args.use_feat_cache:
            assert os.path.exists(
                cache_file), "Feature cache file {} does not exist!".format(
                    cache_file)
            query_2, q_vis, query_flip_2, q_vis, test_2, test_vis, test_flip_2, test_vis, q_ids, q_cams, g_ids, g_cams = load_pickle(
                cache_file)
        else:
            query_2, q_vis = extract_loader_feat(query_loader, verbose=verbose)
            query_flip_2, q_vis = extract_loader_feat(query_flip_loader,
                                                      verbose=verbose)

            test_2, test_vis = extract_loader_feat(test_loader,
                                                   verbose=verbose)
            test_flip_2, test_vis = extract_loader_feat(test_flip_loader,
                                                        verbose=verbose)

            q_ids = query_loader.dataset.ids
            q_cams = query_loader.dataset.cameras
            g_ids = test_loader.dataset.ids
            g_cams = test_loader.dataset.cameras
            save_pickle([
                query_2, q_vis, query_flip_2, q_vis, test_2, test_vis,
                test_flip_2, test_vis, q_ids, q_cams, g_ids, g_cams
            ], cache_file)

        if args.test_which_feat > 0:
            # TODO: implement for pap
            idx = args.test_which_feat
            query_2 = query_2[:, 256 * idx - 256:256 * idx]
            query_flip_2 = query_flip_2[:, 256 * idx - 256:256 * idx]
            test_2 = test_2[:, 256 * idx - 256:256 * idx]
            test_flip_2 = test_flip_2[:, 256 * idx - 256:256 * idx]

        query = normalize(query_2 + query_flip_2)
        test = normalize(test_2 + test_flip_2)

        if verbose:
            print('query.shape:', query.shape)
            print('test.shape:', test.shape)
            if args.pap:
                print('q_vis.shape:', q_vis.shape)
                print('test_vis.shape:', test_vis.shape)

        if args.pap:
            dist_1 = compute_dist_with_visibility(query,
                                                  test,
                                                  q_vis,
                                                  test_vis,
                                                  dist_type='euclidean',
                                                  avg_by_vis_num=False)
        else:
            dist_1 = cdist(query, test)
        r_1 = cmc(dist_1,
                  q_ids,
                  g_ids,
                  q_cams,
                  g_cams,
                  separate_camera_set=False,
                  single_gallery_shot=False,
                  first_match_break=True)
        m_ap_1 = mean_ap(dist_1, q_ids, g_ids, q_cams, g_cams)
        print('EPOCH [%d] %s -> %s: mAP=%f, r@1=%f, r@3=%f, r@5=%f, r@10=%f' %
              (epoch + 1, trainset_name, testset_name, m_ap_1, r_1[0], r_1[2],
               r_1[4], r_1[9]))
import file_utils
import glob

code = "cve"

#load all cve cascades and comments
files = glob.glob('data_cache/filtered_cascades/cve_*_cascades.pkl')
posts = {}
for file in files:
    posts.update(file_utils.load_pickle(file))
files = glob.glob('data_cache/filtered_cascades/cve_*_comments.pkl')
comments = {}
for file in files:
    comments.update(file_utils.load_pickle(file))

#filenames of filtered cascades and comments
cascades_filepath = "data_cache/filtered_cascades/%s_%s_cascades.pkl"  #domain and subreddit cascades
comments_filepath = "data_cache/filtered_cascades/%s_%s_comments.pkl"  #domain and subreddit comments

#save to same place as other filtered cascades - use hackernews as domain and subreddit
file_utils.save_pickle(posts, cascades_filepath % (code, code))
file_utils.save_pickle(comments, comments_filepath % (code, code))

#add cve to subreddit -> domain mapping
subs = file_utils.load_pickle("model_files/domain_mapping.pkl")
if code not in subs:
    subs[code] = code
    file_utils.save_pickle(subs, "model_files/domain_mapping.pkl")
Exemple #18
0
        set(cyber_subreddit_dist.keys())
    ], ["crypto", "cve", "cyber"])
    #now, kill all the duplicates! crypto and cyber scraped entire subreddits,
    #so any overlap is redudant and can be thrown away
    #(yes, there are neater ways to do this, but I don't care!)
    for item in subreddit_dict.keys():
        if len(subreddit_dict[item]) > 1:
            #crypto and cyber drowns out cve, so remove it
            if ("crypto" in subreddit_dict[item] or "cyber"
                    in subreddit_dict[item]) and "cve" in subreddit_dict[item]:
                subreddit_dict[item].remove("cve")
        subreddit_dict[item] = subreddit_dict[item][0]

    #save as pickle for later
    print("Saving subreddit->domain mapping to", subreddits_filepath)
    file_utils.save_pickle(subreddit_dict, subreddits_filepath)

#verify directories for output files
file_utils.verify_dir("model_files/params")
file_utils.verify_dir("model_files/posts")
file_utils.verify_dir("model_files/graphs")
file_utils.verify_dir("model_files/users")

#loop all subreddits
for subreddit, domain in subreddit_dict.items():

    if subreddit != subreddit_filter:
        continue
    '''
	if domain != "crypto":
		continue
Exemple #19
0
#extract and maintain user
if file_utils.verify_file(posts_filepath % code):
    print("Processed post file already exists.")
    posts = file_utils.load_pickle(posts_filepath % code)
else:
    c = count()
    posts = {
        key: {
            'user': value['author_h'],
            'tokens': extract_tokens(value),
            'id': next(c)
        }
        for key, value in cascades.items()
    }
    #save this to file
    file_utils.save_pickle(posts, posts_filepath % code)
    print("Saved", len(posts), "processed posts to", posts_filepath % code)

#build list of users active in this subreddit - list, not set, so more active users are more likely to get drawn in the simulation
if file_utils.verify_file(users_filepath % code):
    print("Active users exist in", users_filepath % code)
else:
    #build active users list
    active_users = []
    for post_id, post in cascades.items():
        active_users.append(post['author_h'])
    for comment_id, comment in comments.items():
        active_users.append(comment['author_h'])
    file_utils.save_pickle(active_users, users_filepath % code)
    print("Saved", len(active_users), "active users to", users_filepath % code)
Exemple #20
0
    #if have cached cascades/comments for this set of posts, load them
    if file_utils.verify_file("%s_test_cascades.pkl" % subreddit):
        cascades = file_utils.load_pickle("%s_test_cascades.pkl" % subreddit)
        comments = file_utils.load_pickle("%s_test_comments.pkl" % subreddit)
        print("Loaded", len(cascades), "filtered test cascades")
    #ptherwise, load filtered cascades for this subreddit, and build filtered list
    else:
        all_cascades, all_comments = cascade_manip.load_filtered_cascades(
            domain, subreddit)
        seed_ids = [post['id_h'] for post in seeds]
        cascades = {
            post_id: post
            for post_id, post in all_cascades.items() if post_id in seed_ids
        }
        file_utils.save_pickle(cascades, "%s_test_cascades.pkl" % subreddit)
        cascades, comments = cascade_manip.filter_comments_by_posts(
            cascades, all_comments)
        file_utils.save_pickle(comments, "%s_test_comments.pkl" % subreddit)

    #node2vec finished, on to the simulation!
    #for each post, infer parameters and simulate
    print("Simulating comment trees...")
    for seed_post in seeds:

        #grap real post for this cascade
        test_post = cascades[seed_post['id_h']]
        '''
		#if we can, use fitted params
		if test_post['id'] in fitted_params:
			post_params = fitted_params[test_post['id']]