Beispiel #1
0
def load_group_data(group):
    #read group -> domain mapping for later file loads
    domain_mapping = file_utils.load_pickle(domain_mapping_filepath)
    if group not in domain_mapping:
        print(group, "not in domain mapping - exiting.\n")
        exit(0)
    domain = domain_mapping[group]

    #load cascades and comments for this group
    print("")
    raw_posts = file_utils.load_pickle(cascades_filepath % (domain, group))
    raw_comments = file_utils.load_pickle(comments_filepath % (domain, group))
    print("Loaded", len(raw_posts), "posts and", len(raw_comments),
          "comments\n")

    return raw_posts, raw_comments
Beispiel #2
0
    def replace_links(self):
        """
        Find small files, recover original link from google redirect notice
        and update link in DB and crawl links again
        """
        files = fx.get_fnames(self.directory)

        if 'errors' in files:
            # remove log file
            files.remove('errors')

        ids = []

        ids.extend([
            f for f in files
            if os.path.getsize(self.directory + '\\' + f + '.pkl') < 10000
        ])

        links = []
        for id in tqdm(ids):
            doc = fx.load_pickle(self.directory + '\\' + str(id) + '.pkl')
            links.append(rx.get_url(doc['text']))

        data = list(zip(ids, links))

        df = pd.DataFrame(data, columns=['id', 'real'])

        print(df)

        pdx.df_update_sql_field(self.db, self.table, 'id', 'real', df, 'TEXT')

        df = df[~df['real'].isnull()]

        for ix, row in df.iterrows():
            self.get_article(row['real'], row['id'])
Beispiel #3
0
def simulate_comment_tree(sim_post, sim_params, group, sim_comments,
                          time_observed):
    print("\nSimulating comment tree")

    #load active users list to draw from when assigning users to comments
    user_ids = file_utils.load_pickle(users_filepath % group)

    #simulate tree structure + comment times!
    print("Post created at", sim_post['created_utc'] / 60.0)
    #simulate from partially observed tree
    if time_observed != 0:
        #get alternate structure of observed tree
        observed_tree = convert_comment_tree(sim_post, sim_comments,
                                             time_observed)
        #simulate from this observed tree
        sim_root, all_times = sim_tree.simulate_comment_tree(
            sim_params, time_observed * 60, observed_tree)
    #simulate entirely new tree from root only
    else:
        sim_root, all_times = sim_tree.simulate_comment_tree(sim_params)

    #convert that to desired output format
    sim_events = functions_hybrid_model.build_cascade_events(
        sim_root, sim_post, user_ids, group)
    #sort list of events by time
    sim_events = sorted(sim_events, key=lambda k: k['nodeTime'])

    print("Generated",
          len(sim_events) - 1, "total comments for post", sim_post['id_h'],
          "(including observed)")
    print("   ", len(sim_comments), "actual\n")

    return sim_events, sim_root  #return events list, and dictionary format of simulated tree
Beispiel #4
0
    def clean_file(self, fname, field, remove, **kwargs):
        media = ('Media', 'Video', 'Image', 'Search', 'Sorry')
        attributes = [
            'caption', 'copyright', 'playback', 'episode', 'iPlayer', 'radio',
            'BBC2'
        ]

        doc = fx.load_pickle(self.directory + '\\' + fname)
        lines = doc[field]
        # print(lines)

        if 'split' in kwargs:
            lines = [
                line for line in lines.split('\n')
                if not line.startswith(media) or not any(x in line.split()
                                                         for x in attributes)
            ]
            doc[field] = '\n'.join(lines)

        if 'clean' in kwargs:
            if remove in lines:
                doc[field] = ''
            else:
                pass

        doc[field] = lines.replace(remove, '')
        # print(doc['text'])
        fx.save_pickle(self.directory + '\\' + fname, doc)
Beispiel #5
0
    def clean_directory(self, **kwargs):
        """
        Remove files with with less than 5kb and/or from other domains
        """
        files = fx.get_fnames(self.directory)

        if 'errors' in files:
            # remove log file
            files.remove('errors')

        files_to_remove = []

        if 'clear_small' in kwargs:
            ids = [
                f for f in files
                if os.path.getsize(self.directory + '\\' + f + '.pkl') < 10000
            ]
            files_to_remove.extend(
                [self.directory + '\\' + f + '.pkl' for f in ids])
            dbx.delete_rows(self.db, self.table, 'id', ids)

            if len(files_to_remove) > 0:
                fx.delete_files(files_to_remove)

        # verify if links correspond to domain
        if 'save' in kwargs:
            data = []
            for id in tqdm(files):
                doc = fx.load_pickle(self.directory + '\\' + id + '.pkl')
                link = doc['canonical_link']
                data.append((id, link))

            df = pd.DataFrame(data, columns=['id', 'real'])
            pdx.save_to_csv(df, os.path.join(self.corpus, self.domain))
    def __init__(self,
                 root,
                 transform=None,
                 target_transform=None,
                 loader=default_loader,
                 training=None,
                 kpt_file=None,
                 ps_dir=None,
                 re_obj=None,
                 ps_w_h=(16, 48),
                 ps_fuse_type='None'):
        self.root = root
        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

        self.imgs = [
            path for path in list_pictures(self.root) if self.id(path) != -1
        ]

        # convert person id to softmax continuous label
        self._id2label = {_id: idx for idx, _id in enumerate(self.unique_ids)}
        self.training = training
        self.im_path_to_kpt = load_pickle(
            kpt_file) if kpt_file is not None else None
        self.ps_dir = ps_dir
        self.re_obj = re_obj
        self.ps_w_h = ps_w_h
        self.ps_fuse_type = ps_fuse_type
Beispiel #7
0
def load_cached_posts(code):
	if os.path.exists("data_cache/%s_posts.pkl" % code):
		#load from pickle
		print("Loading posts from data_cache")
		posts = file_utils.load_pickle("data_cache/%s_posts.pkl" % code)
		print("   Loaded", len(posts), "posts")
		return posts
	else:
		return False
Beispiel #8
0
def load_filtered_cascades(code, subreddit):
    #if files don't exist, quit
    if os.path.exists("data_cache/filtered_cascades/%s_%s_comments.pkl" %
                      (code, subreddit)) == False or os.path.exists(
                          "data_cache/filtered_cascades/%s_%s_cascades.pkl" %
                          (code, subreddit)) == False:
        print("No saved filtered cascades")
        return False, False

    print("Loading", subreddit, "posts and comments from cache...")

    #load from file
    cascades = file_utils.load_pickle(
        "data_cache/filtered_cascades/%s_%s_cascades.pkl" % (code, subreddit))
    comments = file_utils.load_pickle(
        "data_cache/filtered_cascades/%s_%s_comments.pkl" % (code, subreddit))

    print("   Loaded", len(cascades), "posts and", len(comments), "comments")

    return cascades, comments
Beispiel #9
0
def load_cached_comments(code):
	#load comments, either from cached pickle or directly from data
	if os.path.exists("data_cache/%s_comments.pkl" % code):
		#load from pickle
		print("Loading comments from data_cache")
		comments = file_utils.load_pickle("data_cache/%s_comments.pkl" % code)
		print("   Loaded", len(comments))
		return comments
	elif code == "cyber"  and os.path.exists("data_cache/cyber_comments"):
		#load from multiple pickles
		print("Loading comments from data_cache")
		comments = []
		files = sorted(glob.glob('data_cache/cyber_comments/*'))
		for file in files:
			print("   Loading", file)
			new_comments = file_utils.load_pickle(file)
			comments.extend(new_comments)
		print("   Loaded", len(comments), "comments")
		return comments
	else:
		return False
Beispiel #10
0
    def __init__(self, transform=None, target_transform=None, loader=default_loader, training=None, use_kpt=False, use_ps=False, split='train'):
        self.root = 'msmt17'
        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

        self.imgs = self._get_im_paths(split)

        # convert person id to softmax continuous label
        self._id2label = {_id: idx for idx, _id in enumerate(self.unique_ids)}
        self.training = training
        self.im_path_to_kpt = load_pickle(osp.join(self.root, 'im_path_to_kpt.pkl')) if use_kpt else None
        self.ps_dir = osp.join(self.root, 'MSMT17_V1_ps_label') if use_ps else None
Beispiel #11
0
def load_cascade_params(code, filtered=False, display=True):
    if filtered == False:
        filename = "data_cache/fitted_params/%s_cascade_params.pkl" % code
    else:
        filename = "data_cache/fitted_params/%s_%s_cascade_params.pkl" % (
            code, filtered)

    if os.path.exists(filename) == False:
        if display:
            print("No saved cascade parameters - exiting")
        exit(0)
    else:
        if display:
            print("Loading cascade parameters from cache:", filename)
        params = file_utils.load_pickle(filename)

    return params
Beispiel #12
0
    def __init__(self, transform=None, target_transform=None, loader=default_loader, training=None, use_kpt=False, ps_dir=None, split='train', re_obj=None,
                 ps_w_h=(16, 48), ps_fuse_type='None'):
        self.root = 'msmt17'
        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

        self.imgs = self._get_im_paths(split)

        # convert person id to softmax continuous label
        self._id2label = {_id: idx for idx, _id in enumerate(self.unique_ids)}
        self.training = training
        self.im_path_to_kpt = load_pickle(osp.join(self.root, 'im_path_to_kpt.pkl')) if use_kpt else None
        self.ps_dir = ps_dir
        self.re_obj = re_obj
        self.ps_w_h = ps_w_h
        self.ps_fuse_type = ps_fuse_type
Beispiel #13
0
def load_filtered_posts(code, subreddit):
    #if files don't exist, quit
    if os.path.exists("data_cache/filtered_cascades/%s_%s_cascades.pkl" %
                      (code, subreddit)) == False:
        print("No saved filtered posts")
        return False

    print("Loading", subreddit, "posts from cache...")

    #load from file
    posts = file_utils.load_pickle(
        "data_cache/filtered_cascades/%s_%s_cascades.pkl" % (code, subreddit))

    print("   Loaded", len(posts), "posts")

    return posts


#end load_filtered_posts
Beispiel #14
0
    def get_corpus(self):

        files = fx.get_fnames(self.directory)

        # remove log file
        files.remove('errors')

        articles = []

        for fname in files:
            path = os.path.join(
                os.path.abspath(os.curdir) + '\\' + self.directory,
                fname + '.pkl')
            doc = fx.load_pickle(path)
            article = {'id': fname, 'title': doc['title'], 'text': doc['text']}
            articles.append(article)

        df = pd.DataFrame(articles)
        fx.save_pickle(os.path.join(self.corpus, self.domain + '.pkl'), df)
        pdx.save_to_csv(df, os.path.join(self.corpus, self.domain))
Beispiel #15
0
    def get_corpus_weight(self, column):
        """
        Get weighted corpus dataframe according to column weight
        (count, favorites, retweets, is_bot)
        """
        df_corpus = fx.load_pickle(self.corpus + '/' + self.domain + '.pkl')

        df_weight = self.df.filter(['id', column], axis=1)

        df_corpus['id'] = df_corpus['id'].astype(int)
        df_weight['id'] = df_weight['id'].astype(int)

        df = pd.merge(df_corpus, df_weight, on='id')

        df = pd.DataFrame(np.repeat(df.values,
                                    df[column].replace(0, 1).tolist(),
                                    axis=0),
                          columns=df.columns)

        fx.save_pickle(
            os.path.join(self.corpus,
                         self.domain + '_' + str(column) + '.pkl'), df)
Beispiel #16
0
#process each model separately
for model, curr_filename in [("model", model_filename),
                             ("comp", comp_filename),
                             ("rand_tree", rand_tree_filename),
                             ("rand_sim", rand_sim_filename),
                             ("avg_sim", avg_filename)]:

    print("Processing", model, "and", subreddit)

    #process each subreddit
    for run in range(5):
        print("  run", run)

        #load timestamps data
        timestamps = file_utils.load_pickle(curr_filename %
                                            (subreddit, subreddit, run))

        #loop posts
        for post_id, post_timestamps in timestamps.items():

            #grab true timestamps
            true_timestamps = post_timestamps['true']

            #loop observation times
            for time in post_timestamps.keys():
                #skip true
                if time == "true": continue

                #pull sim comment times
                sim_timestamps = post_timestamps[time]
Beispiel #17
0
import file_utils
import cascade_manip
import fit_cascade

code = "crypto"
subreddit = "Bitcoin"

#load cascades
#cascades, comments = cascade_manip.load_filtered_cascades(code, subreddit)

#check the fits - how many bad initializtion params are left?
#load the param pickle
cascade_params = file_utils.load_pickle(
    "data_cache/fitted_params/%s_%s_cascade_params.pkl" % (code, subreddit))
print("Loaded", len(cascade_params), "fitted params")

fail_count = 0
for post_id, params in cascade_params.items():
    #print(post_id, params)

    if params[0] == 20 and params[1] == 500 and params[2] == 2.3:
        print("FIT FAIL", post_id, params)
        fail_count += 1
        '''
		#try to fit this cascade again, get a read on what caused the failure
		print("old params", params)
		post = cascades[post_id]
		junk, post_comments = cascade_manip.filter_comments_by_posts({post_id : post}, comments)
		new_params = fit_cascade.fit_cascade_model(post, post_comments)
		print("new params", new_params)
		'''
Beispiel #18
0
    if subreddit != sub_filter:
        continue

    print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate")

    #what is the graph limit for this subreddit?
    if subreddit in sub_limits:
        max_nodes = sub_limits[subreddit]
        print("Max graph size for this subreddit:", max_nodes)
    else:
        max_nodes = default_max_nodes
        print("Using default max graph size:", max_nodes)

    #load preprocessed posts for this subreddit
    if file_utils.verify_file(posts_filepath % subreddit):
        posts = file_utils.load_pickle(posts_filepath % subreddit)
        print("Loaded", len(posts), "processed posts from",
              posts_filepath % subreddit)
    else:
        print("Cannot simulate for subreddit", subreddit,
              "without processed posts file", posts_filepath % subreddit)
        exit(0)

    #find highest assigned post id for this data, so we know where to assign new ids if we need to
    next_id = max([value['id'] for key, value in posts.items()]) + 1

    #do we need to build a graph and infer at all? loop to find out
    infer = False
    infer_count = 0

    #also fetch/assign numeric ids to seed posts
Beispiel #19
0
#partition these into ~8 chunks
count = 0
for chunk_cascades in chunks(cascades):
	print("Chunked to", len(chunk_cascades), "posts")
	junk, chunk_comments = cascade_manip.filter_comments_by_posts(chunk_cascades, comments)
	file_utils.save_pickle(chunk_cascades, "pcmasterrace/chunk_cascades_%s.pkl" % count)
	file_utils.save_pickle(chunk_comments, "pcmasterrace/chunk_comments_%s.pkl" % count)
	count += 1
exit(0)
'''

#once those chunks are created - call multiple instances, one per chunk

#load chunked posts/comments
print("Loading chunk")
cascades = file_utils.load_pickle("pcmasterrace/chunk_cascades_%s.pkl" % count)
comments = file_utils.load_pickle("pcmasterrace/chunk_comments_%s.pkl" % count)
print("Loaded", len(cascades), "posts and", len(comments), "comments")

#fit params to all cascades
all_params = cascade_analysis.fit_all_cascades(domain, cascades, comments,
                                               False, subreddit)

#load processed posts
posts = file_utils.load_pickle(posts_filepath % subreddit)

#save to text file now
with open(params_filepath % count, "w") as f:
    for post_id, params in all_params.items():
        f.write(str(posts[post_id]['id']) + " ")  #write numeric post id
        for i in range(len(params)):
Beispiel #20
0
                 file_run)) and file_utils.verify_file(
                     results_format %
                     (subreddit, subreddit, model, training_num, testing_num,
                      testing_start_year, testing_start_month, file_run)):
                print("   target exists, skipping", results_file)
                continue

            print("   reducing", results_file)

            #get corresponding bookmark filename
            bookmark_file = bookmark_format % (
                subreddit, subreddit, model, training_num, file_test_size,
                testing_start_year, testing_start_month, file_run)

            #load the pickle bookmark
            bookmark = file_utils.load_pickle(bookmark_file)

            #build a new one, including only the posts in our reduced target set
            finished_set = set([
                post_id for post_id in bookmark['finished_posts']
                if post_id in test_ids
            ])
            print("     ", len(finished_set), "posts finished in bookmark")

            #save new bookmark - if doesn't already exist (don't want to overwrite stuff!)
            file_utils.save_pickle(
                {
                    "finished_posts": finished_set,
                    'complete':
                    True if len(finished_set) == testing_num else False
                }, bookmark_format %
Beispiel #21
0
        comments["t1_" + comment_id] = comment

    print("Total of %d comments for %d-%d posts" %
          (len(comments), post_month, post_year))
    return comments


#end load_comments

#---MAIN BEGINS HERE---#

domain = "crypto"
subreddit = "Lisk"

#load crypto subreddit data - reconstructed cascades
posts = file_utils.load_pickle(
    "data_cache/filtered_cascades/%s_%s_cascades.pkl" % (domain, subreddit))
comments = file_utils.load_pickle(
    "data_cache/filtered_cascades/%s_%s_comments.pkl" % (domain, subreddit))
print("Read %d posts and %d comments" % (len(posts), len(comments)))

#artificial month partitioning - half the posts to 8/16, half to 9/16 (date doesn't matter)
august_posts = dict(list(posts.items())[int(len(posts) / 2):])
september_posts = dict(list(posts.items())[:int(len(posts) / 2)])
print("Split to %d posts for 8-16 and %d posts for 9-16" %
      (len(august_posts), len(september_posts)))

#convert both post sets
file_utils.verify_dir("reddit_data/%s" % subreddit)
august_posts = process(august_posts, subreddit, 2016, 8)
september_posts = process(september_posts, subreddit, 2016, 9)
import file_utils
import cascade_analysis
import cascade_manip

import glob


code = "cyber"

#load cascades and comments from pickle
#cascades, comments, missing_posts, missing_comments = build_cascades(code, posts = False, comments = False)

print("Loading cascades from data_cache")
cascades = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code))

#comments: across multiple files
print("Loading comments from data_cache")		
comments = {}
files = sorted(glob.glob('data_cache/%s_cascades/%s_cascade_comments*' % (code, code)))
for file in files:
	print("Loading", file)
	new_comments = file_utils.load_pickle(file)
	comments.update(new_comments)

#missing posts and comments
missing_posts = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code))
missing_comments = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code))

#yay! loaded
print("   Loaded", len(cascades), "cascades with", len(comments), "comments")
import file_utils
import glob

code = "cve"

#load all cve cascades and comments
files = glob.glob('data_cache/filtered_cascades/cve_*_cascades.pkl')
posts = {}
for file in files:
    posts.update(file_utils.load_pickle(file))
files = glob.glob('data_cache/filtered_cascades/cve_*_comments.pkl')
comments = {}
for file in files:
    comments.update(file_utils.load_pickle(file))

#filenames of filtered cascades and comments
cascades_filepath = "data_cache/filtered_cascades/%s_%s_cascades.pkl"  #domain and subreddit cascades
comments_filepath = "data_cache/filtered_cascades/%s_%s_comments.pkl"  #domain and subreddit comments

#save to same place as other filtered cascades - use hackernews as domain and subreddit
file_utils.save_pickle(posts, cascades_filepath % (code, code))
file_utils.save_pickle(comments, comments_filepath % (code, code))

#add cve to subreddit -> domain mapping
subs = file_utils.load_pickle("model_files/domain_mapping.pkl")
if code not in subs:
    subs[code] = code
    file_utils.save_pickle(subs, "model_files/domain_mapping.pkl")
Beispiel #24
0
def build_cascades(code, posts = False, comments = False):
	#if cascades already exist, read from cache
	if os.path.exists("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) and (os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) or os.path.exists("data_cache/%s_cascades/%s_cascade_comments_1.pkl" % (code, code))):
		#load from pickle
		print("Loading cascades from data_cache")
		cascades = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code))
		#comments: either a single file, or multiple files
		print("Loading comments from data_cache")
		if os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)):
			comments = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code))
		else:			
			comments = {}
			files = sorted(glob.glob('data_cache/%s_cascades/%s_cascade_comments*' % (code, code)))
			for file in files:
				print("Loading", file)
				new_comments = file_utils.load_pickle(file)
				comments.update(new_comments)
		missing_posts = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code))
		missing_comments = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code))
		print("   Loaded", len(cascades), "cascades with", len(comments), "comments")
		print("     ", len(missing_posts), "missing posts", len(missing_comments), "missing comments")
		return cascades, comments, missing_posts, missing_comments

	#if no cached cascades, build them from scratch

	#if no loaded posts/comments, load those up first
	if posts == False or comments == False:
		posts, comments = load_model_data.load_reddit_data(code)

	print("Extracting post/comment structure for", len(posts), "posts and", len(comments), "comments")

	#add replies field to all posts/comments, init to empty list
	data_utils.add_field(posts, "replies", [])
	data_utils.add_field(comments, "replies", [])
	#add placeholder field to all posts/comments, flag indicates if we created a dummy object
	data_utils.add_field(posts, 'placeholder', False)
	data_utils.add_field(comments, 'placeholder', False)

	#add comment_count field to all post objects as well: count total number of comments all the way down the cascade
	data_utils.add_field(posts, "comment_count_total", 0)
	#and direct replies only
	data_utils.add_field(posts, "comment_count_direct", 0)
	#and add a missing_comments field to all post objects: set True if we find any missing comments in this cascade
	data_utils.add_field(posts, "missing_comments", False)

	#grab list of fields for each type of object (used to create placeholders when items are missing)
	post_fields = list(posts[0].keys())
	comment_fields = list(comments[0].keys())

	'''
	id_h = post/commend id
	parent_id_h = direct parent
	link_id_h = post parent
	if a parent_id starts with t1_, you remove t1_ and match the rest against a comment id 
	if it starts with t3_, you remove t3_ and match the rest against a submission id.
	linked_id always starts with t3_, since it always points to a submission.
	'''

	#create dictionary of post id -> post object to store cascades
	cascades = data_utils.list_to_dict(posts, "id_h")

	#convert list of comments to dictionary, where key is comment id
	comments = data_utils.list_to_dict(comments, "id_h")

	#now that we can find posts and comments at will, let's build the dictionary!
	
	#loop all comments, assign to immediate parent and increment comment_count of post parent
	comment_count = 0
	missing_comments = set()	#missing comments
	missing_posts = set()		#missing posts
	for comment_id in list(comments.keys()):

		#get immediate parent (post or comment)
		direct_parent = comments[comment_id]['parent_id_h'][3:]
		direct_parent_type = "post" if comments[comment_id]['parent_id_h'][:2] == "t3" else "comment"
		#get post parent
		post_parent = comments[comment_id]['link_id_h'][3:]
		comment_count += 1

		#add this comment to replies list of immediate parent, and update counters on post_parent
		try:
			#if post parent missing, create placeholder
			if post_parent not in cascades:
				cascades[post_parent] = create_object(post_parent, post_fields)
				missing_posts.add(post_parent)

			#update overall post comment count for this new comment
			cascades[post_parent]['comment_count_total'] += 1

			#now handle direct parent, post or comment
			#parent is post
			if direct_parent_type == "post":
				#missing post, create placeholder to hold replies
				if direct_parent not in cascades:
					cascades[direct_parent] = create_object(direct_parent, post_fields)
					missing_posts.add(direct_parent)
				#add this comment to replies field of post (no total comment increment, done above)
				cascades[direct_parent]['replies'].append(comment_id)
				#add 1 to direct comment count field
				cascades[direct_parent]['comment_count_direct'] += 1

			#parent is comment
			else:	
				#missing comment, create placeholder to contain replies, point to parent post by default
				if direct_parent not in comments:
					comments[direct_parent] = create_object(direct_parent, comment_fields)
					#point this placeholder comment to the top-level post
					comments[direct_parent]['link_id_h'] = post_parent
					comments[direct_parent]['parent_id_h'] = post_parent
					#add manufactured comment to counters
					cascades[post_parent]['comment_count_total'] += 1
					cascades[post_parent]['comment_count_direct'] += 1	
					#and add to replies	
					cascades[post_parent]['replies'].append(direct_parent)	
					#flag this cascade as containing missing comments
					cascades[post_parent]['missing_comments'] = True	
					missing_comments.add(direct_parent)		#add comment to list of missing
				#add current comment to replies field of parent comment
				comments[direct_parent]['replies'].append(comment_id)
		except:
			print("FAIL")
			print(len(missing_posts), "posts")
			print(len(missing_comments), "comments")
			for field in comments[comment_id]:
				if field != "replies":
					print(field, comments[comment_id][field])
			exit(0)

	print("\nProcessed", comment_count,  "comments in", len(cascades), "cascades")
	print("   ", len(missing_posts), "missing posts")
	print("   ", len(missing_comments), "missing comments")
	print("   ", len([x for x in cascades if cascades[x]['missing_comments']]), "cascades with missing comments")

	#verify the above process, a couple different ways

	#count comments from parent counters across all cascades
	'''
	total_comments = 0
	for post_id, post in cascades.items():
		total_comments += post['comment_count']
	print(total_comments, "from post counters")
	'''

	#traverse each cascade and count comments, check against stored comment count
	'''
	for post_id, post in cascades.items():
		traverse_comments = traverse_cascade(post, comments)
		if traverse_comments != post['comment_count']:
			print("post counter says", post['comment_count'], "comments, but traversal says", traverse_comments)
	'''

	#save cascades for later loading
	cascade_manip.save_cascades(code, cascades)				#cascades
	cascade_manip.save_comments(code, comments)		#comments
	file_utils.save_json(list(missing_posts), "data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code))
	file_utils.save_json(list(missing_comments), "data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code))

	return cascades, comments, missing_posts, missing_comments
Beispiel #25
0
def graph_infer(sim_post, sim_post_id, group, max_nodes, min_node_quality,
                estimate_initial_params):
    print("Inferring post parameters from post graph")

    #load preprocessed posts for this group
    if file_utils.verify_file(posts_filepath % group):
        posts = file_utils.load_pickle(posts_filepath % group)
        print("Loaded", len(posts), "processed posts from",
              posts_filepath % group)
    else:
        print("Cannot simulate for group", group,
              "without processed posts file", posts_filepath % group)
        exit(0)

    #if seed post not in posts file - we're gonna have a bad time
    if sim_post['id_h'] not in posts:
        print("Simulation post not in dataset - exiting\n")
        exit(0)

    #grab numeric/graph id of sim post
    numeric_sim_post_id = posts[sim_post_id]['id']

    #load in fitted simulation params - need these for graph build
    fitted_params, fitted_quality = functions_hybrid_model.load_params(
        params_filepath % group, posts, False, True)

    #remove sim post from graph params - no cheating! (pop based on numeric id)
    res = fitted_params.pop(numeric_sim_post_id)
    res = fitted_quality.pop(numeric_sim_post_id)

    #graph stuff - sample graph if necessary, add new nodes, etc
    graph = {}
    isolated_nodes = []
    added_count = 0

    #do we need to sample/process the graph? sample if whole graph too big, imposing a min node quality, need to estimate initial params, or we don't have a precomputed graph file
    if (max_nodes != None
            and len(posts) > max_nodes) or file_utils.verify_file(
                graph_filepath % group
            ) == False or min_node_quality != None or estimate_initial_params:

        #only sample down if we actually have to
        if max_nodes != None:
            print("\nSampling graph to", max_nodes, "nodes")
            #sample down posts
            graph_posts = user_sample_graph(posts, [sim_post], max_nodes,
                                            group, min_node_quality,
                                            fitted_quality)
        #otherwise, use them all
        else:
            graph_posts = posts

        #build graph, getting initial param estimate if required
        if estimate_initial_params:
            estimated_params = functions_hybrid_model.build_graph_estimate_node_params(
                graph_posts, fitted_params, fitted_quality,
                numeric_sim_post_id, temp_graph_filepath % group)
        else:
            functions_hybrid_model.build_graph(graph_posts,
                                               temp_graph_filepath % group)

    #no graph sampling/processing, use the full set and copy graph file to temp location
    else:
        graph_posts = posts
        copyfile(graph_filepath % group, temp_graph_filepath % group)
        print("Copied complete post-graph to", temp_graph_filepath % group)

    #ALWAYS sample down params to match whatever graph we have - because we can't use the previously fitted params!
    if estimate_initial_params:
        functions_hybrid_model.get_graph_params(graph_posts,
                                                numeric_sim_post_id,
                                                fitted_params, fitted_quality,
                                                temp_params_filepath % group,
                                                estimated_params)
    else:
        functions_hybrid_model.get_graph_params(graph_posts,
                                                numeric_sim_post_id,
                                                fitted_params, fitted_quality,
                                                temp_params_filepath % group)

    #graph is built and ready - graph file and input params file

    #run node2vec to get embeddings - if we have to infer parameters
    #offload to C++, because I feel the need... the need for speed!:

    if file_utils.verify_file(output_params_filepath % group):
        os.remove(output_params_filepath %
                  group)  #clear output to prevent append

    #run node2vec on graph and params
    subprocess.check_call([
        "./c_node2vec/examples/node2vec/node2vec",
        "-i:" + (temp_graph_filepath % group),
        "-ie:" + (temp_params_filepath % group),
        "-o:" + (output_params_filepath % group), "-d:6", "-l:3", "-w", "-s",
        "-otf"
    ])
    print("")

    #load the inferred params (dictionary of numeric id -> params)
    all_inferred_params = functions_hybrid_model.load_params(
        output_params_filepath % group, posts, inferred=True)
    inferred_params = all_inferred_params[numeric_sim_post_id]

    return inferred_params
Beispiel #26
0
    def test(query_loader,
             query_flip_loader,
             test_loader,
             test_flip_loader,
             trainset_name,
             testset_name,
             epoch,
             verbose=False):
        cache_file = '{}/feat_cache-{}_to_{}.pkl'.format(
            exp_dir, trainset_name, testset_name)
        if args.use_feat_cache:
            assert os.path.exists(
                cache_file), "Feature cache file {} does not exist!".format(
                    cache_file)
            query_2, q_vis, query_flip_2, q_vis, test_2, test_vis, test_flip_2, test_vis, q_ids, q_cams, g_ids, g_cams = load_pickle(
                cache_file)
        else:
            query_2, q_vis = extract_loader_feat(query_loader, verbose=verbose)
            query_flip_2, q_vis = extract_loader_feat(query_flip_loader,
                                                      verbose=verbose)

            test_2, test_vis = extract_loader_feat(test_loader,
                                                   verbose=verbose)
            test_flip_2, test_vis = extract_loader_feat(test_flip_loader,
                                                        verbose=verbose)

            q_ids = query_loader.dataset.ids
            q_cams = query_loader.dataset.cameras
            g_ids = test_loader.dataset.ids
            g_cams = test_loader.dataset.cameras
            save_pickle([
                query_2, q_vis, query_flip_2, q_vis, test_2, test_vis,
                test_flip_2, test_vis, q_ids, q_cams, g_ids, g_cams
            ], cache_file)

        if args.test_which_feat > 0:
            # TODO: implement for pap
            idx = args.test_which_feat
            query_2 = query_2[:, 256 * idx - 256:256 * idx]
            query_flip_2 = query_flip_2[:, 256 * idx - 256:256 * idx]
            test_2 = test_2[:, 256 * idx - 256:256 * idx]
            test_flip_2 = test_flip_2[:, 256 * idx - 256:256 * idx]

        query = normalize(query_2 + query_flip_2)
        test = normalize(test_2 + test_flip_2)

        if verbose:
            print('query.shape:', query.shape)
            print('test.shape:', test.shape)
            if args.pap:
                print('q_vis.shape:', q_vis.shape)
                print('test_vis.shape:', test_vis.shape)

        if args.pap:
            dist_1 = compute_dist_with_visibility(query,
                                                  test,
                                                  q_vis,
                                                  test_vis,
                                                  dist_type='euclidean',
                                                  avg_by_vis_num=False)
        else:
            dist_1 = cdist(query, test)
        r_1 = cmc(dist_1,
                  q_ids,
                  g_ids,
                  q_cams,
                  g_cams,
                  separate_camera_set=False,
                  single_gallery_shot=False,
                  first_match_break=True)
        m_ap_1 = mean_ap(dist_1, q_ids, g_ids, q_cams, g_cams)
        print('EPOCH [%d] %s -> %s: mAP=%f, r@1=%f, r@3=%f, r@5=%f, r@10=%f' %
              (epoch + 1, trainset_name, testset_name, m_ap_1, r_1[0], r_1[2],
               r_1[4], r_1[9]))
Beispiel #27
0
if len(sys.argv) == 2:
    subreddit_filter = sys.argv[1]

#filepaths of output files
subreddits_filepath = "model_files/subreddits.pkl"  #dictionary of subreddit -> domain code
posts_filepath = "model_files/posts/%s_posts.pkl"  #processed post data for each post, one file per subreddit
#each post maps original post id to numeric id, set of tokens, and user id
params_filepath = "model_files/params/%s_params.txt"  #text file of fitted cascade params, one file per subreddit
#one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality)
graph_filepath = "model_files/graphs/%s_graph.txt"  #edgelist of post graph for this subreddit
users_filepath = "model_files/users/%s_users.txt"  #list of users seen in posts/comments, one file per subreddit

#load the subreddit distribution for all cascades (just need a list of subreddits)
if file_utils.verify_file(subreddits_filepath):
    print("Loading subreddit list from", subreddits_filepath)
    subreddit_dict = file_utils.load_pickle(subreddits_filepath)
#file doesn't exist, build it
else:
    #load all three domain breakdown files
    crypto_subreddit_dist = file_utils.load_json(
        "results/crypto_post_subreddit_dist.json")
    cve_subreddit_dist = file_utils.load_json(
        "results/cve_post_subreddit_dist.json")
    cyber_subreddit_dist = file_utils.load_json(
        "results/cyber_post_subreddit_dist.json")
    #combine into single dictionary of subreddit -> list of corresponding domain codes
    subreddit_dict = build_domain_dict([
        set(crypto_subreddit_dist.keys()),
        set(cve_subreddit_dist.keys()),
        set(cyber_subreddit_dist.keys())
    ], ["crypto", "cve", "cyber"])
Beispiel #28
0
#build/load cascades (auto-load as a result, either raw data or cached cascades)
cascades, comments, missing_posts, missing_comments = cascade_analysis.build_cascades(
    code, raw_stories, raw_comments)
#optional: filter out cascades with any missing elements (posts or comments)
cascades, comments = cascade_manip.remove_missing(code, cascades, comments)

print("\nHave total of", len(cascades), "cascades and", len(comments),
      "comments for hackernews")

#build processed post file
#assign numeric ids to each post for node2vec input files
#get set of tokens
#extract and maintain user
if file_utils.verify_file(posts_filepath % code):
    print("Processed post file already exists.")
    posts = file_utils.load_pickle(posts_filepath % code)
else:
    c = count()
    posts = {
        key: {
            'user': value['author_h'],
            'tokens': extract_tokens(value),
            'id': next(c)
        }
        for key, value in cascades.items()
    }
    #save this to file
    file_utils.save_pickle(posts, posts_filepath % code)
    print("Saved", len(posts), "processed posts to", posts_filepath % code)

#build list of users active in this subreddit - list, not set, so more active users are more likely to get drawn in the simulation
Beispiel #29
0
def fit_all_cascades(code, cascades, comments, pickle_save, subreddit = False):
	#if all saved, load from that
	if file_utils.verify_file("data_cache/fitted_params/%s_cascade_params.pkl" % code):
		cascade_params = file_utils.load_pickle("data_cache/fitted_params/%s_cascade_params.pkl" % code)
		fit_fail = [post_id for post_id, post in cascades.items() if post_id not in cascade_params]
		return cascade_params, fit_fail

	#anything to load? if so, load the latest checkpoint
	if pickle_save:
		#build glob filestring - to get all matching checkpoints
		if subreddit == False:
			filename = "data_cache/fitted_params/%s*_cascade_params.pkl" % code
		else:
			filename = "data_cache/fitted_params/%s_%s*_cascade_params.pkl" % (code, subreddit)

		#extract matching filenames and their numeric values, selecting the most complete one to load
		files = glob.glob(filename)
		best_int = -1		#count of records in best file - set to "" if a complete file is found
		for file in files:
			file_int = re.search(r'\d+', file)
			#if no number in filename, have a complete file - use that
			if file_int is None:
				best_int = ""
				break
			else:
				file_int = int(file_int.group())
				if file_int > best_int:
					best_int = file_int

		#load checkpoint, if we have one
		if best_int != -1:
			if subreddit == False:
				cascade_params = cascade_manip.load_cascade_params(code, str(best_int))
			else:
				cascade_params = cascade_manip.load_cascade_params(code, subreddit + str(best_int))
			print("Loaded", len(cascade_params), "fitted cascade parameters")
		#otherwise, empty dictionary
		else:
			cascade_params = {}
	else:
		cascade_params = {}

	avg_quality = 0

	#fit any cascades that have not been fitted before, add to params dictionary: post_id -> params
	post_count = len(cascade_params)
	fit_fail = []
	print("Fitting all cascade models")
	for post_id, post in cascades.items():
		#if this cascade already fitted, and params are valid, skip
		if post_id in cascade_params and (cascade_params[post_id][0] != 20 and cascade_params[post_id][1] != 500 and cascade_params[post_id][2] != 2.3):
			continue

		#fit the current cascade (filtering comments to just this post is not required)
		#print("Fitting cascade", post_id)
		param_res = fit_cascade.fit_cascade_model(post, comments)
		#if negative comment times, skip this cascade and move to next
		if param_res == False:
			fit_fail.append(post_id)
			continue
		cascade_params[post_id] = param_res
		avg_quality += cascade_params[post_id][6]
		post_count += 1
		if post_count % 1000 == 0:
			print("Fitted", post_count, "cascades")
			if pickle_save and post_count % 10000 == 0:
				if subreddit == False:
					cascade_manip.save_cascade_params(code, cascade_params, str(post_count))
				else:
					cascade_manip.save_cascade_params(code, cascade_params, subreddit + str(post_count))

	avg_quality /= len(cascade_params)

	#dump params to file
	print("Fitted a total of", len(cascade_params), "cascades (average quality", str(avg_quality) + ")")
	if pickle_save:
		cascade_manip.save_cascade_params(code, cascade_params, subreddit)

	#return all params, loaded and newly fitted
	return cascade_params, fit_fail
#end fit_all_cascades
post_counter = 1	#counter of posts to simulate, across all subreddits

#process each subreddit
for subreddit, seeds in post_seeds.items():
	'''
	#TESTING ONLY!!!!
	if subreddit != "Lisk":
		continue
	'''

	print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate")

	#if have a cached graph, load and use that instead of rebuilding
	if file_utils.verify_file("graph_cache/%s_post_graph.pkl" % subreddit) and file_utils.verify_file("graph_cache/%s_user_ids.pkl" % subreddit):
		print("Loading post graph from graph_cache/%s_post_graph.pkl and user id list from graph_cache/%s_user_ids.pkl" % (subreddit, subreddit))
		sub_graph = file_utils.load_pickle("graph_cache/%s_post_graph.pkl" % subreddit)
		user_ids = 	file_utils.load_pickle("graph_cache/%s_user_ids.pkl" % subreddit)
		print("Loaded graph has", sub_graph.graph.number_of_nodes(), "nodes and", sub_graph.graph.size(), "edges")

	#no cached, build graph from raw posts and params
	else:
		#load subreddit posts (don't need the comments!)
		raw_sub_posts = cascade_manip.load_filtered_posts(domain, subreddit)
		#load subreddit parameters
		raw_sub_params = cascade_manip.load_cascade_params(domain, subreddit)

		#filter posts - TESTING ONLY!!!!!!!! - if you didn't load all the params
		'''
		raw_sub_posts = {post_id : post for post_id, post in sub_posts.items() if post_id in sub_params}
		print("Filtered to", len(sub_posts), "posts with fitted parameters")
		'''