Example #1
0
def get_glove_model(dimension = 50):
    #convert Stanford glove format to gensim word2vec format, if file doesn't exist
    if file_utils.verify_file(gensim_vector_filepath % dimension) == False:
        glove2word2vec(glove_input_file = (vector_filepath % dimension), word2vec_output_file = (gensim_vector_filepath % dimension))

    #load embeddings and init model (word2vec model, but init with glove embeddings)
    glove_model = KeyedVectors.load_word2vec_format(gensim_vector_filepath % dimension, binary=False)

    return glove_model
Example #2
0
def fit_all_cascades(code, cascades, comments, pickle_save, subreddit = False):
	#if all saved, load from that
	if file_utils.verify_file("data_cache/fitted_params/%s_cascade_params.pkl" % code):
		cascade_params = file_utils.load_pickle("data_cache/fitted_params/%s_cascade_params.pkl" % code)
		fit_fail = [post_id for post_id, post in cascades.items() if post_id not in cascade_params]
		return cascade_params, fit_fail

	#anything to load? if so, load the latest checkpoint
	if pickle_save:
		#build glob filestring - to get all matching checkpoints
		if subreddit == False:
			filename = "data_cache/fitted_params/%s*_cascade_params.pkl" % code
		else:
			filename = "data_cache/fitted_params/%s_%s*_cascade_params.pkl" % (code, subreddit)

		#extract matching filenames and their numeric values, selecting the most complete one to load
		files = glob.glob(filename)
		best_int = -1		#count of records in best file - set to "" if a complete file is found
		for file in files:
			file_int = re.search(r'\d+', file)
			#if no number in filename, have a complete file - use that
			if file_int is None:
				best_int = ""
				break
			else:
				file_int = int(file_int.group())
				if file_int > best_int:
					best_int = file_int

		#load checkpoint, if we have one
		if best_int != -1:
			if subreddit == False:
				cascade_params = cascade_manip.load_cascade_params(code, str(best_int))
			else:
				cascade_params = cascade_manip.load_cascade_params(code, subreddit + str(best_int))
			print("Loaded", len(cascade_params), "fitted cascade parameters")
		#otherwise, empty dictionary
		else:
			cascade_params = {}
	else:
		cascade_params = {}

	avg_quality = 0

	#fit any cascades that have not been fitted before, add to params dictionary: post_id -> params
	post_count = len(cascade_params)
	fit_fail = []
	print("Fitting all cascade models")
	for post_id, post in cascades.items():
		#if this cascade already fitted, and params are valid, skip
		if post_id in cascade_params and (cascade_params[post_id][0] != 20 and cascade_params[post_id][1] != 500 and cascade_params[post_id][2] != 2.3):
			continue

		#fit the current cascade (filtering comments to just this post is not required)
		#print("Fitting cascade", post_id)
		param_res = fit_cascade.fit_cascade_model(post, comments)
		#if negative comment times, skip this cascade and move to next
		if param_res == False:
			fit_fail.append(post_id)
			continue
		cascade_params[post_id] = param_res
		avg_quality += cascade_params[post_id][6]
		post_count += 1
		if post_count % 1000 == 0:
			print("Fitted", post_count, "cascades")
			if pickle_save and post_count % 10000 == 0:
				if subreddit == False:
					cascade_manip.save_cascade_params(code, cascade_params, str(post_count))
				else:
					cascade_manip.save_cascade_params(code, cascade_params, subreddit + str(post_count))

	avg_quality /= len(cascade_params)

	#dump params to file
	print("Fitted a total of", len(cascade_params), "cascades (average quality", str(avg_quality) + ")")
	if pickle_save:
		cascade_manip.save_cascade_params(code, cascade_params, subreddit)

	#return all params, loaded and newly fitted
	return cascade_params, fit_fail
#end fit_all_cascades
Example #3
0
    #TESTING ONLY!!!!
    if subreddit != sub_filter:
        continue

    print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate")

    #what is the graph limit for this subreddit?
    if subreddit in sub_limits:
        max_nodes = sub_limits[subreddit]
        print("Max graph size for this subreddit:", max_nodes)
    else:
        max_nodes = default_max_nodes
        print("Using default max graph size:", max_nodes)

    #load preprocessed posts for this subreddit
    if file_utils.verify_file(posts_filepath % subreddit):
        posts = file_utils.load_pickle(posts_filepath % subreddit)
        print("Loaded", len(posts), "processed posts from",
              posts_filepath % subreddit)
    else:
        print("Cannot simulate for subreddit", subreddit,
              "without processed posts file", posts_filepath % subreddit)
        exit(0)

    #find highest assigned post id for this data, so we know where to assign new ids if we need to
    next_id = max([value['id'] for key, value in posts.items()]) + 1

    #do we need to build a graph and infer at all? loop to find out
    infer = False
    infer_count = 0
all_events = []		#list for all sim events, across all seed posts
post_counter = 1	#counter of posts to simulate, across all subreddits

#process each subreddit
for subreddit, seeds in post_seeds.items():
	'''
	#TESTING ONLY!!!!
	if subreddit != "Lisk":
		continue
	'''

	print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate")

	#if have a cached graph, load and use that instead of rebuilding
	if file_utils.verify_file("graph_cache/%s_post_graph.pkl" % subreddit) and file_utils.verify_file("graph_cache/%s_user_ids.pkl" % subreddit):
		print("Loading post graph from graph_cache/%s_post_graph.pkl and user id list from graph_cache/%s_user_ids.pkl" % (subreddit, subreddit))
		sub_graph = file_utils.load_pickle("graph_cache/%s_post_graph.pkl" % subreddit)
		user_ids = 	file_utils.load_pickle("graph_cache/%s_user_ids.pkl" % subreddit)
		print("Loaded graph has", sub_graph.graph.number_of_nodes(), "nodes and", sub_graph.graph.size(), "edges")

	#no cached, build graph from raw posts and params
	else:
		#load subreddit posts (don't need the comments!)
		raw_sub_posts = cascade_manip.load_filtered_posts(domain, subreddit)
		#load subreddit parameters
		raw_sub_params = cascade_manip.load_cascade_params(domain, subreddit)

		#filter posts - TESTING ONLY!!!!!!!! - if you didn't load all the params
		'''
		raw_sub_posts = {post_id : post for post_id, post in sub_posts.items() if post_id in sub_params}
Example #5
0
    index = 7 if "baseline" in model else 5

    #process each file
    for results_file in results_files:

        #what testing size is this file for?
        file_tokens = results_file.split('_')
        file_test_size = int(file_tokens[index][:-4])
        file_run = int(file_tokens[index + 2][-1])

        #if test size larger than target, process this file (and its associated pkl)
        if file_test_size > testing_num:
            #if bookmark and results already exist at target size for this run, skip reduction
            if file_utils.verify_file(
                    bookmark_format %
                (subreddit, subreddit, model, training_num, testing_num,
                 testing_start_year, testing_start_month,
                 file_run)) and file_utils.verify_file(
                     results_format %
                     (subreddit, subreddit, model, training_num, testing_num,
                      testing_start_year, testing_start_month, file_run)):
                print("   target exists, skipping", results_file)
                continue

            print("   reducing", results_file)

            #get corresponding bookmark filename
            bookmark_file = bookmark_format % (
                subreddit, subreddit, model, training_num, file_test_size,
                testing_start_year, testing_start_month, file_run)
Example #6
0
#optional command line argument to limit to a single subreddit
if len(sys.argv) == 2:
    subreddit_filter = sys.argv[1]

#filepaths of output files
subreddits_filepath = "model_files/subreddits.pkl"  #dictionary of subreddit -> domain code
posts_filepath = "model_files/posts/%s_posts.pkl"  #processed post data for each post, one file per subreddit
#each post maps original post id to numeric id, set of tokens, and user id
params_filepath = "model_files/params/%s_params.txt"  #text file of fitted cascade params, one file per subreddit
#one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality)
graph_filepath = "model_files/graphs/%s_graph.txt"  #edgelist of post graph for this subreddit
users_filepath = "model_files/users/%s_users.txt"  #list of users seen in posts/comments, one file per subreddit

#load the subreddit distribution for all cascades (just need a list of subreddits)
if file_utils.verify_file(subreddits_filepath):
    print("Loading subreddit list from", subreddits_filepath)
    subreddit_dict = file_utils.load_pickle(subreddits_filepath)
#file doesn't exist, build it
else:
    #load all three domain breakdown files
    crypto_subreddit_dist = file_utils.load_json(
        "results/crypto_post_subreddit_dist.json")
    cve_subreddit_dist = file_utils.load_json(
        "results/cve_post_subreddit_dist.json")
    cyber_subreddit_dist = file_utils.load_json(
        "results/cyber_post_subreddit_dist.json")
    #combine into single dictionary of subreddit -> list of corresponding domain codes
    subreddit_dict = build_domain_dict([
        set(crypto_subreddit_dist.keys()),
        set(cve_subreddit_dist.keys()),
Example #7
0
    raw_comments = {}

#build/load cascades (auto-load as a result, either raw data or cached cascades)
cascades, comments, missing_posts, missing_comments = cascade_analysis.build_cascades(
    code, raw_stories, raw_comments)
#optional: filter out cascades with any missing elements (posts or comments)
cascades, comments = cascade_manip.remove_missing(code, cascades, comments)

print("\nHave total of", len(cascades), "cascades and", len(comments),
      "comments for hackernews")

#build processed post file
#assign numeric ids to each post for node2vec input files
#get set of tokens
#extract and maintain user
if file_utils.verify_file(posts_filepath % code):
    print("Processed post file already exists.")
    posts = file_utils.load_pickle(posts_filepath % code)
else:
    c = count()
    posts = {
        key: {
            'user': value['author_h'],
            'tokens': extract_tokens(value),
            'id': next(c)
        }
        for key, value in cascades.items()
    }
    #save this to file
    file_utils.save_pickle(posts, posts_filepath % code)
    print("Saved", len(posts), "processed posts to", posts_filepath % code)
Example #8
0
def graph_infer(sim_post, sim_post_id, group, max_nodes, min_node_quality,
                estimate_initial_params):
    print("Inferring post parameters from post graph")

    #load preprocessed posts for this group
    if file_utils.verify_file(posts_filepath % group):
        posts = file_utils.load_pickle(posts_filepath % group)
        print("Loaded", len(posts), "processed posts from",
              posts_filepath % group)
    else:
        print("Cannot simulate for group", group,
              "without processed posts file", posts_filepath % group)
        exit(0)

    #if seed post not in posts file - we're gonna have a bad time
    if sim_post['id_h'] not in posts:
        print("Simulation post not in dataset - exiting\n")
        exit(0)

    #grab numeric/graph id of sim post
    numeric_sim_post_id = posts[sim_post_id]['id']

    #load in fitted simulation params - need these for graph build
    fitted_params, fitted_quality = functions_hybrid_model.load_params(
        params_filepath % group, posts, False, True)

    #remove sim post from graph params - no cheating! (pop based on numeric id)
    res = fitted_params.pop(numeric_sim_post_id)
    res = fitted_quality.pop(numeric_sim_post_id)

    #graph stuff - sample graph if necessary, add new nodes, etc
    graph = {}
    isolated_nodes = []
    added_count = 0

    #do we need to sample/process the graph? sample if whole graph too big, imposing a min node quality, need to estimate initial params, or we don't have a precomputed graph file
    if (max_nodes != None
            and len(posts) > max_nodes) or file_utils.verify_file(
                graph_filepath % group
            ) == False or min_node_quality != None or estimate_initial_params:

        #only sample down if we actually have to
        if max_nodes != None:
            print("\nSampling graph to", max_nodes, "nodes")
            #sample down posts
            graph_posts = user_sample_graph(posts, [sim_post], max_nodes,
                                            group, min_node_quality,
                                            fitted_quality)
        #otherwise, use them all
        else:
            graph_posts = posts

        #build graph, getting initial param estimate if required
        if estimate_initial_params:
            estimated_params = functions_hybrid_model.build_graph_estimate_node_params(
                graph_posts, fitted_params, fitted_quality,
                numeric_sim_post_id, temp_graph_filepath % group)
        else:
            functions_hybrid_model.build_graph(graph_posts,
                                               temp_graph_filepath % group)

    #no graph sampling/processing, use the full set and copy graph file to temp location
    else:
        graph_posts = posts
        copyfile(graph_filepath % group, temp_graph_filepath % group)
        print("Copied complete post-graph to", temp_graph_filepath % group)

    #ALWAYS sample down params to match whatever graph we have - because we can't use the previously fitted params!
    if estimate_initial_params:
        functions_hybrid_model.get_graph_params(graph_posts,
                                                numeric_sim_post_id,
                                                fitted_params, fitted_quality,
                                                temp_params_filepath % group,
                                                estimated_params)
    else:
        functions_hybrid_model.get_graph_params(graph_posts,
                                                numeric_sim_post_id,
                                                fitted_params, fitted_quality,
                                                temp_params_filepath % group)

    #graph is built and ready - graph file and input params file

    #run node2vec to get embeddings - if we have to infer parameters
    #offload to C++, because I feel the need... the need for speed!:

    if file_utils.verify_file(output_params_filepath % group):
        os.remove(output_params_filepath %
                  group)  #clear output to prevent append

    #run node2vec on graph and params
    subprocess.check_call([
        "./c_node2vec/examples/node2vec/node2vec",
        "-i:" + (temp_graph_filepath % group),
        "-ie:" + (temp_params_filepath % group),
        "-o:" + (output_params_filepath % group), "-d:6", "-l:3", "-w", "-s",
        "-otf"
    ])
    print("")

    #load the inferred params (dictionary of numeric id -> params)
    all_inferred_params = functions_hybrid_model.load_params(
        output_params_filepath % group, posts, inferred=True)
    inferred_params = all_inferred_params[numeric_sim_post_id]

    return inferred_params
Example #9
0
    post_seeds[post['subreddit']].append(post)
print({key: len(post_seeds[key]) for key in post_seeds})

post_counter = 1  #counter of posts to simulate, across all subreddits

#process each subreddit
for subreddit, seeds in post_seeds.items():

    #TESTING ONLY!!!!
    if subreddit != "Lisk":
        continue

    print("\nProcessing", subreddit, "with", len(seeds), "posts to simulate")

    #load preprocessed posts for this subreddit
    if file_utils.verify_file(posts_filepath % subreddit):
        posts = file_utils.load_pickle(posts_filepath % subreddit)
        print("Loaded", len(posts), "processed posts from",
              posts_filepath % subreddit)
    else:
        print("Cannot simulate for subreddit", subreddit,
              "without processed posts file", posts_filepath % subreddit)
        exit(0)

    #correct key format of seed posts, if necessary
    for seed_post in seeds:
        #if post id contains the t3_ prefix, strip it off so we don't have to change everything
        if seed_post['id_h'].startswith(POST_PREFIX):
            seed_post['id_h'] = seed_post['id_h'][3:]

    #load fitted params for this subreddit
Example #10
0
    #get correct index of test size token, based on model name
    index = 7 if "baseline" in model else (4 if model == "comparative" else 5)

    #process each file
    for results_file in results_files:

        #what testing size is this file for?
        file_tokens = results_file.split('_')
        file_test_size = int(file_tokens[index][:-4])
        file_run = int(file_tokens[index + 2][-1])

        #if test size larger than target, process this file (and its associated pkl)
        if file_test_size < testing_num:
            #if bookmark and results already exist at target size for this run, skip reduction
            if (model == "comparative" and file_utils.verify_file(
                    comparative_bookmark_format %
                (subreddit, subreddit, model, testing_num, testing_start_year,
                 testing_start_month, file_run)) and file_utils.verify_file(
                     comparative_results_format %
                     (subreddit, subreddit, model, testing_num,
                      testing_start_year, testing_start_month, file_run))
                ) or (file_utils.verify_file(
                    bookmark_format %
                    (subreddit, subreddit, model, training_num, testing_num,
                     testing_start_year, testing_start_month, file_run))
                      and file_utils.verify_file(
                          results_format %
                          (subreddit, subreddit, model, training_num,
                           testing_num, testing_start_year,
                           testing_start_month, file_run))):
                print("   target exists, skipping", results_file)
                continue