def get_first_repos(auth, headers): #check if file exists, if yes just read it in data = utils.load_json("github_files/all_repos.json") if data != False: return data #well, that seems to have worked... time to do them all! #except it's not really ALL, just the first 1000 - can't seem to sort them, #so we'll go with what Github thinks is most interesting url = 'https://api.github.com/search/repositories?q=language:python&per_page=100' #first requests r = requests.get(url, auth=auth, headers=headers) all_results = r.json() url = r.links['next']['url'] print r.links['last']['url'] #loop all requests count = 1 print "finished request", count, "received", len( all_results['items']), "items" while url != "": #perform request and add results to previous r = requests.get(url, auth=auth, headers=headers) res = r.json() all_results['items'].extend(res['items']) count = count + 1 print "finished request", count, "received", len(res['items']), "items" #get url for next request if 'next' in r.links: url = r.links['next']['url'] else: url = "" print url #save all results to json file utils.save_json(all_results, "github_files/github_all_repos.json") return all_results
print( f"another edge case - null person: {person}, {text}, {split}" ) text_piece = TextPiece(who=person_hold, text=accumulated_string.strip()) person_hold = person accumulated_string = after yield text_piece if __name__ == '__main__': # All of it: data = load_json("data/transcripts_c2.json") # Just episode 1: ep1_data = data["0"] # Generate the text pieces: texts_ep1 = list(generate_text_pieces(episode_data=ep1_data)) # Let's do all of it: texts = [] for idx, ep_data in data.items(): texts += list(generate_text_pieces(episode_data=ep_data)) # filter out empty pieces: texts = [t for t in texts if t.who != '' and t.text != '']
code = "crypto" #set use case/domain: must be crypto, cyber, or cve #crypto for dry run #cyber takes forever #cve fastest pickle_save = False #if True, save fitted parameters dictionary to pickle #if False, save to human-readable text file instead print("\nProcessing", code) cascades = None comments = None #load the subreddit distribution for these cascades subreddit_dist = file_utils.load_json("results/%s_post_subreddit_dist.json" % code) #loop all subreddits for this code for subreddit in sorted(subreddit_dist.keys()): if subreddit != 'Lisk': continue print("\nProcessing", subreddit) #load filtered, if they exist filtered_cascades, filtered_comments = cascade_manip.load_filtered_cascades( code, subreddit) #don't exist, filter them now if filtered_cascades == False:
for str in tokens[1:]: if flag: flag = False continue if str == "as": flag = True continue lib.append(str) return lib #--- MAIN EXECUTION BEGINS HERE---# #read userid mappings from files email_to_id = utils.load_json("data_files/email_to_userid.json") name_to_id = utils.load_json("data_files/name_to_userid.json") if name_to_id == False or email_to_id == False: print "Must have name/email to user id mapping files. Exiting" exit(0) file_idx = 0 #create parsed_commit_data directory if it does not exist if os.path.isdir("parsed_commit_data") == False: os.makedirs("parsed_commit_data") #for each commit log file: for filename in os.listdir('commit_data'): commits_list = [] #overall commit list for file
def __init__(self): self.irregular =\ load_json(f"{Directories.processed_data}/irregular_verbs.json") self.vowel = "[aeiouy]" self.consonant = "[b-df-hj-np-tv-z]"
def build_cascades(code, posts = False, comments = False): #if cascades already exist, read from cache if os.path.exists("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) and (os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) or os.path.exists("data_cache/%s_cascades/%s_cascade_comments_1.pkl" % (code, code))): #load from pickle print("Loading cascades from data_cache") cascades = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) #comments: either a single file, or multiple files print("Loading comments from data_cache") if os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)): comments = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) else: comments = {} files = sorted(glob.glob('data_cache/%s_cascades/%s_cascade_comments*' % (code, code))) for file in files: print("Loading", file) new_comments = file_utils.load_pickle(file) comments.update(new_comments) missing_posts = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code)) missing_comments = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code)) print(" Loaded", len(cascades), "cascades with", len(comments), "comments") print(" ", len(missing_posts), "missing posts", len(missing_comments), "missing comments") return cascades, comments, missing_posts, missing_comments #if no cached cascades, build them from scratch #if no loaded posts/comments, load those up first if posts == False or comments == False: posts, comments = load_model_data.load_reddit_data(code) print("Extracting post/comment structure for", len(posts), "posts and", len(comments), "comments") #add replies field to all posts/comments, init to empty list data_utils.add_field(posts, "replies", []) data_utils.add_field(comments, "replies", []) #add placeholder field to all posts/comments, flag indicates if we created a dummy object data_utils.add_field(posts, 'placeholder', False) data_utils.add_field(comments, 'placeholder', False) #add comment_count field to all post objects as well: count total number of comments all the way down the cascade data_utils.add_field(posts, "comment_count_total", 0) #and direct replies only data_utils.add_field(posts, "comment_count_direct", 0) #and add a missing_comments field to all post objects: set True if we find any missing comments in this cascade data_utils.add_field(posts, "missing_comments", False) #grab list of fields for each type of object (used to create placeholders when items are missing) post_fields = list(posts[0].keys()) comment_fields = list(comments[0].keys()) ''' id_h = post/commend id parent_id_h = direct parent link_id_h = post parent if a parent_id starts with t1_, you remove t1_ and match the rest against a comment id if it starts with t3_, you remove t3_ and match the rest against a submission id. linked_id always starts with t3_, since it always points to a submission. ''' #create dictionary of post id -> post object to store cascades cascades = data_utils.list_to_dict(posts, "id_h") #convert list of comments to dictionary, where key is comment id comments = data_utils.list_to_dict(comments, "id_h") #now that we can find posts and comments at will, let's build the dictionary! #loop all comments, assign to immediate parent and increment comment_count of post parent comment_count = 0 missing_comments = set() #missing comments missing_posts = set() #missing posts for comment_id in list(comments.keys()): #get immediate parent (post or comment) direct_parent = comments[comment_id]['parent_id_h'][3:] direct_parent_type = "post" if comments[comment_id]['parent_id_h'][:2] == "t3" else "comment" #get post parent post_parent = comments[comment_id]['link_id_h'][3:] comment_count += 1 #add this comment to replies list of immediate parent, and update counters on post_parent try: #if post parent missing, create placeholder if post_parent not in cascades: cascades[post_parent] = create_object(post_parent, post_fields) missing_posts.add(post_parent) #update overall post comment count for this new comment cascades[post_parent]['comment_count_total'] += 1 #now handle direct parent, post or comment #parent is post if direct_parent_type == "post": #missing post, create placeholder to hold replies if direct_parent not in cascades: cascades[direct_parent] = create_object(direct_parent, post_fields) missing_posts.add(direct_parent) #add this comment to replies field of post (no total comment increment, done above) cascades[direct_parent]['replies'].append(comment_id) #add 1 to direct comment count field cascades[direct_parent]['comment_count_direct'] += 1 #parent is comment else: #missing comment, create placeholder to contain replies, point to parent post by default if direct_parent not in comments: comments[direct_parent] = create_object(direct_parent, comment_fields) #point this placeholder comment to the top-level post comments[direct_parent]['link_id_h'] = post_parent comments[direct_parent]['parent_id_h'] = post_parent #add manufactured comment to counters cascades[post_parent]['comment_count_total'] += 1 cascades[post_parent]['comment_count_direct'] += 1 #and add to replies cascades[post_parent]['replies'].append(direct_parent) #flag this cascade as containing missing comments cascades[post_parent]['missing_comments'] = True missing_comments.add(direct_parent) #add comment to list of missing #add current comment to replies field of parent comment comments[direct_parent]['replies'].append(comment_id) except: print("FAIL") print(len(missing_posts), "posts") print(len(missing_comments), "comments") for field in comments[comment_id]: if field != "replies": print(field, comments[comment_id][field]) exit(0) print("\nProcessed", comment_count, "comments in", len(cascades), "cascades") print(" ", len(missing_posts), "missing posts") print(" ", len(missing_comments), "missing comments") print(" ", len([x for x in cascades if cascades[x]['missing_comments']]), "cascades with missing comments") #verify the above process, a couple different ways #count comments from parent counters across all cascades ''' total_comments = 0 for post_id, post in cascades.items(): total_comments += post['comment_count'] print(total_comments, "from post counters") ''' #traverse each cascade and count comments, check against stored comment count ''' for post_id, post in cascades.items(): traverse_comments = traverse_cascade(post, comments) if traverse_comments != post['comment_count']: print("post counter says", post['comment_count'], "comments, but traversal says", traverse_comments) ''' #save cascades for later loading cascade_manip.save_cascades(code, cascades) #cascades cascade_manip.save_comments(code, comments) #comments file_utils.save_json(list(missing_posts), "data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code)) file_utils.save_json(list(missing_comments), "data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code)) return cascades, comments, missing_posts, missing_comments
import glob #glob filestring to get all results files filestring = "dryrun/submit/sim_res_*.json" #get list of all matching files files = glob.glob(filestring) prefix = "t3_" #prefix to prepend to all id references #process each file individually, correcting ids along the way for file in files: print("\nCorrecting", file) #load the josn data = file_utils.load_json(file) print(" ", len(data['data']), "events to fix") #correct comment/post records, where each is a dictionary of the following form: #{"parentID": "A4XW5Jol_qVgUAKDWeOeaw", "communityID": "t5_3i6d8", "rootID": "A4XW5Jol_qVgUAKDWeOeaw", "nodeUserID": "okcc60doiWAfkR89nAAvHQ", "nodeTime": "1501876531", "nodeID": "A4XW5Jol_qVgUAKDWeOeaw", "actionType": "post"} for event in data['data']: #fix id fields event['parentID'] = prefix + event['parentID'] event['rootID'] = prefix + event['rootID'] event['nodeID'] = prefix + event['nodeID'] #save the updated file overtop of the old one file_utils.save_json(data, file) print("Corrected file saved to", file)
def __init__(self): self.data = \ load_json(f"{Directories.json_outputs}/{self.input_file_name}.json")
def get_contrib(all_repos, auth, headers, all_contrib=False, user_to_repo=False, repo_to_contrib=False, new=False): #load existing data from files if not passed in and files exist if all_contrib == False: #existing contributors all_contrib = utils.load_json("github_files/github_all_contrib.json") if user_to_repo == False: #user->repos dict user_to_repo = dict_key_to_int( utils.load_json("github_files/github_user_to_repo.json")) if repo_to_contrib == False: #repo->contribs dict repo_to_contrib = dict_key_to_int( utils.load_json("github_files/github_repo_to_contrib.json")) if new == False: return all_contrib, user_to_repo, repo_to_contrib #if no contributors list or correlative dictionaries, initialize empty containers if all_contrib == False or user_to_repo == False or repo_to_contrib == False: user_to_repo = defaultdict(list) #user id to list of repo ids repo_to_contrib = defaultdict(list) #repo id to list of contrib all_contrib = list() #keep a bookmark of finished repos finished_repos = utils.load_json("github_files/github_finished_repos.json") if finished_repos == False: finished_repos = list() else: print "read in", len(finished_repos), "finished repos" #check the rate limit before we start by making a dummy request, sleep if we need to url = 'https://api.github.com/repos/vinta/awesome-python/contributors' #any url will do r = requests.get(url, auth=auth, headers=headers) check_rate_limit(r) #loop all repos from list, fetch contributors if don't have them repo_count = 0 for repo in all_repos['items']: #check if have contributors for this repo already, skip if yes if repo['id'] in finished_repos: continue #need to fetch contributors for this repo #print "Fetching repo", repo['id'] contrib_count = 0 #get request url for this repo url = repo['contributors_url'] while url != "": #get the json! r = requests.get(url, auth=auth, headers=headers) #check for 204 response code - seems to indicate empty repo, and fails on json parse if r.status_code == 204: break #parse request response to json res = r.json() contrib_count = contrib_count + len(res) #repo not found (probably made private), skip and move to next if type(res) is not list and "message" in res and res[ 'message'] == "Not Found": break #server error 502 - try the request again elif type(res) is not list and "message" in res and res[ 'message'] == "Server Error": continue #other fail? dump some output and quit so we can fix it elif type(res) is not list: print r print res print url exit(0) #parse out this request result for usr in res: #new usr, add to list of all if usr['id'] not in user_to_repo: all_contrib.append(usr) #always add to correlative structures if usr['id'] in user_to_repo and repo[ 'id'] not in user_to_repo[usr['id']]: user_to_repo[usr['id']].append(repo['id']) elif usr['id'] not in user_to_repo: user_to_repo[usr['id']] = list() if usr['id'] not in repo_to_contrib[repo['id']]: repo_to_contrib[repo['id']].append(usr['id']) #see if more pages, fetch if yes if 'next' in r.links: url = r.links['next']['url'] else: #no new pages, done url = "" #check the rate limit, sleep if we need to check_rate_limit(r) #sleep for ~0.5 seconds to space out the requests better time.sleep(0.5) #print "Repo", repo['id'], ":", contrib_count, "contributors" repo_count += 1 finished_repos.append(repo['id']) #intermediate saves... just in case if repo_count % 100 == 0: #save all contrib to json file utils.save_json(all_contrib, "github_files/github_all_contrib.json") #save correlative lists utils.save_json(user_to_repo, "github_files/github_user_to_repo.json") utils.save_json(repo_to_contrib, "github_files/github_repo_to_contrib.json") #save bookmark utils.save_json(finished_repos, "github_files/github_finished_repos.json") print "saved contributors of", repo_count, "repos" #all done - save results #save all contrib to json file utils.save_json(all_contrib, "github_files/github_all_contrib.json") #save correlative dictionaries utils.save_json(user_to_repo, "github_files/github_user_to_repo.json") utils.save_json(repo_to_contrib, "github_files/github_repo_to_contrib.json") #final bookmark utils.save_json(finished_repos, "github_files/github_finished_repos.json") #return results return all_contrib, user_to_repo, repo_to_contrib
all_repos = get_first_repos(auth, headers) print "loaded", len(all_repos['items']), "repos" #get contributors of these 1000 repos - DONE print "loading saved contributors and correlations..." all_contrib, user_to_repo, repo_to_contrib = get_contrib(all_repos, auth, headers) print "loaded", len(all_contrib), "contributors" #get all python repos for all users we have so far - roughly 45K - DONE print "verifying have all repos for those users..." all_contrib, all_repos, user_to_repo, repo_to_contrib = get_repos(all_contrib, all_repos, user_to_repo, repo_to_contrib, auth, headers) print "have", len(all_repos['items']), "repos for all users" ''' print "reading in repos..." all_repos = utils.load_json("github_files/github_all_repos.json") print len(all_repos['items']), "repositories" print "reading in users..." all_contrib = utils.load_json("github_files/github_all_contrib.json") print len(all_contrib), "users" #go another ripple - get all users contributing to any repos we have so far ''' print "fetching new contributors for all repos (active requests, this could take a while)..." all_contrib, user_to_repo, repo_to_contrib = get_contrib(all_repos, auth, headers, all_contrib, False, False, True) print "now have", len(all_contrib), "contributors" ''' print "reading in correlative lists..." user_to_repo = utils.load_json("github_files/github_user_to_repo.json")
def get_repos(all_contrib, all_repos, user_to_repo, repo_to_contrib, auth, headers): #since this will take more than 5000 requests, keep a bookmark in a file - in case something goes wrong #read simple list of users that are done already, will update finished_users = utils.load_json("github_files/github_finished_users.json") if finished_users == False: finished_users = list() #if all the users we have so far are already finished, return input as results instead of loop-checking if len(finished_users) == len(all_contrib): return all_contrib, all_repos, user_to_repo, repo_to_contrib #also keep list of users that don't search properly (private repos?) bad_users = utils.load_json("github_files/bad_users.json") if bad_users == False: bad_users = list() #count users done user_count = 0 #loop all users (should be all contributors of the 1000 initial repos), fetch their python repos for user in all_contrib: #check if we did this user already, if so skip if user['id'] in finished_users: continue #build request url for this user url = "https://api.github.com/search/repositories?q=language:python+user:%s&per_page=100" % ( user['login']) #do request, including any pages while url != "": #sleep for ~2 seconds before next request, to prevent getting kicked off #(search requests limited to 30 per minute) time.sleep(2) #get the json! r = requests.get(url, auth=auth, headers=headers) res = r.json() #handle bad results and try to continue if 'items' not in res: #rate limit? wait for 60 seconds and try the same url again if 'documentation_url' in res and res[ 'documentation_url'] == "https://developer.github.com/v3/#rate-limiting": print "rate limit wait" time.sleep(60) continue #server error 502 - try the request again elif "message" in res and res['message'] == "Server Error": continue #bad results for this particular user, they might be private now - skip and move to next else: #print res bad_users.append(user) break #good results, parse and store for repo in res['items']: #new repo, add to list of all if repo['id'] not in repo_to_contrib: all_repos['items'].append(repo) #always add to correlative structures if user['id'] not in user_to_repo: user_to_repo[user['id']] = list() if repo['id'] not in user_to_repo[user['id']]: user_to_repo[user['id']].append(repo['id']) if repo['id'] in repo_to_contrib and user[ 'id'] not in repo_to_contrib[repo['id']]: repo_to_contrib[repo['id']].append(user['id']) elif repo['id'] not in repo_to_contrib: repo_to_contrib[repo['id']] = list() #see if more pages, if so fetch them if 'next' in r.links: url = r.links['next']['url'] else: #no more pages, quit for this user url = "" #finished user, add to bookmark finished_users.append(user['id']) user_count += 1 #intermediate saves and prints if user_count % 100 == 0: #save all repos to json file utils.save_json(all_repos, "github_files/github_all_repos.json") #save correlative lists utils.save_json(user_to_repo, "github_files/github_user_to_repo.json") utils.save_json(repo_to_contrib, "github_files/github_repo_to_contrib.json") print "Saved repos of", user_count, "users" #save bad users list utils.save_json(bad_users, "github_files/github_bad_users.json") #save list of finished users utils.save_json(finished_users, "github_files/github_finished_users.json") #end for users #final save before return #save all repos to json file utils.save_json(all_repos, "github_files/github_all_repos.json") #save correlative lists utils.save_json(user_to_repo, "github_files/github_user_to_repo.json") utils.save_json(repo_to_contrib, "github_files/github_repo_to_contrib.json") #save bad users list utils.save_json(bad_users, "github_files/github_bad_users.json") #save list of finished users utils.save_json(finished_users, "github_files/github_finished_users.json") print "Saved all data to files" return all_contrib, all_repos, user_to_repo, repo_to_contrib #return all results
for ValN in CdfV.keys(): if CdfV[ValN] > EffPairs: break if ValN >= len(CdfV): return next(reversed(CdfV)) if ValN is 0: return 1 # interpolate DeltaNbrs = CdfV[ValN] - CdfV[ValN - 1] if DeltaNbrs is 0: return ValN return ValN - 1 + (EffPairs - CdfV[ValN - 1]) / DeltaNbrs #--- MAIN EXECUTION BEGINS HERE---# #read mappings from files email_to_id = utils.load_json("email_to_userid.json") name_to_id = utils.load_json("name_to_userid.json") file_idx = 0 #build bipartite graph, count connected components print "building graph" B = nx.Graph() #add user nodes for key in email_to_id: B.add_node(email_to_id[key], bipartite=0) for key in name_to_id: B.add_node(name_to_id[key], bipartite=0) #add repo nodes and user-repo edges
users_filepath = "model_files/users/%s_users.txt" #list of users seen in posts/comments, one file per subreddit #verify directories for output files file_utils.verify_dir("model_files/params") file_utils.verify_dir("model_files/posts") file_utils.verify_dir("model_files/graphs") file_utils.verify_dir("model_files/users") code = "hackernews" #load raw hackernews cascades if no cached reconstructed cascades if os.path.exists("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) == False or os.path.exists( "data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) == False: raw_stories = file_utils.load_json( "../hackernews/data/hackernews_stories.json") raw_comments = file_utils.load_json( "../hackernews/data/hackernews_comments.json") else: raw_stories = {} raw_comments = {} #build/load cascades (auto-load as a result, either raw data or cached cascades) cascades, comments, missing_posts, missing_comments = cascade_analysis.build_cascades( code, raw_stories, raw_comments) #optional: filter out cascades with any missing elements (posts or comments) cascades, comments = cascade_manip.remove_missing(code, cascades, comments) print("\nHave total of", len(cascades), "cascades and", len(comments), "comments for hackernews")
import subprocess import sys import urllib2 import io from collections import defaultdict import matplotlib matplotlib.use('Agg') import matplotlib.pylab as plt import numpy as np import file_utils as utils import plot_utils #--- MAIN EXECUTION BEGINS HERE---# #read mappings from files email_to_id = utils.load_json("email_to_userid.json") name_to_id = utils.load_json("name_to_userid.json") user_commit_counts = utils.load_json("user_commit_counts.json") if user_commit_counts == False: user_commit_counts = defaultdict(int) file_commit_counts = utils.load_json("file_commit_counts.json") if file_commit_counts == False: file_commit_counts = defaultdict(int) file_idx = 0 if len(file_commit_counts) == 0 or len(user_commit_counts) == 0: print "computing counts" #for each commit log file: for filename in os.listdir('commit_data'):
import matplotlib matplotlib.use('Agg') import matplotlib.pylab as plt import numpy as np import unicodedata import file_utils as utils import plot_utils import package_type #--- MAIN EXECUTION BEGINS HERE---# #how to count: top-level vs submodules module_type = package_type.get_type() #read mappings from files email_to_id = utils.load_json("email_to_userid.json") name_to_id = utils.load_json("name_to_userid.json") user_all_commit_counts = utils.load_json("data_files/user_all_commit_counts.json") if user_all_commit_counts == False: user_all_commit_counts = defaultdict(int) repo_all_commit_counts = utils.load_json("data_files/repo_all_commit_counts.json") if repo_all_commit_counts == False: repo_all_commit_counts = defaultdict(int) if len(repo_commit_counts) == 0 or len(user_commit_counts) == 0: print "computing commit counts" #read in all commits commits = utils.load_json("data_files/all_commits_%s_small.json" % module_type) if commits == False:
from itertools import count import sys #filepaths of output files subreddits_filepath = "model_files/subreddits.pkl" #dictionary of subreddit -> domain code posts_filepath = "model_files/posts/%s_posts.pkl" #processed post data for each post, one file per subreddit #each post maps original post id to numeric id, set of tokens, and user id params_filepath = "model_files/params/%s_params.txt" #text file of fitted cascade params, one file per subreddit #one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality) graph_filepath = "model_files/graphs/%s_graph.txt" #edgelist of post graph for this subreddit users_filepath = "model_files/users/%s_users.txt" #list of users seen in posts/comments, one file per subreddit #load list of cve subreddits cve_subreddit_dist = file_utils.load_json("results/cve_post_subreddit_dist.json") #verify directories for output files file_utils.verify_dir("model_files/params") file_utils.verify_dir("model_files/posts") file_utils.verify_dir("model_files/graphs") file_utils.verify_dir("model_files/users") cascades = {} comments = {} domain = "cve" #loop all subreddits for subreddit, sub_count in cve_subreddit_dist.items():
filestring = "dryrun/results/*metrics.json" #get list of all matching files files = glob.glob(filestring) data = { } #nested dictionary of file/run identifier -> metric topic -> metric name -> computed metric #process each file individually, add data to dictionary for file in files: #get identifier for this file ident, rest = file.split('-') ident = ident.split('/')[-1] #load file data data[ident] = file_utils.load_json(file) ident_column = 'sim version' #for better csv dump-age, combine metric topic and name into a single key #list of dictionaries, one per run/ident dump_data = [] #loop identifiers for ident in data: row = {ident_column: ident} #loop metric topics for topic in data[ident]: #loop metrics for metric in data[ident][topic]: #add this to updated data row[topic + " : " + metric] = data[ident][topic][metric]
subreddits_filepath = "model_files/subreddits.pkl" #dictionary of subreddit -> domain code posts_filepath = "model_files/posts/%s_posts.pkl" #processed post data for each post, one file per subreddit #each post maps original post id to numeric id, set of tokens, and user id params_filepath = "model_files/params/%s_params.txt" #text file of fitted cascade params, one file per subreddit #one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality) graph_filepath = "model_files/graphs/%s_graph.txt" #edgelist of post graph for this subreddit users_filepath = "model_files/users/%s_users.txt" #list of users seen in posts/comments, one file per subreddit #load the subreddit distribution for all cascades (just need a list of subreddits) if file_utils.verify_file(subreddits_filepath): print("Loading subreddit list from", subreddits_filepath) subreddit_dict = file_utils.load_pickle(subreddits_filepath) #file doesn't exist, build it else: #load all three domain breakdown files crypto_subreddit_dist = file_utils.load_json( "results/crypto_post_subreddit_dist.json") cve_subreddit_dist = file_utils.load_json( "results/cve_post_subreddit_dist.json") cyber_subreddit_dist = file_utils.load_json( "results/cyber_post_subreddit_dist.json") #combine into single dictionary of subreddit -> list of corresponding domain codes subreddit_dict = build_domain_dict([ set(crypto_subreddit_dist.keys()), set(cve_subreddit_dist.keys()), set(cyber_subreddit_dist.keys()) ], ["crypto", "cve", "cyber"]) #now, kill all the duplicates! crypto and cyber scraped entire subreddits, #so any overlap is redudant and can be thrown away #(yes, there are neater ways to do this, but I don't care!) for item in subreddit_dict.keys(): if len(subreddit_dict[item]) > 1:
# if a verified map already exists, the file will be updated with the new information added parser = argparse.ArgumentParser() parser.add_argument("--path", type=str, default="./multitracks/", help="path to downloaded mutlitracks") parser.add_argument("--out", type=str, default="./complete_map.json", help="output path of verified map") parser.add_argument("--map", type=str, default=None, help="path do existing dataset map which, if specified, will be modified in place") parser.add_argument("--overwrite", type=bool, default=True, help="overwrite existing entries if existing dataset map is loaded") parser.add_argument("--save_interval", type=int, default=5, help="how often to save the dataset map while processing tracks") parser.add_argument("-kw", type=str, default="keywords.txt", help="keywords txt file that specifies search terms") parser.add_argument("--c_thresh", type=int, default=80, help="confidence threshold for fuzzy string matching") parser.add_argument("--thresh_db", type=int, default=45, help="threshold in db to reject silence") parser.add_argument("--n_jobs", type=int, default=8, help="num parallel worker threads to load & process audio files") args = parser.parse_args() dataset_map = {} # initialzie empty dict if args.map is not None: # if existing map is provided, it will be modified in place (overwriting existing entries) dataset_map = file_utils.load_json(args.map) dataset_map = create_map(args.path, args.kw, args.c_thresh, args.thresh_db, args.n_jobs, dataset_map, args.out, args.overwrite, args.save_interval) file_utils.save_json(args.out, dataset_map, indent=2)
#count frequency of each commit author name and email, so we can track down the ones breaking id assignment import os.path import subprocess import sys import urllib2 import io import unicodedata from collections import defaultdict import file_utils as utils import data_utils as data import operator #--- MAIN EXECUTION BEGINS HERE---# name_count = utils.load_json("data_files/author_name_freq.json") email_count = utils.load_json("data_files/author_email_freq.json") #build list if does not exist if name_count == False or email_count == False: #dictionary for name and email counts name_count = defaultdict(int) email_count = defaultdict(int) file_idx = 0 #for each commit log file: for filename in os.listdir('commit_data'): #for line in f: for line in io.open("commit_data/%s" % filename,
SIGHT = True #module-type specifier (at this point, more of a file suffix specifier) module_type = package_type.get_type() #adoption condition specifier (another suffix) if SIGHT: print "Adoption requires direct commit view" adop_type = "SIGHT" else: print "Adoption from repo history allowed" adop_type = "HISTORY" #load adoption events print "Loading all adoption events..." adoption_events = utils.load_json("datafiles/adoption_events_%s.json" % (module_type + "_" + adop_type)) #load library usage counts (overall for now) print "Loading library usage counts..." usage_counts = utils.load_json("datafiles/import_counts_overall_%s.json" % module_type) #don't have an adoption event file, yell at the user if adoption_events == False or usage_counts == False: print "must have compiled adoption event list datafiles/adoption_events_%s.json" % ( module_type + "_" + adop_type) print "exiting" sys.exit(0) #adoption events look like this: # dictionary, where library is key
parser.add_argument("-kw", type=str, default="vox", help="keyword filter, only one allowed. See keywords.txt for full list") parser.add_argument("--approve", type=str, nargs="+", default="Speech", help="AudioSet labels to match in source content. Multiple string values allowed (case sensitive)") parser.add_argument("--reject", type=str, default="Silence", help="AudioSet labels to reject. Multiple string values allowed (case sensitive)") parser.add_argument("--map", type=str, default="dataset_map.json", help="path do json dataset map") parser.add_argument("--thresh", type=int, default=45, help="threshold in db to reject silence") args = parser.parse_args() # with open(args.map) as json_file: # dataset_map = json.load(json_file) dataset_map = file_utils.load_json(args.map) yam_approve = list(args.approve) yam_reject = list(args.reject) if os.path.exists(args.out): print(f'{args.out} already exists, modifying...') verified_classmap = file_utils.load_json(args.out) else: verified_classmap = {} verified = verify_classes_yamnet(verified_classmap, dataset_map, args.thresh, args.kw, yam_approve, yam_reject) file_utils.save_json(args.out, verified) print(f'saved verified map to {args.out}')
"--normalize", type=bool, default=False, help= "normalize stems (normalizes audio file from start to end, not individual samples)" ) parser.add_argument("--normalize_transients", type=bool, default=False, help="normalize individual transients") parser.add_argument("--diff_mask", type=bool, default=False, help="y = (x - y), for source separation masking") args = parser.parse_args() x, y = gen_dataset(file_utils.load_json(args.map), args.ws, args.xkey, args.ykey, normalize_stems=args.normalize, normalize_transients=args.normalize_transients, max_examples=args.max_examples, sample_rate=44100, difference_mask=args.diff_mask) np.save(f'{args.out}/x_{args.xkey}_{args.ws}_{len(x)}.npy', x) np.save(f'{args.out}/y_{args.ykey}_{args.ws}_{len(y)}.npy', y) print(f'saved x y pairs in {args.out}')
#cascades, comments, missing_posts, missing_comments = build_cascades(code, posts = False, comments = False) print("Loading cascades from data_cache") cascades = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) #comments: across multiple files print("Loading comments from data_cache") comments = {} files = sorted(glob.glob('data_cache/%s_cascades/%s_cascade_comments*' % (code, code))) for file in files: print("Loading", file) new_comments = file_utils.load_pickle(file) comments.update(new_comments) #missing posts and comments missing_posts = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code)) missing_comments = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code)) #yay! loaded print(" Loaded", len(cascades), "cascades with", len(comments), "comments") print(" ", len(missing_posts), "missing posts", len(missing_comments), "missing comments") cascades, comments = cascade_manip.remove_missing(code, cascades, comments) #load subreddit distribution (just want list of subreddits) cyber_subreddit_dist = file_utils.load_json("results/cyber_post_subreddit_dist.json") print(sorted(list(cyber_subreddit_dist.keys()))) #filter posts/comments for each subreddit for subreddit, count in cyber_subreddit_dist.items(): print("Filtering for", subreddit)
from collections import defaultdict #given sorted data, write the data to a file def write_sorted_to_file(data, filename): with open(filename, 'w') as f: f.write(str(data)) #end write_sorted_to_file #--- MAIN EXECUTION BEGINS HERE---# #which files to sort: top-level vs submodules module_type = package_type.get_type() #read in the data print "reading in commit data..." data = utils.load_json('data_files/all_commits_%s.json' % module_type) if data == False: print "commits file does not exist, exiting" exit(0) print "read in", len(data), "commits" #sort events by time print "sorting commits..." ordered_commits = sorted(data, key=lambda k: k['time']) #save to a single mega-file print "saving sorted commits..." write_sorted_to_file(ordered_commits, "data_files/all_commits_%s_sorted.json" % module_type) print len(ordered_commits), "sorted commits saved to data_files/all_commits_%s_sorted.json" % module_type #create directory for commits by year files (if doesn't already exist)
SIGHT = True #module-type specifier (at this point, more of a file suffix specifier) module_type = package_type.get_type() #adoption condition specifier (another suffix) if SIGHT: print "Adoption requires direct commit view" adop_type = "SIGHT" else: print "Adoption from repo history allowed" adop_type = "HISTORY" #load all commits print "Loading all import commits..." all_lib_commits = utils.load_json("datafiles/all_add_commits_%s.json" % module_type) #load first commits for each user print "Loading all user/repo first commits..." first_commits = utils.load_json("datafiles/first_commits.json") #don't have a compiled commit file, yell at the user if all_lib_commits == False or first_commits == False: print "must have compiled commit list datafiles/all_add_commits_%s.json and user first commits datafiles/first_commits.json" % module_type print "exiting" sys.exit(0) #compile into a single mega-list, use the lack of a "libs" key to differentiate #library imports from first commits print "Combining commit lists..." all_commits = all_lib_commits + first_commits
return False return True #end clone_repo #--- MAIN EXECUTION BEGINS HERE---# #check for required command line param if len(sys.argv) != 3: print "Must include starting index and # to process for repo cloning process." print "Usage: python github_clone_repos.py <starting index> <# to process>" sys.exit(0) #read list of repos to clone print "Reading repo list..." repos = utils.load_json("github_files/github_all_repos.json") print "Read", len(repos['items']), "repos" #grab and save current working directory root_dir = os.getcwd() #create directory for repo clones (if doesn't already exist) if os.path.isdir("repo_clones") == False: os.makedirs("repo_clones") #create directory for commit data (if doesn't already exist) if os.path.isdir("commit_data") == False: os.makedirs("commit_data") #grad command line args: starting index and processing limit idx = int(sys.argv[1]) #starting index from command line parameter limit = int(sys.argv[2]) #process <command line var> at a time starting from given index
import unicodedata from collections import OrderedDict from operator import itemgetter import file_utils as utils import plot_utils import package_type #--- MAIN EXECUTION BEGINS HERE---# #file count-type specifier count_type = package_type.get_type() self_ref_count = 0 #load counts if have them import_counts_overall = utils.load_json("import_counts_overall_%s.json" % count_type) import_repo_counts = utils.load_json("import_repo_counts_%s.json" % count_type) import_user_counts = utils.load_json("import_user_counts_%s.json" % count_type) if import_counts_overall == False or import_repo_counts == False or import_user_counts == False: import_counts_overall = defaultdict( int) #number of additions across all repos and users import_repos = defaultdict(set) #list of repos using library import_users = defaultdict(set) #list of users using library file_idx = 0 #for each commit log file: for filename in os.listdir('imports_data'): #extract repo name repo = filename[:-4]
SIGHT = True #module-type specifier (at this point, more of a file suffix specifier) module_type = package_type.get_type() #adoption condition specifier (another suffix) if SIGHT: print "Adoption requires direct commit view" adop_type = "SIGHT" else: print "Adoption from repo history allowed" adop_type = "HISTORY" #load adoption events print "Loading all adoption events..." adoption_events = utils.load_json("datafiles/adoption_events_%s.json" % (module_type + "_" + adop_type)) #read user->repos mapping (to use as user and repo list) print "Loading user and repo list..." user_to_repos = utils.load_json("datafiles/user_to_repo_list.json") #don't have an adoption event file, yell at the user if adoption_events == False or user_to_repos == False: print "must have compiled adoption event list datafiles/adoption_events_%s.json and user to repo mapping datafiles/user_to_repo_list.json" % ( module_type + "_" + adop_type) print "exiting" sys.exit(0) #adoption events look like this: # dictionary, where library is key # for each library, value is list of adoption events
import file_utils import cascade_manip code = "hackernews" #load list of posts that fit-failed fit_fail = set( file_utils.load_json("model_files/params/hackernews_failed_param_fit.txt")) #load hackernews cascades posts = file_utils.load_pickle( "data_cache/hackernews_cascades/hackernews_cascade_posts.pkl") comments = file_utils.load_pickle( "data_cache/hackernews_cascades/hackernews_cascade_comments.pkl") print("Loaded", len(posts), "posts and", len(comments), "comments") #remove missing posts, comments = cascade_manip.remove_missing(code, posts, comments) #remove posts for which the fit failed posts = {key: value for (key, value) in posts.items() if key not in fit_fail} posts, comments = cascade_manip.filter_comments_by_posts(posts, comments) print("Down to", len(posts), "posts and", len(comments), "comments") #filenames of filtered cascades and comments cascades_filepath = "data_cache/filtered_cascades/%s_%s_cascades.pkl" #domain and subreddit cascades comments_filepath = "data_cache/filtered_cascades/%s_%s_comments.pkl" #domain and subreddit comments #save to same place as other filtered cascades - use hackernews as domain and subreddit file_utils.save_pickle(posts, cascades_filepath % (code, code)) file_utils.save_pickle(comments, comments_filepath % (code, code))