def get_first_repos(auth, headers): #check if file exists, if yes just read it in data = utils.load_json("github_files/all_repos.json") if data != False: return data #well, that seems to have worked... time to do them all! #except it's not really ALL, just the first 1000 - can't seem to sort them, #so we'll go with what Github thinks is most interesting url = 'https://api.github.com/search/repositories?q=language:python&per_page=100' #first requests r = requests.get(url, auth=auth, headers=headers) all_results = r.json() url = r.links['next']['url'] print r.links['last']['url'] #loop all requests count = 1 print "finished request", count, "received", len( all_results['items']), "items" while url != "": #perform request and add results to previous r = requests.get(url, auth=auth, headers=headers) res = r.json() all_results['items'].extend(res['items']) count = count + 1 print "finished request", count, "received", len(res['items']), "items" #get url for next request if 'next' in r.links: url = r.links['next']['url'] else: url = "" print url #save all results to json file utils.save_json(all_results, "github_files/github_all_repos.json") return all_results
def get_search_tweets(self, keywords, max_tweets, days_ago): ''' This is a script that continuously searches for tweets that were created over a given number of days. The search dates and search phrase can be changed below. runtime limit in hours number of tweets per search (will be iterated over) - maximum is 100 search limits e.g., from 7 to 8 gives current weekday from last week, min_days_old=0 will search from right now this geocode includes nearly all American states (and a large portion of Canada) USA = '39.8,-95.583068847656,2500km' ''' # loop over search items, # creating a new file for each for query in keywords: self.search = True print('Search phrase =', query) fx.create_folder('data/' + query) start = dx.get_date(0, date_object=True) end = dx.get_date(days_ago, date_object=True) range = dx.date_range_day(start, end, 1) for day in range: tweets_tosave = [] while self.search is True: if self.initial is True: start = dx.get_date(1) end = day self.max_id = self.get_tweet_id(query, 1, end) since_id = self.get_tweet_id(query, 1, start) self.initial = False if since_id is False: pass tweets = self.get_search_page(query, max_tweets, since_id, self.max_id) if tweets: tweets_tosave.extend(tweets) if len(tweets_tosave) >= 1000: fname = dx.format_date_str( tweets_tosave[-1]['created_at'], '%a %b %d %H:%M:%S %z %Y', '%d-%m-%y_%H-%M-%S') + '_' + str( tweets_tosave[-1]['id']) fx.save_json('data/' + query + '/' + str(fname), tweets_tosave) tweets_tosave = [] else: pass
def process_irregular_verbs(): """ Read irregular verbs from a downloaded Wikipedia page, parse it and save. """ doc = pq(filename=f'{Directories.raw_data}/irregular_verbs.html') table = doc(".wikitable") raw_cells = table("tr td:first") words = parse_cells(raw_cells) save_json(words, f"{Directories.processed_data}/irregular_verbs.json")
def top_level_comment_response_dist(code, cascades = False, comments = False, bin_minutes = 1, remove_first = True): #load data if missing if cascades == False or comments == False: cascades, comments, missing_posts, missing_comments = build_cascades(code) print("\nComputing top-level comment response time distribution") #response time dictionary: time in minutes -> number of responses with that delay response_times = defaultdict(int) #for each post, look at all top-level replies for post_id, post in cascades.items(): #loop posts #if this post is a dummy object, throw an error to the user and move on if post['placeholder']: print("Data contains placeholder post. Please use remove_missing to filter out incomplete cascades first.") exit(0) post_time = post['created_utc'] #grab post time to compute reply delay for comment_id in post['replies']: #loop replies #get response time in minutes for this comment response_time = int((comments[comment_id]['created_utc'] - post_time) / (bin_minutes * 60.0)) * bin_minutes #if response time is somehow negative, throw an error message but keep running if response_time < 0: print("Warning: negative response time!") #add one to counter for this response time (binned by minutes) response_times[response_time] += 1 #throw out first minute (bots) if remove_first == True: response_times.pop(0, None) #convert frequencies to probability distribution function total = sum(response_times.values()) for key in response_times.keys(): response_times[key] /= total #save response time distribution, but only if bin_minutes = 1 if bin_minutes == 1: print("Saving top-level comment response time distribution to results/%s_top_level_comment_response_time_dist_%s_<options>.json" % (code, bin_minutes)) file_utils.verify_dir("results") file_utils.save_json(response_times, "results/%s_top_level_comment_response_time_dist_%s.json" % (code, bin_minutes)) #plot everything print("Plotting top-level comment response time distribution to plots/%s_top_level_comment_response_times_%s.png" % (code, bin_minutes)) file_utils.verify_dir("plots") plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_log.png" % (code, bin_minutes), x_min = 0, log_scale_x = True, log_scale_y = True) plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_zoom_log.png" % (code, bin_minutes), x_min = 0, x_max = 60*24, log_scale_x = True, log_scale_y = True) plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_zoom.png" % (code, bin_minutes), x_min = 0, x_max = 60*24)
def save_sim_json(group, sim_post_id, random_post, time_observed, min_node_quality, max_nodes, estimate_initial_params, sim_events, outfile): #save sim results to output file - json with events and run settings print("Saving results to", outfile + ".json...") #write to json, include some run info output = { 'group': group, 'post_id': sim_post_id, 'post_randomly_selected': random_post, 'time_observed': time_observed, 'min_node_quality': min_node_quality, 'max_graph_size': max_nodes, 'estimate_initial_params': estimate_initial_params, 'data': sim_events } file_utils.save_json(output, outfile + ".json")
def get_subreddits(code, cascades = False, display = False): #no cascades, load them first if cascades == False: cascades, comments, missing_posts, missing_comments = build_cascades(code) #get distribution subreddit_dist = data_utils.dictionary_field_dist(cascades, 'subreddit') #print distribution if desired if display: for key, value in subreddit_dist.items(): print(key, value) #save distribution to json file print("Saving subreddit distribution to results/%s_post_subreddit_dist.json" % code) file_utils.verify_dir("results") file_utils.save_json(subreddit_dist, "results/%s_post_subreddit_dist.json" % code) return subreddit_dist
#glob filestring to get all results files filestring = "dryrun/submit/sim_res_*.json" #get list of all matching files files = glob.glob(filestring) prefix = "t3_" #prefix to prepend to all id references #process each file individually, correcting ids along the way for file in files: print("\nCorrecting", file) #load the josn data = file_utils.load_json(file) print(" ", len(data['data']), "events to fix") #correct comment/post records, where each is a dictionary of the following form: #{"parentID": "A4XW5Jol_qVgUAKDWeOeaw", "communityID": "t5_3i6d8", "rootID": "A4XW5Jol_qVgUAKDWeOeaw", "nodeUserID": "okcc60doiWAfkR89nAAvHQ", "nodeTime": "1501876531", "nodeID": "A4XW5Jol_qVgUAKDWeOeaw", "actionType": "post"} for event in data['data']: #fix id fields event['parentID'] = prefix + event['parentID'] event['rootID'] = prefix + event['rootID'] event['nodeID'] = prefix + event['nodeID'] #save the updated file overtop of the old one file_utils.save_json(data, file) print("Corrected file saved to", file)
def create_map(data_path, kw_path, conf_thresh, silence_thresh, n_jobs=8, dir_map={}, outpath="complete_map.json", overwrite=True, save_interval=5): yamnet = Yamnet() keywords = file_utils.load_keywords(kw_path) kw_filt = extract_labels.FilterStems(keywords, conf_thresh) existing_keys = dir_map.keys() analyzed = 0 for root, dirs, files in os.walk(data_path): if len(dirs) > 0: session_name = get_session_name(dirs) write_entry = True if session_name is not None and overwrite == False and session_name in existing_keys: write_entry = False if session_name is not None and len(files) > 0 and write_entry: dir_map[session_name] = {} # verify the files are valid ones valid_files = [os.path.abspath(os.path.join(root, f)) for f in files if is_valid_file(f)] print(f"{n_jobs} jobs extracting {len(valid_files)} clips from {session_name}") extracted_clips = joblib.Parallel(n_jobs=n_jobs, backend="threading")(joblib.delayed(extract_clips)(f, silence_thresh, 2048, 1024, 4096) for f in valid_files) print(f"calculating features for {len(valid_files)} tracks in {session_name}") for i, (clips, intervals, num_samps) in enumerate(extracted_clips): full_path = valid_files[i] # REMOVE SILENCE BEFORE YAMNET PROCESSING # clips, intervals, num_samps = extract_clips(full_path, # silence_thresh, # ws=2048, # hop=1024, # min_len=4096) audioset_classes = [] corrected_intervals = [] corrected_num_samps = 0 # consider chopping these up into smaller bits for input to yamnet for # more accurate classification for j, (clip, interval) in enumerate(zip(clips, intervals)): subframes = get_frames(clip, 16384, 16384) # slip each clip into sub-frames print(f'processing {len(subframes)} sub frames') for i, sf in enumerate(subframes): sf_index_start = i * 16384 sf_index_end = (i+1) * 16384 sf_interval = [interval[0] + sf_index_start, interval[1] + sf_index_end] classes = yamnet.predict_classes(waveform=clip[sf_index_start:sf_index_end], sr=16000, num_top=5) # json serializer is very picky, so all these seemingly pointless # casts are required... # audioset_classes.append(list(classes.astype(np.int16))) audioset_classes.append(classes.tolist()) # because we're making a prediction based on a DOWNSAMPLED version of the # track, we need to convert our sample indicies back up to 44.1kHz # for an accurate location sf_interval = (np.array(sf_interval) / 16000) * 44100 corrected_intervals.append([int(sf_interval[0]), int(sf_interval[1])]) corrected_num_samps += int(sf_interval[1]) - int(sf_interval[0]) track = os.path.splitext(os.path.basename(full_path))[0] # remove full path and ext .wav print(f'{track} - {corrected_num_samps} samples matching audioset classes') dir_map[session_name][track] = {} dir_map[session_name][track]["path"] = str(full_path) dir_map[session_name][track]["keywords"] = list(kw_filt.filter(track)) dir_map[session_name][track]["numsamps"] = int(num_samps) dir_map[session_name][track]["intervals"] = list(corrected_intervals) dir_map[session_name][track]["audioset"] = audioset_classes analyzed += 1 if analyzed % save_interval == save_interval - 1: print("save interval reached, saving map...") file_utils.save_json(outpath, dir_map, indent=2) return dir_map
def match_sites_dataframe(dataframe, matches_json="", top_n=5): ''' Generates a dataframe of matched sites. matches_json is an optional parameter for saving and loading slow to generate description based matches. INPUTS: - dataframe - matches_json -- A string representing the filename of a json file containing old matches to speed up processing - top_n (int) -- Maximum amount of matches to return for each item OUTPUTS: - matches_df ''' #Missing values should be represented by empty strings dataframe = dataframe.fillna(value="") #Ensure we have the correct columns dataframe = pandas.DataFrame(dataframe.to_dict("records"), columns=ALL_FIELDNAMES) #Fill any columns we just added with "-1" to mark it wasn't originally there dataframe = dataframe.fillna(value="-1") #Make sure everything in that dataframe is a string dataframe = dataframe.applymap(lambda x: str(x)) #Remove extra whitespace dataframe = dataframe.applymap(lambda x: x.strip() if type(x) == str else x) if "Match Site" in dataframe.columns: ndf = dataframe[dataframe["Match Site"] == "-1"] if ndf.empty: #No new rows. return pandas.DataFrame() odf = dataframe[dataframe["Match Site"] != "-1"] if odf.empty: old_rows = [] else: old_rows = odf.to_dict("records") new_rows = ndf.to_dict("records") else: new_rows = dataframe.to_dict("records") old_rows = [] # Add a 'Description' field to new_rows site_rows = [{ **row, "Description": row["Stock Description"] } for row in new_rows] old_site_rows = remove_duplicate_rows(old_rows) old_item_ids_to_rows = generate_item_ids_to_rows(old_rows) # Generate desc_matches based on matches_json desc_matches = {} if matches_json: if file_utils.file_exists(matches_json): desc_matches = file_utils.read_json(matches_json) else: desc_matches = match_by_description(site_rows, old_site_rows) file_utils.save_json(matches_json, desc_matches) matches_rows = match_sites(site_rows, old_site_rows, old_item_ids_to_rows, desc_matches, top_n=top_n) matches_df = pandas.DataFrame(matches_rows, columns=OUTPUT_FIELDNAMES) matches_df = matches_df.fillna(value="") matches_df = matches_df[OUTPUT_FIELDNAMES] return matches_df
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--path", type=str, default="./multitracks/", help="path to downloaded mutlitracks") parser.add_argument("-kw", type=str, default="keywords.txt", help="keywords txt file that specifies search terms") parser.add_argument("--c_thresh", type=int, default=80, help="confidence threshold for fuzzy string matching") parser.add_argument("-o", type=str, default="./dataset_map.json", help="output file for dataset map") args = parser.parse_args() # kw_filter = FilterStems(file_utils.load_keywords(args.kw), args.c_thresh) print("filtering mutlitrack stems for labels") dir_map = create_label_map(args.path, args.kw, args.c_thresh) file_utils.save_json(args.o, dir_map) print(f"directory map saved to {args.o}")
parser = argparse.ArgumentParser() parser.add_argument("--path", type=str, default="./multitracks/", help="path to downloaded mutlitracks") parser.add_argument("--out", type=str, default="./complete_map.json", help="output path of verified map") parser.add_argument("--map", type=str, default=None, help="path do existing dataset map which, if specified, will be modified in place") parser.add_argument("--overwrite", type=bool, default=True, help="overwrite existing entries if existing dataset map is loaded") parser.add_argument("--save_interval", type=int, default=5, help="how often to save the dataset map while processing tracks") parser.add_argument("-kw", type=str, default="keywords.txt", help="keywords txt file that specifies search terms") parser.add_argument("--c_thresh", type=int, default=80, help="confidence threshold for fuzzy string matching") parser.add_argument("--thresh_db", type=int, default=45, help="threshold in db to reject silence") parser.add_argument("--n_jobs", type=int, default=8, help="num parallel worker threads to load & process audio files") args = parser.parse_args() dataset_map = {} # initialzie empty dict if args.map is not None: # if existing map is provided, it will be modified in place (overwriting existing entries) dataset_map = file_utils.load_json(args.map) dataset_map = create_map(args.path, args.kw, args.c_thresh, args.thresh_db, args.n_jobs, dataset_map, args.out, args.overwrite, args.save_interval) file_utils.save_json(args.out, dataset_map, indent=2)
def get_repos(all_contrib, all_repos, user_to_repo, repo_to_contrib, auth, headers): #since this will take more than 5000 requests, keep a bookmark in a file - in case something goes wrong #read simple list of users that are done already, will update finished_users = utils.load_json("github_files/github_finished_users.json") if finished_users == False: finished_users = list() #if all the users we have so far are already finished, return input as results instead of loop-checking if len(finished_users) == len(all_contrib): return all_contrib, all_repos, user_to_repo, repo_to_contrib #also keep list of users that don't search properly (private repos?) bad_users = utils.load_json("github_files/bad_users.json") if bad_users == False: bad_users = list() #count users done user_count = 0 #loop all users (should be all contributors of the 1000 initial repos), fetch their python repos for user in all_contrib: #check if we did this user already, if so skip if user['id'] in finished_users: continue #build request url for this user url = "https://api.github.com/search/repositories?q=language:python+user:%s&per_page=100" % ( user['login']) #do request, including any pages while url != "": #sleep for ~2 seconds before next request, to prevent getting kicked off #(search requests limited to 30 per minute) time.sleep(2) #get the json! r = requests.get(url, auth=auth, headers=headers) res = r.json() #handle bad results and try to continue if 'items' not in res: #rate limit? wait for 60 seconds and try the same url again if 'documentation_url' in res and res[ 'documentation_url'] == "https://developer.github.com/v3/#rate-limiting": print "rate limit wait" time.sleep(60) continue #server error 502 - try the request again elif "message" in res and res['message'] == "Server Error": continue #bad results for this particular user, they might be private now - skip and move to next else: #print res bad_users.append(user) break #good results, parse and store for repo in res['items']: #new repo, add to list of all if repo['id'] not in repo_to_contrib: all_repos['items'].append(repo) #always add to correlative structures if user['id'] not in user_to_repo: user_to_repo[user['id']] = list() if repo['id'] not in user_to_repo[user['id']]: user_to_repo[user['id']].append(repo['id']) if repo['id'] in repo_to_contrib and user[ 'id'] not in repo_to_contrib[repo['id']]: repo_to_contrib[repo['id']].append(user['id']) elif repo['id'] not in repo_to_contrib: repo_to_contrib[repo['id']] = list() #see if more pages, if so fetch them if 'next' in r.links: url = r.links['next']['url'] else: #no more pages, quit for this user url = "" #finished user, add to bookmark finished_users.append(user['id']) user_count += 1 #intermediate saves and prints if user_count % 100 == 0: #save all repos to json file utils.save_json(all_repos, "github_files/github_all_repos.json") #save correlative lists utils.save_json(user_to_repo, "github_files/github_user_to_repo.json") utils.save_json(repo_to_contrib, "github_files/github_repo_to_contrib.json") print "Saved repos of", user_count, "users" #save bad users list utils.save_json(bad_users, "github_files/github_bad_users.json") #save list of finished users utils.save_json(finished_users, "github_files/github_finished_users.json") #end for users #final save before return #save all repos to json file utils.save_json(all_repos, "github_files/github_all_repos.json") #save correlative lists utils.save_json(user_to_repo, "github_files/github_user_to_repo.json") utils.save_json(repo_to_contrib, "github_files/github_repo_to_contrib.json") #save bad users list utils.save_json(bad_users, "github_files/github_bad_users.json") #save list of finished users utils.save_json(finished_users, "github_files/github_finished_users.json") print "Saved all data to files" return all_contrib, all_repos, user_to_repo, repo_to_contrib #return all results
#read in all commits commits = utils.load_json("data_files/all_commits_%s_small.json" % module_type) if commits == False: print "need compiled commits file data_files/all_commits_%s.json, exiting" % module_type exit(0) #process each commit for commit in commits: user_all_commit_counts[commit[user]] += 1 repo_all_commit_counts[commit[repo]] += 1 print "COMPLETE" #save results utils.save_json(user_commit_counts, "data_files/user_all_commit_counts.json") utils.save_json(repo_commit_counts, "data_files/repo_all_commit_counts.json") print "final commit_counts saved to data_files/user_all_commit_counts.json and data_files/repo_all_commit_counts.json" else: print "commit counts already computed, plotting frequencies" #user commit plots user_all_commit_freq, min_user_commit, max_user_commit = plot_utils.count_freq(user_all_commit_counts) plot_utils.plot_freq(user_commit_freq, "user commit count", "freq", filename = "plots/user_all_commit_freq.jpg") print "user all commit counts: min =", min_user_commit, ", max =", max_user_commit #repo commit plots repo_all_commit_freq, min_repo_commit, max_repo_commit = plot_utils.count_freq(repo_all_commit_counts) plot_utils.plot_freq(repo_all_commit_freq, "repo commit count", "freq", filename = "plots/repo_all_commit_freq.jpg") print "repo all commit counts: min =", min_repo_commit, ", max =", max_repo_commit
#clear list and counts imports = defaultdict(list) imports_count = 0 #grab new commit data, replace the old user, time = get_user_and_time(line, email_to_id, name_to_id) if user == False: continue #diff line else: lib = parse_import(line) if not lib: #empty imports, skip continue imports_count = imports_count + 1 if line.startswith("+"): imports['+'].extend(lib) else: imports['-'].extend(lib) #finished file, save any lingering data commits_list.append([user, time, imports]) #save commit #save file commit data to json utils.save_json(commits_list, "parsed_commit_data/%s.log" % filename[:-12]) #period prints file_idx = file_idx + 1 if file_idx % 500 == 0: print "finished", file_idx, "files" #break
'children_ids': list(), 'time': (comment['nodeTime'] - post_time) / 60, 'parent': comment['parentID'] } for comment in comments } #add all comments to correct child list for comment in comments: if comment['parentID'] == post_id: output[post_id]['root']['children'].append( comment_tree[comment['nodeID']]) else: comment_tree[comment['parentID']]['children_ids'].append( comment['nodeID']) comment_tree[comment['parentID']]['children'].append( comment_tree[comment['nodeID']]) #remove unnecessary fields: children_ids, parent for comment_id, comment_dict in comment_tree.items(): comment_dict.pop('children_ids', None) comment_dict.pop('parent', None) post_count += 1 if post_count % 100 == 0: print("Finished", post_count, "posts") #save results print("Saving results") file_utils.save_json(output, outfile)
name_count[name] += 1 email_count[email] += 1 #periodic progress prints file_idx = file_idx + 1 if file_idx % 1000 == 0: print "finished", file_idx, "files" print "COMPLETE" #have lists, convert back to dictionary for sorting else: name_count = dict(name_count) email_count = dict(email_count) #remove all names/emails that only occur once name_count = dict((k, v) for k, v in name_count.iteritems() if v > 1) email_count = dict((k, v) for k, v in email_count.iteritems() if v > 1) #sort name_count = sorted(name_count.items(), key=operator.itemgetter(1), reverse=True) email_count = sorted(email_count.items(), key=operator.itemgetter(1), reverse=True) #save results utils.save_json(name_count, "data_files/author_name_freq.json") utils.save_json(email_count, "data_files/author_email_freq.json")
lib = package #not counting with submodules, get parent package only else: lib = strip_lib(package) #add package to libs list for this commit if not relative path and not already in list if lib[0] != '.' and lib not in new_commit["del_libs"]: new_commit["del_libs"].append(lib) #add commit to list of all (may have empty import list) all_commits.append(new_commit) #period prints file_idx = file_idx + 1 if file_idx % 1000 == 0: print "finished", file_idx, "repo files" #save all commits to json (large file incoming) utils.save_json(all_commits, "data_files/all_commits_%s.json" % module_type) print "results saved to data_files/all_commits_%s.json" % module_type print " ", missing_time_count, "commits without timestamp (not included in compiled list)" else: "read in all commits from data_files/all_commits_%s.json" % module_type #regardless, print number of total commits print len(all_commits), "commits total"
import file_utils import glob #list of files to combine files = glob.glob('output/*.json') print(len(files), "files:", files) count = 0 all_data = None event_count = 0 for file in files: file_data = file_utils.load_json(file) event_count += len(file_data['data']) if count == 0: all_data = file_data else: all_data['data'].extend(file_data['data']) count += 1 print("Sum event count:", event_count) print("Combined event count:", len(all_data['data'])) #save all data file_utils.save_json(all_data, "output/all_cyber_sim_res.json")
def get_contrib(all_repos, auth, headers, all_contrib=False, user_to_repo=False, repo_to_contrib=False, new=False): #load existing data from files if not passed in and files exist if all_contrib == False: #existing contributors all_contrib = utils.load_json("github_files/github_all_contrib.json") if user_to_repo == False: #user->repos dict user_to_repo = dict_key_to_int( utils.load_json("github_files/github_user_to_repo.json")) if repo_to_contrib == False: #repo->contribs dict repo_to_contrib = dict_key_to_int( utils.load_json("github_files/github_repo_to_contrib.json")) if new == False: return all_contrib, user_to_repo, repo_to_contrib #if no contributors list or correlative dictionaries, initialize empty containers if all_contrib == False or user_to_repo == False or repo_to_contrib == False: user_to_repo = defaultdict(list) #user id to list of repo ids repo_to_contrib = defaultdict(list) #repo id to list of contrib all_contrib = list() #keep a bookmark of finished repos finished_repos = utils.load_json("github_files/github_finished_repos.json") if finished_repos == False: finished_repos = list() else: print "read in", len(finished_repos), "finished repos" #check the rate limit before we start by making a dummy request, sleep if we need to url = 'https://api.github.com/repos/vinta/awesome-python/contributors' #any url will do r = requests.get(url, auth=auth, headers=headers) check_rate_limit(r) #loop all repos from list, fetch contributors if don't have them repo_count = 0 for repo in all_repos['items']: #check if have contributors for this repo already, skip if yes if repo['id'] in finished_repos: continue #need to fetch contributors for this repo #print "Fetching repo", repo['id'] contrib_count = 0 #get request url for this repo url = repo['contributors_url'] while url != "": #get the json! r = requests.get(url, auth=auth, headers=headers) #check for 204 response code - seems to indicate empty repo, and fails on json parse if r.status_code == 204: break #parse request response to json res = r.json() contrib_count = contrib_count + len(res) #repo not found (probably made private), skip and move to next if type(res) is not list and "message" in res and res[ 'message'] == "Not Found": break #server error 502 - try the request again elif type(res) is not list and "message" in res and res[ 'message'] == "Server Error": continue #other fail? dump some output and quit so we can fix it elif type(res) is not list: print r print res print url exit(0) #parse out this request result for usr in res: #new usr, add to list of all if usr['id'] not in user_to_repo: all_contrib.append(usr) #always add to correlative structures if usr['id'] in user_to_repo and repo[ 'id'] not in user_to_repo[usr['id']]: user_to_repo[usr['id']].append(repo['id']) elif usr['id'] not in user_to_repo: user_to_repo[usr['id']] = list() if usr['id'] not in repo_to_contrib[repo['id']]: repo_to_contrib[repo['id']].append(usr['id']) #see if more pages, fetch if yes if 'next' in r.links: url = r.links['next']['url'] else: #no new pages, done url = "" #check the rate limit, sleep if we need to check_rate_limit(r) #sleep for ~0.5 seconds to space out the requests better time.sleep(0.5) #print "Repo", repo['id'], ":", contrib_count, "contributors" repo_count += 1 finished_repos.append(repo['id']) #intermediate saves... just in case if repo_count % 100 == 0: #save all contrib to json file utils.save_json(all_contrib, "github_files/github_all_contrib.json") #save correlative lists utils.save_json(user_to_repo, "github_files/github_user_to_repo.json") utils.save_json(repo_to_contrib, "github_files/github_repo_to_contrib.json") #save bookmark utils.save_json(finished_repos, "github_files/github_finished_repos.json") print "saved contributors of", repo_count, "repos" #all done - save results #save all contrib to json file utils.save_json(all_contrib, "github_files/github_all_contrib.json") #save correlative dictionaries utils.save_json(user_to_repo, "github_files/github_user_to_repo.json") utils.save_json(repo_to_contrib, "github_files/github_repo_to_contrib.json") #final bookmark utils.save_json(finished_repos, "github_files/github_finished_repos.json") #return results return all_contrib, user_to_repo, repo_to_contrib
user_id = name_to_id[name] else: user_id = email_to_id[email] #diff line else: imports = True file_idx = file_idx + 1 if file_idx % 1000 == 0: print "finished", file_idx, "files" print "COMPLETE" #save results utils.save_json(user_commit_counts, "user_commit_counts.json") utils.save_json(file_commit_counts, "file_commit_counts.json") print "final user list saved to user_commit_counts.json and file_commit_counts.json" else: print "counts already computed, plotting frequencies" #user commit plots user_commit_freq, min_user_commit, max_user_commit = plot_utils.count_freq( user_commit_counts) plot_utils.plot_freq(user_commit_freq, "user commit count", "freq", filename="user_commit_freq.jpg") print "user commit counts: min =", min_user_commit, ", max =", max_user_commit #repo commit plots
lib_idx = lib_idx + 1 if lib_idx % 1000 == 0: print " finished", lib_idx, "libraries" #finished, print the counts print "processed", event_idx, "adoption events across", lib_idx, "libraries" print " ", multi_source, "of these events have multiple sources, max number of sources is", max_source print intra_repo, "adoption source-target pairs within same repo" print cross_repo, "adoption source-target pairs across different repos" print zero_count, "adoption source-target pairs with time delay of 0" #save lib adoption counts sorted most to least lib_adop_counts_sorted = OrderedDict( sorted(lib_adop_counts.items(), key=itemgetter(1), reverse=True)) utils.save_json( lib_adop_counts_sorted, "datafiles/lib_adop_counts_sorted_%s.json" % (module_type + "_" + adop_type)) #plots use usage counts, adoption counts, and average delta t: use_counts = [] adop_counts = [] avg_delta = [] for lib in usage_counts: if lib in lib_adop_counts: use_counts.append(usage_counts[lib]) adop_counts.append(lib_adop_counts[lib]) avg_delta.append(lib_delta[lib]) #total # of usages for library on x, total # of adoptions for lib on y plot_utils.plot_data(use_counts, adop_counts,
else: import_repo_counts[k] = len(import_repos[k]) import_user_counts[k] = len(import_users[k]) import_repos_list[k] = list(import_repos[k]) import_users_list[k] = list(import_users[k]) #sort the counts (painful conversion, hopefully not too slow) import_counts_overall = OrderedDict( sorted(import_counts_overall.items(), key=itemgetter(1), reverse=True)) import_repo_counts = OrderedDict( sorted(import_repo_counts.items(), key=itemgetter(1), reverse=True)) import_user_counts = OrderedDict( sorted(import_user_counts.items(), key=itemgetter(1), reverse=True)) #save counts to json utils.save_json(import_counts_overall, "import_counts_overall_%s.json" % count_type) utils.save_json(import_repo_counts, "import_repo_counts_%s.json" % count_type) utils.save_json(import_user_counts, "import_user_counts_%s.json" % count_type) #save the lists too (why not) utils.save_json(import_repos_list, "import_repos_lists_%s.json" % count_type) utils.save_json(import_users_list, "import_users_lists_%s.json" % count_type) print "results saved to import_??_counts.json (3 files) and import_??_lists.json (2 files)" else: "read in counts, plotting distributions"
parser.add_argument("-kw", type=str, default="vox", help="keyword filter, only one allowed. See keywords.txt for full list") parser.add_argument("--approve", type=str, nargs="+", default="Speech", help="AudioSet labels to match in source content. Multiple string values allowed (case sensitive)") parser.add_argument("--reject", type=str, default="Silence", help="AudioSet labels to reject. Multiple string values allowed (case sensitive)") parser.add_argument("--map", type=str, default="dataset_map.json", help="path do json dataset map") parser.add_argument("--thresh", type=int, default=45, help="threshold in db to reject silence") args = parser.parse_args() # with open(args.map) as json_file: # dataset_map = json.load(json_file) dataset_map = file_utils.load_json(args.map) yam_approve = list(args.approve) yam_reject = list(args.reject) if os.path.exists(args.out): print(f'{args.out} already exists, modifying...') verified_classmap = file_utils.load_json(args.out) else: verified_classmap = {} verified = verify_classes_yamnet(verified_classmap, dataset_map, args.thresh, args.kw, yam_approve, yam_reject) file_utils.save_json(args.out, verified) print(f'saved verified map to {args.out}')
file_utils.save_pickle(active_users, users_filepath % code) print("Saved", len(active_users), "active users to", users_filepath % code) #fit params to all of the cascades, if no file #no need to load if we have them, won't use them again if file_utils.verify_file(params_filepath % code): print("Params exist in", params_filepath % code) else: #fit params to all cascades all_params, fit_fail_list = cascade_analysis.fit_all_cascades( code, cascades, comments, True) #save list of failed fits (if exists) if len(fit_fail_list) != 0 and file_utils.verify_file( "model_files/params/%s_failed_param_fit.txt" % code) == False: file_utils.save_json( fit_fail_list, "model_files/params/%s_failed_param_fit.txt" % code) print( "Saved list of fit-fail stories to model_files/params/%s_failed_param_fit.txt" % code) #save to text file now with open(params_filepath % code, "w") as f: for post_id, params in all_params.items(): f.write(str(posts[post_id]['id']) + " ") #write numeric post id for i in range(len(params)): f.write((' ' if i > 0 else '') + str(params[i])) f.write("\n") print("Saved text-readable params to", params_filepath % code) #don't build graph, would be way too big '''
def build_cascades(code, posts = False, comments = False): #if cascades already exist, read from cache if os.path.exists("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) and (os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) or os.path.exists("data_cache/%s_cascades/%s_cascade_comments_1.pkl" % (code, code))): #load from pickle print("Loading cascades from data_cache") cascades = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) #comments: either a single file, or multiple files print("Loading comments from data_cache") if os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)): comments = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) else: comments = {} files = sorted(glob.glob('data_cache/%s_cascades/%s_cascade_comments*' % (code, code))) for file in files: print("Loading", file) new_comments = file_utils.load_pickle(file) comments.update(new_comments) missing_posts = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code)) missing_comments = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code)) print(" Loaded", len(cascades), "cascades with", len(comments), "comments") print(" ", len(missing_posts), "missing posts", len(missing_comments), "missing comments") return cascades, comments, missing_posts, missing_comments #if no cached cascades, build them from scratch #if no loaded posts/comments, load those up first if posts == False or comments == False: posts, comments = load_model_data.load_reddit_data(code) print("Extracting post/comment structure for", len(posts), "posts and", len(comments), "comments") #add replies field to all posts/comments, init to empty list data_utils.add_field(posts, "replies", []) data_utils.add_field(comments, "replies", []) #add placeholder field to all posts/comments, flag indicates if we created a dummy object data_utils.add_field(posts, 'placeholder', False) data_utils.add_field(comments, 'placeholder', False) #add comment_count field to all post objects as well: count total number of comments all the way down the cascade data_utils.add_field(posts, "comment_count_total", 0) #and direct replies only data_utils.add_field(posts, "comment_count_direct", 0) #and add a missing_comments field to all post objects: set True if we find any missing comments in this cascade data_utils.add_field(posts, "missing_comments", False) #grab list of fields for each type of object (used to create placeholders when items are missing) post_fields = list(posts[0].keys()) comment_fields = list(comments[0].keys()) ''' id_h = post/commend id parent_id_h = direct parent link_id_h = post parent if a parent_id starts with t1_, you remove t1_ and match the rest against a comment id if it starts with t3_, you remove t3_ and match the rest against a submission id. linked_id always starts with t3_, since it always points to a submission. ''' #create dictionary of post id -> post object to store cascades cascades = data_utils.list_to_dict(posts, "id_h") #convert list of comments to dictionary, where key is comment id comments = data_utils.list_to_dict(comments, "id_h") #now that we can find posts and comments at will, let's build the dictionary! #loop all comments, assign to immediate parent and increment comment_count of post parent comment_count = 0 missing_comments = set() #missing comments missing_posts = set() #missing posts for comment_id in list(comments.keys()): #get immediate parent (post or comment) direct_parent = comments[comment_id]['parent_id_h'][3:] direct_parent_type = "post" if comments[comment_id]['parent_id_h'][:2] == "t3" else "comment" #get post parent post_parent = comments[comment_id]['link_id_h'][3:] comment_count += 1 #add this comment to replies list of immediate parent, and update counters on post_parent try: #if post parent missing, create placeholder if post_parent not in cascades: cascades[post_parent] = create_object(post_parent, post_fields) missing_posts.add(post_parent) #update overall post comment count for this new comment cascades[post_parent]['comment_count_total'] += 1 #now handle direct parent, post or comment #parent is post if direct_parent_type == "post": #missing post, create placeholder to hold replies if direct_parent not in cascades: cascades[direct_parent] = create_object(direct_parent, post_fields) missing_posts.add(direct_parent) #add this comment to replies field of post (no total comment increment, done above) cascades[direct_parent]['replies'].append(comment_id) #add 1 to direct comment count field cascades[direct_parent]['comment_count_direct'] += 1 #parent is comment else: #missing comment, create placeholder to contain replies, point to parent post by default if direct_parent not in comments: comments[direct_parent] = create_object(direct_parent, comment_fields) #point this placeholder comment to the top-level post comments[direct_parent]['link_id_h'] = post_parent comments[direct_parent]['parent_id_h'] = post_parent #add manufactured comment to counters cascades[post_parent]['comment_count_total'] += 1 cascades[post_parent]['comment_count_direct'] += 1 #and add to replies cascades[post_parent]['replies'].append(direct_parent) #flag this cascade as containing missing comments cascades[post_parent]['missing_comments'] = True missing_comments.add(direct_parent) #add comment to list of missing #add current comment to replies field of parent comment comments[direct_parent]['replies'].append(comment_id) except: print("FAIL") print(len(missing_posts), "posts") print(len(missing_comments), "comments") for field in comments[comment_id]: if field != "replies": print(field, comments[comment_id][field]) exit(0) print("\nProcessed", comment_count, "comments in", len(cascades), "cascades") print(" ", len(missing_posts), "missing posts") print(" ", len(missing_comments), "missing comments") print(" ", len([x for x in cascades if cascades[x]['missing_comments']]), "cascades with missing comments") #verify the above process, a couple different ways #count comments from parent counters across all cascades ''' total_comments = 0 for post_id, post in cascades.items(): total_comments += post['comment_count'] print(total_comments, "from post counters") ''' #traverse each cascade and count comments, check against stored comment count ''' for post_id, post in cascades.items(): traverse_comments = traverse_cascade(post, comments) if traverse_comments != post['comment_count']: print("post counter says", post['comment_count'], "comments, but traversal says", traverse_comments) ''' #save cascades for later loading cascade_manip.save_cascades(code, cascades) #cascades cascade_manip.save_comments(code, comments) #comments file_utils.save_json(list(missing_posts), "data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code)) file_utils.save_json(list(missing_comments), "data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code)) return cascades, comments, missing_posts, missing_comments
if user not in repo_users[repo]: repo_users[repo][user] = time user_repos[user].add(repo) #add repo to user's list #user can adopt from repo contents without "seeing" a commit else: print "adoption without visible commit not currently supported" print "exiting" sys.exit(0) #period prints commit_idx = commit_idx + 1 if commit_idx % 2500 == 0: print " finished", commit_idx, "commits, found", adoption_count, "adoption events" #short-circuit for proof of concept: stop when have ~5 adoptions so we can #verify the output #if adoption_count >= 5: # break #save all adoption events to json (large file incoming, hope it has everything we need) utils.save_json( adoption_events, "datafiles/adoption_events_%s.json" % (module_type + "_" + adop_type)) print "results saved to datafiles/adoption_events_%s.json" % (module_type + "_" + adop_type) #regardless, print number of adoptions found print adoption_count, "adoption events found in", len( all_lib_commits), "import commits"
def save_json_map_data(query, filepath): """run overpass api http query and save response to file as json dict""" response = fetch_raw_http_response(query) json_data = file_utils.bytes_to_json(response) file_utils.save_json(json_data, filepath)
def save(self): """ Write the results into a file. finalize() should be called before saving. """ save_json(self.state, f"{Directories.json_outputs}/{self.name}.json")