Esempio n. 1
0
def get_first_repos(auth, headers):
    #check if file exists, if yes just read it in
    data = utils.load_json("github_files/all_repos.json")
    if data != False:
        return data

    #well, that seems to have worked... time to do them all!
    #except it's not really ALL, just the first 1000 - can't seem to sort them,
    #so we'll go with what Github thinks is most interesting
    url = 'https://api.github.com/search/repositories?q=language:python&per_page=100'
    #first requests
    r = requests.get(url, auth=auth, headers=headers)
    all_results = r.json()
    url = r.links['next']['url']
    print r.links['last']['url']
    #loop all requests
    count = 1
    print "finished request", count, "received", len(
        all_results['items']), "items"
    while url != "":
        #perform request and add results to previous
        r = requests.get(url, auth=auth, headers=headers)
        res = r.json()
        all_results['items'].extend(res['items'])
        count = count + 1
        print "finished request", count, "received", len(res['items']), "items"
        #get url for next request
        if 'next' in r.links:
            url = r.links['next']['url']
        else:
            url = ""
        print url
    #save all results to json file
    utils.save_json(all_results, "github_files/github_all_repos.json")
    return all_results
Esempio n. 2
0
    def get_search_tweets(self, keywords, max_tweets, days_ago):
        ''' This is a script that continuously searches for tweets
        that were created over a given number of days. The search
        dates and search phrase can be changed below.

        runtime limit in hours
        number of tweets per search (will be iterated over) - maximum is 100
        search limits e.g., from 7 to 8 gives current weekday from last week,
        min_days_old=0 will search from right now

        this geocode includes nearly all American states (and a large portion of Canada)
        USA = '39.8,-95.583068847656,2500km'
       '''

        # loop over search items,
        # creating a new file for each
        for query in keywords:

            self.search = True

            print('Search phrase =', query)
            fx.create_folder('data/' + query)

            start = dx.get_date(0, date_object=True)
            end = dx.get_date(days_ago, date_object=True)

            range = dx.date_range_day(start, end, 1)

            for day in range:

                tweets_tosave = []

                while self.search is True:

                    if self.initial is True:
                        start = dx.get_date(1)
                        end = day
                        self.max_id = self.get_tweet_id(query, 1, end)
                        since_id = self.get_tweet_id(query, 1, start)
                        self.initial = False
                        if since_id is False:
                            pass

                    tweets = self.get_search_page(query, max_tweets, since_id,
                                                  self.max_id)

                    if tweets:
                        tweets_tosave.extend(tweets)

                        if len(tweets_tosave) >= 1000:
                            fname = dx.format_date_str(
                                tweets_tosave[-1]['created_at'],
                                '%a %b %d %H:%M:%S %z %Y',
                                '%d-%m-%y_%H-%M-%S') + '_' + str(
                                    tweets_tosave[-1]['id'])
                            fx.save_json('data/' + query + '/' + str(fname),
                                         tweets_tosave)
                            tweets_tosave = []
                    else:
                        pass
Esempio n. 3
0
def process_irregular_verbs():
    """
    Read irregular verbs from a downloaded Wikipedia page, parse it and save.
    """
    doc = pq(filename=f'{Directories.raw_data}/irregular_verbs.html')

    table = doc(".wikitable")
    raw_cells = table("tr td:first")

    words = parse_cells(raw_cells)

    save_json(words, f"{Directories.processed_data}/irregular_verbs.json")
Esempio n. 4
0
def top_level_comment_response_dist(code, cascades = False, comments = False, bin_minutes = 1, remove_first = True):
	#load data if missing
	if cascades == False or comments == False:
		cascades, comments, missing_posts, missing_comments = build_cascades(code)

	print("\nComputing top-level comment response time distribution")

	#response time dictionary: time in minutes -> number of responses with that delay
	response_times = defaultdict(int)

	#for each post, look at all top-level replies
	for post_id, post in cascades.items():		#loop posts
		#if this post is a dummy object, throw an error to the user and move on
		if post['placeholder']:
			print("Data contains placeholder post. Please use remove_missing to filter out incomplete cascades first.")
			exit(0)

		post_time = post['created_utc']		#grab post time to compute reply delay

		for comment_id in post['replies']:		#loop replies
			#get response time in minutes for this comment
			response_time = int((comments[comment_id]['created_utc'] - post_time) / (bin_minutes * 60.0)) * bin_minutes

			#if response time is somehow negative, throw an error message but keep running
			if response_time < 0:
				print("Warning: negative response time!")
			#add one to counter for this response time (binned by minutes)
			response_times[response_time] += 1

	#throw out first minute (bots)
	if remove_first == True:
		response_times.pop(0, None)

	#convert frequencies to probability distribution function
	total = sum(response_times.values())
	for key in response_times.keys():
		response_times[key] /= total

	#save response time distribution, but only if bin_minutes = 1
	if bin_minutes == 1:
		print("Saving top-level comment response time distribution to results/%s_top_level_comment_response_time_dist_%s_<options>.json" % (code, bin_minutes))
		file_utils.verify_dir("results")
		file_utils.save_json(response_times, "results/%s_top_level_comment_response_time_dist_%s.json" % (code, bin_minutes))

	#plot everything
	print("Plotting top-level comment response time distribution to plots/%s_top_level_comment_response_times_%s.png" % (code, bin_minutes))
	file_utils.verify_dir("plots")
	plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_log.png" % (code, bin_minutes), x_min = 0, log_scale_x = True, log_scale_y = True)
	plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_zoom_log.png" % (code, bin_minutes), x_min = 0, x_max = 60*24, log_scale_x = True, log_scale_y = True)
	plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_zoom.png" % (code, bin_minutes), x_min = 0, x_max = 60*24)
Esempio n. 5
0
def save_sim_json(group, sim_post_id, random_post, time_observed,
                  min_node_quality, max_nodes, estimate_initial_params,
                  sim_events, outfile):
    #save sim results to output file - json with events and run settings
    print("Saving results to", outfile + ".json...")
    #write to json, include some run info
    output = {
        'group': group,
        'post_id': sim_post_id,
        'post_randomly_selected': random_post,
        'time_observed': time_observed,
        'min_node_quality': min_node_quality,
        'max_graph_size': max_nodes,
        'estimate_initial_params': estimate_initial_params,
        'data': sim_events
    }
    file_utils.save_json(output, outfile + ".json")
Esempio n. 6
0
def get_subreddits(code, cascades = False, display = False):
	#no cascades, load them first
	if cascades == False:
		cascades, comments, missing_posts, missing_comments = build_cascades(code)

	#get distribution
	subreddit_dist = data_utils.dictionary_field_dist(cascades, 'subreddit')

	#print distribution if desired
	if display:
		for key, value in subreddit_dist.items():
			print(key, value)

	#save distribution to json file
	print("Saving subreddit distribution to results/%s_post_subreddit_dist.json" % code)
	file_utils.verify_dir("results")
	file_utils.save_json(subreddit_dist, "results/%s_post_subreddit_dist.json" % code)

	return subreddit_dist
Esempio n. 7
0
#glob filestring to get all results files
filestring = "dryrun/submit/sim_res_*.json"

#get list of all matching files
files = glob.glob(filestring)

prefix = "t3_"		#prefix to prepend to all id references

#process each file individually, correcting ids along the way
for file in files:
	print("\nCorrecting", file)

	#load the josn
	data = file_utils.load_json(file)
	print("  ", len(data['data']), "events to fix")

	#correct comment/post records, where each is a dictionary of the following form:
	#{"parentID": "A4XW5Jol_qVgUAKDWeOeaw", "communityID": "t5_3i6d8", "rootID": "A4XW5Jol_qVgUAKDWeOeaw", "nodeUserID": "okcc60doiWAfkR89nAAvHQ", "nodeTime": "1501876531", "nodeID": "A4XW5Jol_qVgUAKDWeOeaw", "actionType": "post"}
	for event in data['data']:

		#fix id fields
		event['parentID'] = prefix + event['parentID']
		event['rootID'] = prefix + event['rootID']
		event['nodeID'] = prefix + event['nodeID']

	#save the updated file overtop of the old one
	file_utils.save_json(data, file)
	print("Corrected file saved to", file)

Esempio n. 8
0
def create_map(data_path, kw_path, conf_thresh, silence_thresh, n_jobs=8, dir_map={}, 
               outpath="complete_map.json", overwrite=True, save_interval=5):
    yamnet = Yamnet()
    keywords = file_utils.load_keywords(kw_path)
    kw_filt = extract_labels.FilterStems(keywords, conf_thresh)

    existing_keys = dir_map.keys()
    analyzed = 0

    for root, dirs, files in os.walk(data_path):
        if len(dirs) > 0:
            session_name = get_session_name(dirs)

        write_entry = True

        if session_name is not None and overwrite == False and session_name in existing_keys:
            write_entry = False

        if session_name is not None and len(files) > 0 and write_entry:
            dir_map[session_name] = {}
            # verify the files are valid ones
            valid_files = [os.path.abspath(os.path.join(root, f)) for f in files if is_valid_file(f)]

            print(f"{n_jobs} jobs extracting {len(valid_files)} clips from {session_name}")
            extracted_clips = joblib.Parallel(n_jobs=n_jobs, backend="threading")(joblib.delayed(extract_clips)(f, silence_thresh, 2048, 1024, 4096) for f in valid_files)

            print(f"calculating features for {len(valid_files)} tracks in {session_name}")

            for i, (clips, intervals, num_samps) in enumerate(extracted_clips):
                full_path = valid_files[i]

                # REMOVE SILENCE BEFORE YAMNET PROCESSING
                # clips, intervals, num_samps = extract_clips(full_path,
                #                                             silence_thresh,
                #                                             ws=2048, 
                #                                             hop=1024, 
                #                                             min_len=4096)

                audioset_classes = []
                corrected_intervals = []
                corrected_num_samps = 0

                # consider chopping these up into smaller bits for input to yamnet for 
                # more accurate classification
                for j, (clip, interval) in enumerate(zip(clips, intervals)):

                    subframes = get_frames(clip, 16384, 16384) # slip each clip into sub-frames
                    print(f'processing {len(subframes)} sub frames')
                    
                    for i, sf in enumerate(subframes):
                        
                        sf_index_start = i * 16384
                        sf_index_end = (i+1) * 16384

                        sf_interval = [interval[0] + sf_index_start, interval[1] + sf_index_end]

                        classes = yamnet.predict_classes(waveform=clip[sf_index_start:sf_index_end], 
                                                        sr=16000,
                                                        num_top=5)

                        # json serializer is very picky, so all these seemingly pointless
                        # casts are required...
                        # audioset_classes.append(list(classes.astype(np.int16)))
                        audioset_classes.append(classes.tolist())
                        # because we're making a prediction based on a DOWNSAMPLED version of the 
                        # track, we need to convert our sample indicies back up to 44.1kHz
                        # for an accurate location
                        sf_interval = (np.array(sf_interval) / 16000) * 44100
                        corrected_intervals.append([int(sf_interval[0]), int(sf_interval[1])])
                        corrected_num_samps += int(sf_interval[1]) - int(sf_interval[0])
                
                track = os.path.splitext(os.path.basename(full_path))[0] # remove full path and ext .wav

                print(f'{track} - {corrected_num_samps} samples matching audioset classes')

                dir_map[session_name][track] = {}
                dir_map[session_name][track]["path"] = str(full_path)
                dir_map[session_name][track]["keywords"] = list(kw_filt.filter(track))
                dir_map[session_name][track]["numsamps"] = int(num_samps)
                dir_map[session_name][track]["intervals"] = list(corrected_intervals)
                dir_map[session_name][track]["audioset"] = audioset_classes

            analyzed += 1

            if analyzed % save_interval == save_interval - 1:
                print("save interval reached, saving map...")
                file_utils.save_json(outpath, dir_map, indent=2)

    return dir_map
Esempio n. 9
0
def match_sites_dataframe(dataframe, matches_json="", top_n=5):
    '''
    Generates a dataframe of matched sites.
    matches_json is an optional parameter for saving and loading slow to generate
    description based matches.
    INPUTS:
     - dataframe
     - matches_json -- A string representing the filename of a json file containing old matches to speed up processing
     - top_n (int) -- Maximum amount of matches to return for each item
    OUTPUTS:
     - matches_df
    '''

    #Missing values should be represented by empty strings
    dataframe = dataframe.fillna(value="")

    #Ensure we have the correct columns
    dataframe = pandas.DataFrame(dataframe.to_dict("records"),
                                 columns=ALL_FIELDNAMES)

    #Fill any columns we just added with "-1" to mark it wasn't originally there
    dataframe = dataframe.fillna(value="-1")

    #Make sure everything in that dataframe is a string
    dataframe = dataframe.applymap(lambda x: str(x))

    #Remove extra whitespace
    dataframe = dataframe.applymap(lambda x: x.strip()
                                   if type(x) == str else x)

    if "Match Site" in dataframe.columns:
        ndf = dataframe[dataframe["Match Site"] == "-1"]
        if ndf.empty:
            #No new rows.
            return pandas.DataFrame()
        odf = dataframe[dataframe["Match Site"] != "-1"]
        if odf.empty:
            old_rows = []
        else:
            old_rows = odf.to_dict("records")
        new_rows = ndf.to_dict("records")
    else:
        new_rows = dataframe.to_dict("records")
        old_rows = []
    # Add a 'Description' field to new_rows
    site_rows = [{
        **row, "Description": row["Stock Description"]
    } for row in new_rows]
    old_site_rows = remove_duplicate_rows(old_rows)
    old_item_ids_to_rows = generate_item_ids_to_rows(old_rows)

    # Generate desc_matches based on matches_json
    desc_matches = {}
    if matches_json:
        if file_utils.file_exists(matches_json):
            desc_matches = file_utils.read_json(matches_json)
        else:
            desc_matches = match_by_description(site_rows, old_site_rows)
            file_utils.save_json(matches_json, desc_matches)

    matches_rows = match_sites(site_rows,
                               old_site_rows,
                               old_item_ids_to_rows,
                               desc_matches,
                               top_n=top_n)
    matches_df = pandas.DataFrame(matches_rows, columns=OUTPUT_FIELDNAMES)
    matches_df = matches_df.fillna(value="")
    matches_df = matches_df[OUTPUT_FIELDNAMES]
    return matches_df
Esempio n. 10
0

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--path",
                        type=str,
                        default="./multitracks/",
                        help="path to downloaded mutlitracks")
    parser.add_argument("-kw",
                        type=str,
                        default="keywords.txt",
                        help="keywords txt file that specifies search terms")
    parser.add_argument("--c_thresh",
                        type=int,
                        default=80,
                        help="confidence threshold for fuzzy string matching")
    parser.add_argument("-o",
                        type=str,
                        default="./dataset_map.json",
                        help="output file for dataset map")
    args = parser.parse_args()

    # kw_filter = FilterStems(file_utils.load_keywords(args.kw), args.c_thresh)
    print("filtering mutlitrack stems for labels")
    dir_map = create_label_map(args.path, args.kw, args.c_thresh)

    file_utils.save_json(args.o, dir_map)

    print(f"directory map saved to {args.o}")
Esempio n. 11
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--path", type=str, default="./multitracks/", 
        help="path to downloaded mutlitracks")
    parser.add_argument("--out", type=str, default="./complete_map.json", 
        help="output path of verified map")
    parser.add_argument("--map", type=str, default=None, 
        help="path do existing dataset map which, if specified, will be modified in place")
    parser.add_argument("--overwrite", type=bool, default=True, 
        help="overwrite existing entries if existing dataset map is loaded")
    parser.add_argument("--save_interval", type=int, default=5, 
        help="how often to save the dataset map while processing tracks")
    parser.add_argument("-kw", type=str, default="keywords.txt",
        help="keywords txt file that specifies search terms")
    parser.add_argument("--c_thresh", type=int, default=80,
        help="confidence threshold for fuzzy string matching")
    parser.add_argument("--thresh_db", type=int, default=45, 
        help="threshold in db to reject silence")
    parser.add_argument("--n_jobs", type=int, default=8, 
        help="num parallel worker threads to load & process audio files")

    args = parser.parse_args()

    dataset_map = {} # initialzie empty dict
    if args.map is not None: # if existing map is provided, it will be modified in place (overwriting existing entries)
        dataset_map = file_utils.load_json(args.map)

    dataset_map = create_map(args.path, args.kw, args.c_thresh, args.thresh_db, args.n_jobs, dataset_map, 
                            args.out, args.overwrite, args.save_interval)

    file_utils.save_json(args.out, dataset_map, indent=2)
Esempio n. 12
0
def get_repos(all_contrib, all_repos, user_to_repo, repo_to_contrib, auth,
              headers):
    #since this will take more than 5000 requests, keep a bookmark in a file - in case something goes wrong

    #read simple list of users that are done already, will update
    finished_users = utils.load_json("github_files/github_finished_users.json")
    if finished_users == False:
        finished_users = list()

    #if all the users we have so far are already finished, return input as results instead of loop-checking
    if len(finished_users) == len(all_contrib):
        return all_contrib, all_repos, user_to_repo, repo_to_contrib

    #also keep list of users that don't search properly (private repos?)
    bad_users = utils.load_json("github_files/bad_users.json")
    if bad_users == False:
        bad_users = list()

    #count users done
    user_count = 0

    #loop all users (should be all contributors of the 1000 initial repos), fetch their python repos
    for user in all_contrib:
        #check if we did this user already, if so skip
        if user['id'] in finished_users:
            continue

        #build request url for this user
        url = "https://api.github.com/search/repositories?q=language:python+user:%s&per_page=100" % (
            user['login'])

        #do request, including any pages
        while url != "":
            #sleep for ~2 seconds before next request, to prevent getting kicked off
            #(search requests limited to 30 per minute)
            time.sleep(2)

            #get the json!
            r = requests.get(url, auth=auth, headers=headers)
            res = r.json()

            #handle bad results and try to continue
            if 'items' not in res:
                #rate limit? wait for 60 seconds and try the same url again
                if 'documentation_url' in res and res[
                        'documentation_url'] == "https://developer.github.com/v3/#rate-limiting":
                    print "rate limit wait"
                    time.sleep(60)
                    continue
                #server error 502 - try the request again
                elif "message" in res and res['message'] == "Server Error":
                    continue
                #bad results for this particular user, they might be private now - skip and move to next
                else:
                    #print res
                    bad_users.append(user)
                    break

            #good results, parse and store
            for repo in res['items']:
                #new repo, add to list of all
                if repo['id'] not in repo_to_contrib:
                    all_repos['items'].append(repo)
                #always add to correlative structures
                if user['id'] not in user_to_repo:
                    user_to_repo[user['id']] = list()
                if repo['id'] not in user_to_repo[user['id']]:
                    user_to_repo[user['id']].append(repo['id'])
                if repo['id'] in repo_to_contrib and user[
                        'id'] not in repo_to_contrib[repo['id']]:
                    repo_to_contrib[repo['id']].append(user['id'])
                elif repo['id'] not in repo_to_contrib:
                    repo_to_contrib[repo['id']] = list()

            #see if more pages, if so fetch them
            if 'next' in r.links:
                url = r.links['next']['url']
            else:  #no more pages, quit for this user
                url = ""

        #finished user, add to bookmark
        finished_users.append(user['id'])
        user_count += 1

        #intermediate saves and prints
        if user_count % 100 == 0:
            #save all repos to json file
            utils.save_json(all_repos, "github_files/github_all_repos.json")
            #save correlative lists
            utils.save_json(user_to_repo,
                            "github_files/github_user_to_repo.json")
            utils.save_json(repo_to_contrib,
                            "github_files/github_repo_to_contrib.json")
            print "Saved repos of", user_count, "users"
            #save bad users list
            utils.save_json(bad_users, "github_files/github_bad_users.json")
            #save list of finished users
            utils.save_json(finished_users,
                            "github_files/github_finished_users.json")
    #end for users

    #final save before return
    #save all repos to json file
    utils.save_json(all_repos, "github_files/github_all_repos.json")
    #save correlative lists
    utils.save_json(user_to_repo, "github_files/github_user_to_repo.json")
    utils.save_json(repo_to_contrib,
                    "github_files/github_repo_to_contrib.json")
    #save bad users list
    utils.save_json(bad_users, "github_files/github_bad_users.json")
    #save list of finished users
    utils.save_json(finished_users, "github_files/github_finished_users.json")
    print "Saved all data to files"

    return all_contrib, all_repos, user_to_repo, repo_to_contrib  #return all results
Esempio n. 13
0
	#read in all commits
	commits = utils.load_json("data_files/all_commits_%s_small.json" % module_type)
	if commits == False:
		print "need compiled commits file data_files/all_commits_%s.json, exiting" % module_type
		exit(0)

	#process each commit
	for commit in commits:
		user_all_commit_counts[commit[user]] += 1
		repo_all_commit_counts[commit[repo]] += 1
			
	print "COMPLETE"

	#save results	
	utils.save_json(user_commit_counts, "data_files/user_all_commit_counts.json")
	utils.save_json(repo_commit_counts, "data_files/repo_all_commit_counts.json")
	print "final commit_counts saved to data_files/user_all_commit_counts.json and data_files/repo_all_commit_counts.json"
else:
	print "commit counts already computed, plotting frequencies"
	
	#user commit plots
	user_all_commit_freq, min_user_commit, max_user_commit = plot_utils.count_freq(user_all_commit_counts)
	plot_utils.plot_freq(user_commit_freq, "user commit count", "freq", filename = "plots/user_all_commit_freq.jpg")
	print "user all commit counts: min =", min_user_commit, ", max =", max_user_commit
	
	#repo commit plots
	repo_all_commit_freq, min_repo_commit, max_repo_commit = plot_utils.count_freq(repo_all_commit_counts)
	plot_utils.plot_freq(repo_all_commit_freq, "repo commit count", "freq", filename = "plots/repo_all_commit_freq.jpg")
	print "repo all commit counts: min =", min_repo_commit, ", max =", max_repo_commit
	
Esempio n. 14
0
                #clear list and counts
                imports = defaultdict(list)
                imports_count = 0

            #grab new commit data, replace the old
            user, time = get_user_and_time(line, email_to_id, name_to_id)
            if user == False:
                continue
        #diff line
        else:
            lib = parse_import(line)
            if not lib:  #empty imports, skip
                continue
            imports_count = imports_count + 1
            if line.startswith("+"):
                imports['+'].extend(lib)
            else:
                imports['-'].extend(lib)

    #finished file, save any lingering data
    commits_list.append([user, time, imports])  #save commit

    #save file commit data to json
    utils.save_json(commits_list, "parsed_commit_data/%s.log" % filename[:-12])

    #period prints
    file_idx = file_idx + 1
    if file_idx % 500 == 0:
        print "finished", file_idx, "files"
        #break
Esempio n. 15
0
            'children_ids': list(),
            'time': (comment['nodeTime'] - post_time) / 60,
            'parent': comment['parentID']
        }
        for comment in comments
    }

    #add all comments to correct child list
    for comment in comments:
        if comment['parentID'] == post_id:
            output[post_id]['root']['children'].append(
                comment_tree[comment['nodeID']])
        else:
            comment_tree[comment['parentID']]['children_ids'].append(
                comment['nodeID'])
            comment_tree[comment['parentID']]['children'].append(
                comment_tree[comment['nodeID']])

    #remove unnecessary fields: children_ids, parent
    for comment_id, comment_dict in comment_tree.items():
        comment_dict.pop('children_ids', None)
        comment_dict.pop('parent', None)

    post_count += 1
    if post_count % 100 == 0:
        print("Finished", post_count, "posts")

#save results
print("Saving results")
file_utils.save_json(output, outfile)
Esempio n. 16
0
                name_count[name] += 1
                email_count[email] += 1

        #periodic progress prints
        file_idx = file_idx + 1
        if file_idx % 1000 == 0:
            print "finished", file_idx, "files"

    print "COMPLETE"

#have lists, convert back to dictionary for sorting
else:
    name_count = dict(name_count)
    email_count = dict(email_count)

#remove all names/emails that only occur once
name_count = dict((k, v) for k, v in name_count.iteritems() if v > 1)
email_count = dict((k, v) for k, v in email_count.iteritems() if v > 1)

#sort
name_count = sorted(name_count.items(),
                    key=operator.itemgetter(1),
                    reverse=True)
email_count = sorted(email_count.items(),
                     key=operator.itemgetter(1),
                     reverse=True)

#save results
utils.save_json(name_count, "data_files/author_name_freq.json")
utils.save_json(email_count, "data_files/author_email_freq.json")
						lib = package						
					#not counting with submodules, get parent package only
					else:
						lib = strip_lib(package)
					
					#add package to libs list for this commit if not relative path and not already in list
					if lib[0] != '.' and lib not in new_commit["del_libs"]:
						new_commit["del_libs"].append(lib)

					
			#add commit to list of all (may have empty import list)
			all_commits.append(new_commit)

		#period prints
		file_idx = file_idx + 1
		if file_idx % 1000 == 0:
			print "finished", file_idx, "repo files"
			
	#save all commits to json (large file incoming)
	utils.save_json(all_commits, "data_files/all_commits_%s.json" % module_type)

	print "results saved to data_files/all_commits_%s.json" % module_type
	print "   ", missing_time_count, "commits without timestamp (not included in compiled list)"
	
else:
	"read in all commits from data_files/all_commits_%s.json" % module_type

#regardless, print number of total commits
print len(all_commits), "commits total"

Esempio n. 18
0
import file_utils
import glob

#list of files to combine
files = glob.glob('output/*.json')
print(len(files), "files:", files)

count = 0
all_data = None
event_count = 0

for file in files:
    file_data = file_utils.load_json(file)

    event_count += len(file_data['data'])

    if count == 0:
        all_data = file_data
    else:
        all_data['data'].extend(file_data['data'])

    count += 1

print("Sum event count:", event_count)
print("Combined event count:", len(all_data['data']))

#save all data
file_utils.save_json(all_data, "output/all_cyber_sim_res.json")
Esempio n. 19
0
def get_contrib(all_repos,
                auth,
                headers,
                all_contrib=False,
                user_to_repo=False,
                repo_to_contrib=False,
                new=False):
    #load existing data from files if not passed in and files exist
    if all_contrib == False:  #existing contributors
        all_contrib = utils.load_json("github_files/github_all_contrib.json")
    if user_to_repo == False:  #user->repos dict
        user_to_repo = dict_key_to_int(
            utils.load_json("github_files/github_user_to_repo.json"))
    if repo_to_contrib == False:  #repo->contribs dict
        repo_to_contrib = dict_key_to_int(
            utils.load_json("github_files/github_repo_to_contrib.json"))

    if new == False:
        return all_contrib, user_to_repo, repo_to_contrib

    #if no contributors list or correlative dictionaries, initialize empty containers
    if all_contrib == False or user_to_repo == False or repo_to_contrib == False:
        user_to_repo = defaultdict(list)  #user id to list of repo ids
        repo_to_contrib = defaultdict(list)  #repo id to list of contrib
        all_contrib = list()

    #keep a bookmark of finished repos
    finished_repos = utils.load_json("github_files/github_finished_repos.json")
    if finished_repos == False:
        finished_repos = list()
    else:
        print "read in", len(finished_repos), "finished repos"

    #check the rate limit before we start by making a dummy request, sleep if we need to
    url = 'https://api.github.com/repos/vinta/awesome-python/contributors'  #any url will do
    r = requests.get(url, auth=auth, headers=headers)
    check_rate_limit(r)

    #loop all repos from list, fetch contributors if don't have them
    repo_count = 0
    for repo in all_repos['items']:
        #check if have contributors for this repo already, skip if yes
        if repo['id'] in finished_repos:
            continue

        #need to fetch contributors for this repo
        #print "Fetching repo", repo['id']
        contrib_count = 0
        #get request url for this repo
        url = repo['contributors_url']
        while url != "":
            #get the json!
            r = requests.get(url, auth=auth, headers=headers)
            #check for 204 response code - seems to indicate empty repo, and fails on json parse
            if r.status_code == 204:
                break
            #parse request response to json
            res = r.json()
            contrib_count = contrib_count + len(res)

            #repo not found (probably made private), skip and move to next
            if type(res) is not list and "message" in res and res[
                    'message'] == "Not Found":
                break
            #server error 502 - try the request again
            elif type(res) is not list and "message" in res and res[
                    'message'] == "Server Error":
                continue
            #other fail? dump some output and quit so we can fix it
            elif type(res) is not list:
                print r
                print res
                print url
                exit(0)

            #parse out this request result
            for usr in res:
                #new usr, add to list of all
                if usr['id'] not in user_to_repo:
                    all_contrib.append(usr)
                #always add to correlative structures
                if usr['id'] in user_to_repo and repo[
                        'id'] not in user_to_repo[usr['id']]:
                    user_to_repo[usr['id']].append(repo['id'])
                elif usr['id'] not in user_to_repo:
                    user_to_repo[usr['id']] = list()
                if usr['id'] not in repo_to_contrib[repo['id']]:
                    repo_to_contrib[repo['id']].append(usr['id'])
            #see if more pages, fetch if yes
            if 'next' in r.links:
                url = r.links['next']['url']
            else:  #no new pages, done
                url = ""

            #check the rate limit, sleep if we need to
            check_rate_limit(r)

            #sleep for ~0.5 seconds to space out the requests better
            time.sleep(0.5)

        #print "Repo", repo['id'], ":", contrib_count, "contributors"
        repo_count += 1
        finished_repos.append(repo['id'])

        #intermediate saves... just in case
        if repo_count % 100 == 0:
            #save all contrib to json file
            utils.save_json(all_contrib,
                            "github_files/github_all_contrib.json")
            #save correlative lists
            utils.save_json(user_to_repo,
                            "github_files/github_user_to_repo.json")
            utils.save_json(repo_to_contrib,
                            "github_files/github_repo_to_contrib.json")
            #save bookmark
            utils.save_json(finished_repos,
                            "github_files/github_finished_repos.json")
            print "saved contributors of", repo_count, "repos"

    #all done - save results
    #save all contrib to json file
    utils.save_json(all_contrib, "github_files/github_all_contrib.json")
    #save correlative dictionaries
    utils.save_json(user_to_repo, "github_files/github_user_to_repo.json")
    utils.save_json(repo_to_contrib,
                    "github_files/github_repo_to_contrib.json")
    #final bookmark
    utils.save_json(finished_repos, "github_files/github_finished_repos.json")

    #return results
    return all_contrib, user_to_repo, repo_to_contrib
Esempio n. 20
0
                    user_id = name_to_id[name]
                else:
                    user_id = email_to_id[email]

            #diff line
            else:
                imports = True

        file_idx = file_idx + 1
        if file_idx % 1000 == 0:
            print "finished", file_idx, "files"

    print "COMPLETE"

    #save results
    utils.save_json(user_commit_counts, "user_commit_counts.json")
    utils.save_json(file_commit_counts, "file_commit_counts.json")
    print "final user list saved to user_commit_counts.json and file_commit_counts.json"
else:
    print "counts already computed, plotting frequencies"

    #user commit plots
    user_commit_freq, min_user_commit, max_user_commit = plot_utils.count_freq(
        user_commit_counts)
    plot_utils.plot_freq(user_commit_freq,
                         "user commit count",
                         "freq",
                         filename="user_commit_freq.jpg")
    print "user commit counts: min =", min_user_commit, ", max =", max_user_commit

    #repo commit plots
Esempio n. 21
0
    lib_idx = lib_idx + 1
    if lib_idx % 1000 == 0:
        print "   finished", lib_idx, "libraries"

#finished, print the counts
print "processed", event_idx, "adoption events across", lib_idx, "libraries"
print "   ", multi_source, "of these events have multiple sources, max number of sources is", max_source
print intra_repo, "adoption source-target pairs within same repo"
print cross_repo, "adoption source-target pairs across different repos"
print zero_count, "adoption source-target pairs with time delay of 0"

#save lib adoption counts sorted most to least
lib_adop_counts_sorted = OrderedDict(
    sorted(lib_adop_counts.items(), key=itemgetter(1), reverse=True))
utils.save_json(
    lib_adop_counts_sorted, "datafiles/lib_adop_counts_sorted_%s.json" %
    (module_type + "_" + adop_type))

#plots use usage counts, adoption counts, and average delta t:

use_counts = []
adop_counts = []
avg_delta = []
for lib in usage_counts:
    if lib in lib_adop_counts:
        use_counts.append(usage_counts[lib])
        adop_counts.append(lib_adop_counts[lib])
        avg_delta.append(lib_delta[lib])
#total # of usages for library on x, total # of adoptions for lib on y
plot_utils.plot_data(use_counts,
                     adop_counts,
Esempio n. 22
0
        else:
            import_repo_counts[k] = len(import_repos[k])
            import_user_counts[k] = len(import_users[k])
            import_repos_list[k] = list(import_repos[k])
            import_users_list[k] = list(import_users[k])

    #sort the counts (painful conversion, hopefully not too slow)
    import_counts_overall = OrderedDict(
        sorted(import_counts_overall.items(), key=itemgetter(1), reverse=True))
    import_repo_counts = OrderedDict(
        sorted(import_repo_counts.items(), key=itemgetter(1), reverse=True))
    import_user_counts = OrderedDict(
        sorted(import_user_counts.items(), key=itemgetter(1), reverse=True))

    #save counts to json
    utils.save_json(import_counts_overall,
                    "import_counts_overall_%s.json" % count_type)
    utils.save_json(import_repo_counts,
                    "import_repo_counts_%s.json" % count_type)
    utils.save_json(import_user_counts,
                    "import_user_counts_%s.json" % count_type)

    #save the lists too (why not)
    utils.save_json(import_repos_list,
                    "import_repos_lists_%s.json" % count_type)
    utils.save_json(import_users_list,
                    "import_users_lists_%s.json" % count_type)

    print "results saved to import_??_counts.json (3 files) and import_??_lists.json (2 files)"
else:
    "read in counts, plotting distributions"
  parser.add_argument("-kw", type=str, default="vox",
      help="keyword filter, only one allowed. See keywords.txt for full list")
  parser.add_argument("--approve", type=str, nargs="+", default="Speech",
      help="AudioSet labels to match in source content. Multiple string values allowed (case sensitive)")
  parser.add_argument("--reject", type=str, default="Silence",
      help="AudioSet labels to reject. Multiple string values allowed (case sensitive)")
  parser.add_argument("--map", type=str, default="dataset_map.json", 
      help="path do json dataset map")
  parser.add_argument("--thresh", type=int, default=45, 
      help="threshold in db to reject silence")

  args = parser.parse_args()

  # with open(args.map) as json_file:
  #     dataset_map = json.load(json_file)
  dataset_map = file_utils.load_json(args.map)

  yam_approve = list(args.approve)
  yam_reject = list(args.reject)

  if os.path.exists(args.out):
    print(f'{args.out} already exists, modifying...')
    verified_classmap = file_utils.load_json(args.out)
  else:
    verified_classmap = {}

  
  verified = verify_classes_yamnet(verified_classmap, dataset_map, args.thresh, args.kw, yam_approve, yam_reject)

  file_utils.save_json(args.out, verified)
  print(f'saved verified map to {args.out}')
Esempio n. 24
0
    file_utils.save_pickle(active_users, users_filepath % code)
    print("Saved", len(active_users), "active users to", users_filepath % code)

#fit params to all of the cascades, if no file
#no need to load if we have them, won't use them again
if file_utils.verify_file(params_filepath % code):
    print("Params exist in", params_filepath % code)
else:
    #fit params to all cascades
    all_params, fit_fail_list = cascade_analysis.fit_all_cascades(
        code, cascades, comments, True)

    #save list of failed fits (if exists)
    if len(fit_fail_list) != 0 and file_utils.verify_file(
            "model_files/params/%s_failed_param_fit.txt" % code) == False:
        file_utils.save_json(
            fit_fail_list, "model_files/params/%s_failed_param_fit.txt" % code)
        print(
            "Saved list of fit-fail stories to model_files/params/%s_failed_param_fit.txt"
            % code)

    #save to text file now
    with open(params_filepath % code, "w") as f:
        for post_id, params in all_params.items():
            f.write(str(posts[post_id]['id']) + " ")  #write numeric post id
            for i in range(len(params)):
                f.write((' ' if i > 0 else '') + str(params[i]))
            f.write("\n")
    print("Saved text-readable params to", params_filepath % code)

#don't build graph, would be way too big
'''
Esempio n. 25
0
def build_cascades(code, posts = False, comments = False):
	#if cascades already exist, read from cache
	if os.path.exists("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) and (os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) or os.path.exists("data_cache/%s_cascades/%s_cascade_comments_1.pkl" % (code, code))):
		#load from pickle
		print("Loading cascades from data_cache")
		cascades = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code))
		#comments: either a single file, or multiple files
		print("Loading comments from data_cache")
		if os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)):
			comments = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code))
		else:			
			comments = {}
			files = sorted(glob.glob('data_cache/%s_cascades/%s_cascade_comments*' % (code, code)))
			for file in files:
				print("Loading", file)
				new_comments = file_utils.load_pickle(file)
				comments.update(new_comments)
		missing_posts = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code))
		missing_comments = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code))
		print("   Loaded", len(cascades), "cascades with", len(comments), "comments")
		print("     ", len(missing_posts), "missing posts", len(missing_comments), "missing comments")
		return cascades, comments, missing_posts, missing_comments

	#if no cached cascades, build them from scratch

	#if no loaded posts/comments, load those up first
	if posts == False or comments == False:
		posts, comments = load_model_data.load_reddit_data(code)

	print("Extracting post/comment structure for", len(posts), "posts and", len(comments), "comments")

	#add replies field to all posts/comments, init to empty list
	data_utils.add_field(posts, "replies", [])
	data_utils.add_field(comments, "replies", [])
	#add placeholder field to all posts/comments, flag indicates if we created a dummy object
	data_utils.add_field(posts, 'placeholder', False)
	data_utils.add_field(comments, 'placeholder', False)

	#add comment_count field to all post objects as well: count total number of comments all the way down the cascade
	data_utils.add_field(posts, "comment_count_total", 0)
	#and direct replies only
	data_utils.add_field(posts, "comment_count_direct", 0)
	#and add a missing_comments field to all post objects: set True if we find any missing comments in this cascade
	data_utils.add_field(posts, "missing_comments", False)

	#grab list of fields for each type of object (used to create placeholders when items are missing)
	post_fields = list(posts[0].keys())
	comment_fields = list(comments[0].keys())

	'''
	id_h = post/commend id
	parent_id_h = direct parent
	link_id_h = post parent
	if a parent_id starts with t1_, you remove t1_ and match the rest against a comment id 
	if it starts with t3_, you remove t3_ and match the rest against a submission id.
	linked_id always starts with t3_, since it always points to a submission.
	'''

	#create dictionary of post id -> post object to store cascades
	cascades = data_utils.list_to_dict(posts, "id_h")

	#convert list of comments to dictionary, where key is comment id
	comments = data_utils.list_to_dict(comments, "id_h")

	#now that we can find posts and comments at will, let's build the dictionary!
	
	#loop all comments, assign to immediate parent and increment comment_count of post parent
	comment_count = 0
	missing_comments = set()	#missing comments
	missing_posts = set()		#missing posts
	for comment_id in list(comments.keys()):

		#get immediate parent (post or comment)
		direct_parent = comments[comment_id]['parent_id_h'][3:]
		direct_parent_type = "post" if comments[comment_id]['parent_id_h'][:2] == "t3" else "comment"
		#get post parent
		post_parent = comments[comment_id]['link_id_h'][3:]
		comment_count += 1

		#add this comment to replies list of immediate parent, and update counters on post_parent
		try:
			#if post parent missing, create placeholder
			if post_parent not in cascades:
				cascades[post_parent] = create_object(post_parent, post_fields)
				missing_posts.add(post_parent)

			#update overall post comment count for this new comment
			cascades[post_parent]['comment_count_total'] += 1

			#now handle direct parent, post or comment
			#parent is post
			if direct_parent_type == "post":
				#missing post, create placeholder to hold replies
				if direct_parent not in cascades:
					cascades[direct_parent] = create_object(direct_parent, post_fields)
					missing_posts.add(direct_parent)
				#add this comment to replies field of post (no total comment increment, done above)
				cascades[direct_parent]['replies'].append(comment_id)
				#add 1 to direct comment count field
				cascades[direct_parent]['comment_count_direct'] += 1

			#parent is comment
			else:	
				#missing comment, create placeholder to contain replies, point to parent post by default
				if direct_parent not in comments:
					comments[direct_parent] = create_object(direct_parent, comment_fields)
					#point this placeholder comment to the top-level post
					comments[direct_parent]['link_id_h'] = post_parent
					comments[direct_parent]['parent_id_h'] = post_parent
					#add manufactured comment to counters
					cascades[post_parent]['comment_count_total'] += 1
					cascades[post_parent]['comment_count_direct'] += 1	
					#and add to replies	
					cascades[post_parent]['replies'].append(direct_parent)	
					#flag this cascade as containing missing comments
					cascades[post_parent]['missing_comments'] = True	
					missing_comments.add(direct_parent)		#add comment to list of missing
				#add current comment to replies field of parent comment
				comments[direct_parent]['replies'].append(comment_id)
		except:
			print("FAIL")
			print(len(missing_posts), "posts")
			print(len(missing_comments), "comments")
			for field in comments[comment_id]:
				if field != "replies":
					print(field, comments[comment_id][field])
			exit(0)

	print("\nProcessed", comment_count,  "comments in", len(cascades), "cascades")
	print("   ", len(missing_posts), "missing posts")
	print("   ", len(missing_comments), "missing comments")
	print("   ", len([x for x in cascades if cascades[x]['missing_comments']]), "cascades with missing comments")

	#verify the above process, a couple different ways

	#count comments from parent counters across all cascades
	'''
	total_comments = 0
	for post_id, post in cascades.items():
		total_comments += post['comment_count']
	print(total_comments, "from post counters")
	'''

	#traverse each cascade and count comments, check against stored comment count
	'''
	for post_id, post in cascades.items():
		traverse_comments = traverse_cascade(post, comments)
		if traverse_comments != post['comment_count']:
			print("post counter says", post['comment_count'], "comments, but traversal says", traverse_comments)
	'''

	#save cascades for later loading
	cascade_manip.save_cascades(code, cascades)				#cascades
	cascade_manip.save_comments(code, comments)		#comments
	file_utils.save_json(list(missing_posts), "data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code))
	file_utils.save_json(list(missing_comments), "data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code))

	return cascades, comments, missing_posts, missing_comments
Esempio n. 26
0
                if user not in repo_users[repo]:
                    repo_users[repo][user] = time
                user_repos[user].add(repo)  #add repo to user's list

            #user can adopt from repo contents without "seeing" a commit
            else:
                print "adoption without visible commit not currently supported"
                print "exiting"
                sys.exit(0)

    #period prints
    commit_idx = commit_idx + 1
    if commit_idx % 2500 == 0:
        print "   finished", commit_idx, "commits, found", adoption_count, "adoption events"

    #short-circuit for proof of concept: stop when have ~5 adoptions so we can
    #verify the output
    #if adoption_count >= 5:
    #	break

#save all adoption events to json (large file incoming, hope it has everything we need)
utils.save_json(
    adoption_events,
    "datafiles/adoption_events_%s.json" % (module_type + "_" + adop_type))
print "results saved to datafiles/adoption_events_%s.json" % (module_type +
                                                              "_" + adop_type)

#regardless, print number of adoptions found
print adoption_count, "adoption events found in", len(
    all_lib_commits), "import commits"
Esempio n. 27
0
def save_json_map_data(query, filepath):
    """run overpass api http query and save response to file as json dict"""
    response = fetch_raw_http_response(query)
    json_data = file_utils.bytes_to_json(response)
    file_utils.save_json(json_data, filepath)
Esempio n. 28
0
 def save(self):
     """
     Write the results into a file.
     finalize() should be called before saving.
     """
     save_json(self.state, f"{Directories.json_outputs}/{self.name}.json")