コード例 #1
0
def get_first_repos(auth, headers):
    #check if file exists, if yes just read it in
    data = utils.load_json("github_files/all_repos.json")
    if data != False:
        return data

    #well, that seems to have worked... time to do them all!
    #except it's not really ALL, just the first 1000 - can't seem to sort them,
    #so we'll go with what Github thinks is most interesting
    url = 'https://api.github.com/search/repositories?q=language:python&per_page=100'
    #first requests
    r = requests.get(url, auth=auth, headers=headers)
    all_results = r.json()
    url = r.links['next']['url']
    print r.links['last']['url']
    #loop all requests
    count = 1
    print "finished request", count, "received", len(
        all_results['items']), "items"
    while url != "":
        #perform request and add results to previous
        r = requests.get(url, auth=auth, headers=headers)
        res = r.json()
        all_results['items'].extend(res['items'])
        count = count + 1
        print "finished request", count, "received", len(res['items']), "items"
        #get url for next request
        if 'next' in r.links:
            url = r.links['next']['url']
        else:
            url = ""
        print url
    #save all results to json file
    utils.save_json(all_results, "github_files/github_all_repos.json")
    return all_results
コード例 #2
0
                print(
                    f"another edge case - null person: {person}, {text}, {split}"
                )

            text_piece = TextPiece(who=person_hold,
                                   text=accumulated_string.strip())

            person_hold = person
            accumulated_string = after

            yield text_piece


if __name__ == '__main__':
    # All of it:
    data = load_json("data/transcripts_c2.json")

    # Just episode 1:
    ep1_data = data["0"]

    # Generate the text pieces:
    texts_ep1 = list(generate_text_pieces(episode_data=ep1_data))

    # Let's do all of it:
    texts = []
    for idx, ep_data in data.items():
        texts += list(generate_text_pieces(episode_data=ep_data))

    # filter out empty pieces:
    texts = [t for t in texts if t.who != '' and t.text != '']
コード例 #3
0
code = "crypto"  #set use case/domain: must be crypto, cyber, or cve
#crypto for dry run
#cyber takes forever
#cve fastest

pickle_save = False  #if True, save fitted parameters dictionary to pickle
#if False, save to human-readable text file instead

print("\nProcessing", code)

cascades = None
comments = None

#load the subreddit distribution for these cascades
subreddit_dist = file_utils.load_json("results/%s_post_subreddit_dist.json" %
                                      code)

#loop all subreddits for this code
for subreddit in sorted(subreddit_dist.keys()):

    if subreddit != 'Lisk':
        continue

    print("\nProcessing", subreddit)

    #load filtered, if they exist
    filtered_cascades, filtered_comments = cascade_manip.load_filtered_cascades(
        code, subreddit)

    #don't exist, filter them now
    if filtered_cascades == False:
コード例 #4
0
        for str in tokens[1:]:
            if flag:
                flag = False
                continue
            if str == "as":
                flag = True
                continue
            lib.append(str)

    return lib


#--- MAIN EXECUTION BEGINS HERE---#

#read userid mappings from files
email_to_id = utils.load_json("data_files/email_to_userid.json")
name_to_id = utils.load_json("data_files/name_to_userid.json")

if name_to_id == False or email_to_id == False:
    print "Must have name/email to user id mapping files. Exiting"
    exit(0)

file_idx = 0

#create parsed_commit_data directory if it does not exist
if os.path.isdir("parsed_commit_data") == False:
    os.makedirs("parsed_commit_data")

#for each commit log file:
for filename in os.listdir('commit_data'):
    commits_list = []  #overall commit list for file
コード例 #5
0
 def __init__(self):
     self.irregular =\
         load_json(f"{Directories.processed_data}/irregular_verbs.json")
     self.vowel = "[aeiouy]"
     self.consonant = "[b-df-hj-np-tv-z]"
コード例 #6
0
def build_cascades(code, posts = False, comments = False):
	#if cascades already exist, read from cache
	if os.path.exists("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) and (os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) or os.path.exists("data_cache/%s_cascades/%s_cascade_comments_1.pkl" % (code, code))):
		#load from pickle
		print("Loading cascades from data_cache")
		cascades = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code))
		#comments: either a single file, or multiple files
		print("Loading comments from data_cache")
		if os.path.exists("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)):
			comments = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code))
		else:			
			comments = {}
			files = sorted(glob.glob('data_cache/%s_cascades/%s_cascade_comments*' % (code, code)))
			for file in files:
				print("Loading", file)
				new_comments = file_utils.load_pickle(file)
				comments.update(new_comments)
		missing_posts = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code))
		missing_comments = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code))
		print("   Loaded", len(cascades), "cascades with", len(comments), "comments")
		print("     ", len(missing_posts), "missing posts", len(missing_comments), "missing comments")
		return cascades, comments, missing_posts, missing_comments

	#if no cached cascades, build them from scratch

	#if no loaded posts/comments, load those up first
	if posts == False or comments == False:
		posts, comments = load_model_data.load_reddit_data(code)

	print("Extracting post/comment structure for", len(posts), "posts and", len(comments), "comments")

	#add replies field to all posts/comments, init to empty list
	data_utils.add_field(posts, "replies", [])
	data_utils.add_field(comments, "replies", [])
	#add placeholder field to all posts/comments, flag indicates if we created a dummy object
	data_utils.add_field(posts, 'placeholder', False)
	data_utils.add_field(comments, 'placeholder', False)

	#add comment_count field to all post objects as well: count total number of comments all the way down the cascade
	data_utils.add_field(posts, "comment_count_total", 0)
	#and direct replies only
	data_utils.add_field(posts, "comment_count_direct", 0)
	#and add a missing_comments field to all post objects: set True if we find any missing comments in this cascade
	data_utils.add_field(posts, "missing_comments", False)

	#grab list of fields for each type of object (used to create placeholders when items are missing)
	post_fields = list(posts[0].keys())
	comment_fields = list(comments[0].keys())

	'''
	id_h = post/commend id
	parent_id_h = direct parent
	link_id_h = post parent
	if a parent_id starts with t1_, you remove t1_ and match the rest against a comment id 
	if it starts with t3_, you remove t3_ and match the rest against a submission id.
	linked_id always starts with t3_, since it always points to a submission.
	'''

	#create dictionary of post id -> post object to store cascades
	cascades = data_utils.list_to_dict(posts, "id_h")

	#convert list of comments to dictionary, where key is comment id
	comments = data_utils.list_to_dict(comments, "id_h")

	#now that we can find posts and comments at will, let's build the dictionary!
	
	#loop all comments, assign to immediate parent and increment comment_count of post parent
	comment_count = 0
	missing_comments = set()	#missing comments
	missing_posts = set()		#missing posts
	for comment_id in list(comments.keys()):

		#get immediate parent (post or comment)
		direct_parent = comments[comment_id]['parent_id_h'][3:]
		direct_parent_type = "post" if comments[comment_id]['parent_id_h'][:2] == "t3" else "comment"
		#get post parent
		post_parent = comments[comment_id]['link_id_h'][3:]
		comment_count += 1

		#add this comment to replies list of immediate parent, and update counters on post_parent
		try:
			#if post parent missing, create placeholder
			if post_parent not in cascades:
				cascades[post_parent] = create_object(post_parent, post_fields)
				missing_posts.add(post_parent)

			#update overall post comment count for this new comment
			cascades[post_parent]['comment_count_total'] += 1

			#now handle direct parent, post or comment
			#parent is post
			if direct_parent_type == "post":
				#missing post, create placeholder to hold replies
				if direct_parent not in cascades:
					cascades[direct_parent] = create_object(direct_parent, post_fields)
					missing_posts.add(direct_parent)
				#add this comment to replies field of post (no total comment increment, done above)
				cascades[direct_parent]['replies'].append(comment_id)
				#add 1 to direct comment count field
				cascades[direct_parent]['comment_count_direct'] += 1

			#parent is comment
			else:	
				#missing comment, create placeholder to contain replies, point to parent post by default
				if direct_parent not in comments:
					comments[direct_parent] = create_object(direct_parent, comment_fields)
					#point this placeholder comment to the top-level post
					comments[direct_parent]['link_id_h'] = post_parent
					comments[direct_parent]['parent_id_h'] = post_parent
					#add manufactured comment to counters
					cascades[post_parent]['comment_count_total'] += 1
					cascades[post_parent]['comment_count_direct'] += 1	
					#and add to replies	
					cascades[post_parent]['replies'].append(direct_parent)	
					#flag this cascade as containing missing comments
					cascades[post_parent]['missing_comments'] = True	
					missing_comments.add(direct_parent)		#add comment to list of missing
				#add current comment to replies field of parent comment
				comments[direct_parent]['replies'].append(comment_id)
		except:
			print("FAIL")
			print(len(missing_posts), "posts")
			print(len(missing_comments), "comments")
			for field in comments[comment_id]:
				if field != "replies":
					print(field, comments[comment_id][field])
			exit(0)

	print("\nProcessed", comment_count,  "comments in", len(cascades), "cascades")
	print("   ", len(missing_posts), "missing posts")
	print("   ", len(missing_comments), "missing comments")
	print("   ", len([x for x in cascades if cascades[x]['missing_comments']]), "cascades with missing comments")

	#verify the above process, a couple different ways

	#count comments from parent counters across all cascades
	'''
	total_comments = 0
	for post_id, post in cascades.items():
		total_comments += post['comment_count']
	print(total_comments, "from post counters")
	'''

	#traverse each cascade and count comments, check against stored comment count
	'''
	for post_id, post in cascades.items():
		traverse_comments = traverse_cascade(post, comments)
		if traverse_comments != post['comment_count']:
			print("post counter says", post['comment_count'], "comments, but traversal says", traverse_comments)
	'''

	#save cascades for later loading
	cascade_manip.save_cascades(code, cascades)				#cascades
	cascade_manip.save_comments(code, comments)		#comments
	file_utils.save_json(list(missing_posts), "data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code))
	file_utils.save_json(list(missing_comments), "data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code))

	return cascades, comments, missing_posts, missing_comments
コード例 #7
0
import glob

#glob filestring to get all results files
filestring = "dryrun/submit/sim_res_*.json"

#get list of all matching files
files = glob.glob(filestring)

prefix = "t3_"		#prefix to prepend to all id references

#process each file individually, correcting ids along the way
for file in files:
	print("\nCorrecting", file)

	#load the josn
	data = file_utils.load_json(file)
	print("  ", len(data['data']), "events to fix")

	#correct comment/post records, where each is a dictionary of the following form:
	#{"parentID": "A4XW5Jol_qVgUAKDWeOeaw", "communityID": "t5_3i6d8", "rootID": "A4XW5Jol_qVgUAKDWeOeaw", "nodeUserID": "okcc60doiWAfkR89nAAvHQ", "nodeTime": "1501876531", "nodeID": "A4XW5Jol_qVgUAKDWeOeaw", "actionType": "post"}
	for event in data['data']:

		#fix id fields
		event['parentID'] = prefix + event['parentID']
		event['rootID'] = prefix + event['rootID']
		event['nodeID'] = prefix + event['nodeID']

	#save the updated file overtop of the old one
	file_utils.save_json(data, file)
	print("Corrected file saved to", file)
コード例 #8
0
 def __init__(self):
     self.data = \
         load_json(f"{Directories.json_outputs}/{self.input_file_name}.json")
コード例 #9
0
def get_contrib(all_repos,
                auth,
                headers,
                all_contrib=False,
                user_to_repo=False,
                repo_to_contrib=False,
                new=False):
    #load existing data from files if not passed in and files exist
    if all_contrib == False:  #existing contributors
        all_contrib = utils.load_json("github_files/github_all_contrib.json")
    if user_to_repo == False:  #user->repos dict
        user_to_repo = dict_key_to_int(
            utils.load_json("github_files/github_user_to_repo.json"))
    if repo_to_contrib == False:  #repo->contribs dict
        repo_to_contrib = dict_key_to_int(
            utils.load_json("github_files/github_repo_to_contrib.json"))

    if new == False:
        return all_contrib, user_to_repo, repo_to_contrib

    #if no contributors list or correlative dictionaries, initialize empty containers
    if all_contrib == False or user_to_repo == False or repo_to_contrib == False:
        user_to_repo = defaultdict(list)  #user id to list of repo ids
        repo_to_contrib = defaultdict(list)  #repo id to list of contrib
        all_contrib = list()

    #keep a bookmark of finished repos
    finished_repos = utils.load_json("github_files/github_finished_repos.json")
    if finished_repos == False:
        finished_repos = list()
    else:
        print "read in", len(finished_repos), "finished repos"

    #check the rate limit before we start by making a dummy request, sleep if we need to
    url = 'https://api.github.com/repos/vinta/awesome-python/contributors'  #any url will do
    r = requests.get(url, auth=auth, headers=headers)
    check_rate_limit(r)

    #loop all repos from list, fetch contributors if don't have them
    repo_count = 0
    for repo in all_repos['items']:
        #check if have contributors for this repo already, skip if yes
        if repo['id'] in finished_repos:
            continue

        #need to fetch contributors for this repo
        #print "Fetching repo", repo['id']
        contrib_count = 0
        #get request url for this repo
        url = repo['contributors_url']
        while url != "":
            #get the json!
            r = requests.get(url, auth=auth, headers=headers)
            #check for 204 response code - seems to indicate empty repo, and fails on json parse
            if r.status_code == 204:
                break
            #parse request response to json
            res = r.json()
            contrib_count = contrib_count + len(res)

            #repo not found (probably made private), skip and move to next
            if type(res) is not list and "message" in res and res[
                    'message'] == "Not Found":
                break
            #server error 502 - try the request again
            elif type(res) is not list and "message" in res and res[
                    'message'] == "Server Error":
                continue
            #other fail? dump some output and quit so we can fix it
            elif type(res) is not list:
                print r
                print res
                print url
                exit(0)

            #parse out this request result
            for usr in res:
                #new usr, add to list of all
                if usr['id'] not in user_to_repo:
                    all_contrib.append(usr)
                #always add to correlative structures
                if usr['id'] in user_to_repo and repo[
                        'id'] not in user_to_repo[usr['id']]:
                    user_to_repo[usr['id']].append(repo['id'])
                elif usr['id'] not in user_to_repo:
                    user_to_repo[usr['id']] = list()
                if usr['id'] not in repo_to_contrib[repo['id']]:
                    repo_to_contrib[repo['id']].append(usr['id'])
            #see if more pages, fetch if yes
            if 'next' in r.links:
                url = r.links['next']['url']
            else:  #no new pages, done
                url = ""

            #check the rate limit, sleep if we need to
            check_rate_limit(r)

            #sleep for ~0.5 seconds to space out the requests better
            time.sleep(0.5)

        #print "Repo", repo['id'], ":", contrib_count, "contributors"
        repo_count += 1
        finished_repos.append(repo['id'])

        #intermediate saves... just in case
        if repo_count % 100 == 0:
            #save all contrib to json file
            utils.save_json(all_contrib,
                            "github_files/github_all_contrib.json")
            #save correlative lists
            utils.save_json(user_to_repo,
                            "github_files/github_user_to_repo.json")
            utils.save_json(repo_to_contrib,
                            "github_files/github_repo_to_contrib.json")
            #save bookmark
            utils.save_json(finished_repos,
                            "github_files/github_finished_repos.json")
            print "saved contributors of", repo_count, "repos"

    #all done - save results
    #save all contrib to json file
    utils.save_json(all_contrib, "github_files/github_all_contrib.json")
    #save correlative dictionaries
    utils.save_json(user_to_repo, "github_files/github_user_to_repo.json")
    utils.save_json(repo_to_contrib,
                    "github_files/github_repo_to_contrib.json")
    #final bookmark
    utils.save_json(finished_repos, "github_files/github_finished_repos.json")

    #return results
    return all_contrib, user_to_repo, repo_to_contrib
コード例 #10
0
all_repos = get_first_repos(auth, headers)
print "loaded", len(all_repos['items']), "repos"

#get contributors of these 1000 repos - DONE
print "loading saved contributors and correlations..."
all_contrib, user_to_repo, repo_to_contrib = get_contrib(all_repos, auth, headers)
print "loaded", len(all_contrib), "contributors"

#get all python repos for all users we have so far - roughly 45K - DONE
print "verifying have all repos for those users..."
all_contrib, all_repos, user_to_repo, repo_to_contrib = get_repos(all_contrib, all_repos, user_to_repo, repo_to_contrib, auth, headers)
print "have", len(all_repos['items']), "repos for all users"
'''

print "reading in repos..."
all_repos = utils.load_json("github_files/github_all_repos.json")
print len(all_repos['items']), "repositories"

print "reading in users..."
all_contrib = utils.load_json("github_files/github_all_contrib.json")
print len(all_contrib), "users"

#go another ripple - get all users contributing to any repos we have so far
'''
print "fetching new contributors for all repos (active requests, this could take a while)..."
all_contrib, user_to_repo, repo_to_contrib = get_contrib(all_repos, auth, headers, all_contrib, False, False, True)
print "now have", len(all_contrib), "contributors"
'''

print "reading in correlative lists..."
user_to_repo = utils.load_json("github_files/github_user_to_repo.json")
コード例 #11
0
def get_repos(all_contrib, all_repos, user_to_repo, repo_to_contrib, auth,
              headers):
    #since this will take more than 5000 requests, keep a bookmark in a file - in case something goes wrong

    #read simple list of users that are done already, will update
    finished_users = utils.load_json("github_files/github_finished_users.json")
    if finished_users == False:
        finished_users = list()

    #if all the users we have so far are already finished, return input as results instead of loop-checking
    if len(finished_users) == len(all_contrib):
        return all_contrib, all_repos, user_to_repo, repo_to_contrib

    #also keep list of users that don't search properly (private repos?)
    bad_users = utils.load_json("github_files/bad_users.json")
    if bad_users == False:
        bad_users = list()

    #count users done
    user_count = 0

    #loop all users (should be all contributors of the 1000 initial repos), fetch their python repos
    for user in all_contrib:
        #check if we did this user already, if so skip
        if user['id'] in finished_users:
            continue

        #build request url for this user
        url = "https://api.github.com/search/repositories?q=language:python+user:%s&per_page=100" % (
            user['login'])

        #do request, including any pages
        while url != "":
            #sleep for ~2 seconds before next request, to prevent getting kicked off
            #(search requests limited to 30 per minute)
            time.sleep(2)

            #get the json!
            r = requests.get(url, auth=auth, headers=headers)
            res = r.json()

            #handle bad results and try to continue
            if 'items' not in res:
                #rate limit? wait for 60 seconds and try the same url again
                if 'documentation_url' in res and res[
                        'documentation_url'] == "https://developer.github.com/v3/#rate-limiting":
                    print "rate limit wait"
                    time.sleep(60)
                    continue
                #server error 502 - try the request again
                elif "message" in res and res['message'] == "Server Error":
                    continue
                #bad results for this particular user, they might be private now - skip and move to next
                else:
                    #print res
                    bad_users.append(user)
                    break

            #good results, parse and store
            for repo in res['items']:
                #new repo, add to list of all
                if repo['id'] not in repo_to_contrib:
                    all_repos['items'].append(repo)
                #always add to correlative structures
                if user['id'] not in user_to_repo:
                    user_to_repo[user['id']] = list()
                if repo['id'] not in user_to_repo[user['id']]:
                    user_to_repo[user['id']].append(repo['id'])
                if repo['id'] in repo_to_contrib and user[
                        'id'] not in repo_to_contrib[repo['id']]:
                    repo_to_contrib[repo['id']].append(user['id'])
                elif repo['id'] not in repo_to_contrib:
                    repo_to_contrib[repo['id']] = list()

            #see if more pages, if so fetch them
            if 'next' in r.links:
                url = r.links['next']['url']
            else:  #no more pages, quit for this user
                url = ""

        #finished user, add to bookmark
        finished_users.append(user['id'])
        user_count += 1

        #intermediate saves and prints
        if user_count % 100 == 0:
            #save all repos to json file
            utils.save_json(all_repos, "github_files/github_all_repos.json")
            #save correlative lists
            utils.save_json(user_to_repo,
                            "github_files/github_user_to_repo.json")
            utils.save_json(repo_to_contrib,
                            "github_files/github_repo_to_contrib.json")
            print "Saved repos of", user_count, "users"
            #save bad users list
            utils.save_json(bad_users, "github_files/github_bad_users.json")
            #save list of finished users
            utils.save_json(finished_users,
                            "github_files/github_finished_users.json")
    #end for users

    #final save before return
    #save all repos to json file
    utils.save_json(all_repos, "github_files/github_all_repos.json")
    #save correlative lists
    utils.save_json(user_to_repo, "github_files/github_user_to_repo.json")
    utils.save_json(repo_to_contrib,
                    "github_files/github_repo_to_contrib.json")
    #save bad users list
    utils.save_json(bad_users, "github_files/github_bad_users.json")
    #save list of finished users
    utils.save_json(finished_users, "github_files/github_finished_users.json")
    print "Saved all data to files"

    return all_contrib, all_repos, user_to_repo, repo_to_contrib  #return all results
コード例 #12
0
    for ValN in CdfV.keys():
        if CdfV[ValN] > EffPairs: break

    if ValN >= len(CdfV): return next(reversed(CdfV))
    if ValN is 0: return 1
    # interpolate
    DeltaNbrs = CdfV[ValN] - CdfV[ValN - 1]
    if DeltaNbrs is 0: return ValN
    return ValN - 1 + (EffPairs - CdfV[ValN - 1]) / DeltaNbrs


#--- MAIN EXECUTION BEGINS HERE---#

#read mappings from files
email_to_id = utils.load_json("email_to_userid.json")
name_to_id = utils.load_json("name_to_userid.json")

file_idx = 0

#build bipartite graph, count connected components
print "building graph"
B = nx.Graph()

#add user nodes
for key in email_to_id:
    B.add_node(email_to_id[key], bipartite=0)
for key in name_to_id:
    B.add_node(name_to_id[key], bipartite=0)

#add repo nodes and user-repo edges
コード例 #13
0
users_filepath = "model_files/users/%s_users.txt"  #list of users seen in posts/comments, one file per subreddit

#verify directories for output files
file_utils.verify_dir("model_files/params")
file_utils.verify_dir("model_files/posts")
file_utils.verify_dir("model_files/graphs")
file_utils.verify_dir("model_files/users")

code = "hackernews"

#load raw hackernews cascades if no cached reconstructed cascades
if os.path.exists("data_cache/%s_cascades/%s_cascade_posts.pkl" %
                  (code, code)) == False or os.path.exists(
                      "data_cache/%s_cascades/%s_cascade_comments.pkl" %
                      (code, code)) == False:
    raw_stories = file_utils.load_json(
        "../hackernews/data/hackernews_stories.json")
    raw_comments = file_utils.load_json(
        "../hackernews/data/hackernews_comments.json")
else:
    raw_stories = {}
    raw_comments = {}

#build/load cascades (auto-load as a result, either raw data or cached cascades)
cascades, comments, missing_posts, missing_comments = cascade_analysis.build_cascades(
    code, raw_stories, raw_comments)
#optional: filter out cascades with any missing elements (posts or comments)
cascades, comments = cascade_manip.remove_missing(code, cascades, comments)

print("\nHave total of", len(cascades), "cascades and", len(comments),
      "comments for hackernews")
コード例 #14
0
import subprocess
import sys
import urllib2
import io
from collections import defaultdict
import matplotlib
matplotlib.use('Agg')
import matplotlib.pylab as plt
import numpy as np
import file_utils as utils
import plot_utils

#--- MAIN EXECUTION BEGINS HERE---#

#read mappings from files
email_to_id = utils.load_json("email_to_userid.json")
name_to_id = utils.load_json("name_to_userid.json")

user_commit_counts = utils.load_json("user_commit_counts.json")
if user_commit_counts == False:
    user_commit_counts = defaultdict(int)
file_commit_counts = utils.load_json("file_commit_counts.json")
if file_commit_counts == False:
    file_commit_counts = defaultdict(int)

file_idx = 0

if len(file_commit_counts) == 0 or len(user_commit_counts) == 0:
    print "computing counts"
    #for each commit log file:
    for filename in os.listdir('commit_data'):
コード例 #15
0
import matplotlib
matplotlib.use('Agg')
import matplotlib.pylab as plt
import numpy as np
import unicodedata
import file_utils as utils
import plot_utils
import package_type

#--- MAIN EXECUTION BEGINS HERE---#	

#how to count: top-level vs submodules
module_type = package_type.get_type()

#read mappings from files
email_to_id = utils.load_json("email_to_userid.json")
name_to_id = utils.load_json("name_to_userid.json")

user_all_commit_counts = utils.load_json("data_files/user_all_commit_counts.json")
if user_all_commit_counts == False:
	user_all_commit_counts = defaultdict(int)
repo_all_commit_counts = utils.load_json("data_files/repo_all_commit_counts.json")
if repo_all_commit_counts == False:
	repo_all_commit_counts = defaultdict(int)

if len(repo_commit_counts) == 0 or len(user_commit_counts) == 0:
	print "computing commit counts"

	#read in all commits
	commits = utils.load_json("data_files/all_commits_%s_small.json" % module_type)
	if commits == False:
コード例 #16
0
from itertools import count
import sys


#filepaths of output files
subreddits_filepath = "model_files/subreddits.pkl"		#dictionary of subreddit -> domain code
posts_filepath = "model_files/posts/%s_posts.pkl"			#processed post data for each post, one file per subreddit
														#each post maps original post id to numeric id, set of tokens, and user id
params_filepath = "model_files/params/%s_params.txt"	#text file of fitted cascade params, one file per subreddit
														#one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality)
graph_filepath = "model_files/graphs/%s_graph.txt"		#edgelist of post graph for this subreddit
users_filepath = "model_files/users/%s_users.txt"	#list of users seen in posts/comments, one file per subreddit

#load list of cve subreddits
cve_subreddit_dist = file_utils.load_json("results/cve_post_subreddit_dist.json")

#verify directories for output files
file_utils.verify_dir("model_files/params")
file_utils.verify_dir("model_files/posts")
file_utils.verify_dir("model_files/graphs")
file_utils.verify_dir("model_files/users")

cascades = {}
comments = {}

domain = "cve"


#loop all subreddits
for subreddit, sub_count in cve_subreddit_dist.items():
コード例 #17
0
filestring = "dryrun/results/*metrics.json"

#get list of all matching files
files = glob.glob(filestring)

data = {
}  #nested dictionary of file/run identifier -> metric topic -> metric name -> computed metric

#process each file individually, add data to dictionary
for file in files:
    #get identifier for this file
    ident, rest = file.split('-')
    ident = ident.split('/')[-1]

    #load file data
    data[ident] = file_utils.load_json(file)

ident_column = 'sim version'

#for better csv dump-age, combine metric topic and name into a single key
#list of dictionaries, one per run/ident
dump_data = []
#loop identifiers
for ident in data:
    row = {ident_column: ident}
    #loop metric topics
    for topic in data[ident]:
        #loop metrics
        for metric in data[ident][topic]:
            #add this to updated data
            row[topic + " : " + metric] = data[ident][topic][metric]
コード例 #18
0
subreddits_filepath = "model_files/subreddits.pkl"  #dictionary of subreddit -> domain code
posts_filepath = "model_files/posts/%s_posts.pkl"  #processed post data for each post, one file per subreddit
#each post maps original post id to numeric id, set of tokens, and user id
params_filepath = "model_files/params/%s_params.txt"  #text file of fitted cascade params, one file per subreddit
#one line per cascade: cascade numeric id, params(x6), sticky factor (1-quality)
graph_filepath = "model_files/graphs/%s_graph.txt"  #edgelist of post graph for this subreddit
users_filepath = "model_files/users/%s_users.txt"  #list of users seen in posts/comments, one file per subreddit

#load the subreddit distribution for all cascades (just need a list of subreddits)
if file_utils.verify_file(subreddits_filepath):
    print("Loading subreddit list from", subreddits_filepath)
    subreddit_dict = file_utils.load_pickle(subreddits_filepath)
#file doesn't exist, build it
else:
    #load all three domain breakdown files
    crypto_subreddit_dist = file_utils.load_json(
        "results/crypto_post_subreddit_dist.json")
    cve_subreddit_dist = file_utils.load_json(
        "results/cve_post_subreddit_dist.json")
    cyber_subreddit_dist = file_utils.load_json(
        "results/cyber_post_subreddit_dist.json")
    #combine into single dictionary of subreddit -> list of corresponding domain codes
    subreddit_dict = build_domain_dict([
        set(crypto_subreddit_dist.keys()),
        set(cve_subreddit_dist.keys()),
        set(cyber_subreddit_dist.keys())
    ], ["crypto", "cve", "cyber"])
    #now, kill all the duplicates! crypto and cyber scraped entire subreddits,
    #so any overlap is redudant and can be thrown away
    #(yes, there are neater ways to do this, but I don't care!)
    for item in subreddit_dict.keys():
        if len(subreddit_dict[item]) > 1:
コード例 #19
0
  # if a verified map already exists, the file will be updated with the new information added
    parser = argparse.ArgumentParser()
    parser.add_argument("--path", type=str, default="./multitracks/", 
        help="path to downloaded mutlitracks")
    parser.add_argument("--out", type=str, default="./complete_map.json", 
        help="output path of verified map")
    parser.add_argument("--map", type=str, default=None, 
        help="path do existing dataset map which, if specified, will be modified in place")
    parser.add_argument("--overwrite", type=bool, default=True, 
        help="overwrite existing entries if existing dataset map is loaded")
    parser.add_argument("--save_interval", type=int, default=5, 
        help="how often to save the dataset map while processing tracks")
    parser.add_argument("-kw", type=str, default="keywords.txt",
        help="keywords txt file that specifies search terms")
    parser.add_argument("--c_thresh", type=int, default=80,
        help="confidence threshold for fuzzy string matching")
    parser.add_argument("--thresh_db", type=int, default=45, 
        help="threshold in db to reject silence")
    parser.add_argument("--n_jobs", type=int, default=8, 
        help="num parallel worker threads to load & process audio files")

    args = parser.parse_args()

    dataset_map = {} # initialzie empty dict
    if args.map is not None: # if existing map is provided, it will be modified in place (overwriting existing entries)
        dataset_map = file_utils.load_json(args.map)

    dataset_map = create_map(args.path, args.kw, args.c_thresh, args.thresh_db, args.n_jobs, dataset_map, 
                            args.out, args.overwrite, args.save_interval)

    file_utils.save_json(args.out, dataset_map, indent=2)
コード例 #20
0
#count frequency of each commit author name and email, so we can track down the ones breaking id assignment

import os.path
import subprocess
import sys
import urllib2
import io
import unicodedata
from collections import defaultdict
import file_utils as utils
import data_utils as data
import operator

#--- MAIN EXECUTION BEGINS HERE---#

name_count = utils.load_json("data_files/author_name_freq.json")
email_count = utils.load_json("data_files/author_email_freq.json")

#build list if does not exist
if name_count == False or email_count == False:

    #dictionary for name and email counts
    name_count = defaultdict(int)
    email_count = defaultdict(int)

    file_idx = 0

    #for each commit log file:
    for filename in os.listdir('commit_data'):
        #for line in f:
        for line in io.open("commit_data/%s" % filename,
コード例 #21
0
SIGHT = True

#module-type specifier (at this point, more of a file suffix specifier)
module_type = package_type.get_type()

#adoption condition specifier (another suffix)
if SIGHT:
    print "Adoption requires direct commit view"
    adop_type = "SIGHT"
else:
    print "Adoption from repo history allowed"
    adop_type = "HISTORY"

#load adoption events
print "Loading all adoption events..."
adoption_events = utils.load_json("datafiles/adoption_events_%s.json" %
                                  (module_type + "_" + adop_type))

#load library usage counts (overall for now)
print "Loading library usage counts..."
usage_counts = utils.load_json("datafiles/import_counts_overall_%s.json" %
                               module_type)

#don't have an adoption event file, yell at the user
if adoption_events == False or usage_counts == False:
    print "must have compiled adoption event list datafiles/adoption_events_%s.json" % (
        module_type + "_" + adop_type)
    print "exiting"
    sys.exit(0)

#adoption events look like this:
#	dictionary, where library is key
コード例 #22
0
  parser.add_argument("-kw", type=str, default="vox",
      help="keyword filter, only one allowed. See keywords.txt for full list")
  parser.add_argument("--approve", type=str, nargs="+", default="Speech",
      help="AudioSet labels to match in source content. Multiple string values allowed (case sensitive)")
  parser.add_argument("--reject", type=str, default="Silence",
      help="AudioSet labels to reject. Multiple string values allowed (case sensitive)")
  parser.add_argument("--map", type=str, default="dataset_map.json", 
      help="path do json dataset map")
  parser.add_argument("--thresh", type=int, default=45, 
      help="threshold in db to reject silence")

  args = parser.parse_args()

  # with open(args.map) as json_file:
  #     dataset_map = json.load(json_file)
  dataset_map = file_utils.load_json(args.map)

  yam_approve = list(args.approve)
  yam_reject = list(args.reject)

  if os.path.exists(args.out):
    print(f'{args.out} already exists, modifying...')
    verified_classmap = file_utils.load_json(args.out)
  else:
    verified_classmap = {}

  
  verified = verify_classes_yamnet(verified_classmap, dataset_map, args.thresh, args.kw, yam_approve, yam_reject)

  file_utils.save_json(args.out, verified)
  print(f'saved verified map to {args.out}')
コード例 #23
0
        "--normalize",
        type=bool,
        default=False,
        help=
        "normalize stems (normalizes audio file from start to end, not individual samples)"
    )
    parser.add_argument("--normalize_transients",
                        type=bool,
                        default=False,
                        help="normalize individual transients")
    parser.add_argument("--diff_mask",
                        type=bool,
                        default=False,
                        help="y = (x - y), for source separation masking")

    args = parser.parse_args()

    x, y = gen_dataset(file_utils.load_json(args.map),
                       args.ws,
                       args.xkey,
                       args.ykey,
                       normalize_stems=args.normalize,
                       normalize_transients=args.normalize_transients,
                       max_examples=args.max_examples,
                       sample_rate=44100,
                       difference_mask=args.diff_mask)

    np.save(f'{args.out}/x_{args.xkey}_{args.ws}_{len(x)}.npy', x)
    np.save(f'{args.out}/y_{args.ykey}_{args.ws}_{len(y)}.npy', y)
    print(f'saved x y pairs in {args.out}')
コード例 #24
0
#cascades, comments, missing_posts, missing_comments = build_cascades(code, posts = False, comments = False)

print("Loading cascades from data_cache")
cascades = file_utils.load_pickle("data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code))

#comments: across multiple files
print("Loading comments from data_cache")		
comments = {}
files = sorted(glob.glob('data_cache/%s_cascades/%s_cascade_comments*' % (code, code)))
for file in files:
	print("Loading", file)
	new_comments = file_utils.load_pickle(file)
	comments.update(new_comments)

#missing posts and comments
missing_posts = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_posts.json" % (code, code))
missing_comments = file_utils.load_json("data_cache/%s_cascades/%s_cascade_missing_comments.json" % (code, code))

#yay! loaded
print("   Loaded", len(cascades), "cascades with", len(comments), "comments")
print("     ", len(missing_posts), "missing posts", len(missing_comments), "missing comments")

cascades, comments = cascade_manip.remove_missing(code, cascades, comments)

#load subreddit distribution (just want list of subreddits)
cyber_subreddit_dist = file_utils.load_json("results/cyber_post_subreddit_dist.json")
print(sorted(list(cyber_subreddit_dist.keys())))

#filter posts/comments for each subreddit
for subreddit, count in cyber_subreddit_dist.items():
	print("Filtering for", subreddit)
コード例 #25
0
from collections import defaultdict

#given sorted data, write the data to a file
def write_sorted_to_file(data, filename):
	with open(filename, 'w') as f:
    		f.write(str(data))
#end write_sorted_to_file

#--- MAIN EXECUTION BEGINS HERE---#	

#which files to sort: top-level vs submodules
module_type = package_type.get_type()

#read in the data
print "reading in commit data..."
data = utils.load_json('data_files/all_commits_%s.json' % module_type)
if data == False:
	print "commits file does not exist, exiting"
	exit(0)
print "read in", len(data), "commits"

#sort events by time
print "sorting commits..."
ordered_commits = sorted(data, key=lambda k: k['time'])

#save to a single mega-file
print "saving sorted commits..."
write_sorted_to_file(ordered_commits, "data_files/all_commits_%s_sorted.json" % module_type)
print len(ordered_commits), "sorted commits saved to data_files/all_commits_%s_sorted.json" % module_type

#create directory for commits by year files (if doesn't already exist)
コード例 #26
0
SIGHT = True

#module-type specifier (at this point, more of a file suffix specifier)
module_type = package_type.get_type()

#adoption condition specifier (another suffix)
if SIGHT:
    print "Adoption requires direct commit view"
    adop_type = "SIGHT"
else:
    print "Adoption from repo history allowed"
    adop_type = "HISTORY"

#load all commits
print "Loading all import commits..."
all_lib_commits = utils.load_json("datafiles/all_add_commits_%s.json" %
                                  module_type)

#load first commits for each user
print "Loading all user/repo first commits..."
first_commits = utils.load_json("datafiles/first_commits.json")

#don't have a compiled commit file, yell at the user
if all_lib_commits == False or first_commits == False:
    print "must have compiled commit list datafiles/all_add_commits_%s.json and user first commits datafiles/first_commits.json" % module_type
    print "exiting"
    sys.exit(0)

#compile into a single mega-list, use the lack of a "libs" key to differentiate
#library imports from first commits
print "Combining commit lists..."
all_commits = all_lib_commits + first_commits
コード例 #27
0
		return False
		
	return True
#end clone_repo

#--- MAIN EXECUTION BEGINS HERE---#	
	
#check for required command line param
if len(sys.argv) != 3:
	print "Must include starting index and # to process for repo cloning process."
	print "Usage: python github_clone_repos.py <starting index> <# to process>"
	sys.exit(0)

#read list of repos to clone
print "Reading repo list..."
repos = utils.load_json("github_files/github_all_repos.json")	
print "Read", len(repos['items']), "repos"

#grab and save current working directory
root_dir = os.getcwd()

#create directory for repo clones (if doesn't already exist)
if os.path.isdir("repo_clones") == False:
	os.makedirs("repo_clones")
#create directory for commit data (if doesn't already exist)
if os.path.isdir("commit_data") == False:
	os.makedirs("commit_data")

#grad command line args: starting index and processing limit
idx = int(sys.argv[1])		#starting index from command line parameter
limit = int(sys.argv[2])			#process <command line var> at a time starting from given index
コード例 #28
0
import unicodedata
from collections import OrderedDict
from operator import itemgetter
import file_utils as utils
import plot_utils
import package_type

#--- MAIN EXECUTION BEGINS HERE---#

#file count-type specifier
count_type = package_type.get_type()

self_ref_count = 0

#load counts if have them
import_counts_overall = utils.load_json("import_counts_overall_%s.json" %
                                        count_type)
import_repo_counts = utils.load_json("import_repo_counts_%s.json" % count_type)
import_user_counts = utils.load_json("import_user_counts_%s.json" % count_type)
if import_counts_overall == False or import_repo_counts == False or import_user_counts == False:
    import_counts_overall = defaultdict(
        int)  #number of additions across all repos and users
    import_repos = defaultdict(set)  #list of repos using library
    import_users = defaultdict(set)  #list of users using library

    file_idx = 0

    #for each commit log file:
    for filename in os.listdir('imports_data'):

        #extract repo name
        repo = filename[:-4]
コード例 #29
0
SIGHT = True

#module-type specifier (at this point, more of a file suffix specifier)
module_type = package_type.get_type()

#adoption condition specifier (another suffix)
if SIGHT:
    print "Adoption requires direct commit view"
    adop_type = "SIGHT"
else:
    print "Adoption from repo history allowed"
    adop_type = "HISTORY"

#load adoption events
print "Loading all adoption events..."
adoption_events = utils.load_json("datafiles/adoption_events_%s.json" %
                                  (module_type + "_" + adop_type))

#read user->repos mapping (to use as user and repo list)
print "Loading user and repo list..."
user_to_repos = utils.load_json("datafiles/user_to_repo_list.json")

#don't have an adoption event file, yell at the user
if adoption_events == False or user_to_repos == False:
    print "must have compiled adoption event list datafiles/adoption_events_%s.json and user to repo mapping datafiles/user_to_repo_list.json" % (
        module_type + "_" + adop_type)
    print "exiting"
    sys.exit(0)

#adoption events look like this:
#	dictionary, where library is key
#	for each library, value is list of adoption events
コード例 #30
0
import file_utils
import cascade_manip

code = "hackernews"

#load list of posts that fit-failed
fit_fail = set(
    file_utils.load_json("model_files/params/hackernews_failed_param_fit.txt"))

#load hackernews cascades
posts = file_utils.load_pickle(
    "data_cache/hackernews_cascades/hackernews_cascade_posts.pkl")
comments = file_utils.load_pickle(
    "data_cache/hackernews_cascades/hackernews_cascade_comments.pkl")
print("Loaded", len(posts), "posts and", len(comments), "comments")

#remove missing
posts, comments = cascade_manip.remove_missing(code, posts, comments)

#remove posts for which the fit failed
posts = {key: value for (key, value) in posts.items() if key not in fit_fail}
posts, comments = cascade_manip.filter_comments_by_posts(posts, comments)
print("Down to", len(posts), "posts and", len(comments), "comments")

#filenames of filtered cascades and comments
cascades_filepath = "data_cache/filtered_cascades/%s_%s_cascades.pkl"  #domain and subreddit cascades
comments_filepath = "data_cache/filtered_cascades/%s_%s_comments.pkl"  #domain and subreddit comments

#save to same place as other filtered cascades - use hackernews as domain and subreddit
file_utils.save_pickle(posts, cascades_filepath % (code, code))
file_utils.save_pickle(comments, comments_filepath % (code, code))