def getcommitinfo(repoid, NEWREPO_xl): commit_url = "https://api.github.com/repositories/" + str( repoid) + "/commits?per_page=100" while commit_url: commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV) if commit_req: commit_json = commit_req.json() for commit in commit_json: commit_row = ghparse_row(commit, "sha", "commit*author*name", "commit*author*email", "commit*author*date", "commit*committer*name", "commit*committer*email", "commit*committer*date", "commit*comment_count", "commit*message", prespace=1) c_list = getmorecommitinfo(commit['url']) for e in c_list: commit_row.append(e) # appendrowincsv(NEWREPO_CSV, commit_row) appendrowindf(NEWREPO_xl, commit_row) commit_url = ghpaginate(commit_req) else: print("Error getting commit info ", commit_url) with open(LOG_CSV, 'at', encoding='utf-8', newline="") as loglist: log_handle = csv.writer(loglist) log_handle.writerow( ["Error getting commit", commit_url, "UNKNOWN"]) return 1 return 0
def getcommitinfo(repoid,write_handle): commit_url = "https://api.github.com/repositories/"+str(repoid)+"/commits?per_page=100" while commit_url: commit_req = getGitHubapi(commit_url,PW_CSV,LOG_CSV) if commit_req: commit_json = commit_req.json() for commit in commit_json: commit_row = ghparse_row(commit,"sha", "commit*author*name","commit*author*email","commit*author*date", "commit*committer*name","commit*committer*email","commit*committer*date","commit*message","commit*comment_count","commit*verification","url","parents", prespace = 1) write_handle.writerow(commit_row) commit_url = ghpaginate(commit_req) else: print("Error getting commit info ",commit_url) with open(LOG_CSV, 'at', encoding = 'utf-8', newline ="") as loglist: log_handle = csv.writer(loglist) log_handle.writerow(["Error getting commit",commit_url,"UNKNOWN"]) return
def getmorecommitinfo(c_url): """Get data on individual commit""" commit_row = [] del commit_row[:] commit_res = getGitHubapi(c_url, PW_CSV, LOG_CSV) if commit_res is None: print("No commit information available", c_url) return ["", "", "", "", ""] commit_json = commit_res.json() commit_row = ghparse_row(commit_json, "stats*total", "stats*additions", prespace=0) parents = commit_json['parents'] p_no = 0 for parent in parents: p_no = p_no + 1 commit_row.append(p_no) files = commit_json['files'] f_name = [] f_stat = [] f_pat = [] f_no = 0 del f_name[:] del f_stat[:] del f_pat[:] for file in files: f_no = f_no + 1 f_name.append(file['filename']) f_stat.append(file['status']) if "patch" in file: f_pat.append(file['patch']) else: f_pat.append("") commit_row.append(f_no) commit_row.append(f_name) commit_row.append(f_stat) commit_row.append(f_pat) return commit_row
def main(): global DF_REPO global DF_COUNT search_key = ['us']#,'usa','states','america','canada','california','ca'] period = ['2018-05-31..2019-01-01','2018-01-01..2018-06-01', '2017-05-31..2018-01-01','2017-01-01..2017-06-01', '2016-05-31..2017-01-01','2016-01-01..2016-06-01', '2015-05-31..2016-01-01','2015-01-01..2015-06-01', '2014-05-31..2015-01-01','2014-01-01..2014-06-01', '2013-05-31..2014-01-01','2013-01-01..2013-06-01', '2012-05-31..2013-01-01','2012-01-01..2012-06-01', '2011-05-31..2012-01-01','2011-01-01..2011-06-01', '2010-05-31..2011-01-01','2010-01-01..2010-06-01', '2009-05-31..2010-01-01','2009-01-01..2009-06-01', '2008-05-31..2009-01-01','2008-01-01..2008-06-01',] for loc in search_key: for p in period: search_url = "https://api.github.com/search/users?q=repos:%3E5+location%3A"+loc+"+created:"+p+"+type:user&per_page=100" while search_url: user_res = getGitHubapi(search_url,PW_CSV,LOG_CSV) user_json = user_res.json() if int(user_json['total_count']) > 1000: with open(LOG_CSV, 'at') as logobj: l_data = list() log = csv.writer(logobj) l_data.append("Search Results Exceeds 1000") l_data.append(search_url) l_data.append(user_json['total_count']) log.writerow(l_data) print(search_url," ",user_json['total_count']) for user in user_json["items"]: user_row = ghparse_row(user,"login", "id") appendrowindf(user_xl, user_row) search_url = ghpaginate(user_res) if DF_COUNT < MAX_ROWS_PERWRITE: df = pd.read_excel(user_xl,error_bad_lines=False,header= 0, index = False) df= df.append(DF_REPO, ignore_index = True) df.to_excel(user_xl, index = False)
def getcommitinfo(repoid, NEWREPO_xl, owner, name): commit_url = "https://api.github.com/repositories/" + str( repoid) + "/commits?per_page=100" while commit_url: commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV) if (commit_req is None) and ( commit_url == "https://api.github.com/repositories/" + str(repoid) + "/commits?per_page=100"): print("Repo ID did not work. Trying owner/name") commit_url = "https://api.github.com/repos/" + str( owner) + "/" + str(name) + "/commits?per_page=100" commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV) if commit_req: commit_json = commit_req.json() for commit in commit_json: commit_row = ghparse_row(commit, "sha", "commit*author*name", "commit*author*email", "commit*author*date", "commit*committer*name", "commit*committer*email", "commit*committer*date", "commit*comment_count", "commit*message", prespace=1) commit_datetime = datetime.strptime( commit['commit']['author']['date'], "%Y-%m-%dT%H:%M:%SZ") if int(commit_datetime.year) < 2016: c_list = getmorecommitinfo(commit['url']) for e in c_list: commit_row.append(e) appendrowindf(NEWREPO_xl, commit_row) commit_url = ghpaginate(commit_req) else: print("Error getting commit info ", commit_url) with open(LOG_CSV, 'at', encoding='utf-8', newline="") as loglist: log_handle = csv.writer(loglist) log_handle.writerow( ["Error getting commit", commit_url, "UNKNOWN"]) return 1 return 0
def getrepoinfo(NEWREPO_CSV): """Update the repo inforation with, PUSHED,STARS, SUBSCRIBERS, FORKS, SIZE, LICENCE """ with open(NEWREPO_CSV, 'wt', encoding = 'utf-8', newline='') as writelist: write_handle = csv.writer(writelist) for org in org_list: repo_url = 'https://api.github.com/orgs/'+org+'/repos?per_page=100&page=1' while repo_url: repoid_req = getGitHubapi(repo_url,PW_CSV,LOG_CSV) # print(repoid_req.headers['link']) if repoid_req: repo_json = repoid_req.json() for repo in repo_json: repo_row = ghparse_row(repo,"id", "full_name","description","fork","url","created_at","updated_at","pushed_at","homepage","size","stargazers_count","watchers_count","language","has_issues","has_projects","has_downloads","has_wiki","has_pages","forks_count","mirror_url","archived","disabled","open_issues_count","license*name","forks","open_issues","watchers","default_branch","permissions") write_handle.writerow(repo_row) # get commits #getcommitinfo(repo['id'],write_handle) #end get commits repo_url = ghpaginate(repoid_req) else: break print(repo_url)
print(req_json) return 404 if req_json['data']['search']['pageInfo']['hasNextPage']: endc= req_json['data']['search']['pageInfo']['endCursor'] else: end = True users = req_json['data']['search']['edges'] for user in users: user_row = list() if(user['node']): <<<<<<< HEAD user_row = ghparse_row(user,"node*login", "node*name", "node*email", "node*company", "node*bio", "node*location", "node*createdAt", "node*isHireable", "node*followers*totalCount", "node*following*totalCount", "node*repositories*totalCount") if user['node']['sponsorsListing']: user_row.append(user['node']['sponsorsListing']['createdAt']) user_row.append(user['node']['sponsorsListing']['shortDescription']) user_row.append(user['node']['sponsorsListing']['name']) user_row.append(user['node']['sponsorsListing']['tiers']['totalCount']) user_row.append(user['node']['sponsorsListing']['tiers']['edges']) user_row.append(user['node']['sponsorshipsAsMaintainer']['totalCount']) user_row.append(user['node']['sponsorshipsAsMaintainer']['nodes']) else: user_row.append("") ======= print(user['node']['name']) if user['node']['sponsorsListing']: print(user['node']['name']," ",user['node']['sponsorsListing']['createdAt']) user_row.append(user['node']['name'])
def run_query(loc, period,user_xl): """ A simple function to use requests.post to make the API call. Note the json= section.""" q = "location:"+loc+" repos:>5 created:"+period query = """ query{ search(query: \""""+q+"""\", type: USER, first: 1) { userCount pageInfo { startCursor hasNextPage }}}""" try: request = requests.post('https://api.github.com/graphql', json={'query':query}, headers=headers) req_json = request.json() endc = req_json['data']['search']['pageInfo']['startCursor'] except: print("Error getting starting cursor") print(req_json) return 404 end = False while not end: query = """ query($cursor:String! ) { rateLimit { cost remaining resetAt } search(query: \""""+q+"""\", type: USER, first: 100, after:$cursor) { userCount pageInfo { endCursor hasNextPage } edges { node { ... on User { login name id email company bio location createdAt isHireable followers { totalCount } following { totalCount } repositories { totalCount } sponsorsListing { createdAt shortDescription name tiers(first: 100) { totalCount edges { node { name description monthlyPriceInDollars updatedAt } } } } sponsorshipsAsMaintainer(first: 100) { totalCount nodes { createdAt sponsor { login } } } } } } } } """ variables = { "cursor" : endc } body = { "query": query, "variables": variables } try: request = requests.post('https://api.github.com/graphql', json=body, headers=headers) req_json = request.json() print(loc," ",period," ",req_json['data']['search']['userCount'] ) if int(req_json['data']['search']['userCount']) > 1000: # log if the total user count is greater than 1000 with open(LOG_CSV, 'at') as logobj: log = csv.writer(logobj) l_data = list() l_data.append("Search Results Exceeds 1000") l_data.append(loc) l_data.append(period) l_data.append(req_json['data']['search']['userCount']) log.writerow(l_data) print(req_json['data']['rateLimit']['remaining']) except: print("Error running graphql") end = True print(req_json) return 404 if req_json['data']['search']['pageInfo']['hasNextPage']: endc= req_json['data']['search']['pageInfo']['endCursor'] else: end = True users = req_json['data']['search']['edges'] for user in users: user_row = list() if(user['node']): user_row = ghparse_row(user,"node*login", "node*name", "node*email", "node*company", "node*bio", "node*location", "node*createdAt", "node*isHireable", "node*followers*totalCount", "node*following*totalCount", "node*repositories*totalCount") if user['node']['sponsorsListing']: user_row.append(user['node']['sponsorsListing']['createdAt']) user_row.append(user['node']['sponsorsListing']['shortDescription']) user_row.append(user['node']['sponsorsListing']['name']) user_row.append(user['node']['sponsorsListing']['tiers']['totalCount']) user_row.append(user['node']['sponsorsListing']['tiers']['edges']) user_row.append(user['node']['sponsorshipsAsMaintainer']['totalCount']) user_row.append(user['node']['sponsorshipsAsMaintainer']['nodes']) else: user_row.append("") appendrowindf(user_xl, user_row) return 0
sp_url = row[0].split('/') if len(sp_url) > 5: commit_url = 'https://api.github.com/repos/' + sp_url[ 3] + '/' + sp_url[4] + '/commits/' + sp_url[6] commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV) if commit_req: commit = commit_req.json() commit_row = ghparse_row(commit, "sha", "commit*author*date", "commit*message", "commit*comment_count", "commit*author*name", "commit*author*email", "commit*committer*date", "commit*url", "parents", "commit*verification", "stats", "files", prespace=0) commit_row[-1] = len(commit_row[-1]) df = df.append(pd.Series(commit_row), sort=False, ignore_index=True) else: print("Error getting commit info ", commit_url) commit_row.append('', '', ',') commit_row[-1] = len(commit_row[-1])