def getcommitinfo(repoid, NEWREPO_xl): commit_url = "https://api.github.com/repositories/" + str( repoid) + "/commits?per_page=100" while commit_url: commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV) if commit_req: commit_json = commit_req.json() for commit in commit_json: commit_row = ghparse_row(commit, "sha", "commit*author*name", "commit*author*email", "commit*author*date", "commit*committer*name", "commit*committer*email", "commit*committer*date", "commit*comment_count", "commit*message", prespace=1) c_list = getmorecommitinfo(commit['url']) for e in c_list: commit_row.append(e) # appendrowincsv(NEWREPO_CSV, commit_row) appendrowindf(NEWREPO_xl, commit_row) commit_url = ghpaginate(commit_req) else: print("Error getting commit info ", commit_url) with open(LOG_CSV, 'at', encoding='utf-8', newline="") as loglist: log_handle = csv.writer(loglist) log_handle.writerow( ["Error getting commit", commit_url, "UNKNOWN"]) return 1 return 0
def main(): """Get topics for each repo""" REPO_CSV = 'C:\\Users\kmpoo\Dropbox\HEC\Project 2 - License\EJIS\Data\ExportedISRDataCollab_LocUpdates 20190425.csv' NEWREPO_CSV = 'C:\\Users\kmpoo\Dropbox\HEC\Project 2 - License\EJIS\Data\FullData_20190517.csv' d_header = {'Accept': 'application/vnd.github.mercy-preview+json'} new_row = list() with open(NEWREPO_CSV, 'wt', encoding='utf-8', newline='') as writeobj: repo_writer = csv.writer(writeobj) with open(REPO_CSV, 'rt', encoding='utf-8') as fileobj: repo_struct = csv.reader(fileobj) for row in repo_struct: del new_row[:] new_row = row repo_id = row[0] repo_url = 'https://api.github.com/repositories/' + repo_id + '/topics' topic_req = getGitHubapi(repo_url, PW_CSV, LOG_CSV, header=d_header) if topic_req: j_topics = topic_req.json() new_row.append(j_topics['names']) if j_topics['names']: new_row.append(j_topics['names'][0]) else: new_row.append("") else: new_row.append("") new_row.append("") repo_writer.writerow(new_row)
def get_name(repo_id): repo_url = "https://api.github.com/repositories/"+str(repo_id) repo_req = getGitHubapi(repo_url,PW_CSV,LOG_CSV) if repo_req: repo_json = repo_req.json() return repo_json['name'], repo_json['owner']['login'] else: return None, None
def getcommitinfo(repoid, NEWREPO_xl, owner, name): commit_url = "https://api.github.com/repositories/" + str( repoid) + "/commits?per_page=100" while commit_url: commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV) if (commit_req is None) and ( commit_url == "https://api.github.com/repositories/" + str(repoid) + "/commits?per_page=100"): print("Repo ID did not work. Trying owner/name") commit_url = "https://api.github.com/repos/" + str( owner) + "/" + str(name) + "/commits?per_page=100" commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV) if commit_req: commit_json = commit_req.json() for commit in commit_json: commit_row = ghparse_row(commit, "sha", "commit*author*name", "commit*author*email", "commit*author*date", "commit*committer*name", "commit*committer*email", "commit*committer*date", "commit*comment_count", "commit*message", prespace=1) commit_datetime = datetime.strptime( commit['commit']['author']['date'], "%Y-%m-%dT%H:%M:%SZ") if int(commit_datetime.year) < 2016: c_list = getmorecommitinfo(commit['url']) for e in c_list: commit_row.append(e) appendrowindf(NEWREPO_xl, commit_row) commit_url = ghpaginate(commit_req) else: print("Error getting commit info ", commit_url) with open(LOG_CSV, 'at', encoding='utf-8', newline="") as loglist: log_handle = csv.writer(loglist) log_handle.writerow( ["Error getting commit", commit_url, "UNKNOWN"]) return 1 return 0
def getcommitinfo(repoid,write_handle): commit_url = "https://api.github.com/repositories/"+str(repoid)+"/commits?per_page=100" while commit_url: commit_req = getGitHubapi(commit_url,PW_CSV,LOG_CSV) if commit_req: commit_json = commit_req.json() for commit in commit_json: commit_row = ghparse_row(commit,"sha", "commit*author*name","commit*author*email","commit*author*date", "commit*committer*name","commit*committer*email","commit*committer*date","commit*message","commit*comment_count","commit*verification","url","parents", prespace = 1) write_handle.writerow(commit_row) commit_url = ghpaginate(commit_req) else: print("Error getting commit info ",commit_url) with open(LOG_CSV, 'at', encoding = 'utf-8', newline ="") as loglist: log_handle = csv.writer(loglist) log_handle.writerow(["Error getting commit",commit_url,"UNKNOWN"]) return
def getmorecommitinfo(c_url): """Get data on individual commit""" commit_row = [] del commit_row[:] commit_res = getGitHubapi(c_url, PW_CSV, LOG_CSV) if commit_res is None: print("No commit information available", c_url) return ["", "", "", "", ""] commit_json = commit_res.json() commit_row = ghparse_row(commit_json, "stats*total", "stats*additions", prespace=0) parents = commit_json['parents'] p_no = 0 for parent in parents: p_no = p_no + 1 commit_row.append(p_no) files = commit_json['files'] f_name = [] f_stat = [] f_pat = [] f_no = 0 del f_name[:] del f_stat[:] del f_pat[:] for file in files: f_no = f_no + 1 f_name.append(file['filename']) f_stat.append(file['status']) if "patch" in file: f_pat.append(file['patch']) else: f_pat.append("") commit_row.append(f_no) commit_row.append(f_name) commit_row.append(f_stat) commit_row.append(f_pat) return commit_row
def main(): global DF_REPO global DF_COUNT search_key = ['us']#,'usa','states','america','canada','california','ca'] period = ['2018-05-31..2019-01-01','2018-01-01..2018-06-01', '2017-05-31..2018-01-01','2017-01-01..2017-06-01', '2016-05-31..2017-01-01','2016-01-01..2016-06-01', '2015-05-31..2016-01-01','2015-01-01..2015-06-01', '2014-05-31..2015-01-01','2014-01-01..2014-06-01', '2013-05-31..2014-01-01','2013-01-01..2013-06-01', '2012-05-31..2013-01-01','2012-01-01..2012-06-01', '2011-05-31..2012-01-01','2011-01-01..2011-06-01', '2010-05-31..2011-01-01','2010-01-01..2010-06-01', '2009-05-31..2010-01-01','2009-01-01..2009-06-01', '2008-05-31..2009-01-01','2008-01-01..2008-06-01',] for loc in search_key: for p in period: search_url = "https://api.github.com/search/users?q=repos:%3E5+location%3A"+loc+"+created:"+p+"+type:user&per_page=100" while search_url: user_res = getGitHubapi(search_url,PW_CSV,LOG_CSV) user_json = user_res.json() if int(user_json['total_count']) > 1000: with open(LOG_CSV, 'at') as logobj: l_data = list() log = csv.writer(logobj) l_data.append("Search Results Exceeds 1000") l_data.append(search_url) l_data.append(user_json['total_count']) log.writerow(l_data) print(search_url," ",user_json['total_count']) for user in user_json["items"]: user_row = ghparse_row(user,"login", "id") appendrowindf(user_xl, user_row) search_url = ghpaginate(user_res) if DF_COUNT < MAX_ROWS_PERWRITE: df = pd.read_excel(user_xl,error_bad_lines=False,header= 0, index = False) df= df.append(DF_REPO, ignore_index = True) df.to_excel(user_xl, index = False)
def getrepoinfo(NEWREPO_CSV): """Update the repo inforation with, PUSHED,STARS, SUBSCRIBERS, FORKS, SIZE, LICENCE """ with open(NEWREPO_CSV, 'wt', encoding = 'utf-8', newline='') as writelist: write_handle = csv.writer(writelist) for org in org_list: repo_url = 'https://api.github.com/orgs/'+org+'/repos?per_page=100&page=1' while repo_url: repoid_req = getGitHubapi(repo_url,PW_CSV,LOG_CSV) # print(repoid_req.headers['link']) if repoid_req: repo_json = repoid_req.json() for repo in repo_json: repo_row = ghparse_row(repo,"id", "full_name","description","fork","url","created_at","updated_at","pushed_at","homepage","size","stargazers_count","watchers_count","language","has_issues","has_projects","has_downloads","has_wiki","has_pages","forks_count","mirror_url","archived","disabled","open_issues_count","license*name","forks","open_issues","watchers","default_branch","permissions") write_handle.writerow(repo_row) # get commits #getcommitinfo(repo['id'],write_handle) #end get commits repo_url = ghpaginate(repoid_req) else: break print(repo_url)
TRAIN_CSV = 'C:/Users/pmedappa/Dropbox/HEC/Project 5 - Roles and Coordination/Data/ML/Commit Creativity - Train3.csv' PW_CSV = 'C:\\Users\pmedappa\Dropbox\HEC\Python\PW\PW_GitHub3.csv' LOG_CSV = 'C:\\Data\\092019 CommitInfo\RepoCommit_log.csv' NEWREPO_xl = 'C:/Users/pmedappa/Dropbox/HEC/Project 5 - Roles and Coordination/Data/ML/New Commit Creativity - Train3.xlsx' with open(TRAIN_CSV, 'rt', encoding='latin-1') as rdobj: rd_repo = csv.reader(rdobj) df = pd.DataFrame() for row in rd_repo: sp_url = row[0].split('/') if len(sp_url) > 5: commit_url = 'https://api.github.com/repos/' + sp_url[ 3] + '/' + sp_url[4] + '/commits/' + sp_url[6] commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV) if commit_req: commit = commit_req.json() commit_row = ghparse_row(commit, "sha", "commit*author*date", "commit*message", "commit*comment_count", "commit*author*name", "commit*author*email", "commit*committer*date", "commit*url", "parents", "commit*verification", "stats",