def getcommitinfo(repoid, NEWREPO_xl):
    commit_url = "https://api.github.com/repositories/" + str(
        repoid) + "/commits?per_page=100"
    while commit_url:
        commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV)
        if commit_req:
            commit_json = commit_req.json()
            for commit in commit_json:
                commit_row = ghparse_row(commit,
                                         "sha",
                                         "commit*author*name",
                                         "commit*author*email",
                                         "commit*author*date",
                                         "commit*committer*name",
                                         "commit*committer*email",
                                         "commit*committer*date",
                                         "commit*comment_count",
                                         "commit*message",
                                         prespace=1)
                c_list = getmorecommitinfo(commit['url'])
                for e in c_list:
                    commit_row.append(e)


#                appendrowincsv(NEWREPO_CSV, commit_row)
                appendrowindf(NEWREPO_xl, commit_row)
            commit_url = ghpaginate(commit_req)
        else:
            print("Error getting commit info ", commit_url)
            with open(LOG_CSV, 'at', encoding='utf-8', newline="") as loglist:
                log_handle = csv.writer(loglist)
                log_handle.writerow(
                    ["Error getting commit", commit_url, "UNKNOWN"])
            return 1
    return 0
Beispiel #2
0
def main():
    """Get topics for each repo"""
    REPO_CSV = 'C:\\Users\kmpoo\Dropbox\HEC\Project 2 -   License\EJIS\Data\ExportedISRDataCollab_LocUpdates 20190425.csv'
    NEWREPO_CSV = 'C:\\Users\kmpoo\Dropbox\HEC\Project 2 -   License\EJIS\Data\FullData_20190517.csv'
    d_header = {'Accept': 'application/vnd.github.mercy-preview+json'}
    new_row = list()
    with open(NEWREPO_CSV, 'wt', encoding='utf-8', newline='') as writeobj:
        repo_writer = csv.writer(writeobj)
        with open(REPO_CSV, 'rt', encoding='utf-8') as fileobj:
            repo_struct = csv.reader(fileobj)
            for row in repo_struct:
                del new_row[:]
                new_row = row
                repo_id = row[0]
                repo_url = 'https://api.github.com/repositories/' + repo_id + '/topics'
                topic_req = getGitHubapi(repo_url,
                                         PW_CSV,
                                         LOG_CSV,
                                         header=d_header)
                if topic_req:
                    j_topics = topic_req.json()
                    new_row.append(j_topics['names'])
                    if j_topics['names']:
                        new_row.append(j_topics['names'][0])
                    else:
                        new_row.append("")
                else:
                    new_row.append("")
                    new_row.append("")
                repo_writer.writerow(new_row)
def get_name(repo_id):
    repo_url = "https://api.github.com/repositories/"+str(repo_id)
    repo_req = getGitHubapi(repo_url,PW_CSV,LOG_CSV)
    if repo_req:
        repo_json = repo_req.json()
        return repo_json['name'], repo_json['owner']['login']
    else:
        return None, None
def getcommitinfo(repoid, NEWREPO_xl, owner, name):
    commit_url = "https://api.github.com/repositories/" + str(
        repoid) + "/commits?per_page=100"
    while commit_url:
        commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV)
        if (commit_req is None) and (
                commit_url == "https://api.github.com/repositories/" +
                str(repoid) + "/commits?per_page=100"):
            print("Repo ID did not work. Trying owner/name")
            commit_url = "https://api.github.com/repos/" + str(
                owner) + "/" + str(name) + "/commits?per_page=100"
            commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV)
        if commit_req:
            commit_json = commit_req.json()
            for commit in commit_json:
                commit_row = ghparse_row(commit,
                                         "sha",
                                         "commit*author*name",
                                         "commit*author*email",
                                         "commit*author*date",
                                         "commit*committer*name",
                                         "commit*committer*email",
                                         "commit*committer*date",
                                         "commit*comment_count",
                                         "commit*message",
                                         prespace=1)
                commit_datetime = datetime.strptime(
                    commit['commit']['author']['date'], "%Y-%m-%dT%H:%M:%SZ")
                if int(commit_datetime.year) < 2016:
                    c_list = getmorecommitinfo(commit['url'])
                    for e in c_list:
                        commit_row.append(e)
                appendrowindf(NEWREPO_xl, commit_row)
            commit_url = ghpaginate(commit_req)
        else:
            print("Error getting commit info ", commit_url)
            with open(LOG_CSV, 'at', encoding='utf-8', newline="") as loglist:
                log_handle = csv.writer(loglist)
                log_handle.writerow(
                    ["Error getting commit", commit_url, "UNKNOWN"])
            return 1
    return 0
Beispiel #5
0
def getcommitinfo(repoid,write_handle):
    commit_url = "https://api.github.com/repositories/"+str(repoid)+"/commits?per_page=100"
    while commit_url:
        commit_req = getGitHubapi(commit_url,PW_CSV,LOG_CSV)
        if commit_req:
            commit_json = commit_req.json()
            for commit in commit_json:
                commit_row = ghparse_row(commit,"sha", "commit*author*name","commit*author*email","commit*author*date", "commit*committer*name","commit*committer*email","commit*committer*date","commit*message","commit*comment_count","commit*verification","url","parents", prespace = 1)
                write_handle.writerow(commit_row)  
            commit_url = ghpaginate(commit_req)
        else:
            print("Error getting commit info ",commit_url)
            with open(LOG_CSV, 'at', encoding = 'utf-8', newline ="") as loglist:
                log_handle = csv.writer(loglist)
                log_handle.writerow(["Error getting commit",commit_url,"UNKNOWN"])
            return
def getmorecommitinfo(c_url):
    """Get data on individual commit"""
    commit_row = []
    del commit_row[:]
    commit_res = getGitHubapi(c_url, PW_CSV, LOG_CSV)

    if commit_res is None:
        print("No commit information available", c_url)
        return ["", "", "", "", ""]
    commit_json = commit_res.json()
    commit_row = ghparse_row(commit_json,
                             "stats*total",
                             "stats*additions",
                             prespace=0)
    parents = commit_json['parents']
    p_no = 0
    for parent in parents:
        p_no = p_no + 1
    commit_row.append(p_no)

    files = commit_json['files']
    f_name = []
    f_stat = []
    f_pat = []
    f_no = 0
    del f_name[:]
    del f_stat[:]
    del f_pat[:]

    for file in files:
        f_no = f_no + 1
        f_name.append(file['filename'])
        f_stat.append(file['status'])
        if "patch" in file:
            f_pat.append(file['patch'])
        else:
            f_pat.append("")

    commit_row.append(f_no)
    commit_row.append(f_name)
    commit_row.append(f_stat)
    commit_row.append(f_pat)
    return commit_row
def main():
    global DF_REPO 
    global DF_COUNT
    search_key = ['us']#,'usa','states','america','canada','california','ca']
    period = ['2018-05-31..2019-01-01','2018-01-01..2018-06-01',
              '2017-05-31..2018-01-01','2017-01-01..2017-06-01',
              '2016-05-31..2017-01-01','2016-01-01..2016-06-01',
              '2015-05-31..2016-01-01','2015-01-01..2015-06-01',
              '2014-05-31..2015-01-01','2014-01-01..2014-06-01',
              '2013-05-31..2014-01-01','2013-01-01..2013-06-01',
              '2012-05-31..2013-01-01','2012-01-01..2012-06-01',
              '2011-05-31..2012-01-01','2011-01-01..2011-06-01',
              '2010-05-31..2011-01-01','2010-01-01..2010-06-01',
              '2009-05-31..2010-01-01','2009-01-01..2009-06-01',
              '2008-05-31..2009-01-01','2008-01-01..2008-06-01',]

    for loc in search_key:
        for p in period:        
            search_url = "https://api.github.com/search/users?q=repos:%3E5+location%3A"+loc+"+created:"+p+"+type:user&per_page=100"
            
            while search_url:
                user_res = getGitHubapi(search_url,PW_CSV,LOG_CSV)
                user_json = user_res.json()

                if int(user_json['total_count']) > 1000:
                    with open(LOG_CSV, 'at') as logobj:
                        l_data = list()
                        log = csv.writer(logobj)
                        l_data.append("Search Results Exceeds 1000")
                        l_data.append(search_url)
                        l_data.append(user_json['total_count'])
                        log.writerow(l_data)
                    print(search_url,"  ",user_json['total_count'])
                for user in user_json["items"]:
                    user_row = ghparse_row(user,"login", "id")
                    appendrowindf(user_xl, user_row)
                search_url = ghpaginate(user_res)
                
    if DF_COUNT < MAX_ROWS_PERWRITE:
        df = pd.read_excel(user_xl,error_bad_lines=False,header= 0, index = False)
        df= df.append(DF_REPO, ignore_index = True)
        df.to_excel(user_xl, index = False) 
Beispiel #8
0
def getrepoinfo(NEWREPO_CSV):
    """Update the repo inforation with,  PUSHED,STARS, SUBSCRIBERS,  FORKS, SIZE, LICENCE """
    with open(NEWREPO_CSV, 'wt', encoding = 'utf-8', newline='') as writelist:
        write_handle = csv.writer(writelist)
        for org in org_list:
            repo_url = 'https://api.github.com/orgs/'+org+'/repos?per_page=100&page=1'            
            while repo_url:
                repoid_req = getGitHubapi(repo_url,PW_CSV,LOG_CSV)
#                print(repoid_req.headers['link'])
                if repoid_req:
                    repo_json = repoid_req.json()
                    for repo in repo_json:
                        repo_row = ghparse_row(repo,"id", "full_name","description","fork","url","created_at","updated_at","pushed_at","homepage","size","stargazers_count","watchers_count","language","has_issues","has_projects","has_downloads","has_wiki","has_pages","forks_count","mirror_url","archived","disabled","open_issues_count","license*name","forks","open_issues","watchers","default_branch","permissions")
                        write_handle.writerow(repo_row)
                        # get commits
                        #getcommitinfo(repo['id'],write_handle)
                        #end get commits
                    repo_url = ghpaginate(repoid_req)
                else: break
                print(repo_url)
TRAIN_CSV = 'C:/Users/pmedappa/Dropbox/HEC/Project 5 - Roles and Coordination/Data/ML/Commit Creativity - Train3.csv'
PW_CSV = 'C:\\Users\pmedappa\Dropbox\HEC\Python\PW\PW_GitHub3.csv'
LOG_CSV = 'C:\\Data\\092019 CommitInfo\RepoCommit_log.csv'
NEWREPO_xl = 'C:/Users/pmedappa/Dropbox/HEC/Project 5 - Roles and Coordination/Data/ML/New Commit Creativity - Train3.xlsx'

with open(TRAIN_CSV, 'rt', encoding='latin-1') as rdobj:
    rd_repo = csv.reader(rdobj)
    df = pd.DataFrame()
    for row in rd_repo:
        sp_url = row[0].split('/')
        if len(sp_url) > 5:
            commit_url = 'https://api.github.com/repos/' + sp_url[
                3] + '/' + sp_url[4] + '/commits/' + sp_url[6]

            commit_req = getGitHubapi(commit_url, PW_CSV, LOG_CSV)
            if commit_req:
                commit = commit_req.json()

                commit_row = ghparse_row(commit,
                                         "sha",
                                         "commit*author*date",
                                         "commit*message",
                                         "commit*comment_count",
                                         "commit*author*name",
                                         "commit*author*email",
                                         "commit*committer*date",
                                         "commit*url",
                                         "parents",
                                         "commit*verification",
                                         "stats",