def get_repo_PRlist(repo, type, renew): api = GitHubAPI() save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type # todo: could be extended to analyze forks in the future if type == 'fork': save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json' if (os.path.exists(save_path)) and (not renew): print("read from local files and return") try: return localfile.get_file(save_path) except: pass print('files does not exist in local disk, start to fetch new list for ', repo, type) if (type == 'pull') or (type == 'issue'): ret = api.request('repos/%s/%ss' % (repo, type), state='all', paginate=True) else: if type == 'branch': type = 'branche' ret = api.request('repos/%s/%ss' % (repo, type), True) localfile.write_to_file(save_path, ret) return ret
def fetch_file_list(pull, renew=False): repo, num = pull["base"]["repo"]["full_name"], str(pull["number"]) save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/' + num + '/raw_diff.json' if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass # t = api.get('repos/%s/pulls/%s/files?page=3' % (repo, num)) t = api.request('repos/%s/pulls/%s/files?page=3' % (repo, num)) file_list = [] if len(t) > 0: raise Exception('too big', pull['html_url']) else: li = api.request('repos/%s/pulls/%s/files' % (repo, num), paginate=True) # li = api.request( 'repos/%s/pulls/%s/files' % (repo, num), True) time.sleep(0.8) for f in li: if f.get('changes', 0) <= 5000 and ('filename' in f) and ('patch' in f): file_list.append(parse_diff(f['filename'], f['patch'])) localfile.write_to_file(save_path, file_list) return file_list
def get_repo_info_forPR(repo, type, renew): filtered_result = [] api = GitHubAPI() print(init.local_pr_data_dir + repo + '/pull_list.json') pullListfile = pathlib.Path(init.local_pr_data_dir + repo + '/pull_list.json') if pullListfile.exists(): tocheck_pr = getOldOpenPRs(repo) print("tocheck_pr " + str(tocheck_pr)) if (tocheck_pr is None): tocheck_pr = 0 save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type if type == 'fork': save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json' if (os.path.exists(save_path)) and (not renew): try: return localfile.get_file(save_path) except: pass print('start fetch new list for ', repo, type) if (type == 'pull') or (type == 'issue'): page_index = 1 while (True): ret = api.requestPR('repos/%s/%ss' % (repo, type), state='all', page=page_index) numPR = init.numPRperPage if (len(ret) > 0): for pr in ret: # if (pr['number'] >= tocheck_pr): if (pr['number'] >= tocheck_pr): filtered_result.append(pr) else: print('get all ' + str(len(filtered_result)) + ' prs') localfile.replaceWithNewPRs(save_path, filtered_result) return filtered_result if (len(filtered_result) < numPR): print('get all ' + str(len(filtered_result)) + ' prs -- after page ' + str(page_index)) localfile.replaceWithNewPRs(save_path, filtered_result) return filtered_result else: page_index += 1 numPR += init.numPRperPage else: print("get pulls failed") return filtered_result else: if type == 'branch': type = 'branche' ret = api.request('repos/%s/%ss' % (repo, type), True) localfile.write_to_file(save_path, ret) else: print('pull list does not exist, get from scratch') ret = get_repo_PRlist(repo, type, renew) return ret
def get_repo_info_forPR_experiment(repo, type, renew): filtered_result = [] api = GitHubAPI() print(init.local_pr_data_dir + repo + '/pull_list.json') save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/pull_list.json' if (os.path.exists(save_path)) and (not renew): try: return localfile.get_file(save_path) except: pass
def get_pull(repo, num, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/api.json' % (repo, num) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass r = api.request('repos/%s/pulls/%s' % (repo, num)) time.sleep(3.0) localfile.write_to_file(save_path, r) return r
def get_pull_commit(pull, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % ( pull["base"]["repo"]["full_name"], pull["number"]) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass commits = api.request('GET', pull['commits_url'], True) time.sleep(0.7) localfile.write_to_file(save_path, commits) return commits
def get_pull_commit(pull, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (pull["base"]["repo"]["full_name"], pull["number"]) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass commits = api.request(pull['commits_url'].replace('https://api.github.com/', ''), paginate=True, state='all') time.sleep(0.7) localfile.write_to_file(save_path, commits) return commits
def get_pr_commit(repo, pr_id, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (repo, pr_id) commit_url = 'repos/%s/pulls/%s/commits' % (repo, pr_id) if os.path.exists(save_path) and (not renew) and (os.stat(save_path).st_size > 2): try: return localfile.get_file(save_path) except: pass # commits = api.request(pull['commits_url'].replace('https://api.github.com/', ''), True) api = GitHubAPI() commits = api.request(commit_url.replace('https://api.github.com/', ''), paginate=True, state='all') time.sleep(0.7) localfile.write_to_file(save_path, commits) return commits
def fetch_commit(url, renew=False): api = GitHubAPI() save_path = LOCAL_DATA_PATH + '/pr_data/%s.json' % url.replace('https://api.github.com/repos/', '') if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass c = api.request(url) time.sleep(0.7) file_list = [] for f in c['files']: if 'patch' in f: file_list.append(fetch_raw_diff.parse_diff(f['filename'], f['patch'])) localfile.write_to_file(save_path, file_list) return file_list
def fetch_file_list(repo, num, renew=False): api = GitHubAPI() # repo, num = pull["base"]["repo"]["full_name"], str(pull["number"]) outfile_prefix = init.local_pr_data_dir + repo + "/" + str(num) save_path = outfile_prefix + '/raw_diff.json' if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass file_list = [] li = api.request('repos/%s/pulls/%s/files' % (repo, num), paginate=True) time.sleep(0.8) for f in li: if f.get('changes', 0) <= 5000 and ('filename' in f) and ('patch' in f): file_list.append(fetch_raw_diff.parse_diff(f['filename'], f['patch'])) localfile.write_to_file(save_path, file_list) return file_list
def get_another_pull(pull, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/another_pull.json' % ( pull["base"]["repo"]["full_name"], pull["number"]) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass comments_href = pull["_links"]["comments"]["href"] comments = api.request('GET', comments_href, True) time.sleep(0.7) candidates = [] for comment in comments: candidates.extend(get_pr_and_issue_numbers(comment["body"])) candidates.extend(get_pr_and_issue_numbers(pull["body"])) result = list(set(candidates)) localfile.write_to_file(save_path, result) return result
def get_another_pull(pull, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/another_pull.json' % ( pull["base"]["repo"]["full_name"], pull["number"]) if os.path.exists(save_path) and (not renew): try: return localfile.get_file(save_path) except: pass comments_href = pull["_links"]["comments"]["href"] # found cites in comments, but checking events is easier. # comments = api.request(comments_href.replace('https://api.github.com/', ''), True) comments = api.request(comments_href.replace('https://api.github.com/', ''), paginate=True, state='all') time.sleep(0.7) candidates = [] for comment in comments: candidates.extend(get_pr_and_issue_numbers(comment["body"])) candidates.extend(get_pr_and_issue_numbers(pull["body"])) result = list(set(candidates)) localfile.write_to_file(save_path, result) return result
def get_repo_info(repo, type, renew=False): save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type if type == 'fork': save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json' if (os.path.exists(save_path)) and (not renew): try: return localfile.get_file(save_path) except: pass print('start fetch new list for ', repo, type) if (type == 'pull') or (type == 'issue'): ret = api.request('GET', 'repos/%s/%ss?state=closed' % (repo, type), True) ret.extend( api.request('GET', 'repos/%s/%ss?state=open' % (repo, type), True)) else: if type == 'branch': type = 'branche' ret = api.request('GET', 'repos/%s/%ss' % (repo, type), True) localfile.write_to_file(save_path, ret) return ret
def get_feature_vector(data, label, renew=False, out=None): print('Model Data Input=', data) default_path = data.replace('.txt', '') + '_feature_vector' out = default_path if out is None else default_path + '_' + out X_path, y_path = out + '_X.json', out + '_y.json' if os.path.exists(X_path) and os.path.exists(y_path) and (not renew): print('warning: feature vector already exists!', out) X = localfile.get_file(X_path) y = localfile.get_file(y_path) return X, y X, y = [], [] # run with all PR's info model p = {} pr_len = 0 with open(data) as f: all_pr = f.readlines() pr_len = len(all_pr) count = 0 for l in all_pr: print(str(count / pr_len) + ' pr:' + l) r, n1, n2 = l.strip().split() if 'msr_pairs' not in data: print( 'check if there are too much texts in the PR description.. such as template..' ) if check_large(get_pull(r, n1)) or check_large(get_pull(r, n2)): continue if r not in p: p[r] = [] p[r].append((n1, n2, label)) count = count + 1 print('all=', len(all_pr)) out_file = open(out + '_X_and_Y.txt', 'w+') for r in p: init_model_with_repo(r) for r in p: print('Start running on', r) # init NLP model init_model_with_repo(r) print('pairs num=', len(p[r])) # sequence cnt = 0 for z in p[r]: # print(r, z[0], z[1]) x0, y0 = get_sim(r, z[0], z[1]), z[2] X.append(x0) y.append(y0) print(r, z[0], z[1], x0, y0, file=out_file) cnt += 1 if cnt % 100 == 0: print('current:', r, cnt) ''' # run parallel for label in [0, 1]: pairs = [] for z in p[r]: if z[2] == label: pairs.append((r, z[0], z[1])) with Pool(processes=10) as pool: result = pool.map(get_sim_wrap, pairs) X.extend(result) y.extend([label for i in range(len(result))]) ''' out_file.close() # save to local localfile.write_to_file(X_path, X) localfile.write_to_file(y_path, y) return (X, y)
def random_pairs(): global select_set # repos = os.listdir('/DATA/luyao/pr_data') # choose = ['saltstack/salt'] # training repos # choose = ['mozilla-b2g/gaia', 'twbs/bootstrap', 'scikit-learn/scikit-learn', 'rust-lang/rust', 'servo/servo', 'pydata/pandas', 'saltstack/salt', 'nodejs/node', 'symfony/symfony-docs', 'zendframework/zf2', 'symfony/symfony', 'kubernetes/kubernetes'] # testing repos print("randomly pick a repo...") choose = [ 'cocos2d/cocos2d-x', 'dotnet/corefx', 'django/django', 'angular/angular.js', 'JuliaLang/julia', 'ceph/ceph', 'joomla/joomla-cms', 'facebook/react', 'hashicorp/terraform', 'rails/rails', 'docker/docker', 'elastic/elasticsearch', 'emberjs/ember.js', 'ansible/ansible' ] find = False while not find: # random a repo while True: try: ''' repo = repos[random.randint(0, len(repos) - 1)] repo_ = os.listdir('/DATA/luyao/pr_data/' + repo)[0] repo = repo + '/' + repo_ ''' repo = choose[random.randint(0, len(choose) - 1)] print("..." + repo) break except: continue ok_file = '/DATA/luyao/pr_data/%s/list_for_random_generate_c1.json' % repo if all_pr_flag: ok_file = ok_file.replace('_c1', '_all') if os.path.exists(ok_file): print(ok_file + " exists!") nums = localfile.get_file(ok_file) else: print(ok_file + " file does not exist ...") nums = os.listdir('/DATA/luyao/pr_data/%s' % repo) print(repo + "has " + str(len(nums)) + " PRs in total on GitHub") # filter out config file and readme file def like_localize(p): if 'confi' in p["title"].lower(): return True if 'readme' in p["title"].lower(): return True return False def too_small(p): if len(p["title"]) <= 20: return True if (p["body"] is not None) and (len(p["body"]) <= 20): return True return False new_num = [] cnt, tot_cnt = 0, len(nums) #todo: what is loop about? print("start to parse every PR...") for x in nums: cnt += 1 #progess bar... if cnt % 100 == 0: print(1.0 * cnt / tot_cnt) if x.isdigit(): p = get_pull(repo, x) # print('check', repo, x) if (all_pr_flag or (p["merged_at"] is not None)) and (not check_large(p)) and \ (not too_small(p)) and (not like_localize(p)): len_f = len(fetch_pr_info(p)) if (len_f > 0) and (len_f <= 10): new_num.append(x) print("length of new_nums " + str(len(new_num))) nums = new_num print("length of nums: " + str(len(nums))) localfile.write_to_file(ok_file, nums) l = len(nums) # print(repo, l) if l <= 100: raise Exception('too small', repo) continue if l <= 1000: if random.randint(0, 3) > 0: continue ti = 0 while not find: ti += 1 # if ti > 100: # break if l > 0: x = nums[random.randint(0, l - 1)] y = nums[random.randint(0, l - 1)] if ((repo, x, y) in msr_d) or ((repo, y, x) in msr_d): continue if (repo, x, y) in select_set: continue try: if (x != y) and (x.isdigit()) and (y.isdigit()): p1 = get_pull(repo, x) p2 = get_pull(repo, y) # print(repo, x, y) if p1["user"]["id"] != p2["user"]["id"]: select_set.add((repo, x, y)) select_set.add((repo, y, x)) find = True break except: print("PR 404") pass return [repo, x, y]