def normal_search(language: str): out_path = 'out/' + language + '/links/{}.csv' utils.create_missing_dirs(out_path) pr_cnt = 0 ulink = f'https://api.github.com/search/issues?q=language:{language}+is:pr+is:open' ulink = ulink + '&page={}&per_page=100' file_list = [] for page_num in range(1, 11): resp = utils.send(ulink.format(page_num), tokens[0], 3) if not resp or resp.status_code != 200: break jresp = resp.json() if 'items' in jresp: for item in jresp['items']: if 'url' in item: repo_name = get_repo_name(item['url']) if not repo_name: continue # If it works fine, this line wont be executed link_files = item['url'].replace('/issues/', '/pulls/') + '/files\n' file_list.append(link_files) pr_cnt += 1 if file_list: with open(out_path.format('link'), 'w') as outfile: outfile.writelines(file_list) outfile.flush() utils.logger.warning(f'pr count {pr_cnt}')
def get_top_100_stars(): query = 'https://api.github.com/search/repositories?q=language:java&page=1&per_page=100' resp = utils.send(query, token, 3) if not resp: print("No response") exit(1) jresp = resp.json() utils.create_missing_dirs(path_top_100_stars) with open(path_top_100_stars, 'w') as out: json.dump(jresp, out)
def search_pr(language: str, start_date: str, end_date=''): ''' Query pull requests created from start_date to end_date, and save links to files in csv tables :param language: repository language :param start_date: string in the format of YYYY-MM-DD :param end_date: string in the format of YYYY-MM-DD :return: the number of pull requests ''' out_path = 'out/' + language + '/links/{}_{}.csv' utils.create_missing_dirs(out_path) pr_cnt = 0 # ulink example: https://api.github.com/search/issues?q=language:Java+is:pr+is:open+created:2020-09-10..2020-09-15 query = f'https://api.github.com/search/issues?q=language:{language}+is:pr+is:open+created:{start_date}' if end_date: query += f'..{end_date}' ulink = query + '&page={}&per_page=100' file_list = [] for page_cnt in range(1, 11): # if pr_cnt >= MAX_EACH_NUM: # break resp = utils.send(ulink.format(page_cnt), tokens[0], 3) if not resp or resp.status_code != 200: utils.logger.warning( f'[No response] {ulink.format(page_cnt)}\naccess_token={tokens[0]}' ) break jresp = resp.json() if 'items' in jresp: for item in jresp['items']: if 'url' in item: repo_name = get_repo_name(item['url']) if not repo_name: continue # If it works fine, this line wont be executed # if get_repo_stars(repo_name) <= MIN_STARS: # continue link_files = item['url'].replace('/issues/', '/pulls/') + '/files\n' file_list.append(link_files) pr_cnt += 1 if file_list: with open(out_path.format(start_date, end_date), 'w') as outfile: outfile.writelines(file_list) outfile.flush() utils.logger.warning(f'pr count {pr_cnt}') return pr_cnt
def run(): paths = glob.glob(f'{root}/**/files.json', recursive=True) context = Context() # context.enable_online_search() engine = DefaultEngine(context) re_repo = re.compile(r'PullRequests/java/files/(.+?)/pulls/(\d+)') for p in paths: m = re_repo.search(p) if m: repo_name = m.groups()[0] pr_id = m.groups()[1] engine.context.update_repo_name(repo_name) else: continue patchset = get_modified_patchset(p) if patchset: pr_timer = Timer(repo_name + '-' + pr_id, logger=None) pr_timer.start() engine.visit(*patchset) bugs = engine.filter_bugs() if bugs: save_path = f'{report_path}/{repo_name}/{pr_id}' create_missing_dirs(save_path) with open(f'{save_path}/report.json', 'w') as out: bugs_json = dict() bugs_json['repo'] = repo_name bugs_json['id'] = pr_id bugs_json['total'] = len(bugs) bugs_json['items'] = [bug.__dict__ for bug in bugs] json.dump(bugs_json, out) pr_timer.stop() project_time_dict = dict() detector_time_dict = dict() for k, v in Timer.timers.items(): if k not in DETECTOR_DICT: project_time_dict[k] = v else: detector_time_dict[k] = v with open(path.join(report_path, 'time_projects.json'), 'w') as logfile: json.dump(project_time_dict, logfile) with open(path.join(report_path, 'time_detectors.json'), 'w') as logfile: json.dump(detector_time_dict, logfile)
def sort_by_size(): with open(path_top_100_stars, 'r') as f: jlist = json.load(f) jlist = jlist['items'] sorted_list = sorted(jlist, key=lambda k: k.get('size', 0)) simple_list = [{ 'html_url': repo['html_url'], 'stargazers_count': repo['stargazers_count'], 'size': repo['size'] } for repo in sorted_list] utils.create_missing_dirs(path_sort_size) with open(path_sort_size, 'w') as out: json.dump(simple_list, out)
import glob import re from os import path import json from patterns.models.context import Context from patterns.models.engine import DefaultEngine from rparser import parse from utils import create_missing_dirs from timer import Timer from gen_detectors import DETECTOR_DICT root = 'PullRequests/java/files' report_path = 'PullRequests/report' create_missing_dirs(report_path) RE_SHA = re.compile(r'https://github\.com/[^/]+/[^/]+/blob/(\w+)/') def _get_sha(blob_url: str): try: m = RE_SHA.search(blob_url) if m: return m.groups()[0] except TypeError as e: # "blob_url": null return '' def report_diversity():
import utils from config import token if __name__ == '__main__': save_path = 'student_repos/' utils.create_missing_dirs(save_path) result_file_path = save_path + 'star_rst.txt' open(result_file_path, 'w').close() max_num = 20 cur_num = 0 link1 = 'https://api.github.com/search/repositories?q=course+project+language:java+stars:0..10+size:>=5000&per_page=100&page=' # link1 = 'https://api.github.com/search/repositories?q=language:java+size:>=750000&sort=stars&order=desc&per_page=100&page=' page_num = 0 while page_num <= 10: page_num += 1 tmp_path = save_path + 'star' + str(page_num) + '.json' if utils.exists_file(tmp_path): jresp = utils.load_json_from_file(tmp_path) else: resp = utils.send(link1 + str(page_num), token, 3) if not resp or resp.status_code != 200: break with open(tmp_path, 'w') as f: f.write(resp.text) jresp = resp.json() for item in jresp['items']: