Example #1
0
def normal_search(language: str):
    out_path = 'out/' + language + '/links/{}.csv'
    utils.create_missing_dirs(out_path)
    pr_cnt = 0

    ulink = f'https://api.github.com/search/issues?q=language:{language}+is:pr+is:open'
    ulink = ulink + '&page={}&per_page=100'

    file_list = []
    for page_num in range(1, 11):
        resp = utils.send(ulink.format(page_num), tokens[0], 3)
        if not resp or resp.status_code != 200:
            break
        jresp = resp.json()

        if 'items' in jresp:
            for item in jresp['items']:
                if 'url' in item:
                    repo_name = get_repo_name(item['url'])
                    if not repo_name:
                        continue  # If it works fine, this line wont be executed

                    link_files = item['url'].replace('/issues/',
                                                     '/pulls/') + '/files\n'
                    file_list.append(link_files)
                    pr_cnt += 1

    if file_list:
        with open(out_path.format('link'), 'w') as outfile:
            outfile.writelines(file_list)
            outfile.flush()

    utils.logger.warning(f'pr count {pr_cnt}')
Example #2
0
def get_top_100_stars():
    query = 'https://api.github.com/search/repositories?q=language:java&page=1&per_page=100'
    resp = utils.send(query, token, 3)
    if not resp:
        print("No response")
        exit(1)

    jresp = resp.json()

    utils.create_missing_dirs(path_top_100_stars)
    with open(path_top_100_stars, 'w') as out:
        json.dump(jresp, out)
Example #3
0
def search_pr(language: str, start_date: str, end_date=''):
    '''
    Query pull requests created from start_date to end_date, and save links to files in csv tables
    :param language: repository language
    :param start_date: string in the format of YYYY-MM-DD
    :param end_date: string in the format of YYYY-MM-DD
    :return: the number of pull requests
    '''
    out_path = 'out/' + language + '/links/{}_{}.csv'
    utils.create_missing_dirs(out_path)
    pr_cnt = 0

    # ulink example: https://api.github.com/search/issues?q=language:Java+is:pr+is:open+created:2020-09-10..2020-09-15
    query = f'https://api.github.com/search/issues?q=language:{language}+is:pr+is:open+created:{start_date}'
    if end_date:
        query += f'..{end_date}'
    ulink = query + '&page={}&per_page=100'

    file_list = []
    for page_cnt in range(1, 11):
        # if pr_cnt >= MAX_EACH_NUM:
        #     break

        resp = utils.send(ulink.format(page_cnt), tokens[0], 3)
        if not resp or resp.status_code != 200:
            utils.logger.warning(
                f'[No response] {ulink.format(page_cnt)}\naccess_token={tokens[0]}'
            )
            break
        jresp = resp.json()

        if 'items' in jresp:
            for item in jresp['items']:
                if 'url' in item:
                    repo_name = get_repo_name(item['url'])
                    if not repo_name:
                        continue  # If it works fine, this line wont be executed
                    # if get_repo_stars(repo_name) <= MIN_STARS:
                    #     continue

                    link_files = item['url'].replace('/issues/',
                                                     '/pulls/') + '/files\n'
                    file_list.append(link_files)
                    pr_cnt += 1

    if file_list:
        with open(out_path.format(start_date, end_date), 'w') as outfile:
            outfile.writelines(file_list)
            outfile.flush()

    utils.logger.warning(f'pr count {pr_cnt}')
    return pr_cnt
def run():
    paths = glob.glob(f'{root}/**/files.json', recursive=True)

    context = Context()
    # context.enable_online_search()
    engine = DefaultEngine(context)

    re_repo = re.compile(r'PullRequests/java/files/(.+?)/pulls/(\d+)')

    for p in paths:
        m = re_repo.search(p)
        if m:
            repo_name = m.groups()[0]
            pr_id = m.groups()[1]
            engine.context.update_repo_name(repo_name)
        else:
            continue

        patchset = get_modified_patchset(p)
        if patchset:
            pr_timer = Timer(repo_name + '-' + pr_id, logger=None)
            pr_timer.start()
            engine.visit(*patchset)

            bugs = engine.filter_bugs()
            if bugs:
                save_path = f'{report_path}/{repo_name}/{pr_id}'
                create_missing_dirs(save_path)
                with open(f'{save_path}/report.json', 'w') as out:
                    bugs_json = dict()
                    bugs_json['repo'] = repo_name
                    bugs_json['id'] = pr_id
                    bugs_json['total'] = len(bugs)
                    bugs_json['items'] = [bug.__dict__ for bug in bugs]
                    json.dump(bugs_json, out)
            pr_timer.stop()

    project_time_dict = dict()
    detector_time_dict = dict()
    for k, v in Timer.timers.items():
        if k not in DETECTOR_DICT:
            project_time_dict[k] = v
        else:
            detector_time_dict[k] = v
    with open(path.join(report_path, 'time_projects.json'), 'w') as logfile:
        json.dump(project_time_dict, logfile)
    with open(path.join(report_path, 'time_detectors.json'), 'w') as logfile:
        json.dump(detector_time_dict, logfile)
Example #5
0
def sort_by_size():
    with open(path_top_100_stars, 'r') as f:
        jlist = json.load(f)

    jlist = jlist['items']

    sorted_list = sorted(jlist, key=lambda k: k.get('size', 0))

    simple_list = [{
        'html_url': repo['html_url'],
        'stargazers_count': repo['stargazers_count'],
        'size': repo['size']
    } for repo in sorted_list]

    utils.create_missing_dirs(path_sort_size)
    with open(path_sort_size, 'w') as out:
        json.dump(simple_list, out)
import glob
import re
from os import path
import json

from patterns.models.context import Context
from patterns.models.engine import DefaultEngine
from rparser import parse
from utils import create_missing_dirs
from timer import Timer
from gen_detectors import DETECTOR_DICT

root = 'PullRequests/java/files'
report_path = 'PullRequests/report'
create_missing_dirs(report_path)

RE_SHA = re.compile(r'https://github\.com/[^/]+/[^/]+/blob/(\w+)/')


def _get_sha(blob_url: str):
    try:
        m = RE_SHA.search(blob_url)
        if m:
            return m.groups()[0]

    except TypeError as e:
        # "blob_url": null
        return ''


def report_diversity():
import utils
from config import token

if __name__ == '__main__':
    save_path = 'student_repos/'
    utils.create_missing_dirs(save_path)

    result_file_path = save_path + 'star_rst.txt'
    open(result_file_path, 'w').close()

    max_num = 20
    cur_num = 0

    link1 = 'https://api.github.com/search/repositories?q=course+project+language:java+stars:0..10+size:>=5000&per_page=100&page='
    # link1 = 'https://api.github.com/search/repositories?q=language:java+size:>=750000&sort=stars&order=desc&per_page=100&page='
    page_num = 0
    while page_num <= 10:
        page_num += 1

        tmp_path = save_path + 'star' + str(page_num) + '.json'
        if utils.exists_file(tmp_path):
            jresp = utils.load_json_from_file(tmp_path)
        else:
            resp = utils.send(link1 + str(page_num), token, 3)
            if not resp or resp.status_code != 200:
                break
            with open(tmp_path, 'w') as f:
                f.write(resp.text)
            jresp = resp.json()

        for item in jresp['items']: