Ejemplo n.º 1
0
def get_repo_PRlist(repo, type, renew):
    api = GitHubAPI()
    save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type

    # todo: could be extended to analyze forks in the future
    if type == 'fork':
        save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json'

    if (os.path.exists(save_path)) and (not renew):
        print("read from local files and return")
        try:
            return localfile.get_file(save_path)
        except:
            pass

    print('files does not exist in local disk, start to fetch new list for ',
          repo, type)
    if (type == 'pull') or (type == 'issue'):
        ret = api.request('repos/%s/%ss' % (repo, type),
                          state='all',
                          paginate=True)
    else:
        if type == 'branch':
            type = 'branche'
        ret = api.request('repos/%s/%ss' % (repo, type), True)

    localfile.write_to_file(save_path, ret)
    return ret
Ejemplo n.º 2
0
def fetch_file_list(pull, renew=False):
    repo, num = pull["base"]["repo"]["full_name"], str(pull["number"])
    save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/' + num + '/raw_diff.json'

    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    # t = api.get('repos/%s/pulls/%s/files?page=3' % (repo, num))
    t = api.request('repos/%s/pulls/%s/files?page=3' % (repo, num))
    file_list = []
    if len(t) > 0:
        raise Exception('too big', pull['html_url'])
    else:
        li = api.request('repos/%s/pulls/%s/files' % (repo, num), paginate=True)
        # li = api.request( 'repos/%s/pulls/%s/files' % (repo, num), True)
        time.sleep(0.8)
        for f in li:
            if f.get('changes', 0) <= 5000 and ('filename' in f) and ('patch' in f):
                file_list.append(parse_diff(f['filename'], f['patch']))

    localfile.write_to_file(save_path, file_list)
    return file_list
Ejemplo n.º 3
0
def get_repo_info_forPR(repo, type, renew):
    filtered_result = []
    api = GitHubAPI()
    print(init.local_pr_data_dir + repo + '/pull_list.json')
    pullListfile = pathlib.Path(init.local_pr_data_dir + repo + '/pull_list.json')
    if pullListfile.exists():
        tocheck_pr = getOldOpenPRs(repo)
        print("tocheck_pr " + str(tocheck_pr))
        if (tocheck_pr is None):
            tocheck_pr = 0

        save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type
        if type == 'fork':
            save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json'

        if (os.path.exists(save_path)) and (not renew):
            try:
                return localfile.get_file(save_path)
            except:
                pass

        print('start fetch new list for ', repo, type)
        if (type == 'pull') or (type == 'issue'):
            page_index = 1
            while (True):
                ret = api.requestPR('repos/%s/%ss' % (repo, type), state='all', page=page_index)
                numPR = init.numPRperPage
                if (len(ret) > 0):
                    for pr in ret:
                        # if (pr['number'] >= tocheck_pr):
                        if (pr['number'] >= tocheck_pr):
                            filtered_result.append(pr)
                        else:
                            print('get all ' + str(len(filtered_result)) + ' prs')
                            localfile.replaceWithNewPRs(save_path, filtered_result)
                            return filtered_result
                    if (len(filtered_result) < numPR):
                        print('get all ' + str(len(filtered_result)) + ' prs -- after page ' + str(page_index))
                        localfile.replaceWithNewPRs(save_path, filtered_result)
                        return filtered_result
                    else:
                        page_index += 1
                        numPR += init.numPRperPage
                else:
                    print("get pulls failed")
                    return filtered_result
        else:
            if type == 'branch':
                type = 'branche'
            ret = api.request('repos/%s/%ss' % (repo, type), True)

        localfile.write_to_file(save_path, ret)
    else:
        print('pull list does not exist, get from scratch')
        ret = get_repo_PRlist(repo, type, renew)
    return ret
Ejemplo n.º 4
0
def get_repo_info_forPR_experiment(repo, type, renew):
    filtered_result = []
    api = GitHubAPI()
    print(init.local_pr_data_dir + repo + '/pull_list.json')
    save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/pull_list.json'

    if (os.path.exists(save_path)) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass
Ejemplo n.º 5
0
def get_pull(repo, num, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/api.json' % (repo, num)
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    r = api.request('repos/%s/pulls/%s' % (repo, num))
    time.sleep(3.0)
    localfile.write_to_file(save_path, r)
    return r
Ejemplo n.º 6
0
def get_pull_commit(pull, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (
        pull["base"]["repo"]["full_name"], pull["number"])
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass
    commits = api.request('GET', pull['commits_url'], True)
    time.sleep(0.7)
    localfile.write_to_file(save_path, commits)
    return commits
Ejemplo n.º 7
0
def get_pull_commit(pull, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (pull["base"]["repo"]["full_name"], pull["number"])
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    commits = api.request(pull['commits_url'].replace('https://api.github.com/', ''), paginate=True, state='all')
    time.sleep(0.7)
    localfile.write_to_file(save_path, commits)
    return commits
Ejemplo n.º 8
0
def get_pr_commit(repo, pr_id, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (repo, pr_id)
    commit_url = 'repos/%s/pulls/%s/commits' % (repo, pr_id)
    if os.path.exists(save_path) and (not renew) and (os.stat(save_path).st_size > 2):
        try:
            return localfile.get_file(save_path)
        except:

            pass
    #     commits = api.request(pull['commits_url'].replace('https://api.github.com/', ''), True)
    api = GitHubAPI()
    commits = api.request(commit_url.replace('https://api.github.com/', ''), paginate=True, state='all')
    time.sleep(0.7)
    localfile.write_to_file(save_path, commits)
    return commits
Ejemplo n.º 9
0
def fetch_commit(url, renew=False):
    api = GitHubAPI()
    save_path = LOCAL_DATA_PATH + '/pr_data/%s.json' % url.replace('https://api.github.com/repos/', '')
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    c = api.request(url)
    time.sleep(0.7)
    file_list = []
    for f in c['files']:
        if 'patch' in f:
            file_list.append(fetch_raw_diff.parse_diff(f['filename'], f['patch']))
    localfile.write_to_file(save_path, file_list)
    return file_list
Ejemplo n.º 10
0
def fetch_file_list(repo, num, renew=False):
    api = GitHubAPI()
    # repo, num = pull["base"]["repo"]["full_name"], str(pull["number"])
    outfile_prefix = init.local_pr_data_dir + repo + "/" + str(num)
    save_path = outfile_prefix + '/raw_diff.json'
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass
    file_list = []

    li = api.request('repos/%s/pulls/%s/files' % (repo, num), paginate=True)
    time.sleep(0.8)
    for f in li:
        if f.get('changes', 0) <= 5000 and ('filename' in f) and ('patch' in f):
            file_list.append(fetch_raw_diff.parse_diff(f['filename'], f['patch']))

    localfile.write_to_file(save_path, file_list)
    return file_list
Ejemplo n.º 11
0
def get_another_pull(pull, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/another_pull.json' % (
        pull["base"]["repo"]["full_name"], pull["number"])
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    comments_href = pull["_links"]["comments"]["href"]
    comments = api.request('GET', comments_href, True)
    time.sleep(0.7)
    candidates = []
    for comment in comments:
        candidates.extend(get_pr_and_issue_numbers(comment["body"]))
    candidates.extend(get_pr_and_issue_numbers(pull["body"]))

    result = list(set(candidates))

    localfile.write_to_file(save_path, result)
    return result
Ejemplo n.º 12
0
def get_another_pull(pull, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/another_pull.json' % (
        pull["base"]["repo"]["full_name"], pull["number"])
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    comments_href = pull["_links"]["comments"]["href"]  # found cites in comments, but checking events is easier.
    # comments = api.request(comments_href.replace('https://api.github.com/', ''), True)
    comments = api.request(comments_href.replace('https://api.github.com/', ''), paginate=True, state='all')
    time.sleep(0.7)
    candidates = []
    for comment in comments:
        candidates.extend(get_pr_and_issue_numbers(comment["body"]))
    candidates.extend(get_pr_and_issue_numbers(pull["body"]))

    result = list(set(candidates))

    localfile.write_to_file(save_path, result)
    return result
Ejemplo n.º 13
0
def get_repo_info(repo, type, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type
    if type == 'fork':
        save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json'

    if (os.path.exists(save_path)) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    print('start fetch new list for ', repo, type)
    if (type == 'pull') or (type == 'issue'):
        ret = api.request('GET', 'repos/%s/%ss?state=closed' % (repo, type),
                          True)
        ret.extend(
            api.request('GET', 'repos/%s/%ss?state=open' % (repo, type), True))
    else:
        if type == 'branch':
            type = 'branche'
        ret = api.request('GET', 'repos/%s/%ss' % (repo, type), True)

    localfile.write_to_file(save_path, ret)
    return ret
Ejemplo n.º 14
0
def get_feature_vector(data, label, renew=False, out=None):
    print('Model Data Input=', data)

    default_path = data.replace('.txt', '') + '_feature_vector'
    out = default_path if out is None else default_path + '_' + out
    X_path, y_path = out + '_X.json', out + '_y.json'

    if os.path.exists(X_path) and os.path.exists(y_path) and (not renew):
        print('warning: feature vector already exists!', out)
        X = localfile.get_file(X_path)
        y = localfile.get_file(y_path)
        return X, y

    X, y = [], []

    # run with all PR's info model
    p = {}
    pr_len = 0
    with open(data) as f:
        all_pr = f.readlines()
        pr_len = len(all_pr)
    count = 0

    for l in all_pr:
        print(str(count / pr_len) + ' pr:' + l)
        r, n1, n2 = l.strip().split()

        if 'msr_pairs' not in data:
            print(
                'check if there are too much texts in the PR description.. such as template..'
            )
            if check_large(get_pull(r, n1)) or check_large(get_pull(r, n2)):
                continue

        if r not in p:
            p[r] = []
        p[r].append((n1, n2, label))
        count = count + 1

    print('all=', len(all_pr))

    out_file = open(out + '_X_and_Y.txt', 'w+')

    for r in p:
        init_model_with_repo(r)

    for r in p:
        print('Start running on', r)

        # init NLP model
        init_model_with_repo(r)

        print('pairs num=', len(p[r]))

        # sequence
        cnt = 0
        for z in p[r]:
            # print(r, z[0], z[1])

            x0, y0 = get_sim(r, z[0], z[1]), z[2]
            X.append(x0)
            y.append(y0)
            print(r, z[0], z[1], x0, y0, file=out_file)

            cnt += 1
            if cnt % 100 == 0:
                print('current:', r, cnt)
        '''
        # run parallel
        for label in [0, 1]:
            pairs = []
            for z in p[r]:
                if z[2] == label:
                    pairs.append((r, z[0], z[1]))
            with Pool(processes=10) as pool:
                result = pool.map(get_sim_wrap, pairs)
            X.extend(result)
            y.extend([label for i in range(len(result))])
        '''

    out_file.close()

    # save to local
    localfile.write_to_file(X_path, X)
    localfile.write_to_file(y_path, y)
    return (X, y)
Ejemplo n.º 15
0
def random_pairs():
    global select_set

    # repos = os.listdir('/DATA/luyao/pr_data')

    # choose = ['saltstack/salt']

    # training repos
    # choose = ['mozilla-b2g/gaia', 'twbs/bootstrap', 'scikit-learn/scikit-learn', 'rust-lang/rust', 'servo/servo', 'pydata/pandas', 'saltstack/salt', 'nodejs/node', 'symfony/symfony-docs', 'zendframework/zf2', 'symfony/symfony', 'kubernetes/kubernetes']

    # testing repos
    print("randomly pick a repo...")
    choose = [
        'cocos2d/cocos2d-x', 'dotnet/corefx', 'django/django',
        'angular/angular.js', 'JuliaLang/julia', 'ceph/ceph',
        'joomla/joomla-cms', 'facebook/react', 'hashicorp/terraform',
        'rails/rails', 'docker/docker', 'elastic/elasticsearch',
        'emberjs/ember.js', 'ansible/ansible'
    ]

    find = False

    while not find:
        # random a repo
        while True:
            try:
                '''
                repo = repos[random.randint(0, len(repos) - 1)]
                repo_ = os.listdir('/DATA/luyao/pr_data/' + repo)[0]
                repo = repo + '/' + repo_
                '''
                repo = choose[random.randint(0, len(choose) - 1)]
                print("..." + repo)
                break
            except:
                continue

        ok_file = '/DATA/luyao/pr_data/%s/list_for_random_generate_c1.json' % repo
        if all_pr_flag:
            ok_file = ok_file.replace('_c1', '_all')

        if os.path.exists(ok_file):
            print(ok_file + " exists!")
            nums = localfile.get_file(ok_file)
        else:
            print(ok_file + " file does not exist ...")

            nums = os.listdir('/DATA/luyao/pr_data/%s' % repo)
            print(repo + "has " + str(len(nums)) + " PRs in total on GitHub")

            # filter out config file and readme file
            def like_localize(p):
                if 'confi' in p["title"].lower():
                    return True
                if 'readme' in p["title"].lower():
                    return True
                return False

            def too_small(p):
                if len(p["title"]) <= 20:
                    return True
                if (p["body"] is not None) and (len(p["body"]) <= 20):
                    return True
                return False

            new_num = []
            cnt, tot_cnt = 0, len(nums)

            #todo: what is loop about?
            print("start to parse every PR...")

            for x in nums:
                cnt += 1

                #progess bar...
                if cnt % 100 == 0:
                    print(1.0 * cnt / tot_cnt)

                if x.isdigit():
                    p = get_pull(repo, x)
                    # print('check', repo, x)
                    if (all_pr_flag or (p["merged_at"] is not None)) and (not check_large(p)) and \
                    (not too_small(p)) and (not like_localize(p)):
                        len_f = len(fetch_pr_info(p))
                        if (len_f > 0) and (len_f <= 10):
                            new_num.append(x)
                            print("length of new_nums " + str(len(new_num)))

            nums = new_num
            print("length of nums: " + str(len(nums)))

            localfile.write_to_file(ok_file, nums)

        l = len(nums)
        #         print(repo, l)

        if l <= 100:
            raise Exception('too small', repo)
            continue

        if l <= 1000:
            if random.randint(0, 3) > 0:
                continue

        ti = 0
        while not find:
            ti += 1
            #             if ti > 100:
            #                 break
            if l > 0:
                x = nums[random.randint(0, l - 1)]
                y = nums[random.randint(0, l - 1)]

                if ((repo, x, y) in msr_d) or ((repo, y, x) in msr_d):
                    continue

                if (repo, x, y) in select_set:
                    continue
                try:
                    if (x != y) and (x.isdigit()) and (y.isdigit()):
                        p1 = get_pull(repo, x)
                        p2 = get_pull(repo, y)
                        # print(repo, x, y)

                        if p1["user"]["id"] != p2["user"]["id"]:

                            select_set.add((repo, x, y))
                            select_set.add((repo, y, x))

                            find = True
                            break
                except:
                    print("PR 404")
                    pass
    return [repo, x, y]