def get_repo_PRlist(repo, type, renew):
    api = GitHubAPI()
    save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type

    # todo: could be extended to analyze forks in the future
    if type == 'fork':
        save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json'

    if (os.path.exists(save_path)) and (not renew):
        print("read from local files and return")
        try:
            return localfile.get_file(save_path)
        except:
            pass

    print('files does not exist in local disk, start to fetch new list for ',
          repo, type)
    if (type == 'pull') or (type == 'issue'):
        ret = api.request('repos/%s/%ss' % (repo, type),
                          state='all',
                          paginate=True)
    else:
        if type == 'branch':
            type = 'branche'
        ret = api.request('repos/%s/%ss' % (repo, type), True)

    localfile.write_to_file(save_path, ret)
    return ret
Beispiel #2
0
def fetch_file_list(pull, renew=False):
    repo, num = pull["base"]["repo"]["full_name"], str(pull["number"])
    save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/' + num + '/raw_diff.json'

    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    # t = api.get('repos/%s/pulls/%s/files?page=3' % (repo, num))
    t = api.request('repos/%s/pulls/%s/files?page=3' % (repo, num))
    file_list = []
    if len(t) > 0:
        raise Exception('too big', pull['html_url'])
    else:
        li = api.request('repos/%s/pulls/%s/files' % (repo, num), paginate=True)
        # li = api.request( 'repos/%s/pulls/%s/files' % (repo, num), True)
        time.sleep(0.8)
        for f in li:
            if f.get('changes', 0) <= 5000 and ('filename' in f) and ('patch' in f):
                file_list.append(parse_diff(f['filename'], f['patch']))

    localfile.write_to_file(save_path, file_list)
    return file_list
Beispiel #3
0
def getFeatureVectorForModeling(data):
    renew = True
    # for data in dataset:
    path = data[0]
    # path = 'data/clf/second_msr_pairs.txt'
    label = data[1]
    group = data[2]

    default_path = init.currentDIR + '/' + path.replace('.txt',
                                                        '') + '_feature_vector'
    X_path, y_path = default_path + '_X.json', default_path + '_y.json'

    if os.path.exists(X_path) and os.path.exists(y_path) and (not renew):
        print('feature vector already exists, read from local file')
        X = localfile.get_file(X_path)
        y = localfile.get_file(y_path)
        return X, y

    X, y = [], []

    # run with all PR's info model
    repo2PRpair_map = {}
    with open(init.currentDIR + '/' + path) as f:
        all_pr = f.readlines()

    for l in tqdm(all_pr):
        repo, n1, n2 = l.strip().split()

        if repo not in repo2PRpair_map:
            repo2PRpair_map[repo] = []
        repo2PRpair_map[repo].append((n1, n2))

    out_file = open(default_path + '_X_and_Y.txt', 'w+')

    for repo in tqdm(repo2PRpair_map):
        # print('Start running on', repo)
        # sequence
        for pr_pair in tqdm(repo2PRpair_map[repo]):
            print(repo, pr_pair[0], pr_pair[1])
            featureVec = get_featureVector_ForPRpair(repo, pr_pair[0],
                                                     pr_pair[1])
            X.append(featureVec)
            y.append(label)
            print(repo,
                  pr_pair[0],
                  pr_pair[1],
                  featureVec,
                  label,
                  file=out_file)

    out_file.close()

    # save to local
    localfile.write_to_file(X_path, X)
    localfile.write_to_file(y_path, y)
    return (X, y)
Beispiel #4
0
def get_repo_info_forPR(repo, type, renew):
    filtered_result = []
    api = GitHubAPI()
    print(init.local_pr_data_dir + repo + '/pull_list.json')
    pullListfile = pathlib.Path(init.local_pr_data_dir + repo + '/pull_list.json')
    if pullListfile.exists():
        tocheck_pr = getOldOpenPRs(repo)
        print("tocheck_pr " + str(tocheck_pr))
        if (tocheck_pr is None):
            tocheck_pr = 0

        save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type
        if type == 'fork':
            save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json'

        if (os.path.exists(save_path)) and (not renew):
            try:
                return localfile.get_file(save_path)
            except:
                pass

        print('start fetch new list for ', repo, type)
        if (type == 'pull') or (type == 'issue'):
            page_index = 1
            while (True):
                ret = api.requestPR('repos/%s/%ss' % (repo, type), state='all', page=page_index)
                numPR = init.numPRperPage
                if (len(ret) > 0):
                    for pr in ret:
                        # if (pr['number'] >= tocheck_pr):
                        if (pr['number'] >= tocheck_pr):
                            filtered_result.append(pr)
                        else:
                            print('get all ' + str(len(filtered_result)) + ' prs')
                            localfile.replaceWithNewPRs(save_path, filtered_result)
                            return filtered_result
                    if (len(filtered_result) < numPR):
                        print('get all ' + str(len(filtered_result)) + ' prs -- after page ' + str(page_index))
                        localfile.replaceWithNewPRs(save_path, filtered_result)
                        return filtered_result
                    else:
                        page_index += 1
                        numPR += init.numPRperPage
                else:
                    print("get pulls failed")
                    return filtered_result
        else:
            if type == 'branch':
                type = 'branche'
            ret = api.request('repos/%s/%ss' % (repo, type), True)

        localfile.write_to_file(save_path, ret)
    else:
        print('pull list does not exist, get from scratch')
        ret = get_repo_PRlist(repo, type, renew)
    return ret
def filterNonCodeFiles(file_list, outfile_prefix):
    newFileList = []
    count = 0
    for f in file_list:
        if count > 500:
            localfile.write_to_file(outfile_prefix + "/toobig.txt", '500file')
            return []
        if not language_tool.is_text(f['name']):
            newFileList.append(f)
            count += 1
    return newFileList
Beispiel #6
0
def preprocess_documents(repo, pulls, renew):
    for pull in tqdm(pulls):  # tqdm is used for print progress bar https://github.com/tqdm/tqdm/
        pr_id = pull['number']
        # if pr_id != 14378:
        #     continue
        outfile_prefix = init.local_pr_data_dir + repo + "/" + str(pr_id)
        print(str(pr_id))
        if os.path.exists(outfile_prefix + '/updateAt.txt') and (not renew):
            print('skip')
            continue

        # if the pr is older than 1 year, ignore
        #         # todo: why do I care about the create date when training the model? comment out for now
        #         # current_pr_createdAt = pull['created_at']
        #         # if (util.timeUtil.days_between(now, current_pr_createdAt) > init.comparePRs_timeWindow_inDays):
        #         #     print(str(pull['number']) + " older than " + str(init.pr_date_difference_inDays) + " days , stop")
        #         #     break

        # ----------- title and description -----------
        wordext.get_tokens_from_file(pull['title'], outfile_prefix, 'title')
        if pull["body"]:
            if not os.path.exists(outfile_prefix + "/body_tokens_stemmed.tsv") or renew:
                import re
                body_str = re.sub("(<.*?>)", "", pull['body'], flags=re.DOTALL)
                wordext.get_tokens_from_file(body_str, outfile_prefix, 'body')

        # # ----------- commit msg  -----------
        print('check commit')
        from github.github_api import concat_commits
        from github.github_api import get_pr_commit
        all_commit_msg = concat_commits(get_pr_commit(repo, pr_id))
        wordext.get_tokens_from_file(all_commit_msg, outfile_prefix, 'commit')
        # # ----------- CODE & FILE  -----------
        print('check code ,file ')
        from github.github_api import fetch_pr_code_info
        pr_filelist_json = fetch_pr_code_info(repo, pr_id)
        if (len(pr_filelist_json) == 0):
            localfile.write_to_file(outfile_prefix + "/updateAt.txt",
                                    str(datetime.datetime.now().strftime("%Y-%m-%d")))
            continue
        wordext.get_code_tokens_from_file(pr_filelist_json, outfile_prefix, 'add_code')
        wordext.get_code_tokens_from_file(pr_filelist_json, outfile_prefix, 'del_code')

        # ----------- Location  -----------
        pr_filelist_json = fetch_pr_code_info(repo, pr_id)
        if len(pr_filelist_json) > 0:
            getCodeLocation(pr_filelist_json, outfile_prefix)
        # ----------- version number  & crossReference  PR or ISSUE-----------
        print('check reference')
        body_text = '' if pull["body"] is None else pull["body"]
        pull_text = str(pull["title"]) + ' ' + str(body_text) + ' ' + all_commit_msg
        getReference(repo, pull_text, outfile_prefix)

        localfile.write_to_file(outfile_prefix + "/updateAt.txt", str(datetime.datetime.now().strftime("%Y-%m-%d")))
Beispiel #7
0
def get_pull_commit(pull, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (pull["base"]["repo"]["full_name"], pull["number"])
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    commits = api.request(pull['commits_url'].replace('https://api.github.com/', ''), paginate=True, state='all')
    time.sleep(0.7)
    localfile.write_to_file(save_path, commits)
    return commits
Beispiel #8
0
def get_pull(repo, num, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/api.json' % (repo, num)
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    r = api.request('repos/%s/pulls/%s' % (repo, num))
    time.sleep(3.0)
    localfile.write_to_file(save_path, r)
    return r
Beispiel #9
0
def get_pull_commit(pull, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (
        pull["base"]["repo"]["full_name"], pull["number"])
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass
    commits = api.request('GET', pull['commits_url'], True)
    time.sleep(0.7)
    localfile.write_to_file(save_path, commits)
    return commits
Beispiel #10
0
def fetch_pr_info(pull, must_in_local=False):
    global file_list_cache
    ind = (pull["base"]["repo"]["full_name"], pull["number"])
    if ind in file_list_cache:
        return file_list_cache[ind]

    path = '/DATA/luyao/pr_data/%s/%s' % (pull["base"]["repo"]["full_name"],
                                          pull["number"])
    parse_diff_path = path + '/parse_diff.json'
    raw_diff_path = path + '/raw_diff.json'
    pull_files_path = path + '/pull_files.json'

    flag_path = path + '/too_large_flag.json'
    if os.path.exists(flag_path):
        raise Exception('too big', pull['html_url'])

    if os.path.exists(parse_diff_path):
        try:
            ret = localfile.get_file(parse_diff_path)
            file_list_cache[ind] = ret
            return ret
        except:
            pass

    if os.path.exists(raw_diff_path) or os.path.exists(pull_files_path):
        if os.path.exists(raw_diff_path):
            file_list = localfile.get_file(raw_diff_path)
        elif os.path.exists(pull_files_path):
            pull_files = localfile.get_file(pull_files_path)
            file_list = [
                parse_diff(file["file_full_name"], file["changed_code"])
                for file in pull_files
            ]
        else:
            raise Exception('error on fetch local file %s' % path)
    else:
        if must_in_local:
            raise Exception('not found in local')

        try:
            file_list = fetch_file_list(pull)
        except:
            localfile.write_to_file(flag_path, 'flag')
            raise Exception('too big', pull['html_url'])

    # print(path, [x["name"] for x in file_list])
    localfile.write_to_file(parse_diff_path, file_list)
    file_list_cache[ind] = file_list
    return file_list
Beispiel #11
0
def get_pr_commit(repo, pr_id, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/commits.json' % (repo, pr_id)
    commit_url = 'repos/%s/pulls/%s/commits' % (repo, pr_id)
    if os.path.exists(save_path) and (not renew) and (os.stat(save_path).st_size > 2):
        try:
            return localfile.get_file(save_path)
        except:

            pass
    #     commits = api.request(pull['commits_url'].replace('https://api.github.com/', ''), True)
    api = GitHubAPI()
    commits = api.request(commit_url.replace('https://api.github.com/', ''), paginate=True, state='all')
    time.sleep(0.7)
    localfile.write_to_file(save_path, commits)
    return commits
Beispiel #12
0
def fetch_commit(url, renew=False):
    api = GitHubAPI()
    save_path = LOCAL_DATA_PATH + '/pr_data/%s.json' % url.replace('https://api.github.com/repos/', '')
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    c = api.request(url)
    time.sleep(0.7)
    file_list = []
    for f in c['files']:
        if 'patch' in f:
            file_list.append(fetch_raw_diff.parse_diff(f['filename'], f['patch']))
    localfile.write_to_file(save_path, file_list)
    return file_list
Beispiel #13
0
def fetch_file_list(repo, num, renew=False):
    api = GitHubAPI()
    # repo, num = pull["base"]["repo"]["full_name"], str(pull["number"])
    outfile_prefix = init.local_pr_data_dir + repo + "/" + str(num)
    save_path = outfile_prefix + '/raw_diff.json'
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass
    file_list = []

    li = api.request('repos/%s/pulls/%s/files' % (repo, num), paginate=True)
    time.sleep(0.8)
    for f in li:
        if f.get('changes', 0) <= 5000 and ('filename' in f) and ('patch' in f):
            file_list.append(fetch_raw_diff.parse_diff(f['filename'], f['patch']))

    localfile.write_to_file(save_path, file_list)
    return file_list
Beispiel #14
0
def get_another_pull(pull, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/another_pull.json' % (
        pull["base"]["repo"]["full_name"], pull["number"])
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    comments_href = pull["_links"]["comments"]["href"]
    comments = api.request('GET', comments_href, True)
    time.sleep(0.7)
    candidates = []
    for comment in comments:
        candidates.extend(get_pr_and_issue_numbers(comment["body"]))
    candidates.extend(get_pr_and_issue_numbers(pull["body"]))

    result = list(set(candidates))

    localfile.write_to_file(save_path, result)
    return result
Beispiel #15
0
def get_another_pull(pull, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/%s/%s/another_pull.json' % (
        pull["base"]["repo"]["full_name"], pull["number"])
    if os.path.exists(save_path) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    comments_href = pull["_links"]["comments"]["href"]  # found cites in comments, but checking events is easier.
    # comments = api.request(comments_href.replace('https://api.github.com/', ''), True)
    comments = api.request(comments_href.replace('https://api.github.com/', ''), paginate=True, state='all')
    time.sleep(0.7)
    candidates = []
    for comment in comments:
        candidates.extend(get_pr_and_issue_numbers(comment["body"]))
    candidates.extend(get_pr_and_issue_numbers(pull["body"]))

    result = list(set(candidates))

    localfile.write_to_file(save_path, result)
    return result
Beispiel #16
0
def get_repo_info(repo, type, renew=False):
    save_path = LOCAL_DATA_PATH + '/pr_data/' + repo + '/%s_list.json' % type
    if type == 'fork':
        save_path = LOCAL_DATA_PATH + '/result/' + repo + '/forks_list.json'

    if (os.path.exists(save_path)) and (not renew):
        try:
            return localfile.get_file(save_path)
        except:
            pass

    print('start fetch new list for ', repo, type)
    if (type == 'pull') or (type == 'issue'):
        ret = api.request('GET', 'repos/%s/%ss?state=closed' % (repo, type),
                          True)
        ret.extend(
            api.request('GET', 'repos/%s/%ss?state=open' % (repo, type), True))
    else:
        if type == 'branch':
            type = 'branche'
        ret = api.request('GET', 'repos/%s/%ss' % (repo, type), True)

    localfile.write_to_file(save_path, ret)
    return ret
Beispiel #17
0
def random_pairs():
    global select_set

    # repos = os.listdir('/DATA/luyao/pr_data')

    # choose = ['saltstack/salt']

    # training repos
    # choose = ['mozilla-b2g/gaia', 'twbs/bootstrap', 'scikit-learn/scikit-learn', 'rust-lang/rust', 'servo/servo', 'pydata/pandas', 'saltstack/salt', 'nodejs/node', 'symfony/symfony-docs', 'zendframework/zf2', 'symfony/symfony', 'kubernetes/kubernetes']

    # testing repos
    print("randomly pick a repo...")
    choose = [
        'cocos2d/cocos2d-x', 'dotnet/corefx', 'django/django',
        'angular/angular.js', 'JuliaLang/julia', 'ceph/ceph',
        'joomla/joomla-cms', 'facebook/react', 'hashicorp/terraform',
        'rails/rails', 'docker/docker', 'elastic/elasticsearch',
        'emberjs/ember.js', 'ansible/ansible'
    ]

    find = False

    while not find:
        # random a repo
        while True:
            try:
                '''
                repo = repos[random.randint(0, len(repos) - 1)]
                repo_ = os.listdir('/DATA/luyao/pr_data/' + repo)[0]
                repo = repo + '/' + repo_
                '''
                repo = choose[random.randint(0, len(choose) - 1)]
                print("..." + repo)
                break
            except:
                continue

        ok_file = '/DATA/luyao/pr_data/%s/list_for_random_generate_c1.json' % repo
        if all_pr_flag:
            ok_file = ok_file.replace('_c1', '_all')

        if os.path.exists(ok_file):
            print(ok_file + " exists!")
            nums = localfile.get_file(ok_file)
        else:
            print(ok_file + " file does not exist ...")

            nums = os.listdir('/DATA/luyao/pr_data/%s' % repo)
            print(repo + "has " + str(len(nums)) + " PRs in total on GitHub")

            # filter out config file and readme file
            def like_localize(p):
                if 'confi' in p["title"].lower():
                    return True
                if 'readme' in p["title"].lower():
                    return True
                return False

            def too_small(p):
                if len(p["title"]) <= 20:
                    return True
                if (p["body"] is not None) and (len(p["body"]) <= 20):
                    return True
                return False

            new_num = []
            cnt, tot_cnt = 0, len(nums)

            #todo: what is loop about?
            print("start to parse every PR...")

            for x in nums:
                cnt += 1

                #progess bar...
                if cnt % 100 == 0:
                    print(1.0 * cnt / tot_cnt)

                if x.isdigit():
                    p = get_pull(repo, x)
                    # print('check', repo, x)
                    if (all_pr_flag or (p["merged_at"] is not None)) and (not check_large(p)) and \
                    (not too_small(p)) and (not like_localize(p)):
                        len_f = len(fetch_pr_info(p))
                        if (len_f > 0) and (len_f <= 10):
                            new_num.append(x)
                            print("length of new_nums " + str(len(new_num)))

            nums = new_num
            print("length of nums: " + str(len(nums)))

            localfile.write_to_file(ok_file, nums)

        l = len(nums)
        #         print(repo, l)

        if l <= 100:
            raise Exception('too small', repo)
            continue

        if l <= 1000:
            if random.randint(0, 3) > 0:
                continue

        ti = 0
        while not find:
            ti += 1
            #             if ti > 100:
            #                 break
            if l > 0:
                x = nums[random.randint(0, l - 1)]
                y = nums[random.randint(0, l - 1)]

                if ((repo, x, y) in msr_d) or ((repo, y, x) in msr_d):
                    continue

                if (repo, x, y) in select_set:
                    continue
                try:
                    if (x != y) and (x.isdigit()) and (y.isdigit()):
                        p1 = get_pull(repo, x)
                        p2 = get_pull(repo, y)
                        # print(repo, x, y)

                        if p1["user"]["id"] != p2["user"]["id"]:

                            select_set.add((repo, x, y))
                            select_set.add((repo, y, x))

                            find = True
                            break
                except:
                    print("PR 404")
                    pass
    return [repo, x, y]
Beispiel #18
0
def get_feature_vector(data, label, renew=False, out=None):
    print('Model Data Input=', data)

    default_path = data.replace('.txt', '') + '_feature_vector'
    out = default_path if out is None else default_path + '_' + out
    X_path, y_path = out + '_X.json', out + '_y.json'

    if os.path.exists(X_path) and os.path.exists(y_path) and (not renew):
        print('warning: feature vector already exists!', out)
        X = localfile.get_file(X_path)
        y = localfile.get_file(y_path)
        return X, y

    X, y = [], []

    # run with all PR's info model
    p = {}
    pr_len = 0
    with open(data) as f:
        all_pr = f.readlines()
        pr_len = len(all_pr)
    count = 0

    for l in all_pr:
        print(str(count / pr_len) + ' pr:' + l)
        r, n1, n2 = l.strip().split()

        if 'msr_pairs' not in data:
            print(
                'check if there are too much texts in the PR description.. such as template..'
            )
            if check_large(get_pull(r, n1)) or check_large(get_pull(r, n2)):
                continue

        if r not in p:
            p[r] = []
        p[r].append((n1, n2, label))
        count = count + 1

    print('all=', len(all_pr))

    out_file = open(out + '_X_and_Y.txt', 'w+')

    for r in p:
        init_model_with_repo(r)

    for r in p:
        print('Start running on', r)

        # init NLP model
        init_model_with_repo(r)

        print('pairs num=', len(p[r]))

        # sequence
        cnt = 0
        for z in p[r]:
            # print(r, z[0], z[1])

            x0, y0 = get_sim(r, z[0], z[1]), z[2]
            X.append(x0)
            y.append(y0)
            print(r, z[0], z[1], x0, y0, file=out_file)

            cnt += 1
            if cnt % 100 == 0:
                print('current:', r, cnt)
        '''
        # run parallel
        for label in [0, 1]:
            pairs = []
            for z in p[r]:
                if z[2] == label:
                    pairs.append((r, z[0], z[1]))
            with Pool(processes=10) as pool:
                result = pool.map(get_sim_wrap, pairs)
            X.extend(result)
            y.extend([label for i in range(len(result))])
        '''

    out_file.close()

    # save to local
    localfile.write_to_file(X_path, X)
    localfile.write_to_file(y_path, y)
    return (X, y)
Beispiel #19
0
        pairs = sorted(pairs, key=lambda x: x.split()[0])

        last_repo = None
        for pair in pairs:
            pair_s = pair.split()
            r, n1, n2 = pair_s[0], pair_s[1], pair_s[2]

            if r != last_repo:
                clf.init_model_with_repo(r)
                last_repo = r

            status, history, history_ret, history_last, history_commit = simulate(
                r, n1, n2)

            for i in range(len(history)):
                history[i] = (history[i],
                              max(history_last[i][0], history_last[i][1]))

            if status >= 0:
                with open(out_file, 'a+') as outf:
                    print(r, n1, n2, ':', history, file=outf)

                all_ret.append({
                    'repo': r,
                    'num1': n1,
                    'num2': n2,
                    'history': history_commit
                })

    localfile.write_to_file(out_file + '.all_commit', all_ret)