def test_APIRequest(self):
     r = gf.get_request(self.url)
     forks = 0
     if r.ok:
         contents = json.loads(r.text or r.content)
         forks = contents['forks_count']
     assert forks == 5
Beispiel #2
0
def pickle_repo(repo):
    """
    Basic function that scrapes repo data using the Github API and
    saves it locally as a pickled file, to be used by test_classify()
    above.
    """
    GP = gs.Github_Profile()
    url = 'https://api.github.com/repos/%s' % repo
    repo_path = 'GP_%s.pkl' % (repo.replace('/', '_'))
    r = gf.get_request(url)
    if r.ok:
        item = json.loads(r.text or r.content)
        gs.get_features(item, GP)
        joblib.dump(GP, repo_path)
Beispiel #3
0
def scrape_repos_by_specs(name, raw_links, n_pages=10, start_page=0):
    n_scraped = 0
    f = open('repo_data/%s_repos_Python.txt' % name, 'w')
    for raw_link in raw_links:
        for i in range(start_page, start_page + n_pages):
            link = '%s&per_page=100&page=%d' % (raw_link, i)
            r = gf.get_request(link)
            if (r.ok):
                repos = json.loads(r.text or r.content)
                for item in repos['items']:
                    f.write('%s\n' % item['url'])
                    n_scraped += 1  # number of repo urls scraped
            print(link)

    f.close()
    print("Scraped %d repos'" % n_scraped)
Beispiel #4
0
def scrape_single_repo(user_repo):
    """
    Top-level function that scrapes the statistics for a single
    Github repository.
    """
    GP = Github_Profile()
    repo = 'https://api.github.com/repos/%s' % user_repo
    GP.user = repo.split('repos/')[1].split('/')[0]
    r = gf.get_request(repo)
    if r.ok:
        item = json.loads(r.text or r.content)
        signal.alarm(60)
        try:
            if item['fork'] is False:  # for now ignore forks
                GP = get_features(item, GP)
        except:
            print('couldnt scrape %s' % repo)
    return GP
Beispiel #5
0
def get_batch_repos(repo_list_dir, output_dir):
    """
    Top-level function that batch-extracts the statistics
    from a collection of repositories.
    """
    proc_repos = np.loadtxt(output_dir,
                            delimiter=',',
                            usecols=[0],
                            dtype='str')
    repos = open(repo_list_dir, 'r').read().splitlines()
    # Change the behavior of SIGALRM
    signal.signal(signal.SIGALRM, timeout_handler)
    for repo in repos:
        if repo in proc_repos:
            print('already scanned %s' % repo)
            continue
        GP = Github_Profile()
        GP.user = repo.split('repos/')[1].split('/')[0]
        r = gf.get_request(repo)
        if r.ok:
            item = json.loads(r.text or r.content)
            signal.alarm(60)
            try:
                if item['fork'] is False:  # for now ignore forks
                    GP = get_features(item, GP)

                    # write each repo->GP to file
                    string = '%s, %d, %d, %d, %d, %d, %d, %d, %f, %d, %d'
                    data = open(output_dir, 'a')
                    data.write(string %
                               (repo, GP.n_pyfiles, GP.code_lines,
                                GP.comment_lines, GP.docstring_lines,
                                GP.test_lines, GP.readme_lines, GP.n_commits,
                                GP.commits_per_time, GP.n_stars, GP.n_forks))
                    for key in GP.pep8.keys():
                        data.write(', %d' % GP.pep8[key])
                    data.write('\n')
                    data.close()

            except TimeoutException:
                print('%s timed out, skipping!' % repo)
            except:
                print('skipping repo %s' % repo)
Beispiel #6
0
def digest_repo(repo_url, GProfile):
    """
    Look through each file and directory, extract metrics from
    each python file. Recursive function.
    """
    r = gf.get_request('%s' % repo_url)
    if r.ok:
        repoItems = json.loads(r.text or r.content)

        for item in repoItems:
            try:
                if item['type'] == 'file' and item['name'][-3:] == '.py':
                    GProfile.n_pyfiles += 1
                    print(item['download_url'])
                    gs.get_metrics_per_file(item, GProfile)
                elif item['type'] == 'dir':
                    digest_repo(item['url'], GProfile)
            except:
                print('%s timed out, skipping!' % item['download_url'])
Beispiel #7
0
def update_output_div(n_clicks, input_value, checklist):
    """
    Main App Callback. Takes a user/repository as input, scrapes the stats
    using the Github API, classifies the repository using the pre-trained
    One-Class SVM model, and sends all the information to output() to be
    output on the screen. Saves a querried Github_Profile for faster loading
    subsequent times.
    """
    repo_path = 'saved_repo_profiles/GP_%s.pkl' % (input_value.replace(
        '/', '_'))
    # if profile already exists, don't re-scrape
    if os.path.isfile(repo_path) and 'rescrape' not in checklist:
        GP = joblib.load(repo_path)
    else:
        r = gf.get_request('https://api.github.com/repos/%s' % input_value)
        if r.ok:
            item = json.loads(r.text or r.content)
            GP = get_features(item)
            joblib.dump(GP, repo_path)
        else:
            return html.Div([
                html.H2('Couldnt find: "{}" on Github'.format(input_value),
                        style={
                            'font-style': 'normal',
                            'font-size': 15
                        })
            ])

    try:
        score, Xr = mod.classify_repo(GP)  # r for repo
        return output(input_value, GP, Xr, score, checklist)
    except:
        string = '"{}" could not be processed. Does it contain few/no .py files?'
        return html.Div([
            html.H2(string.format(input_value),
                    style={
                        'font-style': 'normal',
                        'font-size': 15
                    })
        ])
Beispiel #8
0
def digest_repo(repo_url, GProfile):
    """
    Look through each file and directory, extract metrics from
    each python file. Recursive function.
    """
    r = gf.get_request('%s' % repo_url)
    if r.ok:
        repoItems = json.loads(r.text or r.content)

        signal.signal(signal.SIGALRM, timeout_handler)
        for item in repoItems:
            signal.alarm(10)  # skip file if takes more than 10 seconds

            try:
                if item['type'] == 'file' and item['name'][-3:] == '.py':
                    GProfile.n_pyfiles += 1
                    print(item['download_url'])
                    get_metrics_per_file(item, GProfile)
                elif item['type'] == 'dir':
                    digest_repo(item['url'], GProfile)
            except TimeoutException:
                print('%s timed out, skipping!' % item['download_url'])
Beispiel #9
0
def get_metrics_per_file(item, GProfile):
    """
    Extract metrics from each Python file:
        -comment/code ratio
        -pep8 errors
        -number of code lines and test lines
    """
    r = gf.get_request(item['download_url'])
    if r.ok:
        text = r.text

        # metrics
        GProfile.comment_lines += gf.get_comments(text, '#', '\n')
        GProfile.docstring_lines += gf.get_comments(text, '"""', '"""')
        gf.get_pep8_errs(text, GProfile)

        code_len = len(text.split('\n'))
        GProfile.code_lines += code_len

        # tests
        if item['name'].lower()[:5] == 'test_' and 'assert' in text:  # pytest
            GProfile.test_lines += code_len
Beispiel #10
0
 def test_RequestFailure(self):
     r = gf.get_request(self.url, timeout=1e-4)
     assert r is None