Ejemplo n.º 1
0
# Get list of commit records for a repo
def get_records(repo_name):
    data = get_commits(repo_name, gh_username, gh_oauth_key)
    try:
        validate_response_found(data[0])
    except ValueError:
        return None
    except KeyError:
        return None
    curr_time = curr_time_utc()
    curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key)
    return [get_record(dct, repo_name, curr_time, curr_commit) for dct in data]
        
print("%s\tGetting commit info from GitHub API and pushing to BigQuery table" % curr_time_utc())
num_done = 0
num_repos = len(repos)
for repo_name in repos:
    records = get_records(repo_name)
    num_done = num_done + 1
    if records is not None:
        print("%s\tPushing %s commit records for repo %s/%s: %s" 
              % (curr_time_utc(), len(records), num_done, num_repos, repo_name))
        push_bq_records(client, dataset, table, records)
    else:
        print("%s\tPushing 0 commit records for repo %s/%s: %s" 
              % (curr_time_utc(), num_done, num_repos, repo_name))




        recs_to_add_sc = []
        skipped_sha = []

        for rec in reader:

            if num_done % 1000 == 0:
                print(
                    'Finished %s files. Got results for %s. Skipped %s already done, %s previously skipped, %s with empty content, %s with invalid file extension, and %s with no CLOC result.'
                    % (num_done, num_success, num_skipped_already_done,
                       num_skipped_skipped, num_skipped_empty_content,
                       num_skipped_file_extension, num_skipped_no_result))

            # Push batch of records
            if num_done % 10 == 0:
                if len(recs_to_add_loc) > 0:
                    push_bq_records(bq_client, out_ds, table_loc_ungrouped,
                                    recs_to_add_loc)
                    push_bq_records(bq_client, out_ds, table_sc_ungrouped,
                                    recs_to_add_sc)
                if len(skipped_sha) > 0:
                    push_bq_records(bq_client, out_ds, table_skip, [{
                        'sha': sha
                    } for sha in skipped_sha])
                recs_to_add_loc.clear()
                recs_to_add_sc.clear()
                skipped_sha.clear()

            num_done = num_done + 1

            repo = rec["repo_name"]
            filename = rec["file_name"]
            path = rec["path"]
for record in file_info_records:
    # Skip if already done
    if (record["repo_name"], record["path"],
            record["sha"]) in existing_contents:
        num_skipped_already_done = num_skipped_already_done + 1
        continue
    recs_to_push.append(get_contents_record(record))
    num_done = num_done + 1
    if num_done % 100 == 0:
        print("%s\tFinished %s/%s records. Pushing %s records to BigQuery." %
              (curr_time_utc(), num_done, num_to_do, len(recs_to_push)))
        try:
            # Push the entire batch
            push_bq_records(client,
                            dataset,
                            table_contents,
                            recs_to_push,
                            print_failed_records=False)
        except RuntimeError:
            # Try records individually
            print(
                "Batch push failed. Trying records individually every 2 seconds due to BigQuery rate limit."
            )
            for rec in recs_to_push:
                sleep(2.1)
                try:
                    push_bq_records(client,
                                    dataset,
                                    table_contents, [rec],
                                    print_failed_records=False)
                except RuntimeError:
print("%s\tGetting file initial commit times from GitHub API and pushing to table" % curr_time_utc())
num_done = 0
num_skipped_already_done = 0
num_to_do = len(file_info_records) - num_already_done
recs_to_push = []
for record in file_info_records:
    # Skip if already done
    if (record["repo_name"], record["path"], record["sha"]) in existing_records:
        num_skipped_already_done = num_skipped_already_done + 1
        continue
    try:
        recs_to_push.append(get_init_commit(record))
    except ValueError as e:
        print("Caught ValueError; skipping repo %s and path %s. Error:\n%s" % (record["repo_name"], record["path"], e))
    except pycurl.error as e:
        print("Caught pycurl.error; skipping repo %s and path %s. Error:\n%s" % (record["repo_name"], record["path"], e))
    num_done = num_done + 1
    if num_done % 100 == 0:
        print("%s\tFinished %s/%s records. Pushing %s records to BigQuery."
              % (curr_time_utc(), num_done, num_to_do, len(recs_to_push)))
        push_bq_records(client, dataset, table_init_commit, recs_to_push, print_failed_records = True)
        recs_to_push.clear()
    
# Final batch
print("%s\tFinished %s/%s records. Pushing %s records to BigQuery."
    % (curr_time_utc(), num_done, num_to_do, len(recs_to_push)))
push_bq_records(client, dataset, table_init_commit, recs_to_push, print_failed_records = True)



create_bq_table(client, bq_ds, bq_tb, schema)

# Iterate through the records and write to BQ table
print('\nExtracting GitHub repo names from articles...')
num_done = 0
num_found = 0
recs_to_push = []
for record in records:
    metadata = parse_record(record)
    repos = gh_repos_from_metadata(metadata)
    if repos is not None:
        num_found += len(repos['repos'])
        for repo in repos['repos']:
            metadata_this_repo = metadata.copy()
            metadata_this_repo['repo_name'] = repo
            metadata_this_repo['repo_source'] = repos['source']
            recs_to_push.append(metadata_this_repo)
    num_done += 1
    if num_done % 100 == 0:
        print("Analyzed %s papers. Found %s valid repo names." %
              (num_done, num_found))
        if recs_to_push:
            push_bq_records(client, bq_ds, bq_tb, recs_to_push)
        recs_to_push.clear()

# Push final batch of records
if recs_to_push:
    push_bq_records(client, bq_ds, bq_tb, recs_to_push)

print("\n\nAll done.")
            'api_url': r.get_gh_api_url(),
            'html_url': r.get_html_url(),
            'description': r.get_description(),
            'is_fork': r.is_fork(),
            'stargazers_count': r.get_stargazers_count(),
            'watchers_count': r.get_watchers_count(),
            'forks_count': r.get_forks_count(),
            'open_issues_count': r.get_open_issues_count(),
            'subscribers_count': r.get_subscribers_count(),
            'curr_commit_master': curr_commit,
            'time_accessed': curr_time}
    
print("Getting repo info from GitHub API")
records = []
num_done = 0
for repo_name in repos:
    try:
        records.append(get_record(repo_name))
    except UnicodeEncodeError:
        print("Skipping repo %s" % repo_name)
    num_done = num_done + 1
    if num_done % 100 == 0:
        print("Finished %s repos. Pushing records." % num_done)
        push_bq_records(client, dataset, table, records)
        records.clear()
push_bq_records(client, dataset, table, records) # Last batch




Ejemplo n.º 7
0
            'html_url': pr_data['html_url'],
            'title': pr_data['title'],
            'body': pr_data['body'],
            'user_login': pr_data['user']['login'],
            'user_id': pr_data['user']['id'],
            'curr_commit_master': curr_commit,
            'time_accessed': curr_time}
    
print("Getting pull request info from GitHub API")
num_done = 0
num_repos = len(repos)
for repo_name in repos:
    num_done = num_done + 1
    try:
        records = [get_record(repo_name, pr) for pr in get_pull_requests(repo_name, gh_username, gh_oauth_key, "all")]
        if records is not None:
            print("%s\tPushing %s pull request records for repo %s/%s: %s" 
                  % (curr_time_utc(), len(records), num_done, num_repos, repo_name))
            push_bq_records(client = client, dataset = dataset, table = table, records = records, max_batch = 10)
        else:
            print("%s\tPushing 0 pull request records for repo %s/%s: %s" 
                  % (curr_time_utc(), num_done, num_repos, repo_name))
    except KeyError:
        print("Skipping repo %s: %s" % (repo_name, pr['message']))
    except UnicodeEncodeError as e:
        print("Skipping repo %s: %s" % (repo_name, str(e)))




Ejemplo n.º 8
0
    data = get_file_info(repo_name, gh_username, gh_oauth_key)
    curr_time = curr_time_utc()
    curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key)
    return [{
        'repo_name': repo_name,
        'file_name': record['name'],
        'path': record['path'],
        'sha': record['sha'],
        'size': record['size'],
        'api_url': record['url'],
        'html_url': record['html_url'],
        'git_url': record['git_url'],
        'download_url': record['download_url'],
        'type': record['type'],
        'curr_commit_master': curr_commit,
        'time_accessed': curr_time
    } for record in data]


print("%s\tGetting file info from GitHub API and pushing to file info table" %
      curr_time_utc())
num_done = 0
num_repos = len(repos)
for repo_name in repos:
    file_info_records = get_file_info_records(repo_name)
    num_done = num_done + 1
    print("%s\tPushing %s file info records for repo %s/%s: %s" %
          (curr_time_utc(), len(file_info_records), num_done, num_repos,
           repo_name))
    push_bq_records(client, dataset, table, file_info_records)