Example #1
0
    {'name': 'author_type', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'committer_login', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'committer_id', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'committer_name', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'committer_email', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'committer_commit_date', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'committer_api_url', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'committer_html_url', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'committer_type', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'curr_commit_master', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'time_accessed', 'type': 'STRING', 'mode': 'NULLABLE'}
]

# Create table if necessary
if not client.check_table(dataset, table):
    create_bq_table(client, dataset, table, schema)

# Get a record from one commit info dict from API response
def get_record(response_dict, repo_name, curr_time, curr_commit):
    commit = response_dict["commit"]
    author = response_dict["author"]
    commit_author = commit["author"]
    committer = response_dict["committer"]
    commit_committer = commit["committer"]
    return {'repo_name': repo_name,
            'commit_sha': response_dict.get("sha"),
            'commit_api_url': response_dict.get("url"),
            'commit_html_url': response_dict.get("html_url"),
            'commit_comments_url': response_dict.get("comments_url"),
            'commit_message': commit.get("message") if commit is not None else None,
            'commit_comment_count': commit.get("comment_count") if commit is not None else None,
    'name': 'git_url',
    'type': 'STRING',
    'mode': 'NULLABLE'
}, {
    'name': 'contents',
    'type': 'STRING',
    'mode': 'NULLABLE'
}, {
    'name': 'time_accessed',
    'type': 'STRING',
    'mode': 'NULLABLE'
}]

# Create table if necessary
if not client.check_table(dataset, table_contents):
    create_bq_table(client, dataset, table_contents, schema)

# Get set of records already in contents table
print("\nBuilding the set of existing records...")
existing_contents_dicts = run_bq_query(
    client, """
SELECT repo_name, path, sha FROM [%s:%s.%s]
""" % (proj, dataset, table_contents), 120)
existing_contents = {(rec["repo_name"], rec["path"], rec["sha"])
                     for rec in existing_contents_dicts}
num_already_done = len(existing_contents)
if num_already_done > 0:
    print("The table already contains %s file contents records." %
          num_already_done)

# Get list of file info records to download contents for
# Using BigQuery-Python https://github.com/tylertreat/BigQuery-Python
print('\nGetting BigQuery client\n')
client = get_client(json_key_file=json_key, readonly=False, swallow_results=True)
 
# Table schema
schema = [
    {'name': 'repo_name', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'file_name', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'path', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'sha', 'type': 'STRING', 'mode': 'NULLABLE'},
    {'name': 'init_commit_timestamp', 'type': 'STRING', 'mode': 'NULLABLE'}
]

# Create table if necessary
if not client.check_table(dataset, table_init_commit):
    create_bq_table(client, dataset, table_init_commit, schema)

# Get set of records already in table
print("\nBuilding the set of existing records...")
existing_records_dicts = run_bq_query(client, """
SELECT repo_name, path, sha FROM [%s:%s.%s]
""" % (proj, dataset, table_init_commit), 120)
existing_records = {(rec["repo_name"], rec["path"], rec["sha"]) for rec in existing_records_dicts}
num_already_done = len(existing_records)
if num_already_done > 0:
    print("The table already contains %s records." % num_already_done)

# Get list of file info records to get initial commits for 
print("\nGetting file info records...")
file_info_records = run_bq_query(client, """
SELECT repo_name, file_name, path, sha FROM [%s:%s.%s] 
    'type': 'STRING',
    'mode': 'NULLABLE'
}, {
    'name': 'blank',
    'type': 'INTEGER',
    'mode': 'NULLABLE'
}, {
    'name': 'comment',
    'type': 'INTEGER',
    'mode': 'NULLABLE'
}, {
    'name': 'code',
    'type': 'INTEGER',
    'mode': 'NULLABLE'
}]
create_bq_table(bq_client, out_ds, table_loc, schema_loc)
if not bq_client.check_table(out_ds, table_loc_ungrouped):
    create_bq_table(bq_client, out_ds, table_loc_ungrouped, schema_loc)

# Create the comment-stripped contents tables
schema_sc = [{
    'name': 'sha',
    'type': 'STRING',
    'mode': 'NULLABLE'
}, {
    'name': 'contents_comments_stripped',
    'type': 'STRING',
    'mode': 'NULLABLE'
}]
create_bq_table(bq_client, out_ds, table_sc, schema_sc)
if not bq_client.check_table(out_ds, table_sc_ungrouped):
    'type': 'STRING',
    'mode': 'NULLABLE'
}, {
    'name': 'edition',
    'type': 'STRING',
    'mode': 'NULLABLE'
}, {
    'name': 'internal_pdf',
    'type': 'STRING',
    'mode': 'NULLABLE'
}, {
    'name': 'abstract',
    'type': 'STRING',
    'mode': 'NULLABLE'
}]
create_bq_table(client, bq_ds, bq_tb, schema)

# Iterate through the records and write to BQ table
print('\nExtracting GitHub repo names from articles...')
num_done = 0
num_found = 0
recs_to_push = []
for record in records:
    metadata = parse_record(record)
    repos = gh_repos_from_metadata(metadata)
    if repos is not None:
        num_found += len(repos['repos'])
        for repo in repos['repos']:
            metadata_this_repo = metadata.copy()
            metadata_this_repo['repo_name'] = repo
            metadata_this_repo['repo_source'] = repos['source']