{'name': 'author_type', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'committer_login', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'committer_id', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'committer_name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'committer_email', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'committer_commit_date', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'committer_api_url', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'committer_html_url', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'committer_type', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'curr_commit_master', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'time_accessed', 'type': 'STRING', 'mode': 'NULLABLE'} ] # Create table if necessary if not client.check_table(dataset, table): create_bq_table(client, dataset, table, schema) # Get a record from one commit info dict from API response def get_record(response_dict, repo_name, curr_time, curr_commit): commit = response_dict["commit"] author = response_dict["author"] commit_author = commit["author"] committer = response_dict["committer"] commit_committer = commit["committer"] return {'repo_name': repo_name, 'commit_sha': response_dict.get("sha"), 'commit_api_url': response_dict.get("url"), 'commit_html_url': response_dict.get("html_url"), 'commit_comments_url': response_dict.get("comments_url"), 'commit_message': commit.get("message") if commit is not None else None, 'commit_comment_count': commit.get("comment_count") if commit is not None else None,
'name': 'git_url', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'contents', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'time_accessed', 'type': 'STRING', 'mode': 'NULLABLE' }] # Create table if necessary if not client.check_table(dataset, table_contents): create_bq_table(client, dataset, table_contents, schema) # Get set of records already in contents table print("\nBuilding the set of existing records...") existing_contents_dicts = run_bq_query( client, """ SELECT repo_name, path, sha FROM [%s:%s.%s] """ % (proj, dataset, table_contents), 120) existing_contents = {(rec["repo_name"], rec["path"], rec["sha"]) for rec in existing_contents_dicts} num_already_done = len(existing_contents) if num_already_done > 0: print("The table already contains %s file contents records." % num_already_done) # Get list of file info records to download contents for
# Using BigQuery-Python https://github.com/tylertreat/BigQuery-Python print('\nGetting BigQuery client\n') client = get_client(json_key_file=json_key, readonly=False, swallow_results=True) # Table schema schema = [ {'name': 'repo_name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'file_name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'path', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'sha', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'init_commit_timestamp', 'type': 'STRING', 'mode': 'NULLABLE'} ] # Create table if necessary if not client.check_table(dataset, table_init_commit): create_bq_table(client, dataset, table_init_commit, schema) # Get set of records already in table print("\nBuilding the set of existing records...") existing_records_dicts = run_bq_query(client, """ SELECT repo_name, path, sha FROM [%s:%s.%s] """ % (proj, dataset, table_init_commit), 120) existing_records = {(rec["repo_name"], rec["path"], rec["sha"]) for rec in existing_records_dicts} num_already_done = len(existing_records) if num_already_done > 0: print("The table already contains %s records." % num_already_done) # Get list of file info records to get initial commits for print("\nGetting file info records...") file_info_records = run_bq_query(client, """ SELECT repo_name, file_name, path, sha FROM [%s:%s.%s]
'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'blank', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'comment', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'code', 'type': 'INTEGER', 'mode': 'NULLABLE' }] create_bq_table(bq_client, out_ds, table_loc, schema_loc) if not bq_client.check_table(out_ds, table_loc_ungrouped): create_bq_table(bq_client, out_ds, table_loc_ungrouped, schema_loc) # Create the comment-stripped contents tables schema_sc = [{ 'name': 'sha', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'contents_comments_stripped', 'type': 'STRING', 'mode': 'NULLABLE' }] create_bq_table(bq_client, out_ds, table_sc, schema_sc) if not bq_client.check_table(out_ds, table_sc_ungrouped):
'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'edition', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'internal_pdf', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'abstract', 'type': 'STRING', 'mode': 'NULLABLE' }] create_bq_table(client, bq_ds, bq_tb, schema) # Iterate through the records and write to BQ table print('\nExtracting GitHub repo names from articles...') num_done = 0 num_found = 0 recs_to_push = [] for record in records: metadata = parse_record(record) repos = gh_repos_from_metadata(metadata) if repos is not None: num_found += len(repos['repos']) for repo in repos['repos']: metadata_this_repo = metadata.copy() metadata_this_repo['repo_name'] = repo metadata_this_repo['repo_source'] = repos['source']