def get_record(repo_name): lic = get_license(repo_name, gh_username, gh_oauth_key) curr_time = curr_time_utc() curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key) return { 'repo_name': repo_name, 'license': lic, 'curr_commit_master': curr_commit, 'time_accessed': curr_time }
def get_records(repo_name): data = get_commits(repo_name, gh_username, gh_oauth_key) try: validate_response_found(data[0]) except ValueError: return None except KeyError: return None curr_time = curr_time_utc() curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key) return [get_record(dct, repo_name, curr_time, curr_commit) for dct in data]
def get_records(repo_name): data = get_language_bytes(repo_name, gh_username, gh_oauth_key) curr_time = curr_time_utc() curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key) return [{ 'repo_name': repo_name, 'language_name': key, 'language_bytes': data[key], 'curr_commit_master': curr_commit, 'time_accessed': curr_time } for key in data.keys()]
def get_record(repo_name, pr_data): curr_time = curr_time_utc() curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key) return {'repo_name': repo_name, 'pr_id': pr_data['id'], 'state': pr_data['state'], 'api_url': pr_data['url'], 'html_url': pr_data['html_url'], 'title': pr_data['title'], 'body': pr_data['body'], 'user_login': pr_data['user']['login'], 'user_id': pr_data['user']['id'], 'curr_commit_master': curr_commit, 'time_accessed': curr_time}
def get_record(repo_name): r = repo.Repo(repo_name, gh_username, gh_oauth_key) curr_time = curr_time_utc() curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key) return {'repo_name': r.get_repo_name(), 'api_url': r.get_gh_api_url(), 'html_url': r.get_html_url(), 'description': r.get_description(), 'is_fork': r.is_fork(), 'stargazers_count': r.get_stargazers_count(), 'watchers_count': r.get_watchers_count(), 'forks_count': r.get_forks_count(), 'open_issues_count': r.get_open_issues_count(), 'subscribers_count': r.get_subscribers_count(), 'curr_commit_master': curr_commit, 'time_accessed': curr_time}
def get_file_info_records(repo_name): data = get_file_info(repo_name, gh_username, gh_oauth_key) curr_time = curr_time_utc() curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key) return [{ 'repo_name': repo_name, 'file_name': record['name'], 'path': record['path'], 'sha': record['sha'], 'size': record['size'], 'api_url': record['url'], 'html_url': record['html_url'], 'git_url': record['git_url'], 'download_url': record['download_url'], 'type': record['type'], 'curr_commit_master': curr_commit, 'time_accessed': curr_time } for record in data]
def get_contents_record(file_info_record): repo_name = file_info_record["repo_name"] path = file_info_record["path"] git_url = file_info_record["git_url"] curr_time = curr_time_utc() contents = None size = file_info_record["size"] if size <= max_record_size - 1000: try: contents = get_file_contents(git_url, gh_username, gh_oauth_key) except: pass return { 'repo_name': repo_name, 'file_name': file_info_record["file_name"], 'path': path, 'sha': file_info_record["sha"], 'git_url': git_url, 'contents': contents, 'time_accessed': curr_time }
'time_accessed': curr_time} # Get list of commit records for a repo def get_records(repo_name): data = get_commits(repo_name, gh_username, gh_oauth_key) try: validate_response_found(data[0]) except ValueError: return None except KeyError: return None curr_time = curr_time_utc() curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key) return [get_record(dct, repo_name, curr_time, curr_commit) for dct in data] print("%s\tGetting commit info from GitHub API and pushing to BigQuery table" % curr_time_utc()) num_done = 0 num_repos = len(repos) for repo_name in repos: records = get_records(repo_name) num_done = num_done + 1 if records is not None: print("%s\tPushing %s commit records for repo %s/%s: %s" % (curr_time_utc(), len(records), num_done, num_repos, repo_name)) push_bq_records(client, dataset, table, records) else: print("%s\tPushing 0 commit records for repo %s/%s: %s" % (curr_time_utc(), num_done, num_repos, repo_name))
except: pass return { 'repo_name': repo_name, 'file_name': file_info_record["file_name"], 'path': path, 'sha': file_info_record["sha"], 'git_url': git_url, 'contents': contents, 'time_accessed': curr_time } print( "%s\tGetting file contents from GitHub API and pushing to file contents table" % curr_time_utc()) num_done = 0 num_skipped_already_done = 0 num_to_do = len(file_info_records) - num_already_done recs_to_push = [] for record in file_info_records: # Skip if already done if (record["repo_name"], record["path"], record["sha"]) in existing_contents: num_skipped_already_done = num_skipped_already_done + 1 continue recs_to_push.append(get_contents_record(record)) num_done = num_done + 1 if num_done % 100 == 0: print("%s\tFinished %s/%s records. Pushing %s records to BigQuery." % (curr_time_utc(), num_done, num_to_do, len(recs_to_push)))
file_info_records = run_bq_query(client, """ SELECT repo_name, file_name, path, sha FROM [%s:%s.%s] """ % (proj, dataset, table_info), 120) # Get initial commit def get_init_commit(file_info_record): repo_name = file_info_record["repo_name"] path = file_info_record["path"] return {'repo_name': repo_name, 'file_name': file_info_record["file_name"], 'path': path, 'sha': file_info_record["sha"], 'init_commit_timestamp': get_initial_commit(repo_name, path, gh_username, gh_oauth_key).isoformat()} print("%s\tGetting file initial commit times from GitHub API and pushing to table" % curr_time_utc()) num_done = 0 num_skipped_already_done = 0 num_to_do = len(file_info_records) - num_already_done recs_to_push = [] for record in file_info_records: # Skip if already done if (record["repo_name"], record["path"], record["sha"]) in existing_records: num_skipped_already_done = num_skipped_already_done + 1 continue try: recs_to_push.append(get_init_commit(record)) except ValueError as e: print("Caught ValueError; skipping repo %s and path %s. Error:\n%s" % (record["repo_name"], record["path"], e)) except pycurl.error as e: print("Caught pycurl.error; skipping repo %s and path %s. Error:\n%s" % (record["repo_name"], record["path"], e))
'html_url': pr_data['html_url'], 'title': pr_data['title'], 'body': pr_data['body'], 'user_login': pr_data['user']['login'], 'user_id': pr_data['user']['id'], 'curr_commit_master': curr_commit, 'time_accessed': curr_time} print("Getting pull request info from GitHub API") num_done = 0 num_repos = len(repos) for repo_name in repos: num_done = num_done + 1 try: records = [get_record(repo_name, pr) for pr in get_pull_requests(repo_name, gh_username, gh_oauth_key, "all")] if records is not None: print("%s\tPushing %s pull request records for repo %s/%s: %s" % (curr_time_utc(), len(records), num_done, num_repos, repo_name)) push_bq_records(client = client, dataset = dataset, table = table, records = records, max_batch = 10) else: print("%s\tPushing 0 pull request records for repo %s/%s: %s" % (curr_time_utc(), num_done, num_repos, repo_name)) except KeyError: print("Skipping repo %s: %s" % (repo_name, pr['message'])) except UnicodeEncodeError as e: print("Skipping repo %s: %s" % (repo_name, str(e)))
data = get_file_info(repo_name, gh_username, gh_oauth_key) curr_time = curr_time_utc() curr_commit = curr_commit_master(repo_name, gh_username, gh_oauth_key) return [{ 'repo_name': repo_name, 'file_name': record['name'], 'path': record['path'], 'sha': record['sha'], 'size': record['size'], 'api_url': record['url'], 'html_url': record['html_url'], 'git_url': record['git_url'], 'download_url': record['download_url'], 'type': record['type'], 'curr_commit_master': curr_commit, 'time_accessed': curr_time } for record in data] print("%s\tGetting file info from GitHub API and pushing to file info table" % curr_time_utc()) num_done = 0 num_repos = len(repos) for repo_name in repos: file_info_records = get_file_info_records(repo_name) num_done = num_done + 1 print("%s\tPushing %s file info records for repo %s/%s: %s" % (curr_time_utc(), len(file_info_records), num_done, num_repos, repo_name)) push_bq_records(client, dataset, table, file_info_records)