def get_commits_for_included_branches( client: GithubClient, api_repos, included_branches, strip_text_content, server_git_instance_info, redact_names_and_urls, ): for i, repo in enumerate(api_repos, start=1): with agent_logging.log_loop_iters(logger, 'repo for branch commits', i, 1): pull_since = pull_since_date_for_repo( server_git_instance_info, repo['organization']['login'], repo['id'], 'commits') # Determine branches to pull commits from for this repo. If no branches are explicitly # provided in a config, only pull from the repo's default branch. # We are working with the github api object rather than a NormalizedRepository here, # so we can not use get_branches_for_normalized_repo as we do in bitbucket_cloud_adapter and gitlab_adapter. branches_to_process = [repo['default_branch']] additional_branch_patterns = included_branches.get(repo['name']) if additional_branch_patterns: repo_branches = [ b['name'] for b in client.get_branches(repo['full_name']) ] branches_to_process.extend( get_matching_branches(additional_branch_patterns, repo_branches)) for branch in branches_to_process: try: for j, commit in enumerate( tqdm( client.get_commits(repo['full_name'], branch, since=pull_since, until=None), desc= f'downloading commits on branch {branch} for {repo["name"]}', unit='commits', ), start=1, ): with agent_logging.log_loop_iters( logger, 'branch commit inside repo', j, 100): yield _normalize_commit(commit, repo, branch, strip_text_content, redact_names_and_urls) except Exception as e: print( f':WARN: Got exception for branch {branch}: {e}. Skipping...' )
def get_commits_for_included_branches( self, normalized_repos: List[NormalizedRepository], included_branches: dict, server_git_instance_info, ) -> List[NormalizedCommit]: print('downloading gitlab commits on included branches... ', end='', flush=True) for i, nrm_repo in enumerate(normalized_repos, start=1): with agent_logging.log_loop_iters(logger, 'repo for branch commits', i, 1): pull_since = pull_since_date_for_repo(server_git_instance_info, nrm_repo.project.login, nrm_repo.id, 'commits') try: for branch in get_branches_for_normalized_repo( nrm_repo, included_branches): for j, commit in enumerate( tqdm( self.client.list_project_commits( nrm_repo.id, pull_since, branch), desc= f'downloading commits for branch {branch} in repo {nrm_repo.name} ({nrm_repo.id})', unit='commits', ), start=1, ): with agent_logging.log_loop_iters( logger, 'branch commit inside repo', j, 100): yield _normalize_commit( commit, nrm_repo, branch, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, ) except Exception as e: print( f':WARN: Got exception for branch {branch}: {e}. Skipping...' ) print('✓')
def get_commits_for_included_branches( self, normalized_repos: List[NormalizedRepository], included_branches: dict, server_git_instance_info, ) -> List[NormalizedCommit]: print('downloading bitbucket commits on included branches... ', end='', flush=True) for i, repo in enumerate(normalized_repos, start=1): with agent_logging.log_loop_iters(logger, 'repo for branch commits', i, 1): pull_since = pull_since_date_for_repo( server_git_instance_info, repo.project.login, repo.id, 'commits' ) for branch in get_branches_for_normalized_repo(repo, included_branches): for j, commit in enumerate( tqdm( self.client.get_commits(repo.project.id, repo.id, branch), desc=f'downloading commits for {repo.name} on branch {branch}', unit='commits', ), start=1, ): with agent_logging.log_loop_iters( logger, 'branch commit inside repo', j, 100 ): commit = _normalize_commit( commit, repo, branch, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, ) yield commit # yield one commit older than we want to see if commit.commit_date < pull_since: break print('✓')
def get_pull_requests( client: GithubClient, api_repos, strip_text_content, server_git_instance_info, redact_names_and_urls, ): for i, repo in enumerate(api_repos, start=1): with agent_logging.log_loop_iters(logger, 'repo for pull requests', i, 1): pull_since = pull_since_date_for_repo( server_git_instance_info, repo['organization']['login'], repo['id'], 'prs') try: for j, pr in enumerate( tqdm( client.get_pullrequests(repo['full_name']), desc=f'downloading PRs for {repo["name"]}', unit='prs', ), start=1, ): with agent_logging.log_loop_iters(logger, 'pr inside repo', j, 10): updated_at = parser.parse(pr['updated_at']) # PRs are ordered newest to oldest # if this is too old, we're done with this repo if pull_since and updated_at < pull_since: break yield _normalize_pr(client, pr, strip_text_content, redact_names_and_urls) except Exception as e: print( f':WARN: Exception getting PRs for repo {repo["name"]}: {e}. Skipping...' ) print()
def get_pull_requests( self, normalized_repos: List[NormalizedRepository], server_git_instance_info, ) -> List[NormalizedPullRequest]: print('downloading bitbucket prs... ', end='', flush=True) for i, repo in enumerate( tqdm(normalized_repos, desc='downloading prs for repos', unit='repos'), start=1 ): with agent_logging.log_loop_iters(logger, 'repo for pull requests', i, 1): try: pull_since = pull_since_date_for_repo( server_git_instance_info, repo.project.login, repo.id, 'prs' ) api_prs = self.client.get_pullrequests(repo.project.id, repo.id) if not api_prs: agent_logging.log_and_print( logger, logging.INFO, f'no prs found for repo {repo.id}. Skipping... ' ) continue for api_pr in tqdm(api_prs, desc=f'processing prs for {repo.name}', unit='prs'): try: # Skip PRs with missng data if ( 'source' not in api_pr or 'repository' not in api_pr['source'] or not api_pr['source']['repository'] or 'destination' not in api_pr or 'repository' not in api_pr['destination'] or not api_pr['destination']['repository'] ): agent_logging.log_and_print_error_or_warning( logger, logging.WARN, msg_args=[api_pr['id']], error_code=3030 ) continue yield _normalize_pr( self.client, repo, api_pr, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, ) # PRs are ordered newest to oldest if this # is too old, we're done with this repo. We # yield one old one on purpose so that we # handle the case correctly when the most # recent PR is really old. if pull_since and parser.parse(api_pr['updated_on']) < pull_since: break except Exception: # if something happens when normalizing a PR, just keep going with the rest agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[api_pr["id"], repo.id], error_code=3011, exc_info=True, ) except Exception: # if something happens when pulling PRs for a repo, just keep going. agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[repo.id], error_code=3021, exc_info=True, ) print('✓')
def get_pull_requests( self, normalized_repos: List[NormalizedRepository], server_git_instance_info, ) -> List[NormalizedPullRequest]: print('downloading gitlab prs... ', end='', flush=True) for i, nrm_repo in enumerate(normalized_repos, start=1): print(f'downloading prs for repo {nrm_repo.name} ({nrm_repo.id})') with agent_logging.log_loop_iters(logger, 'repo for pull requests', i, 1): try: pull_since = pull_since_date_for_repo( server_git_instance_info, nrm_repo.project.login, nrm_repo.id, 'prs') api_prs = self.client.list_project_merge_requests( nrm_repo.id) if not api_prs or not api_prs.total: agent_logging.log_and_print( logger, logging.WARNING, f"No PRs returned for repo {nrm_repo.id}") continue for api_pr in tqdm( api_prs, desc= f'processing prs for {nrm_repo.name} ({nrm_repo.id})', unit='prs', total=api_prs.total, ): try: updated_at = parser.parse(api_pr.updated_at) # PRs are ordered newest to oldest # if this is too old, we're done with this repo if pull_since and updated_at < pull_since: break try: api_pr = self.client.expand_merge_request_data( api_pr) except MissingSourceProjectException as e: log_and_print_request_error( e, f'fetching source project {api_pr.source_project_id} ' f'for merge_request {api_pr.id}. Skipping...', ) continue nrm_commits: List[NormalizedCommit] = [ _normalize_commit( commit, nrm_repo, api_pr.target_branch, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, ) for commit in api_pr.commit_list ] merge_request = self.client.expand_merge_request_data( api_pr) merge_commit = None if (merge_request.state == 'merged' and nrm_commits is not None and merge_request.merge_commit_sha): merge_commit = _normalize_commit( self.client.get_project_commit( merge_request.project_id, merge_request.merge_commit_sha), nrm_repo, api_pr.target_branch, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, ) yield _normalize_pr( api_pr, nrm_commits, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, merge_commit, ) except Exception as e: # if something goes wrong with normalizing one of the prs - don't stop pulling. try # the next one. pr_id = f' {api_pr.id}' if api_pr else '' log_and_print_request_error( e, f'normalizing PR {pr_id} from repo {nrm_repo.name} ({nrm_repo.id}). Skipping...', log_as_exception=True, ) except Exception as e: # if something happens when pulling PRs for a repo, just keep going. log_and_print_request_error( e, f'getting PRs for repo {nrm_repo.name} ({nrm_repo.id}). Skipping...', log_as_exception=True, )