def detect_issues_needing_re_download( downloaded_issue_info, issue_metadata_from_jellyfish, issue_metadata_addl_from_jellyfish ): issue_keys_changed = [] for issue_id_str, issue_key in downloaded_issue_info: existing_metadata = issue_metadata_from_jellyfish.get(int(issue_id_str)) if existing_metadata and issue_key != existing_metadata.key: agent_logging.log_and_print( logger, logging.INFO, f'Detected a key change for issue {issue_id_str} ({existing_metadata.key} -> {issue_key})', ) issue_keys_changed.append(existing_metadata.key) issues_by_elfik, issues_by_pfik = defaultdict(list), defaultdict(list) for issue_id, (elfik, pfik) in issue_metadata_addl_from_jellyfish.items(): if elfik: issues_by_elfik[elfik].append(issue_id) if pfik: issues_by_pfik[pfik].append(issue_id) # Find all of the issues that refer to those issues through epic_link_field_issue_key # or parent_field_issue_key; these issues need to be re-downloaded issue_ids_needing_re_download = set() for changed_key in issue_keys_changed: issue_ids_needing_re_download.update(set(issues_by_elfik.get(changed_key, []))) issue_ids_needing_re_download.update(set(issues_by_pfik.get(changed_key, []))) return issue_ids_needing_re_download
def get_all_pages(self, url, rate_limit_realm=None, ignore404=False): current_page_values = deque() while True: if not current_page_values: if not url: return # exhausted the current page and there's no next page try: page = self.get_json(url, rate_limit_realm) except requests.exceptions.HTTPError as e: if e.response.status_code == 404 and ignore404: agent_logging.log_and_print( logger, logging.INFO, f'Caught a 404 for {url} - ignoring', ) return raise if 'values' in page: current_page_values.extend(page['values']) if not current_page_values: return # no new values returned url = page['next'] if 'next' in page else None yield current_page_values.popleft()
def download_users(jira_connection, gdpr_active, quiet=False): if not quiet: print('downloading jira users... ', end='', flush=True) jira_users = _search_all_users(jira_connection, gdpr_active) # Some jira instances won't return more than one page of # results. If we have seen approximately 1000 results, try # searching a different way if 950 <= len(jira_users) <= 1000: agent_logging.log_and_print( logger=logger, level=logging.INFO, msg=f'Page limit reached with {len(jira_users)} users, ' 'falling back to search by letter method.', ) jira_users = _users_by_letter(jira_connection, gdpr_active) if len(jira_users) == 0: raise RuntimeError( 'The agent is unable to see any users. Please verify that this user has the "browse all users" permission.' ) if not quiet: print('✓') return jira_users
def get_raw_result(self, url, rate_limit_realm=None): start = datetime.utcnow() while True: try: with self.rate_limiter.limit(rate_limit_realm): result = self.session.get(url) result.raise_for_status() return result except requests.exceptions.HTTPError as e: if e.response.status_code == 429: # rate-limited in spite of trying to throttle # requests. We don't know how long we need to wait, # so just try in 30 seconds, unless it's already # been too long if (datetime.utcnow() - start) < timedelta(hours=1): agent_logging.log_and_print( logger, logging.INFO, 'Retrying in 30 seconds...', ) time.sleep(30) continue else: agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, error_code=3151 ) raise
def limit(self, realm): # if realm is None, don't rate limit, just execute the thing if realm is None: yield return max_calls, period_secs = self.realm_config[realm] start = datetime.utcnow() while True: # decide whether to sleep or call, inside the lock with self.lock: sleep_until, calls_made = self._call_available( realm, max_calls) if not sleep_until: self._record_call(realm, period_secs) if not sleep_until: try: # stuff within the context manager happens here yield return except requests.exceptions.HTTPError as e: if e.response.status_code == 429: # Got rate limited anyway! agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[calls_made, max_calls, realm], error_code=3010, ) raise agent_logging.log_and_print( logger, logging.INFO, f'Rate limiter: exceeded {max_calls} calls in {period_secs} seconds for {realm}!', ) if (sleep_until - start) >= timedelta(seconds=self.timeout_secs): agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[self.timeout_secs], error_code=3020) raise Exception('Rate limit timeout') sleep_period_secs = (sleep_until - datetime.utcnow()).total_seconds() if sleep_period_secs > 0: # it's possible that sleep_until was a couple ms ago agent_logging.log_and_print( logger, logging.INFO, f'Sleeping for {sleep_period_secs:.1f} secs ({sleep_period_secs / 60.0:.1f} mins)', ) time.sleep(sleep_period_secs)
def download_data(config, creds, endpoint_jira_info, endpoint_git_instances_info): download_data_status = [] if config.jira_url: agent_logging.log_and_print( logger, logging.INFO, 'Obtained Jira configuration, attempting download...', ) jira_connection = get_basic_jira_connection(config, creds) if config.run_mode_is_print_all_jira_fields: print_all_jira_fields(config, jira_connection) download_data_status.append(load_and_dump_jira(config, endpoint_jira_info, jira_connection)) is_multi_git_config = len(config.git_configs) > 1 for git_config in config.git_configs: agent_logging.log_and_print( logger, logging.INFO, f'Obtained {git_config.git_provider} configuration, attempting download...', ) if is_multi_git_config: instance_slug = git_config.git_instance_slug instance_info = endpoint_git_instances_info.get(instance_slug) instance_creds = creds.git_instance_to_creds.get(instance_slug) else: # support legacy single-git support, which assumes only one available git instance instance_info = list(endpoint_git_instances_info.values())[0] instance_creds = list(creds.git_instance_to_creds.values())[0] git_connection = get_git_client( git_config, instance_creds, skip_ssl_verification=config.skip_ssl_verification ) download_data_status.append( load_and_dump_git( config=git_config, endpoint_git_instance_info=instance_info, outdir=config.outdir, compress_output_files=config.compress_output_files, git_connection=git_connection, ) ) return download_data_status
def get_repos( self, normalized_projects: List[NormalizedProject], ) -> List[NormalizedRepository]: print('downloading bitbucket repos... ', end='', flush=True) repos = [] for p in normalized_projects: for i, api_repo in enumerate( tqdm( self.client.get_all_repos(p.id), desc=f'downloading repos for {p.name}', unit='repos', ) ): # If we have an explicit repo allow list and this isn't in it, skip if self.config.git_include_repos: git_include_repos_lowered = set( n.lower() for n in self.config.git_include_repos ) if ( api_repo['name'].lower() not in git_include_repos_lowered and api_repo['uuid'].lower() not in git_include_repos_lowered ): if self.config.git_verbose: agent_logging.log_and_print( logger, logging.INFO, f'''Skipping repo "{api_repo['name']}" ({api_repo['uuid']}) because it's not in git_include_repos''', ) continue # If we have an explicit repo deny list and this is in it, skip if self.config.git_exclude_repos: git_exclude_repos_lowered = set( n.lower() for n in self.config.git_exclude_repos ) if ( api_repo['name'].lower() in git_exclude_repos_lowered or api_repo['uuid'].lower() in git_exclude_repos_lowered ): if self.config.git_verbose: agent_logging.log_and_print( logger, logging.INFO, f'''Skipping repo "{api_repo['name']}" ({api_repo['uuid']}) because it's in git_exclude_repos''', ) continue # If this repo is in a project, apply project filters: repo_project = api_repo.get('project') if repo_project: # If we have a project allow list and this repo is in a project that's not in it, skip if ( self.config.git_include_bbcloud_projects and repo_project['key'] not in self.config.git_include_bbcloud_projects and repo_project['uuid'] not in self.config.git_include_bbcloud_projects ): if self.config.git_verbose: agent_logging.log_and_print( logger, logging.INFO, f'''Skipping repo "{api_repo['name']}" ({api_repo['uuid']}) because its project ''' f'''("{repo_project['key']}"/{repo_project['uuid']}) is not in git_include_bbcloud_projects''', ) continue # if we have a project deny list and this repo is in a project that's in it, skip if self.config.git_exclude_bbcloud_projects and ( repo_project['key'] in self.config.git_exclude_bbcloud_projects or repo_project['uuid'] in self.config.git_exclude_bbcloud_projects ): if self.config.git_verbose: agent_logging.log_and_print( logger, logging.INFO, f'''Skipping repo "{api_repo['name']}" ({api_repo['uuid']}) because its project ''' f'''("{repo_project['key']}"/{repo_project['uuid']}) is in git_exclude_bbcloud_projects''', ) continue branches = self.get_branches(p, api_repo) repos.append( _normalize_repo(api_repo, branches, p, self.config.git_redact_names_and_urls) ) print('✓') if not repos: raise ValueError( 'No repos found. Make sure your token has appropriate access to Bitbucket and check your configuration of repos to pull.' ) return repos
def get_pull_requests( self, normalized_repos: List[NormalizedRepository], server_git_instance_info, ) -> List[NormalizedPullRequest]: print('downloading bitbucket prs... ', end='', flush=True) for i, repo in enumerate( tqdm(normalized_repos, desc='downloading prs for repos', unit='repos'), start=1 ): with agent_logging.log_loop_iters(logger, 'repo for pull requests', i, 1): try: pull_since = pull_since_date_for_repo( server_git_instance_info, repo.project.login, repo.id, 'prs' ) api_prs = self.client.get_pullrequests(repo.project.id, repo.id) if not api_prs: agent_logging.log_and_print( logger, logging.INFO, f'no prs found for repo {repo.id}. Skipping... ' ) continue for api_pr in tqdm(api_prs, desc=f'processing prs for {repo.name}', unit='prs'): try: # Skip PRs with missng data if ( 'source' not in api_pr or 'repository' not in api_pr['source'] or not api_pr['source']['repository'] or 'destination' not in api_pr or 'repository' not in api_pr['destination'] or not api_pr['destination']['repository'] ): agent_logging.log_and_print_error_or_warning( logger, logging.WARN, msg_args=[api_pr['id']], error_code=3030 ) continue yield _normalize_pr( self.client, repo, api_pr, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, ) # PRs are ordered newest to oldest if this # is too old, we're done with this repo. We # yield one old one on purpose so that we # handle the case correctly when the most # recent PR is really old. if pull_since and parser.parse(api_pr['updated_on']) < pull_since: break except Exception: # if something happens when normalizing a PR, just keep going with the rest agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[api_pr["id"], repo.id], error_code=3011, exc_info=True, ) except Exception: # if something happens when pulling PRs for a repo, just keep going. agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[repo.id], error_code=3021, exc_info=True, ) print('✓')
def get_repos( self, normalized_projects: List[NormalizedProject], ) -> List[NormalizedRepository]: print('downloading gitlab repos... ', end='', flush=True) nrm_repos: List[NormalizedRepository] = [] for nrm_project in normalized_projects: repos_that_failed_to_download = [] for i, api_repo in enumerate( tqdm( self.client.list_group_projects(nrm_project.id), desc=f'downloading repos for {nrm_project.name}', unit='repos', ), start=1, ): if (self.config.git_include_repos # For GitLab, git_include_repos holds IDs instead of names (probably unintentionally), so # no need to be case insensitive and api_repo.id not in self.config.git_include_repos): if self.config.git_verbose: agent_logging.log_and_print( logger, logging.INFO, f'skipping repo {api_repo.id} because not in include_repos...', ) continue # skip this repo if (self.config.git_exclude_repos # For GitLab, git_exclude_repos holds IDs instead of names (probably unintentionally), so # no need to be case insensitive and api_repo.id in self.config.git_exclude_repos): if self.config.git_verbose: agent_logging.log_and_print( logger, logging.INFO, f'skipping repo {api_repo.id} because in exclude_repos...', ) continue # skip this repo try: nrm_branches = self.get_branches(api_repo) except gitlab.exceptions.GitlabListError: # this is likely due to fine-tuned permissions defined on the repo (gitlab project) # that is not allowing us to access to its repo details. if this happens, make a note of it and # don't blow up the rest of the pull repos_that_failed_to_download.append(api_repo) continue # skip this repo nrm_repos.append( _normalize_repo(api_repo, nrm_branches, nrm_project, self.config.git_redact_names_and_urls)) # if there were any repositories we had issues with... print them out now. if repos_that_failed_to_download: def __repo_log_string(api_repo): # build log string name = (api_repo.name if not self.config.git_redact_names_and_urls else _repo_redactor.redact_name(api_repo.name)) return {"id": api_repo.id, "name": name}.__str__() repos_failed_string = ", ".join([ __repo_log_string(api_repo) for api_repo in repos_that_failed_to_download ]) total_failed = len(repos_that_failed_to_download) agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[ total_failed, nrm_project.id, repos_failed_string ], error_code=2201, ) print('✓') if not nrm_repos: raise ValueError( 'No repos found. Make sure your token has appropriate access to GitLab and check your configuration of repos to pull.' ) return nrm_repos
def get_pull_requests( self, normalized_repos: List[NormalizedRepository], server_git_instance_info, ) -> List[NormalizedPullRequest]: print('downloading gitlab prs... ', end='', flush=True) for i, nrm_repo in enumerate(normalized_repos, start=1): print(f'downloading prs for repo {nrm_repo.name} ({nrm_repo.id})') with agent_logging.log_loop_iters(logger, 'repo for pull requests', i, 1): try: pull_since = pull_since_date_for_repo( server_git_instance_info, nrm_repo.project.login, nrm_repo.id, 'prs') api_prs = self.client.list_project_merge_requests( nrm_repo.id) if not api_prs or not api_prs.total: agent_logging.log_and_print( logger, logging.WARNING, f"No PRs returned for repo {nrm_repo.id}") continue for api_pr in tqdm( api_prs, desc= f'processing prs for {nrm_repo.name} ({nrm_repo.id})', unit='prs', total=api_prs.total, ): try: updated_at = parser.parse(api_pr.updated_at) # PRs are ordered newest to oldest # if this is too old, we're done with this repo if pull_since and updated_at < pull_since: break try: api_pr = self.client.expand_merge_request_data( api_pr) except MissingSourceProjectException as e: log_and_print_request_error( e, f'fetching source project {api_pr.source_project_id} ' f'for merge_request {api_pr.id}. Skipping...', ) continue nrm_commits: List[NormalizedCommit] = [ _normalize_commit( commit, nrm_repo, api_pr.target_branch, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, ) for commit in api_pr.commit_list ] merge_request = self.client.expand_merge_request_data( api_pr) merge_commit = None if (merge_request.state == 'merged' and nrm_commits is not None and merge_request.merge_commit_sha): merge_commit = _normalize_commit( self.client.get_project_commit( merge_request.project_id, merge_request.merge_commit_sha), nrm_repo, api_pr.target_branch, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, ) yield _normalize_pr( api_pr, nrm_commits, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, merge_commit, ) except Exception as e: # if something goes wrong with normalizing one of the prs - don't stop pulling. try # the next one. pr_id = f' {api_pr.id}' if api_pr else '' log_and_print_request_error( e, f'normalizing PR {pr_id} from repo {nrm_repo.name} ({nrm_repo.id}). Skipping...', log_as_exception=True, ) except Exception as e: # if something happens when pulling PRs for a repo, just keep going. log_and_print_request_error( e, f'getting PRs for repo {nrm_repo.name} ({nrm_repo.id}). Skipping...', log_as_exception=True, )