Example #1
0
def detect_issues_needing_re_download(
    downloaded_issue_info, issue_metadata_from_jellyfish, issue_metadata_addl_from_jellyfish
):
    issue_keys_changed = []
    for issue_id_str, issue_key in downloaded_issue_info:
        existing_metadata = issue_metadata_from_jellyfish.get(int(issue_id_str))
        if existing_metadata and issue_key != existing_metadata.key:
            agent_logging.log_and_print(
                logger,
                logging.INFO,
                f'Detected a key change for issue {issue_id_str} ({existing_metadata.key} -> {issue_key})',
            )
            issue_keys_changed.append(existing_metadata.key)

    issues_by_elfik, issues_by_pfik = defaultdict(list), defaultdict(list)
    for issue_id, (elfik, pfik) in issue_metadata_addl_from_jellyfish.items():
        if elfik:
            issues_by_elfik[elfik].append(issue_id)
        if pfik:
            issues_by_pfik[pfik].append(issue_id)

    # Find all of the issues that refer to those issues through epic_link_field_issue_key
    # or parent_field_issue_key; these issues need to be re-downloaded
    issue_ids_needing_re_download = set()
    for changed_key in issue_keys_changed:
        issue_ids_needing_re_download.update(set(issues_by_elfik.get(changed_key, [])))
        issue_ids_needing_re_download.update(set(issues_by_pfik.get(changed_key, [])))

    return issue_ids_needing_re_download
    def get_all_pages(self, url, rate_limit_realm=None, ignore404=False):
        current_page_values = deque()
        while True:
            if not current_page_values:
                if not url:
                    return  # exhausted the current page and there's no next page

                try:
                    page = self.get_json(url, rate_limit_realm)
                except requests.exceptions.HTTPError as e:
                    if e.response.status_code == 404 and ignore404:
                        agent_logging.log_and_print(
                            logger, logging.INFO, f'Caught a 404 for {url} - ignoring',
                        )
                        return
                    raise

                if 'values' in page:
                    current_page_values.extend(page['values'])
                    if not current_page_values:
                        return  # no new values returned

                url = page['next'] if 'next' in page else None

            yield current_page_values.popleft()
Example #3
0
def download_users(jira_connection, gdpr_active, quiet=False):
    if not quiet:
        print('downloading jira users... ', end='', flush=True)

    jira_users = _search_all_users(jira_connection, gdpr_active)

    # Some jira instances won't return more than one page of
    # results.  If we have seen approximately 1000 results, try
    # searching a different way
    if 950 <= len(jira_users) <= 1000:
        agent_logging.log_and_print(
            logger=logger,
            level=logging.INFO,
            msg=f'Page limit reached with {len(jira_users)} users, '
            'falling back to search by letter method.',
        )
        jira_users = _users_by_letter(jira_connection, gdpr_active)

    if len(jira_users) == 0:
        raise RuntimeError(
            'The agent is unable to see any users. Please verify that this user has the "browse all users" permission.'
        )

    if not quiet:
        print('✓')
    return jira_users
 def get_raw_result(self, url, rate_limit_realm=None):
     start = datetime.utcnow()
     while True:
         try:
             with self.rate_limiter.limit(rate_limit_realm):
                 result = self.session.get(url)
                 result.raise_for_status()
                 return result
         except requests.exceptions.HTTPError as e:
             if e.response.status_code == 429:
                 # rate-limited in spite of trying to throttle
                 # requests.  We don't know how long we need to wait,
                 # so just try in 30 seconds, unless it's already
                 # been too long
                 if (datetime.utcnow() - start) < timedelta(hours=1):
                     agent_logging.log_and_print(
                         logger, logging.INFO, 'Retrying in 30 seconds...',
                     )
                     time.sleep(30)
                     continue
                 else:
                     agent_logging.log_and_print_error_or_warning(
                         logger, logging.ERROR, error_code=3151
                     )
             raise
Example #5
0
    def limit(self, realm):
        # if realm is None, don't rate limit, just execute the thing
        if realm is None:
            yield
            return

        max_calls, period_secs = self.realm_config[realm]
        start = datetime.utcnow()
        while True:
            # decide whether to sleep or call, inside the lock
            with self.lock:
                sleep_until, calls_made = self._call_available(
                    realm, max_calls)
                if not sleep_until:
                    self._record_call(realm, period_secs)

            if not sleep_until:
                try:
                    # stuff within the context manager happens here
                    yield
                    return
                except requests.exceptions.HTTPError as e:
                    if e.response.status_code == 429:
                        # Got rate limited anyway!
                        agent_logging.log_and_print_error_or_warning(
                            logger,
                            logging.ERROR,
                            msg_args=[calls_made, max_calls, realm],
                            error_code=3010,
                        )
                    raise

            agent_logging.log_and_print(
                logger,
                logging.INFO,
                f'Rate limiter: exceeded {max_calls} calls in {period_secs} seconds for {realm}!',
            )
            if (sleep_until - start) >= timedelta(seconds=self.timeout_secs):
                agent_logging.log_and_print_error_or_warning(
                    logger,
                    logging.ERROR,
                    msg_args=[self.timeout_secs],
                    error_code=3020)
                raise Exception('Rate limit timeout')

            sleep_period_secs = (sleep_until -
                                 datetime.utcnow()).total_seconds()
            if sleep_period_secs > 0:  # it's possible that sleep_until was a couple ms ago
                agent_logging.log_and_print(
                    logger,
                    logging.INFO,
                    f'Sleeping for {sleep_period_secs:.1f} secs ({sleep_period_secs / 60.0:.1f} mins)',
                )
                time.sleep(sleep_period_secs)
Example #6
0
def download_data(config, creds, endpoint_jira_info, endpoint_git_instances_info):
    download_data_status = []

    if config.jira_url:
        agent_logging.log_and_print(
            logger, logging.INFO, 'Obtained Jira configuration, attempting download...',
        )
        jira_connection = get_basic_jira_connection(config, creds)
        if config.run_mode_is_print_all_jira_fields:
            print_all_jira_fields(config, jira_connection)
        download_data_status.append(load_and_dump_jira(config, endpoint_jira_info, jira_connection))

    is_multi_git_config = len(config.git_configs) > 1
    for git_config in config.git_configs:
        agent_logging.log_and_print(
            logger,
            logging.INFO,
            f'Obtained {git_config.git_provider} configuration, attempting download...',
        )
        if is_multi_git_config:
            instance_slug = git_config.git_instance_slug
            instance_info = endpoint_git_instances_info.get(instance_slug)
            instance_creds = creds.git_instance_to_creds.get(instance_slug)
        else:
            # support legacy single-git support, which assumes only one available git instance
            instance_info = list(endpoint_git_instances_info.values())[0]
            instance_creds = list(creds.git_instance_to_creds.values())[0]

        git_connection = get_git_client(
            git_config, instance_creds, skip_ssl_verification=config.skip_ssl_verification
        )

        download_data_status.append(
            load_and_dump_git(
                config=git_config,
                endpoint_git_instance_info=instance_info,
                outdir=config.outdir,
                compress_output_files=config.compress_output_files,
                git_connection=git_connection,
            )
        )

    return download_data_status
    def get_repos(
        self, normalized_projects: List[NormalizedProject],
    ) -> List[NormalizedRepository]:
        print('downloading bitbucket repos... ', end='', flush=True)

        repos = []
        for p in normalized_projects:
            for i, api_repo in enumerate(
                tqdm(
                    self.client.get_all_repos(p.id),
                    desc=f'downloading repos for {p.name}',
                    unit='repos',
                )
            ):
                # If we have an explicit repo allow list and this isn't in it, skip
                if self.config.git_include_repos:
                    git_include_repos_lowered = set(
                        n.lower() for n in self.config.git_include_repos
                    )
                    if (
                        api_repo['name'].lower() not in git_include_repos_lowered
                        and api_repo['uuid'].lower() not in git_include_repos_lowered
                    ):
                        if self.config.git_verbose:
                            agent_logging.log_and_print(
                                logger,
                                logging.INFO,
                                f'''Skipping repo "{api_repo['name']}" ({api_repo['uuid']}) because it's not in git_include_repos''',
                            )
                        continue

                # If we have an explicit repo deny list and this is in it, skip
                if self.config.git_exclude_repos:
                    git_exclude_repos_lowered = set(
                        n.lower() for n in self.config.git_exclude_repos
                    )
                    if (
                        api_repo['name'].lower() in git_exclude_repos_lowered
                        or api_repo['uuid'].lower() in git_exclude_repos_lowered
                    ):
                        if self.config.git_verbose:
                            agent_logging.log_and_print(
                                logger,
                                logging.INFO,
                                f'''Skipping repo "{api_repo['name']}" ({api_repo['uuid']}) because it's in git_exclude_repos''',
                            )
                        continue

                # If this repo is in a project, apply project filters:
                repo_project = api_repo.get('project')
                if repo_project:
                    # If we have a project allow list and this repo is in a project that's not in it, skip
                    if (
                        self.config.git_include_bbcloud_projects
                        and repo_project['key'] not in self.config.git_include_bbcloud_projects
                        and repo_project['uuid'] not in self.config.git_include_bbcloud_projects
                    ):
                        if self.config.git_verbose:
                            agent_logging.log_and_print(
                                logger,
                                logging.INFO,
                                f'''Skipping repo "{api_repo['name']}" ({api_repo['uuid']}) because its project '''
                                f'''("{repo_project['key']}"/{repo_project['uuid']}) is not in git_include_bbcloud_projects''',
                            )
                        continue

                    # if we have a project deny list and this repo is in a project that's in it, skip
                    if self.config.git_exclude_bbcloud_projects and (
                        repo_project['key'] in self.config.git_exclude_bbcloud_projects
                        or repo_project['uuid'] in self.config.git_exclude_bbcloud_projects
                    ):
                        if self.config.git_verbose:
                            agent_logging.log_and_print(
                                logger,
                                logging.INFO,
                                f'''Skipping repo "{api_repo['name']}" ({api_repo['uuid']}) because its project '''
                                f'''("{repo_project['key']}"/{repo_project['uuid']}) is in git_exclude_bbcloud_projects''',
                            )
                        continue

                branches = self.get_branches(p, api_repo)
                repos.append(
                    _normalize_repo(api_repo, branches, p, self.config.git_redact_names_and_urls)
                )

        print('✓')
        if not repos:
            raise ValueError(
                'No repos found. Make sure your token has appropriate access to Bitbucket and check your configuration of repos to pull.'
            )

        return repos
    def get_pull_requests(
        self, normalized_repos: List[NormalizedRepository], server_git_instance_info,
    ) -> List[NormalizedPullRequest]:
        print('downloading bitbucket prs... ', end='', flush=True)
        for i, repo in enumerate(
            tqdm(normalized_repos, desc='downloading prs for repos', unit='repos'), start=1
        ):
            with agent_logging.log_loop_iters(logger, 'repo for pull requests', i, 1):
                try:
                    pull_since = pull_since_date_for_repo(
                        server_git_instance_info, repo.project.login, repo.id, 'prs'
                    )

                    api_prs = self.client.get_pullrequests(repo.project.id, repo.id)

                    if not api_prs:
                        agent_logging.log_and_print(
                            logger, logging.INFO, f'no prs found for repo {repo.id}. Skipping... '
                        )
                        continue

                    for api_pr in tqdm(api_prs, desc=f'processing prs for {repo.name}', unit='prs'):
                        try:
                            # Skip PRs with missng data
                            if (
                                'source' not in api_pr
                                or 'repository' not in api_pr['source']
                                or not api_pr['source']['repository']
                                or 'destination' not in api_pr
                                or 'repository' not in api_pr['destination']
                                or not api_pr['destination']['repository']
                            ):
                                agent_logging.log_and_print_error_or_warning(
                                    logger, logging.WARN, msg_args=[api_pr['id']], error_code=3030
                                )
                                continue

                            yield _normalize_pr(
                                self.client,
                                repo,
                                api_pr,
                                self.config.git_strip_text_content,
                                self.config.git_redact_names_and_urls,
                            )

                            # PRs are ordered newest to oldest if this
                            # is too old, we're done with this repo.  We
                            # yield one old one on purpose so that we
                            # handle the case correctly when the most
                            # recent PR is really old.
                            if pull_since and parser.parse(api_pr['updated_on']) < pull_since:
                                break

                        except Exception:
                            # if something happens when normalizing a PR, just keep going with the rest
                            agent_logging.log_and_print_error_or_warning(
                                logger,
                                logging.ERROR,
                                msg_args=[api_pr["id"], repo.id],
                                error_code=3011,
                                exc_info=True,
                            )

                except Exception:
                    # if something happens when pulling PRs for a repo, just keep going.
                    agent_logging.log_and_print_error_or_warning(
                        logger, logging.ERROR, msg_args=[repo.id], error_code=3021, exc_info=True,
                    )

        print('✓')
Example #9
0
    def get_repos(
        self,
        normalized_projects: List[NormalizedProject],
    ) -> List[NormalizedRepository]:
        print('downloading gitlab repos... ', end='', flush=True)

        nrm_repos: List[NormalizedRepository] = []
        for nrm_project in normalized_projects:

            repos_that_failed_to_download = []

            for i, api_repo in enumerate(
                    tqdm(
                        self.client.list_group_projects(nrm_project.id),
                        desc=f'downloading repos for {nrm_project.name}',
                        unit='repos',
                    ),
                    start=1,
            ):
                if (self.config.git_include_repos
                        # For GitLab, git_include_repos holds IDs instead of names (probably unintentionally), so
                        # no need to be case insensitive
                        and api_repo.id not in self.config.git_include_repos):
                    if self.config.git_verbose:
                        agent_logging.log_and_print(
                            logger,
                            logging.INFO,
                            f'skipping repo {api_repo.id} because not in include_repos...',
                        )
                    continue  # skip this repo

                if (self.config.git_exclude_repos
                        # For GitLab, git_exclude_repos holds IDs instead of names (probably unintentionally), so
                        # no need to be case insensitive
                        and api_repo.id in self.config.git_exclude_repos):
                    if self.config.git_verbose:
                        agent_logging.log_and_print(
                            logger,
                            logging.INFO,
                            f'skipping repo {api_repo.id} because in exclude_repos...',
                        )
                    continue  # skip this repo

                try:
                    nrm_branches = self.get_branches(api_repo)
                except gitlab.exceptions.GitlabListError:
                    # this is likely due to fine-tuned permissions defined on the repo (gitlab project)
                    # that is not allowing us to access to its repo details. if this happens, make a note of it and
                    # don't blow up the rest of the pull
                    repos_that_failed_to_download.append(api_repo)
                    continue  # skip this repo

                nrm_repos.append(
                    _normalize_repo(api_repo, nrm_branches, nrm_project,
                                    self.config.git_redact_names_and_urls))

            # if there were any repositories we had issues with... print them out now.
            if repos_that_failed_to_download:

                def __repo_log_string(api_repo):
                    # build log string
                    name = (api_repo.name
                            if not self.config.git_redact_names_and_urls else
                            _repo_redactor.redact_name(api_repo.name))
                    return {"id": api_repo.id, "name": name}.__str__()

                repos_failed_string = ", ".join([
                    __repo_log_string(api_repo)
                    for api_repo in repos_that_failed_to_download
                ])
                total_failed = len(repos_that_failed_to_download)

                agent_logging.log_and_print_error_or_warning(
                    logger,
                    logging.WARNING,
                    msg_args=[
                        total_failed, nrm_project.id, repos_failed_string
                    ],
                    error_code=2201,
                )

        print('✓')
        if not nrm_repos:
            raise ValueError(
                'No repos found. Make sure your token has appropriate access to GitLab and check your configuration of repos to pull.'
            )
        return nrm_repos
Example #10
0
    def get_pull_requests(
        self,
        normalized_repos: List[NormalizedRepository],
        server_git_instance_info,
    ) -> List[NormalizedPullRequest]:
        print('downloading gitlab prs... ', end='', flush=True)

        for i, nrm_repo in enumerate(normalized_repos, start=1):
            print(f'downloading prs for repo {nrm_repo.name} ({nrm_repo.id})')

            with agent_logging.log_loop_iters(logger, 'repo for pull requests',
                                              i, 1):
                try:
                    pull_since = pull_since_date_for_repo(
                        server_git_instance_info, nrm_repo.project.login,
                        nrm_repo.id, 'prs')

                    api_prs = self.client.list_project_merge_requests(
                        nrm_repo.id)

                    if not api_prs or not api_prs.total:
                        agent_logging.log_and_print(
                            logger, logging.WARNING,
                            f"No PRs returned for repo {nrm_repo.id}")
                        continue

                    for api_pr in tqdm(
                            api_prs,
                            desc=
                            f'processing prs for {nrm_repo.name} ({nrm_repo.id})',
                            unit='prs',
                            total=api_prs.total,
                    ):
                        try:
                            updated_at = parser.parse(api_pr.updated_at)

                            # PRs are ordered newest to oldest
                            # if this is too old, we're done with this repo
                            if pull_since and updated_at < pull_since:
                                break

                            try:
                                api_pr = self.client.expand_merge_request_data(
                                    api_pr)
                            except MissingSourceProjectException as e:
                                log_and_print_request_error(
                                    e,
                                    f'fetching source project {api_pr.source_project_id} '
                                    f'for merge_request {api_pr.id}. Skipping...',
                                )
                                continue

                            nrm_commits: List[NormalizedCommit] = [
                                _normalize_commit(
                                    commit,
                                    nrm_repo,
                                    api_pr.target_branch,
                                    self.config.git_strip_text_content,
                                    self.config.git_redact_names_and_urls,
                                ) for commit in api_pr.commit_list
                            ]
                            merge_request = self.client.expand_merge_request_data(
                                api_pr)
                            merge_commit = None
                            if (merge_request.state == 'merged'
                                    and nrm_commits is not None
                                    and merge_request.merge_commit_sha):
                                merge_commit = _normalize_commit(
                                    self.client.get_project_commit(
                                        merge_request.project_id,
                                        merge_request.merge_commit_sha),
                                    nrm_repo,
                                    api_pr.target_branch,
                                    self.config.git_strip_text_content,
                                    self.config.git_redact_names_and_urls,
                                )

                            yield _normalize_pr(
                                api_pr,
                                nrm_commits,
                                self.config.git_strip_text_content,
                                self.config.git_redact_names_and_urls,
                                merge_commit,
                            )
                        except Exception as e:
                            # if something goes wrong with normalizing one of the prs - don't stop pulling. try
                            # the next one.
                            pr_id = f' {api_pr.id}' if api_pr else ''
                            log_and_print_request_error(
                                e,
                                f'normalizing PR {pr_id} from repo {nrm_repo.name} ({nrm_repo.id}). Skipping...',
                                log_as_exception=True,
                            )

                except Exception as e:
                    # if something happens when pulling PRs for a repo, just keep going.
                    log_and_print_request_error(
                        e,
                        f'getting PRs for repo {nrm_repo.name} ({nrm_repo.id}). Skipping...',
                        log_as_exception=True,
                    )