Ejemplo n.º 1
0
def test_github_session_ratelimit_reset_sleep(
    caplog,
    requests_ratelimited,
    monkeypatch_sleep_calls,
    num_before_ratelimit,
    num_ratelimit,
    ratelimit_reset,
    github_credentials,
):
    """GitHubSession should handle rate-limit with authentication tokens."""
    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    github_session = GitHubSession(
        user_agent="GitHub Session Test", credentials=github_credentials
    )

    for _ in range(num_ratelimit):
        github_session.request(f"{HTTP_GITHUB_API_URL}?per_page=1000&since=10")

    # We sleep 1 second every time we change credentials, then we sleep until
    # ratelimit_reset + 1
    expected_sleep_calls = len(github_credentials) * [1] + [ratelimit_reset + 1]
    assert monkeypatch_sleep_calls == expected_sleep_calls

    found_exhaustion_message = False
    for record in caplog.records:
        if record.levelname == "INFO":
            if "Rate limits exhausted for all tokens" in record.message:
                found_exhaustion_message = True
                break

    assert found_exhaustion_message is True
Ejemplo n.º 2
0
def test_github_session_ratelimit_once_recovery(
    caplog,
    requests_ratelimited,
    num_ratelimit,
    monkeypatch_sleep_calls,
    github_credentials,
):
    """GitHubSession should recover from hitting the rate-limit once"""
    caplog.set_level(logging.DEBUG, "swh.core.github.utils")

    github_session = GitHubSession(
        user_agent="GitHub Session Test", credentials=github_credentials
    )

    res = github_session.request(f"{HTTP_GITHUB_API_URL}?per_page=1000&since=10")
    assert res.status_code == 200

    token_users = []
    for record in caplog.records:
        if "Using authentication token" in record.message:
            token_users.append(record.args[0])

    # check that we used one more token than we saw rate limited requests
    assert len(token_users) == 1 + num_ratelimit

    # check that we slept for one second between our token uses
    assert monkeypatch_sleep_calls == [1]
Ejemplo n.º 3
0
class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
    """List origins from GitHub.

    By default, the lister runs in incremental mode: it lists all repositories,
    starting with the `last_seen_id` stored in the scheduler backend.

    Providing the `first_id` and `last_id` arguments enables the "relisting" mode: in
    that mode, the lister finds the origins present in the range **excluding**
    `first_id` and **including** `last_id`. In this mode, the lister can overrun the
    `last_id`: it will always record all the origins seen in a given page. As the lister
    is fully idempotent, this is not a practical problem. Once relisting completes, the
    lister state in the scheduler backend is not updated.

    When the config contains a set of credentials, we shuffle this list at the beginning
    of the listing. To follow GitHub's `abuse rate limit policy`_, we keep using the
    same token over and over again, until its rate limit runs out. Once that happens, we
    switch to the next token over in our shuffled list.

    When a request fails with a rate limit exception for all tokens, we pause the
    listing until the largest value for X-Ratelimit-Reset over all tokens.

    When the credentials aren't set in the lister config, the lister can run in
    anonymous mode too (e.g. for testing purposes).

    .. _abuse rate limit policy: https://developer.github.com/v3/guides/best-practices-for-integrators/#dealing-with-abuse-rate-limits


    Args:
      first_id: the id of the first repo to list
      last_id: stop listing after seeing a repo with an id higher than this value.

    """  # noqa: B950

    LISTER_NAME = "github"

    API_URL = "https://api.github.com/repositories"
    PAGE_SIZE = 1000

    def __init__(
        self,
        scheduler: SchedulerInterface,
        credentials: CredentialsType = None,
        first_id: Optional[int] = None,
        last_id: Optional[int] = None,
    ):
        super().__init__(
            scheduler=scheduler,
            credentials=credentials,
            url=self.API_URL,
            instance="github",
        )

        self.first_id = first_id
        self.last_id = last_id

        self.relisting = self.first_id is not None or self.last_id is not None

        self.github_session = GitHubSession(credentials=self.credentials,
                                            user_agent=USER_AGENT)

    def state_from_dict(self, d: Dict[str, Any]) -> GitHubListerState:
        return GitHubListerState(**d)

    def state_to_dict(self, state: GitHubListerState) -> Dict[str, Any]:
        return asdict(state)

    def get_pages(self) -> Iterator[List[Dict[str, Any]]]:
        current_id = 0
        if self.first_id is not None:
            current_id = self.first_id
        elif self.state is not None:
            current_id = self.state.last_seen_id

        current_url = f"{self.API_URL}?since={current_id}&per_page={self.PAGE_SIZE}"

        while self.last_id is None or current_id < self.last_id:
            logger.debug("Getting page %s", current_url)

            try:
                response = self.github_session.request(current_url)
            except MissingRateLimitReset:
                # Give up
                break

            # We've successfully retrieved a (non-ratelimited) `response`. We
            # still need to check it for validity.

            if response.status_code != 200:
                logger.warning(
                    "Got unexpected status_code %s: %s",
                    response.status_code,
                    response.content,
                )
                break

            yield response.json()

            if "next" not in response.links:
                # No `next` link, we've reached the end of the world
                logger.debug(
                    "No next link found in the response headers, all caught up"
                )
                break

            # GitHub strongly advises to use the next link directly. We still
            # parse it to get the id of the last repository we've reached so
            # far.
            next_url = response.links["next"]["url"]
            parsed_url = urlparse(next_url)
            if not parsed_url.query:
                logger.warning("Failed to parse url %s", next_url)
                break

            parsed_query = parse_qs(parsed_url.query)
            current_id = int(parsed_query["since"][0])
            current_url = next_url

    def get_origins_from_page(
            self, page: List[Dict[str, Any]]) -> Iterator[ListedOrigin]:
        """Convert a page of GitHub repositories into a list of ListedOrigins.

        This records the html_url, as well as the pushed_at value if it exists.
        """
        assert self.lister_obj.id is not None

        seen_in_page: Set[str] = set()

        for repo in page:
            if not repo:
                # null repositories in listings happen sometimes...
                continue

            if repo["html_url"] in seen_in_page:
                continue
            seen_in_page.add(repo["html_url"])

            pushed_at_str = repo.get("pushed_at")
            pushed_at: Optional[datetime.datetime] = None
            if pushed_at_str:
                pushed_at = iso8601.parse_date(pushed_at_str)

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=repo["html_url"],
                visit_type="git",
                last_update=pushed_at,
            )

    def commit_page(self, page: List[Dict[str, Any]]):
        """Update the currently stored state using the latest listed page"""
        if self.relisting:
            # Don't update internal state when relisting
            return

        if not page:
            # Sometimes, when you reach the end of the world, GitHub returns an empty
            # page of repositories
            return

        last_id = page[-1]["id"]

        if last_id > self.state.last_seen_id:
            self.state.last_seen_id = last_id

    def finalize(self):
        if self.relisting:
            return

        # Pull fresh lister state from the scheduler backend
        scheduler_state = self.get_state_from_scheduler()

        # Update the lister state in the backend only if the last seen id of
        # the current run is higher than that stored in the database.
        if self.state.last_seen_id > scheduler_state.last_seen_id:
            self.updated = True