async def _get( cls, redis: redis_utils.RedisCache, api_access_key: str, api_secret_key: str, account_scope: typing.Optional[github_types.GitHubLogin], ) -> "ApplicationSaas": cached_application = await cls._retrieve_from_cache( redis, api_access_key, api_secret_key, account_scope) if cached_application is None or cached_application._has_expired(): try: db_application = await cls._retrieve_from_db( redis, api_access_key, api_secret_key, account_scope) except http.HTTPForbidden: # api key is valid, but not the scope raise ApplicationUserNotFound() except http.HTTPNotFound: raise ApplicationUserNotFound() except Exception as exc: if cached_application is not None and ( exceptions.should_be_ignored(exc) or exceptions.need_retry(exc)): # NOTE(sileht): return the cached application, instead of # retrying the stream, just because the dashboard has a # connectivity issue. return cached_application raise await db_application.save_to_cache() return db_application return cached_application
async def exec_action( method_name: typing.Literal["run", "cancel"], rule: rules.EvaluatedRule, action: str, ctxt: context.Context, ) -> check_api.Result: try: if method_name == "run": method = rule.actions[action].run elif method_name == "cancel": method = rule.actions[action].cancel else: raise RuntimeError("wrong method_name") return await method(ctxt, rule) except Exception as e: # pragma: no cover # Forward those to worker if exceptions.should_be_ignored(e) or exceptions.need_retry(e): raise ctxt.log.error("action failed", action=action, rule=rule, exc_info=True) # TODO(sileht): extract sentry event id and post it, so # we can track it easly return check_api.Result(check_api.Conclusion.FAILURE, f"action '{action}' failed", "")
async def exec_action( method_name: typing.Literal["run", "cancel"], rule: rules.EvaluatedRule, action: str, ctxt: context.Context, ) -> check_api.Result: try: if method_name == "run": method = rule.actions[action].run elif method_name == "cancel": method = rule.actions[action].cancel else: raise RuntimeError("wrong method_name") return await method(ctxt, rule) except Exception as e: # pragma: no cover # Forward those to worker if ( exceptions.should_be_ignored(e) or exceptions.need_retry(e) or isinstance(e, exceptions.UnprocessablePullRequest) ): raise # NOTE(sileht): the action fails, this is a bug!!!, so just set the # result as pending and retry in 5 minutes... ctxt.log.error("action failed", action=action, rule=rule, exc_info=True) await delayed_refresh.plan_refresh_at_least_at( ctxt.repository, ctxt.pull["number"], date.utcnow() + datetime.timedelta(minutes=5), ) return check_api.Result( check_api.Conclusion.PENDING, f"Action '{action}' has unexpectedly failed, Mergify team is working on it, the state will be refreshed automatically.", "", )
def process(self): pull_numbers = self.get_pulls() self.log.info("%d pulls queued", len(pull_numbers), queue=list(pull_numbers)) if not pull_numbers: return pull_number = pull_numbers[0] with github.get_client(self.owner, self.repo) as client: ctxt = None try: subscription = asyncio.run( sub_utils.get_subscription(client.auth.owner_id) ) data = client.item(f"pulls/{pull_number}") ctxt = context.Context(client, data, subscription) if ctxt.pull["base"]["ref"] != self.ref: ctxt.log.info( "pull request base branch have changed", old_branch=self.ref, new_branch=ctxt.pull["base"]["ref"], ) self._move_pull_to_new_base_branch(ctxt.pull["number"], self.ref) elif ctxt.pull["state"] == "closed" or ctxt.is_behind: # NOTE(sileht): Pick up this pull request and rebase it again # or update its status and remove it from the queue ctxt.log.info( "pull request needs to be updated again or has been closed", ) self.handle_first_pull_in_queue(ctxt) else: # NOTE(sileht): Pull request has not been merged or cancelled # yet wait next loop ctxt.log.info("pull request checks are still in progress") except Exception as exc: # pragma: no cover log = self.log if ctxt is None else ctxt.log if exceptions.should_be_ignored(exc): log.info( "Fail to process merge queue, remove the pull request from the queue", exc_info=True, ) self.remove_pull(ctxt.pull["number"]) elif exceptions.need_retry(exc): log.info("Fail to process merge queue, need retry", exc_info=True) if isinstance(exc, exceptions.MergeableStateUnknown): # NOTE(sileht): We need GitHub to recompute the state here (by # merging something else for example), so move it to the end self._move_pull_at_end(pull_number) else: log.error("Fail to process merge queue", exc_info=True) self._move_pull_at_end(pull_number)
async def job_filter_and_dispatch(redis, event_type, event_id, data): # TODO(sileht): is statsd async ? meter_event(event_type, data) if "repository" in data: owner = data["repository"]["owner"]["login"] repo = data["repository"]["name"] else: owner = "<unknown>" repo = "<unknown>" reason = get_ignore_reason(event_type, data) if reason: msg_action = f"ignored: {reason}" else: msg_action = "pushed to worker" source_data = _extract_source_data(event_type, data) if "pull_request" in data: pull_number = data["pull_request"]["number"] elif event_type == "issue_comment": pull_number = data["issue"]["number"] else: pull_number = None await worker.push( redis, owner, repo, pull_number, event_type, source_data, ) # NOTE(sileht): nothing important should happen in this hook as we don't retry it try: await commands_runner.on_each_event(owner, repo, event_type, data) except Exception as e: if exceptions.should_be_ignored(e) or exceptions.need_retry(e): LOG.debug("commands_runner.on_each_event failed", exc_info=True) else: raise LOG.info( "GithubApp event %s", msg_action, event_type=event_type, event_id=event_id, sender=data["sender"]["login"], gh_owner=owner, gh_repo=repo, ) if reason: raise IgnoredEvent(event_type, event_id, reason)
def retry_task_on_exception(sender, task_id, exception, args, kwargs, traceback, einfo, **other): # pragma: no cover backoff = exceptions.need_retry(exception) if backoff is None: return LOG.warning("job %s: failed %d times - retrying", task_id, sender.request.retries) # Exponential ^3 backoff retry_in = 3**sender.request.retries * backoff sender.retry(countdown=retry_in)
async def _translate_exception_to_retries( self, e, stream_name, attempts_key=None, ): if isinstance(e, exceptions.MergifyNotInstalled): if attempts_key: await self.redis.hdel("attempts", attempts_key) await self.redis.hdel("attempts", stream_name) raise StreamUnused(stream_name) from e if isinstance(e, github.TooManyPages): # TODO(sileht): Ideally this should be catcher earlier to post an # appropriate check-runs to inform user the PR is too big to be handled # by Mergify, but this need a bit of refactory to do it, so in the # meantimes... if attempts_key: await self.redis.hdel("attempts", attempts_key) await self.redis.hdel("attempts", stream_name) raise IgnoredException() from e if exceptions.should_be_ignored(e): if attempts_key: await self.redis.hdel("attempts", attempts_key) await self.redis.hdel("attempts", stream_name) raise IgnoredException() from e if isinstance(e, exceptions.RateLimited): retry_at = utils.utcnow() + e.countdown score = retry_at.timestamp() if attempts_key: await self.redis.hdel("attempts", attempts_key) await self.redis.hdel("attempts", stream_name) await self.redis.zaddoption("streams", "XX", **{stream_name: score}) raise StreamRetry(stream_name, 0, retry_at) from e backoff = exceptions.need_retry(e) if backoff is None: # NOTE(sileht): This is our fault, so retry until we fix the bug but # without increasing the attempts raise attempts = await self.redis.hincrby("attempts", stream_name) retry_in = 3**min(attempts, 3) * backoff retry_at = utils.utcnow() + datetime.timedelta(seconds=retry_in) score = retry_at.timestamp() await self.redis.zaddoption("streams", "XX", **{stream_name: score}) raise StreamRetry(stream_name, attempts, retry_at) from e
def exec_action(method_name, rule, action, ctxt): try: method = getattr(rule.actions[action], method_name) return method(ctxt, rule) except Exception as e: # pragma: no cover # Forward those to worker if exceptions.should_be_ignored(e) or exceptions.need_retry(e): raise ctxt.log.error("action failed", action=action, rule=rule, exc_info=True) # TODO(sileht): extract sentry event id and post it, so # we can track it easly return check_api.Result(check_api.Conclusion.FAILURE, "action '%s' failed" % action, "")
async def _get(cls, redis: utils.RedisCache, owner_id: int) -> "UserTokensSaas": cached_tokens = await cls._retrieve_from_cache(redis, owner_id) if cached_tokens is None or await cached_tokens._has_expired(): try: db_tokens = await cls._retrieve_from_db(redis, owner_id) except Exception as exc: if cached_tokens is not None and ( exceptions.should_be_ignored(exc) or exceptions.need_retry(exc)): # NOTE(sileht): return the cached tokens, instead of retring the # stream, just because the dashboard has a connectivity issue. return cached_tokens raise await db_tokens.save_to_cache() return db_tokens return cached_tokens
async def get_subscription( cls, redis: utils.RedisCache, owner_id: int ) -> "Subscription": """Get a subscription.""" cached_sub = await cls._retrieve_subscription_from_cache(redis, owner_id) if cached_sub is None or await cached_sub._has_expired(): try: db_sub = await cls._retrieve_subscription_from_db(redis, owner_id) except Exception as exc: if cached_sub is not None and ( exceptions.should_be_ignored(exc) or exceptions.need_retry(exc) ): # NOTE(sileht): return the cached sub, instead of retry the stream, # just because the dashboard have connectivity issue. return cached_sub raise await db_sub.save_subscription_to_cache() return db_sub return cached_sub
def main(): # pragma: no cover utils.setup_logging() LOG.info("Starting") prometheus_client.start_http_server(8889) LOG.info("Started") while True: try: collect_metrics() except Exception as e: # pragma: no cover if exceptions.need_retry(e): LOG.warning("fail to gather metrics: %s", str(e)) time.sleep(10 * 60) continue else: LOG.error("Unexpected error during metrics gathering", exc_info=True) # Only generate metrics once per hour time.sleep(60 * 60)
async def send_seats(seats: SeatsCountResultT) -> None: async with http.AsyncClient() as client: try: await client.post( f"{config.SUBSCRIPTION_BASE_URL}/on-premise/report", headers={ "Authorization": f"token {config.SUBSCRIPTION_TOKEN}" }, json={ "write_users": seats.write_users, "active_users": seats.active_users, "engine_version": config.VERSION, # Deprecated version "seats": seats.write_users, }, ) except Exception as exc: if exceptions.should_be_ignored(exc): return elif exceptions.need_retry(exc): raise tenacity.TryAgain else: raise
def _log_on_exception(exc: Exception, msg: str) -> None: if exceptions.should_be_ignored(exc) or exceptions.need_retry(exc): log = LOG.debug else: log = LOG.error log(msg, exc_info=exc)
async def run_actions( ctxt: context.Context, match: rules.RulesEvaluator, checks: typing.Dict[str, github_types.GitHubCheckRun], previous_conclusions: typing.Dict[str, check_api.Conclusion], ) -> typing.Dict[str, check_api.Conclusion]: """ What action.run() and action.cancel() return should be reworked a bit. Currently the meaning is not really clear, it could be: - None - (succeed but no dedicated report is posted with check api - (None, "<title>", "<summary>") - (action is pending, for merge/backport/...) - ("success", "<title>", "<summary>") - ("failure", "<title>", "<summary>") - ("neutral", "<title>", "<summary>") - ("cancelled", "<title>", "<summary>") """ user_refresh_requested = ctxt.user_refresh_requested() admin_refresh_requested = ctxt.admin_refresh_requested() actions_ran = set() conclusions = {} # NOTE(sileht): We put first rules with missing conditions to do cancellation first. # In case of a canceled merge action and another that need to be run. We want first # to remove the PR from the queue and then add it back with the new config and not the # reverse matching_rules = sorted(match.matching_rules, key=lambda rule: len(rule.missing_conditions) == 0) method_name: typing.Literal["run", "cancel"] for rule in matching_rules: for action, action_obj in rule.actions.items(): check_name = f"Rule: {rule.name} ({action})" done_by_another_action = action_obj.only_once and action in actions_ran action_rule = await action_obj.get_rule(ctxt) if rule.missing_conditions or action_rule.missing_conditions: method_name = "cancel" expected_conclusions = [ check_api.Conclusion.NEUTRAL, check_api.Conclusion.CANCELLED, ] else: method_name = "run" expected_conclusions = [ check_api.Conclusion.SUCCESS, check_api.Conclusion.FAILURE, ] actions_ran.add(action) previous_conclusion = get_previous_conclusion( previous_conclusions, check_name, checks) need_to_be_run = ( action_obj.always_run or admin_refresh_requested or (user_refresh_requested and previous_conclusion == check_api.Conclusion.FAILURE) or previous_conclusion not in expected_conclusions) # TODO(sileht): refactor it to store the whole report in the check summary, # not just the conclusions if not need_to_be_run: report = check_api.Result(previous_conclusion, "Already in expected state", "") message = "ignored, already in expected state" elif done_by_another_action: # NOTE(sileht) We can't run two action merge for example, # This assumes the action produce a report report = check_api.Result( check_api.Conclusion.SUCCESS, f"Another {action} action already ran", "", ) message = "ignored, another has already been run" else: # NOTE(sileht): check state change so we have to run "run" or "cancel" report = await exec_action( method_name, rule, action, ctxt, ) message = "executed" if (report and report.conclusion is not check_api.Conclusion.PENDING and method_name == "run"): statsd.increment("engine.actions.count", tags=[f"name:{action}"]) if report: if need_to_be_run and (not action_obj.silent_report or report.conclusion not in ( check_api.Conclusion.SUCCESS, check_api.Conclusion.CANCELLED, check_api.Conclusion.PENDING, )): external_id = (check_api.USER_CREATED_CHECKS if action_obj.allow_retrigger_mergify else None) try: await check_api.set_check_run( ctxt, check_name, report, external_id=external_id, ) except Exception as e: if exceptions.should_be_ignored(e): ctxt.log.info("Fail to post check `%s`", check_name, exc_info=True) elif exceptions.need_retry(e): raise else: ctxt.log.error("Fail to post check `%s`", check_name, exc_info=True) conclusions[check_name] = report.conclusion else: # NOTE(sileht): action doesn't have report (eg: # comment/request_reviews/..) So just assume it succeed ctxt.log.error("action must return a conclusion", action=action) conclusions[check_name] = expected_conclusions[0] ctxt.log.info( "action evaluation: `%s` %s: %s/%s -> %s", action, message, method_name, previous_conclusion.value, conclusions[check_name].value, report=report, previous_conclusion=previous_conclusion.value, conclusion=conclusions[check_name].value, action=action, check_name=check_name, event_types=[se["event_type"] for se in ctxt.sources], ) return conclusions
def _wait_time_for_exception(retry_state): return exceptions.need_retry(retry_state.outcome.exception())
def _exception_need_retry(retry_state): return ( retry_state.outcome.failed and exceptions.need_retry(retry_state.outcome.exception()) is not None )
async def _translate_exception_to_retries( self, stream_name: StreamNameType, attempts_key: typing.Optional[str] = None, ) -> typing.AsyncIterator[None]: try: yield except Exception as e: if isinstance(e, exceptions.MergeableStateUnknown) and attempts_key: attempts = await self.redis_stream.hincrby("attempts", attempts_key) if attempts < MAX_RETRIES: raise PullRetry(attempts) from e else: await self.redis_stream.hdel("attempts", attempts_key) raise MaxPullRetry(attempts) from e if isinstance(e, exceptions.MergifyNotInstalled): if attempts_key: await self.redis_stream.hdel("attempts", attempts_key) await self.redis_stream.hdel("attempts", stream_name) raise StreamUnused(stream_name) if isinstance(e, github.TooManyPages): # TODO(sileht): Ideally this should be catcher earlier to post an # appropriate check-runs to inform user the PR is too big to be handled # by Mergify, but this need a bit of refactory to do it, so in the # meantimes... if attempts_key: await self.redis_stream.hdel("attempts", attempts_key) await self.redis_stream.hdel("attempts", stream_name) raise IgnoredException() if exceptions.should_be_ignored(e): if attempts_key: await self.redis_stream.hdel("attempts", attempts_key) await self.redis_stream.hdel("attempts", stream_name) raise IgnoredException() if isinstance(e, exceptions.RateLimited): retry_at = utils.utcnow() + e.countdown score = retry_at.timestamp() if attempts_key: await self.redis_stream.hdel("attempts", attempts_key) await self.redis_stream.hdel("attempts", stream_name) await self.redis_stream.zaddoption( "streams", "XX", **{stream_name: score} ) raise StreamRetry(stream_name, 0, retry_at) backoff = exceptions.need_retry(e) if backoff is None: # NOTE(sileht): This is our fault, so retry until we fix the bug but # without increasing the attempts raise attempts = await self.redis_stream.hincrby("attempts", stream_name) retry_in = 3 ** min(attempts, 3) * backoff retry_at = utils.utcnow() + retry_in score = retry_at.timestamp() await self.redis_stream.zaddoption("streams", "XX", **{stream_name: score}) raise StreamRetry(stream_name, attempts, retry_at)
async def run_actions( ctxt: context.Context, match: rules.RulesEvaluator, checks: typing.Dict[str, github_types.CachedGitHubCheckRun], previous_conclusions: typing.Dict[str, check_api.Conclusion], ) -> typing.Dict[str, check_api.Conclusion]: """ What action.run() and action.cancel() return should be reworked a bit. Currently the meaning is not really clear, it could be: - None - (succeed but no dedicated report is posted with check api - (None, "<title>", "<summary>") - (action is pending, for merge/backport/...) - ("success", "<title>", "<summary>") - ("failure", "<title>", "<summary>") - ("neutral", "<title>", "<summary>") - ("cancelled", "<title>", "<summary>") """ user_refresh_requested = ctxt.user_refresh_requested() admin_refresh_requested = ctxt.admin_refresh_requested() actions_ran = set() conclusions = {} # NOTE(sileht): We put first rules with missing conditions to do cancellation first. # In case of a canceled merge action and another that need to be run. We want first # to remove the PR from the queue and then add it back with the new config and not the # reverse matching_rules = sorted( match.matching_rules, key=lambda rule: rule.conditions.match ) method_name: typing.Literal["run", "cancel"] for rule in matching_rules: for action, action_obj in rule.actions.items(): check_name = rule.get_check_name(action) done_by_another_action = ( actions.ActionFlag.DISALLOW_RERUN_ON_OTHER_RULES in action_obj.flags and action in actions_ran ) if ( not rule.conditions.match or rule.disabled is not None or ( ctxt.configuration_changed and actions.ActionFlag.ALLOW_ON_CONFIGURATION_CHANGED not in action_obj.flags ) ): method_name = "cancel" expected_conclusions = [ check_api.Conclusion.NEUTRAL, check_api.Conclusion.CANCELLED, ] else: method_name = "run" expected_conclusions = [ check_api.Conclusion.SUCCESS, check_api.Conclusion.FAILURE, ] actions_ran.add(action) previous_conclusion = get_previous_conclusion( previous_conclusions, check_name, checks ) need_to_be_run = ( actions.ActionFlag.ALWAYS_RUN in action_obj.flags or ( actions.ActionFlag.SUCCESS_IS_FINAL_STATE in action_obj.flags and previous_conclusion == check_api.Conclusion.SUCCESS ) or admin_refresh_requested or ( user_refresh_requested and previous_conclusion == check_api.Conclusion.FAILURE ) or previous_conclusion not in expected_conclusions ) # TODO(sileht): refactor it to store the whole report in the check summary, # not just the conclusions if not need_to_be_run: report = check_api.Result( previous_conclusion, "Already in expected state", "" ) message = "ignored, already in expected state" elif done_by_another_action: # NOTE(sileht) We can't run two action merge for example, # This assumes the action produce a report report = check_api.Result( check_api.Conclusion.SUCCESS, f"Another {action} action already ran", "", ) message = "ignored, another has already been run" else: with ddtrace.tracer.trace( f"action.{action}", span_type="worker", resource=str(ctxt), ) as span: # NOTE(sileht): check state change so we have to run "run" or "cancel" report = await exec_action( method_name, rule, action, ctxt, ) span.set_tags({"conclusion": str(report.conclusion)}) message = "executed" conclusions[check_name] = report.conclusion if ( report.conclusion is not check_api.Conclusion.PENDING and method_name == "run" ): statsd.increment("engine.actions.count", tags=[f"name:{action}"]) if need_to_be_run and ( actions.ActionFlag.ALWAYS_SEND_REPORT in action_obj.flags or report.conclusion not in ( check_api.Conclusion.SUCCESS, check_api.Conclusion.CANCELLED, check_api.Conclusion.PENDING, ) ): external_id = ( check_api.USER_CREATED_CHECKS if actions.ActionFlag.ALLOW_RETRIGGER_MERGIFY in action_obj.flags else None ) try: await check_api.set_check_run( ctxt, check_name, report, external_id=external_id, ) except Exception as e: if exceptions.should_be_ignored(e): ctxt.log.info( "Fail to post check `%s`", check_name, exc_info=True ) elif exceptions.need_retry(e): raise else: ctxt.log.error( "Fail to post check `%s`", check_name, exc_info=True ) ctxt.log.info( "action evaluation: `%s` %s: %s/%s -> %s", action, message, method_name, previous_conclusion.value, conclusions[check_name].value, report=report, previous_conclusion=previous_conclusion.value, conclusion=conclusions[check_name].value, action=action, check_name=check_name, event_types=[se["event_type"] for se in ctxt.sources], ) return conclusions