async def wait_till_service_healthy(service_name: str, endpoint: URL): log.info( "Connecting to %s", f"{service_name=} at {endpoint=}", ) async for attempt in AsyncRetrying( # randomizing healthchecks sampling helps parallel execution wait=wait_random(1, 2), # sets the timeout for a service to become healthy stop=stop_after_delay(2 * MINUTE), before_sleep=before_sleep_log(log, logging.WARNING), reraise=True, ): with attempt: async with aiohttp.ClientSession( timeout=_ONE_SEC_TIMEOUT) as session: async with session.get(endpoint) as response: # NOTE: Health-check endpoint require only a status code 200 # (see e.g. services/web/server/docker/healthcheck.py) # regardless of the payload content assert ( response.status == 200 ), f"Connection to {service_name=} at {endpoint=} failed with {response=}" log.info( "Connection to %s succeeded [%s]", f"{service_name=} at {endpoint=}", json.dumps(attempt.retry_state.retry_object.statistics), )
def wrapper(*args, **kwargs) -> Any: return Retrying( retry=(retry_if_network_error() | retry_if_throttling_error()), stop=stop_after_attempt(max_attempt_number=max_retries), wait=(wait_spotify_throttling() + wait_random(min=1, max=3)), before=before_log(retry_logger, logging.DEBUG), before_sleep=before_sleep_log(retry_logger, logging.WARNING), ).call(func, *args, **kwargs)
async def _wait_for_call(mocked_fct): async for attempt in AsyncRetrying( stop=stop_after_delay(10), wait=wait_random(0, 1), retry=retry_if_exception_type(AssertionError), reraise=True, ): with attempt: print(f"waiting for call in mocked fct {mocked_fct}, " f"Attempt={attempt.retry_state.attempt_number}") mocked_fct.assert_called_once()
def wemo_on(): @tenacity.retry(wait=wait_fixed(10) + wait_random(-2, 2), stop=stop_after_delay(60 * 60), before_sleep=before_sleep_log(_LOGGER, logging.INFO)) def discover_and_on(): address = settings.wemo_address port = pywemo.ouimeaux_device.probe_wemo(address) url = 'http://%s:%i/setup.xml' % (address, port) device = pywemo.discovery.device_from_description(url, None) device.on() _LOGGER.info("Called on on %s", device) discover_and_on() return "ok"
async def managed_docker_compose(postgres_volume_name: str, postgres_username: str, postgres_password: str): typer.echo("starting up database in localhost") compose_file = Path.cwd() / "consistency" / "docker-compose.yml" try: subprocess.run( ["docker-compose", "--file", compose_file, "up", "--detach"], shell=False, check=True, cwd=compose_file.parent, env={ **os.environ, **{ "POSTGRES_DATA_VOLUME": postgres_volume_name } }, ) typer.echo( f"database started: adminer available on http://127.0.0.1:18080/?pgsql=postgres&username={postgres_username}&db=simcoredb&ns=public" ) @retry( wait=wait_random(1, 3), stop=stop_after_attempt(10), after=after_log(log, logging.WARN), ) async def postgres_responsive(): async with aiopg.create_pool( f"dbname=simcoredb user={postgres_username} password={postgres_password} host=127.0.0.1" ) as pool: async with pool.acquire() as conn: async with conn.cursor() as cur: await cur.execute("SELECT 1") await postgres_responsive() yield finally: subprocess.run( ["docker-compose", "--file", compose_file, "down"], shell=False, check=True, cwd=compose_file.parent, )
class ContainerWrapper: def __init__(self, container_url: str) -> None: self.client = ContainerClient.from_container_url(container_url) @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def upload_file(self, file_path: str, blob_name: str) -> None: with open(file_path, "rb") as handle: self.client.upload_blob(name=blob_name, data=handle, overwrite=True, max_concurrency=10) return None def upload_file_data(self, data: str, blob_name: str) -> None: self.client.upload_blob(name=blob_name, data=data, overwrite=True, max_concurrency=10) def upload_dir(self, dir_path: str, recursive: bool = True) -> None: for path in glob.glob(os.path.join(dir_path, "**"), recursive=recursive): if os.path.isfile(path): blob_name = os.path.relpath(path, start=dir_path) self.upload_file(path, blob_name) @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def delete_blob(self, blob_name: str) -> None: self.client.delete_blob(blob_name) return None @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def download_blob(self, blob_name: str) -> bytes: return cast(bytes, self.client.download_blob(blob_name).content_as_bytes()) @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def list_blobs(self, *, name_starts_with: Optional[str] = None) -> List[str]: result = [ x.name for x in self.client.list_blobs(name_starts_with=name_starts_with) ] return cast(List[str], result)
async def assert_service_is_running( service_id: str, docker, *, max_running_delay=1 * MINUTE ) -> Tuple[List[TaskDict], TenacityStatsDict]: MAX_WAIT = 5 assert max_running_delay > 3 * MAX_WAIT # # The retry-policy constraints in this test # the time a service takes since it is deployed by the swarm # until it is running (i.e. started and healthy) # retry_policy = dict( # instead of wait_fix in order to help parallel execution in asyncio.gather wait=wait_random(1, MAX_WAIT), stop=stop_after_delay(max_running_delay), before_sleep=before_sleep_log(log, logging.INFO), reraise=True, ) async for attempt in AsyncRetrying(**retry_policy): with attempt: # service service: ServiceDict = await docker.services.inspect(service_id) assert service_id == service["ID"] service_name = service["Spec"]["Name"] num_replicas = int( get_from_dict(service, "Spec.Mode.Replicated.Replicas", default=1)) # tasks in a service tasks: List[TaskDict] = await docker.tasks.list( filters={"service": service_name}) tasks_current_state = [task["Status"]["State"] for task in tasks] num_running = sum(current == "running" for current in tasks_current_state) # assert condition is_running: bool = num_replicas == num_running error_msg = "" if not is_running: # lazy composes error msg logs_lines = await docker.services.logs( service_id, follow=False, timestamps=True, tail=50, # SEE *_docker_logs artifacts for details ) log_str = " ".join(logs_lines) tasks_json = json.dumps( [ copy_from_dict( task, include={ "ID":..., "CreatedAt":..., "UpdatedAt":..., "Spec": { "ContainerSpec": {"Image"} }, "Status": {"Timestamp", "State"}, "DesiredState":..., }, ) for task in tasks ], indent=1, ) error_msg = ( f"{service_name=} has {tasks_current_state=}, but expected at least {num_replicas=} running. " f"Details:\n" f"tasks={tasks_json}\n" f"logs={log_str}\n") assert is_running, error_msg log.info( "Connection to %s succeded [%s]", service_name, json.dumps(attempt.retry_state.retry_object.statistics), ) return tasks, attempt.retry_state.retry_object.statistics assert False # never reached
LoggerRabbitMessage, ProgressRabbitMessage, RabbitMessageTypes, ) from settings_library.rabbit import RabbitSettings from tenacity import retry from tenacity.before_sleep import before_sleep_log from tenacity.wait import wait_random from ..core.errors import ConfigurationError logger = logging.getLogger(__name__) rabbitmq_retry_policy = dict( wait=wait_random(5, 10), before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True, ) def setup(app: FastAPI) -> None: @retry(**rabbitmq_retry_policy) async def on_startup() -> None: app.state.rabbitmq_client = await RabbitMQClient.create(app) async def on_shutdown() -> None: if app.state.rabbitmq_client: await app.state.rabbitmq_client.delete() del app.state.rabbitmq_client # type: ignore
class ContainerWrapper: client: ContainerClient def __init__(self, container_url: str) -> None: self.client = ContainerClient.from_container_url(container_url) self.container_url = container_url @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def upload_file(self, file_path: str, blob_name: str) -> None: with open(file_path, "rb") as handle: self.client.upload_blob(name=blob_name, data=handle, overwrite=True, max_concurrency=10) return None def upload_file_data(self, data: str, blob_name: str) -> None: self.client.upload_blob(name=blob_name, data=data, overwrite=True, max_concurrency=10) def upload_dir(self, dir_path: str) -> None: # security note: the src for azcopy comes from the server which is # trusted in this context, while the destination is provided by the # user azcopy_sync(dir_path, self.container_url) def download_dir(self, dir_path: str) -> None: # security note: the src for azcopy comes from the server which is # trusted in this context, while the destination is provided by the # user azcopy_sync(self.container_url, dir_path) @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def delete_blob(self, blob_name: str) -> None: self.client.delete_blob(blob_name) return None @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def download_blob(self, blob_name: str) -> bytes: return cast(bytes, self.client.download_blob(blob_name).content_as_bytes()) @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def list_blobs(self, *, name_starts_with: Optional[str] = None) -> List[str]: result = [ x.name for x in self.client.list_blobs(name_starts_with=name_starts_with) ] return cast(List[str], result)
class ContainerWrapper: client: ContainerClient def __init__(self, container_url: str) -> None: self.client = ContainerClient.from_container_url(container_url) self.container_url = container_url @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def upload_file(self, file_path: str, blob_name: str) -> None: try: # Split the container URL to insert the blob_name url_parts = self.container_url.split("?", 1) # Default to azcopy if it is installed azcopy_copy(file_path, url_parts[0] + "/" + blob_name + "?" + url_parts[1]) except Exception as exc: # A subprocess exception would typically only contain the exit status. LOGGER.warning( "Upload using azcopy failed. Check the azcopy logs for more information." ) LOGGER.warning(exc) # Indicate the switch in the approach for clarity in debugging LOGGER.warning("Now attempting to upload using the Python SDK...") # This does not have a try/except since it should be caught by the retry system. # The retry system will always attempt azcopy first and this approach second with open(file_path, "rb") as handle: # Using the Azure SDK default max_concurrency self.client.upload_blob(name=blob_name, data=handle, overwrite=True) return None def upload_file_data(self, data: str, blob_name: str) -> None: with tempfile.TemporaryDirectory() as tmpdir: filename = os.path.join(tmpdir, blob_name) with open(filename, "w") as handle: handle.write(data) self.upload_file(filename, blob_name) def upload_dir(self, dir_path: str) -> None: # security note: the src for azcopy comes from the server which is # trusted in this context, while the destination is provided by the # user azcopy_sync(dir_path, self.container_url) def download_dir(self, dir_path: str) -> None: # security note: the src for azcopy comes from the server which is # trusted in this context, while the destination is provided by the # user azcopy_sync(self.container_url, dir_path) @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def delete_blob(self, blob_name: str) -> None: self.client.delete_blob(blob_name) return None @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def download_blob(self, blob_name: str) -> bytes: return cast(bytes, self.client.download_blob(blob_name).content_as_bytes()) @retry( stop=stop_after_attempt(10), wait=wait_random(min=1, max=3), retry=retry_if_exception_type(), before_sleep=before_sleep, reraise=True, ) def list_blobs(self, *, name_starts_with: Optional[str] = None) -> List[str]: result = [ x.name for x in self.client.list_blobs(name_starts_with=name_starts_with) ] return cast(List[str], result)
) from .director_v2_models import ClusterCreate, ClusterPatch, ClusterPing from .director_v2_settings import ( DirectorV2Settings, get_client_session, get_plugin_settings, ) log = logging.getLogger(__name__) _APP_DIRECTOR_V2_CLIENT_KEY = f"{__name__}.DirectorV2ApiClient" SERVICE_HEALTH_CHECK_TIMEOUT = ClientTimeout(total=2, connect=1) # type:ignore DEFAULT_RETRY_POLICY = dict( wait=wait_random(0, 1), stop=stop_after_attempt(2), reraise=True, before_sleep=before_sleep_log(log, logging.WARNING), ) DataType = Dict[str, Any] DataBody = Union[DataType, List[DataType], None] class DirectorV2ApiClient: def __init__(self, app: web.Application) -> None: self._app = app self._settings: DirectorV2Settings = get_plugin_settings(app) async def start(self, project_id: ProjectID, user_id: UserID,
from servicelib.json_serialization import json_dumps from starlette import status from tenacity._asyncio import AsyncRetrying from tenacity.before_sleep import before_sleep_log from tenacity.stop import stop_after_delay from tenacity.wait import wait_random logger = logging.getLogger(__name__) MINUTE = 60 director_startup_retry_policy = dict( # Random service startup order in swarm. # wait_random prevents saturating other services while startup # wait=wait_random(2, 5), stop=stop_after_delay(2 * MINUTE), before_sleep=before_sleep_log(logger, logging.WARNING), reraise=True, ) async def setup_director(app: FastAPI) -> None: if settings := app.state.settings.CATALOG_DIRECTOR: # init client-api logger.debug("Setup director at %s...", settings.base_url) director_client = DirectorApi(base_url=settings.base_url, app=app) # check that the director is accessible async for attempt in AsyncRetrying(**director_startup_retry_policy): with attempt:
async def delete_pipeline( comp_task_stop: ComputationTaskDelete, project_id: ProjectID, project_repo: ProjectsRepository = Depends( get_repository(ProjectsRepository)), computation_pipelines: CompPipelinesRepository = Depends( get_repository(CompPipelinesRepository)), computation_tasks: CompTasksRepository = Depends( get_repository(CompTasksRepository)), scheduler: BaseCompScheduler = Depends(get_scheduler), ) -> None: try: # get the project project: ProjectAtDB = await project_repo.get_project(project_id) # check if current state allow to stop the computation comp_tasks: List[ CompTaskAtDB] = await computation_tasks.get_comp_tasks(project_id) pipeline_state = get_pipeline_state_from_task_states(comp_tasks) if is_pipeline_running(pipeline_state): if not comp_task_stop.force: raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, detail= f"Projet {project_id} is currently running and cannot be deleted, current state is {pipeline_state}", ) # abort the pipeline first try: await scheduler.stop_pipeline(comp_task_stop.user_id, project_id) except SchedulerError as e: log.warning( "Project %s could not be stopped properly.\n reason: %s", project_id, e, ) def return_last_value(retry_state: Any) -> Any: """return the result of the last call attempt""" return retry_state.outcome.result() @retry( stop=stop_after_delay(PIPELINE_ABORT_TIMEOUT_S), wait=wait_random(0, 2), retry_error_callback=return_last_value, retry=retry_if_result(lambda result: result is False), reraise=False, before_sleep=before_sleep_log(log, logging.INFO), ) async def check_pipeline_stopped() -> bool: comp_tasks: List[ CompTaskAtDB] = await computation_tasks.get_comp_tasks( project_id) pipeline_state = get_pipeline_state_from_task_states( comp_tasks, ) return is_pipeline_stopped(pipeline_state) # wait for the pipeline to be stopped if not await check_pipeline_stopped(): log.error( "pipeline %s could not be stopped properly after %ss", project_id, PIPELINE_ABORT_TIMEOUT_S, ) # delete the pipeline now await computation_tasks.delete_tasks_from_project(project) await computation_pipelines.delete_pipeline(project_id) except ProjectNotFoundError as e: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"{e}") from e