Esempio n. 1
0
async def wait_till_service_healthy(service_name: str, endpoint: URL):

    log.info(
        "Connecting to %s",
        f"{service_name=} at {endpoint=}",
    )
    async for attempt in AsyncRetrying(
            # randomizing healthchecks sampling helps parallel execution
            wait=wait_random(1, 2),
            # sets the timeout for a service to become healthy
            stop=stop_after_delay(2 * MINUTE),
            before_sleep=before_sleep_log(log, logging.WARNING),
            reraise=True,
    ):
        with attempt:
            async with aiohttp.ClientSession(
                    timeout=_ONE_SEC_TIMEOUT) as session:
                async with session.get(endpoint) as response:
                    # NOTE: Health-check endpoint require only a status code 200
                    # (see e.g. services/web/server/docker/healthcheck.py)
                    # regardless of the payload content
                    assert (
                        response.status == 200
                    ), f"Connection to {service_name=} at {endpoint=} failed with {response=}"

            log.info(
                "Connection to %s succeeded [%s]",
                f"{service_name=} at {endpoint=}",
                json.dumps(attempt.retry_state.retry_object.statistics),
            )
Esempio n. 2
0
 def wrapper(*args, **kwargs) -> Any:
     return Retrying(
         retry=(retry_if_network_error() | retry_if_throttling_error()),
         stop=stop_after_attempt(max_attempt_number=max_retries),
         wait=(wait_spotify_throttling() + wait_random(min=1, max=3)),
         before=before_log(retry_logger, logging.DEBUG),
         before_sleep=before_sleep_log(retry_logger, logging.WARNING),
     ).call(func, *args, **kwargs)
Esempio n. 3
0
async def _wait_for_call(mocked_fct):
    async for attempt in AsyncRetrying(
            stop=stop_after_delay(10),
            wait=wait_random(0, 1),
            retry=retry_if_exception_type(AssertionError),
            reraise=True,
    ):
        with attempt:
            print(f"waiting for call in mocked fct {mocked_fct}, "
                  f"Attempt={attempt.retry_state.attempt_number}")
            mocked_fct.assert_called_once()
Esempio n. 4
0
def wemo_on():
    @tenacity.retry(wait=wait_fixed(10) + wait_random(-2, 2),
                    stop=stop_after_delay(60 * 60),
                    before_sleep=before_sleep_log(_LOGGER, logging.INFO))
    def discover_and_on():
        address = settings.wemo_address
        port = pywemo.ouimeaux_device.probe_wemo(address)
        url = 'http://%s:%i/setup.xml' % (address, port)
        device = pywemo.discovery.device_from_description(url, None)
        device.on()
        _LOGGER.info("Called on on %s", device)

    discover_and_on()
    return "ok"
Esempio n. 5
0
async def managed_docker_compose(postgres_volume_name: str,
                                 postgres_username: str,
                                 postgres_password: str):
    typer.echo("starting up database in localhost")
    compose_file = Path.cwd() / "consistency" / "docker-compose.yml"
    try:
        subprocess.run(
            ["docker-compose", "--file", compose_file, "up", "--detach"],
            shell=False,
            check=True,
            cwd=compose_file.parent,
            env={
                **os.environ,
                **{
                    "POSTGRES_DATA_VOLUME": postgres_volume_name
                }
            },
        )
        typer.echo(
            f"database started: adminer available on http://127.0.0.1:18080/?pgsql=postgres&username={postgres_username}&db=simcoredb&ns=public"
        )

        @retry(
            wait=wait_random(1, 3),
            stop=stop_after_attempt(10),
            after=after_log(log, logging.WARN),
        )
        async def postgres_responsive():
            async with aiopg.create_pool(
                    f"dbname=simcoredb user={postgres_username} password={postgres_password} host=127.0.0.1"
            ) as pool:
                async with pool.acquire() as conn:
                    async with conn.cursor() as cur:
                        await cur.execute("SELECT 1")

        await postgres_responsive()
        yield
    finally:
        subprocess.run(
            ["docker-compose", "--file", compose_file, "down"],
            shell=False,
            check=True,
            cwd=compose_file.parent,
        )
Esempio n. 6
0
class ContainerWrapper:
    def __init__(self, container_url: str) -> None:
        self.client = ContainerClient.from_container_url(container_url)

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def upload_file(self, file_path: str, blob_name: str) -> None:
        with open(file_path, "rb") as handle:
            self.client.upload_blob(name=blob_name,
                                    data=handle,
                                    overwrite=True,
                                    max_concurrency=10)
        return None

    def upload_file_data(self, data: str, blob_name: str) -> None:
        self.client.upload_blob(name=blob_name,
                                data=data,
                                overwrite=True,
                                max_concurrency=10)

    def upload_dir(self, dir_path: str, recursive: bool = True) -> None:
        for path in glob.glob(os.path.join(dir_path, "**"),
                              recursive=recursive):
            if os.path.isfile(path):
                blob_name = os.path.relpath(path, start=dir_path)
                self.upload_file(path, blob_name)

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def delete_blob(self, blob_name: str) -> None:
        self.client.delete_blob(blob_name)
        return None

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def download_blob(self, blob_name: str) -> bytes:
        return cast(bytes,
                    self.client.download_blob(blob_name).content_as_bytes())

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def list_blobs(self,
                   *,
                   name_starts_with: Optional[str] = None) -> List[str]:
        result = [
            x.name
            for x in self.client.list_blobs(name_starts_with=name_starts_with)
        ]
        return cast(List[str], result)
async def assert_service_is_running(
        service_id: str,
        docker,
        *,
        max_running_delay=1 * MINUTE
) -> Tuple[List[TaskDict], TenacityStatsDict]:
    MAX_WAIT = 5
    assert max_running_delay > 3 * MAX_WAIT

    #
    # The retry-policy constraints in this test
    # the time a service takes since it is deployed by the swarm
    # until it is running (i.e. started and healthy)
    #
    retry_policy = dict(
        # instead of wait_fix in order to help parallel execution in asyncio.gather
        wait=wait_random(1, MAX_WAIT),
        stop=stop_after_delay(max_running_delay),
        before_sleep=before_sleep_log(log, logging.INFO),
        reraise=True,
    )

    async for attempt in AsyncRetrying(**retry_policy):
        with attempt:

            # service
            service: ServiceDict = await docker.services.inspect(service_id)

            assert service_id == service["ID"]

            service_name = service["Spec"]["Name"]
            num_replicas = int(
                get_from_dict(service,
                              "Spec.Mode.Replicated.Replicas",
                              default=1))

            # tasks in a service
            tasks: List[TaskDict] = await docker.tasks.list(
                filters={"service": service_name})

            tasks_current_state = [task["Status"]["State"] for task in tasks]
            num_running = sum(current == "running"
                              for current in tasks_current_state)

            # assert condition
            is_running: bool = num_replicas == num_running

            error_msg = ""
            if not is_running:
                # lazy composes error msg
                logs_lines = await docker.services.logs(
                    service_id,
                    follow=False,
                    timestamps=True,
                    tail=50,  # SEE *_docker_logs artifacts for details
                )
                log_str = " ".join(logs_lines)
                tasks_json = json.dumps(
                    [
                        copy_from_dict(
                            task,
                            include={
                                "ID":...,
                                "CreatedAt":...,
                                "UpdatedAt":...,
                                "Spec": {
                                    "ContainerSpec": {"Image"}
                                },
                                "Status": {"Timestamp", "State"},
                                "DesiredState":...,
                            },
                        ) for task in tasks
                    ],
                    indent=1,
                )
                error_msg = (
                    f"{service_name=} has {tasks_current_state=}, but expected at least {num_replicas=} running. "
                    f"Details:\n"
                    f"tasks={tasks_json}\n"
                    f"logs={log_str}\n")

            assert is_running, error_msg

            log.info(
                "Connection to %s succeded [%s]",
                service_name,
                json.dumps(attempt.retry_state.retry_object.statistics),
            )

            return tasks, attempt.retry_state.retry_object.statistics
    assert False  # never reached
Esempio n. 8
0
    LoggerRabbitMessage,
    ProgressRabbitMessage,
    RabbitMessageTypes,
)
from settings_library.rabbit import RabbitSettings
from tenacity import retry
from tenacity.before_sleep import before_sleep_log
from tenacity.wait import wait_random

from ..core.errors import ConfigurationError

logger = logging.getLogger(__name__)


rabbitmq_retry_policy = dict(
    wait=wait_random(5, 10),
    before_sleep=before_sleep_log(logger, logging.WARNING),
    reraise=True,
)


def setup(app: FastAPI) -> None:
    @retry(**rabbitmq_retry_policy)
    async def on_startup() -> None:
        app.state.rabbitmq_client = await RabbitMQClient.create(app)

    async def on_shutdown() -> None:
        if app.state.rabbitmq_client:
            await app.state.rabbitmq_client.delete()
            del app.state.rabbitmq_client  # type: ignore
Esempio n. 9
0
class ContainerWrapper:
    client: ContainerClient

    def __init__(self, container_url: str) -> None:
        self.client = ContainerClient.from_container_url(container_url)
        self.container_url = container_url

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def upload_file(self, file_path: str, blob_name: str) -> None:
        with open(file_path, "rb") as handle:
            self.client.upload_blob(name=blob_name,
                                    data=handle,
                                    overwrite=True,
                                    max_concurrency=10)
        return None

    def upload_file_data(self, data: str, blob_name: str) -> None:
        self.client.upload_blob(name=blob_name,
                                data=data,
                                overwrite=True,
                                max_concurrency=10)

    def upload_dir(self, dir_path: str) -> None:
        # security note: the src for azcopy comes from the server which is
        # trusted in this context, while the destination is provided by the
        # user
        azcopy_sync(dir_path, self.container_url)

    def download_dir(self, dir_path: str) -> None:
        # security note: the src for azcopy comes from the server which is
        # trusted in this context, while the destination is provided by the
        # user
        azcopy_sync(self.container_url, dir_path)

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def delete_blob(self, blob_name: str) -> None:
        self.client.delete_blob(blob_name)
        return None

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def download_blob(self, blob_name: str) -> bytes:
        return cast(bytes,
                    self.client.download_blob(blob_name).content_as_bytes())

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def list_blobs(self,
                   *,
                   name_starts_with: Optional[str] = None) -> List[str]:
        result = [
            x.name
            for x in self.client.list_blobs(name_starts_with=name_starts_with)
        ]
        return cast(List[str], result)
Esempio n. 10
0
class ContainerWrapper:
    client: ContainerClient

    def __init__(self, container_url: str) -> None:
        self.client = ContainerClient.from_container_url(container_url)
        self.container_url = container_url

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def upload_file(self, file_path: str, blob_name: str) -> None:
        try:
            # Split the container URL to insert the blob_name
            url_parts = self.container_url.split("?", 1)

            # Default to azcopy if it is installed
            azcopy_copy(file_path,
                        url_parts[0] + "/" + blob_name + "?" + url_parts[1])
        except Exception as exc:
            # A subprocess exception would typically only contain the exit status.
            LOGGER.warning(
                "Upload using azcopy failed. Check the azcopy logs for more information."
            )
            LOGGER.warning(exc)
            # Indicate the switch in the approach for clarity in debugging
            LOGGER.warning("Now attempting to upload using the Python SDK...")

            # This does not have a try/except since it should be caught by the retry system.
            # The retry system will always attempt azcopy first and this approach second
            with open(file_path, "rb") as handle:
                # Using the Azure SDK default max_concurrency
                self.client.upload_blob(name=blob_name,
                                        data=handle,
                                        overwrite=True)
        return None

    def upload_file_data(self, data: str, blob_name: str) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            filename = os.path.join(tmpdir, blob_name)

            with open(filename, "w") as handle:
                handle.write(data)

            self.upload_file(filename, blob_name)

    def upload_dir(self, dir_path: str) -> None:
        # security note: the src for azcopy comes from the server which is
        # trusted in this context, while the destination is provided by the
        # user
        azcopy_sync(dir_path, self.container_url)

    def download_dir(self, dir_path: str) -> None:
        # security note: the src for azcopy comes from the server which is
        # trusted in this context, while the destination is provided by the
        # user
        azcopy_sync(self.container_url, dir_path)

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def delete_blob(self, blob_name: str) -> None:
        self.client.delete_blob(blob_name)
        return None

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def download_blob(self, blob_name: str) -> bytes:
        return cast(bytes,
                    self.client.download_blob(blob_name).content_as_bytes())

    @retry(
        stop=stop_after_attempt(10),
        wait=wait_random(min=1, max=3),
        retry=retry_if_exception_type(),
        before_sleep=before_sleep,
        reraise=True,
    )
    def list_blobs(self,
                   *,
                   name_starts_with: Optional[str] = None) -> List[str]:
        result = [
            x.name
            for x in self.client.list_blobs(name_starts_with=name_starts_with)
        ]
        return cast(List[str], result)
Esempio n. 11
0
)
from .director_v2_models import ClusterCreate, ClusterPatch, ClusterPing
from .director_v2_settings import (
    DirectorV2Settings,
    get_client_session,
    get_plugin_settings,
)

log = logging.getLogger(__name__)

_APP_DIRECTOR_V2_CLIENT_KEY = f"{__name__}.DirectorV2ApiClient"

SERVICE_HEALTH_CHECK_TIMEOUT = ClientTimeout(total=2, connect=1)  # type:ignore

DEFAULT_RETRY_POLICY = dict(
    wait=wait_random(0, 1),
    stop=stop_after_attempt(2),
    reraise=True,
    before_sleep=before_sleep_log(log, logging.WARNING),
)

DataType = Dict[str, Any]
DataBody = Union[DataType, List[DataType], None]


class DirectorV2ApiClient:
    def __init__(self, app: web.Application) -> None:
        self._app = app
        self._settings: DirectorV2Settings = get_plugin_settings(app)

    async def start(self, project_id: ProjectID, user_id: UserID,
Esempio n. 12
0
from servicelib.json_serialization import json_dumps
from starlette import status
from tenacity._asyncio import AsyncRetrying
from tenacity.before_sleep import before_sleep_log
from tenacity.stop import stop_after_delay
from tenacity.wait import wait_random

logger = logging.getLogger(__name__)

MINUTE = 60

director_startup_retry_policy = dict(
    # Random service startup order in swarm.
    # wait_random prevents saturating other services while startup
    #
    wait=wait_random(2, 5),
    stop=stop_after_delay(2 * MINUTE),
    before_sleep=before_sleep_log(logger, logging.WARNING),
    reraise=True,
)


async def setup_director(app: FastAPI) -> None:
    if settings := app.state.settings.CATALOG_DIRECTOR:
        # init client-api
        logger.debug("Setup director at %s...", settings.base_url)
        director_client = DirectorApi(base_url=settings.base_url, app=app)

        # check that the director is accessible
        async for attempt in AsyncRetrying(**director_startup_retry_policy):
            with attempt:
Esempio n. 13
0
async def delete_pipeline(
        comp_task_stop: ComputationTaskDelete,
        project_id: ProjectID,
        project_repo: ProjectsRepository = Depends(
            get_repository(ProjectsRepository)),
        computation_pipelines: CompPipelinesRepository = Depends(
            get_repository(CompPipelinesRepository)),
        computation_tasks: CompTasksRepository = Depends(
            get_repository(CompTasksRepository)),
        scheduler: BaseCompScheduler = Depends(get_scheduler),
) -> None:
    try:
        # get the project
        project: ProjectAtDB = await project_repo.get_project(project_id)
        # check if current state allow to stop the computation
        comp_tasks: List[
            CompTaskAtDB] = await computation_tasks.get_comp_tasks(project_id)
        pipeline_state = get_pipeline_state_from_task_states(comp_tasks)
        if is_pipeline_running(pipeline_state):
            if not comp_task_stop.force:
                raise HTTPException(
                    status_code=status.HTTP_403_FORBIDDEN,
                    detail=
                    f"Projet {project_id} is currently running and cannot be deleted, current state is {pipeline_state}",
                )
            # abort the pipeline first
            try:
                await scheduler.stop_pipeline(comp_task_stop.user_id,
                                              project_id)
            except SchedulerError as e:
                log.warning(
                    "Project %s could not be stopped properly.\n reason: %s",
                    project_id,
                    e,
                )

            def return_last_value(retry_state: Any) -> Any:
                """return the result of the last call attempt"""
                return retry_state.outcome.result()

            @retry(
                stop=stop_after_delay(PIPELINE_ABORT_TIMEOUT_S),
                wait=wait_random(0, 2),
                retry_error_callback=return_last_value,
                retry=retry_if_result(lambda result: result is False),
                reraise=False,
                before_sleep=before_sleep_log(log, logging.INFO),
            )
            async def check_pipeline_stopped() -> bool:
                comp_tasks: List[
                    CompTaskAtDB] = await computation_tasks.get_comp_tasks(
                        project_id)
                pipeline_state = get_pipeline_state_from_task_states(
                    comp_tasks, )
                return is_pipeline_stopped(pipeline_state)

            # wait for the pipeline to be stopped
            if not await check_pipeline_stopped():
                log.error(
                    "pipeline %s could not be stopped properly after %ss",
                    project_id,
                    PIPELINE_ABORT_TIMEOUT_S,
                )

        # delete the pipeline now
        await computation_tasks.delete_tasks_from_project(project)
        await computation_pipelines.delete_pipeline(project_id)

    except ProjectNotFoundError as e:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND,
                            detail=f"{e}") from e