Example #1
0
async def wait_till_service_healthy(service_name: str, endpoint: URL):

    log.info(
        "Connecting to %s",
        f"{service_name=} at {endpoint=}",
    )
    async for attempt in AsyncRetrying(
            # randomizing healthchecks sampling helps parallel execution
            wait=wait_random(1, 2),
            # sets the timeout for a service to become healthy
            stop=stop_after_delay(2 * MINUTE),
            before_sleep=before_sleep_log(log, logging.WARNING),
            reraise=True,
    ):
        with attempt:
            async with aiohttp.ClientSession(
                    timeout=_ONE_SEC_TIMEOUT) as session:
                async with session.get(endpoint) as response:
                    # NOTE: Health-check endpoint require only a status code 200
                    # (see e.g. services/web/server/docker/healthcheck.py)
                    # regardless of the payload content
                    assert (
                        response.status == 200
                    ), f"Connection to {service_name=} at {endpoint=} failed with {response=}"

            log.info(
                "Connection to %s succeeded [%s]",
                f"{service_name=} at {endpoint=}",
                json.dumps(attempt.retry_state.retry_object.statistics),
            )
Example #2
0
async def _is_registry_reachable(registry_settings: RegistrySettings) -> None:
    async for attempt in AsyncRetrying(
            wait=wait_fixed(1),
            stop=stop_after_attempt(1),
            before_sleep=before_sleep_log(logger, logging.INFO),
            reraise=True,
    ):
        with attempt:
            async with httpx.AsyncClient() as client:
                params = {}
                if registry_settings.REGISTRY_AUTH:
                    params["auth"] = (
                        registry_settings.REGISTRY_USER,
                        registry_settings.REGISTRY_PW.get_secret_value(),
                    )

                protocol = "https" if registry_settings.REGISTRY_SSL else "http"
                url = f"{protocol}://{registry_settings.api_url}/"
                logging.info("Registry test url ='%s'", url)
                response = await client.get(url, timeout=1, **params)
                reachable = (response.status_code == status.HTTP_200_OK
                             and response.json() == {})
                if not reachable:
                    logger.error("Response: %s", response)
                    error_message = (
                        f"Could not reach registry {registry_settings.api_url} "
                        f"auth={registry_settings.REGISTRY_AUTH}")
                    raise _RegistryNotReachableException(error_message)
def svn_retry():
    return retry(
        retry=retry_if_exception(is_retryable_svn_exception),
        wait=wait_exponential(exp_base=SVN_RETRY_WAIT_EXP_BASE),
        stop=stop_after_attempt(max_attempt_number=SVN_RETRY_MAX_ATTEMPTS),
        before_sleep=before_sleep_log(logger, logging.DEBUG),
        reraise=True,
    )
Example #4
0
    def __init__(self, logger: Optional[logging.Logger] = None):
        logger = logger or log

        self.kwargs = dict(
            wait=wait_fixed(self.WAIT_SECS),
            stop=stop_after_attempt(self.ATTEMPTS_COUNT),
            before_sleep=before_sleep_log(logger, logging.WARNING),
            reraise=True,
        )
Example #5
0
    def __init__(self, logger: Optional[logging.Logger] = None):
        logger = logger or log

        self.kwargs = dict(
            wait=wait_fixed(2),
            stop=stop_after_delay(3 * _MINUTE),
            before_sleep=before_sleep_log(logger, logging.WARNING),
            reraise=True,
        )
Example #6
0
def simcore_stack_deployed_services(
    docker_registry: UrlStr,
    core_stack_namespace: str,
    ops_stack_namespace: str,
    core_stack_compose_specs: ComposeSpec,
    docker_client: DockerClient,
) -> List[Service]:

    # NOTE: the goal here is NOT to test time-to-deploy but
    # rather guaranteing that the framework is fully deployed before starting
    # tests. Obviously in a critical state in which the frameworks has a problem
    # the fixture will fail
    try:
        for attempt in Retrying(
                wait=wait_fixed(5),
                stop=stop_after_delay(4 * _MINUTE),
                before_sleep=before_sleep_log(log, logging.INFO),
                reraise=True,
        ):
            with attempt:
                for service in docker_client.services.list():
                    assert_service_is_running(service)

    finally:
        for stack_namespace in (core_stack_namespace, ops_stack_namespace):
            subprocess.run(f"docker stack ps {stack_namespace}",
                           shell=True,
                           check=False)

        # logs table like
        #  ID                  NAME                  IMAGE                                      NODE                DESIRED STATE       CURRENT STATE                ERROR
        # xbrhmaygtb76        simcore_sidecar.1     itisfoundation/sidecar:latest              crespo-wkstn        Running             Running 53 seconds ago
        # zde7p8qdwk4j        simcore_rabbit.1      itisfoundation/rabbitmq:3.8.0-management   crespo-wkstn        Running             Running 59 seconds ago
        # f2gxmhwq7hhk        simcore_postgres.1    postgres:10.10                             crespo-wkstn        Running             Running about a minute ago
        # 1lh2hulxmc4q        simcore_director.1    itisfoundation/director:latest             crespo-wkstn        Running             Running 34 seconds ago
        # ...

    # TODO: find a more reliable way to list services in a stack
    core_stack_services: List[Service] = [
        service for service in docker_client.services.list(
            filters={
                "label": f"com.docker.stack.namespace={core_stack_namespace}"
            })
    ]  # type: ignore

    assert (core_stack_services
            ), f"Expected some services in core stack '{core_stack_namespace}'"

    assert len(core_stack_compose_specs["services"].keys()) == len(
        core_stack_services)

    return core_stack_services
Example #7
0
 async def _check_all_services_are_running():
     async for attempt in AsyncRetrying(
             wait=wait_fixed(5),
             stop=stop_after_delay(8 * MINUTE),
             before_sleep=before_sleep_log(log, logging.INFO),
             reraise=True,
     ):
         with attempt:
             await asyncio.gather(*[
                 asyncio.get_event_loop().run_in_executor(
                     None, assert_service_is_running, service)
                 for service in docker_client.services.list()
             ])
Example #8
0
def wemo_off():
    @tenacity.retry(wait=wait_fixed(10),
                    before_sleep=before_sleep_log(_LOGGER, logging.INFO))
    def discover_and_off():
        address = settings.wemo_address
        port = pywemo.ouimeaux_device.probe_wemo(address)
        url = 'http://%s:%i/setup.xml' % (address, port)
        device = pywemo.discovery.device_from_description(url, None)
        device.off()
        _LOGGER.info("Called off on %s", device)

    discover_and_off()
    return "ok"
async def setup_registry(app: web.Application) -> AsyncIterator[None]:
    logger.debug("pinging registry...")

    @retry(
        wait=wait_fixed(2),
        before_sleep=before_sleep_log(logger, logging.WARNING),
        retry=retry_if_result(lambda result: result == False),
        reraise=True,
    )
    async def wait_until_registry_responsive(app: web.Application) -> bool:
        return await is_registry_responsive(app)

    await wait_until_registry_responsive(app)
    logger.info("Connected to docker registry")
    yield
Example #10
0
 async def create(
     cls,
     app: FastAPI,
     settings: DaskSchedulerSettings,
     endpoint: AnyUrl,
     authentication: ClusterAuthentication,
 ) -> "DaskClient":
     logger.info(
         "Initiating connection to %s with auth: %s",
         f"dask-scheduler/gateway at {endpoint}",
         authentication,
     )
     async for attempt in AsyncRetrying(
             reraise=True,
             before_sleep=before_sleep_log(logger, logging.WARNING),
             wait=wait_fixed(0.3),
             stop=stop_after_attempt(3),
     ):
         with attempt:
             logger.debug(
                 "Connecting to %s, attempt %s...",
                 endpoint,
                 attempt.retry_state.attempt_number,
             )
             dask_subsystem = await _create_internal_client_based_on_auth(
                 endpoint, authentication)
             check_scheduler_status(dask_subsystem.client)
             instance = cls(
                 app=app,
                 dask_subsystem=dask_subsystem,
                 settings=settings,
                 cancellation_dask_pub=distributed.Pub(
                     TaskCancelEvent.topic_name(),
                     client=dask_subsystem.client),
             )
             logger.info(
                 "Connection to %s succeeded [%s]",
                 f"dask-scheduler/gateway at {endpoint}",
                 json.dumps(attempt.retry_state.retry_object.statistics),
             )
             logger.info(
                 "Scheduler info:\n%s",
                 json.dumps(dask_subsystem.client.scheduler_info(),
                            indent=2),
             )
             return instance
     # this is to satisfy pylance
     raise ValueError("Could not create client")
Example #11
0
    async def _create_client(address: str) -> aioredis.Redis:
        client: Optional[aioredis.Redis] = None

        async for attempt in AsyncRetrying(
                stop=stop_after_delay(1 * _MINUTE),
                wait=wait_fixed(_WAIT_SECS),
                before_sleep=before_sleep_log(log, logging.WARNING),
                reraise=True,
        ):
            with attempt:
                client = await aioredis.create_redis_pool(address,
                                                          encoding="utf-8")
                log.info(
                    "Connection to %s succeeded with %s [%s]",
                    f"redis at {address=}",
                    f"{client=}",
                    json.dumps(attempt.retry_state.retry_object.statistics),
                )
        assert client  # nosec
        return client
Example #12
0
async def test_listen_comp_tasks_task(
    mock_project_subsystem: Dict,
    comp_task_listening_task: None,
    client,
    update_values: Dict[str, Any],
    expected_calls: List[str],
    task_class: NodeClass,
):
    db_engine: aiopg.sa.Engine = client.app[APP_DB_ENGINE_KEY]
    async with db_engine.acquire() as conn:
        # let's put some stuff in there now
        result = await conn.execute(
            comp_tasks.insert()
            .values(outputs=json.dumps({}), node_class=task_class)
            .returning(literal_column("*"))
        )
        row: RowProxy = await result.fetchone()
        task = dict(row)

        # let's update some values
        await conn.execute(
            comp_tasks.update()
            .values(**update_values)
            .where(comp_tasks.c.task_id == task["task_id"])
        )

        # tests whether listener gets hooked calls executed
        for call_name, mocked_call in mock_project_subsystem.items():
            if call_name in expected_calls:
                async for attempt in AsyncRetrying(
                    wait=wait_fixed(1),
                    stop=stop_after_delay(10),
                    retry=retry_if_exception_type(AssertionError),
                    before_sleep=before_sleep_log(logger, logging.INFO),
                    reraise=True,
                ):
                    with attempt:
                        mocked_call.assert_awaited()

            else:
                mocked_call.assert_not_called()
Example #13
0
 async def create_client(url) -> aioredis.Redis:
     # create redis client
     client: Optional[aioredis.Redis] = None
     async for attempt in AsyncRetrying(
             stop=stop_after_delay(1 * _MINUTE),
             wait=wait_fixed(_WAIT_SECS),
             before_sleep=before_sleep_log(log, logging.WARNING),
             reraise=True,
     ):
         with attempt:
             client = await aioredis.create_redis_pool(url,
                                                       encoding="utf-8")
             if not client:
                 raise ValueError(
                     "Expected aioredis client instance, got {client}")
             log.info(
                 "Connection to %s succeeded [%s]",
                 f"redis at {endpoint=}",
                 json.dumps(attempt.retry_state.retry_object.statistics),
             )
     assert client  # no sec
     return client
Example #14
0
def simcore_docker_stack_and_registry_ready(
    event_loop: asyncio.AbstractEventLoop,
    docker_registry: UrlStr,
    docker_stack: Dict,
    simcore_services_ready: None,
) -> Dict:
    # At this point `simcore_services_ready` waited until all services
    # are running. Let's make one more check on the web-api
    for attempt in Retrying(
        wait=wait_fixed(1),
        stop=stop_after_delay(0.5 * _MINUTE),
        reraise=True,
        before_sleep=before_sleep_log(log, logging.INFO),
    ):
        with attempt:
            resp = httpx.get("http://127.0.0.1:9081/v0/")
            resp.raise_for_status()
            log.info(
                "Connection to osparc-simcore web API succeeded [%s]",
                json.dumps(attempt.retry_state.retry_object.statistics),
            )

    return docker_stack
Example #15
0
class DataStorageManager:  # pylint: disable=too-many-public-methods
    """Data storage manager

    The dsm has access to the database for all meta data and to the actual backend. For now this
    is simcore's S3 [minio] and the datcore storage facilities.

    For all data that is in-house (simcore.s3, ...) we keep a synchronized database with meta information
    for the physical files.

    For physical changes on S3, that might be time-consuming, the db keeps a state (delete and upload mostly)

    The dsm provides the following additional functionalities:

    - listing of folders for a given users, optionally filtered using a regular expression and optionally
      sorted by one of the meta data keys

    - upload/download of files

        client -> S3 : presigned upload link
        S3 -> client : presigned download link
        datcore -> client: presigned download link
        S3 -> datcore: local copy and then upload via their api

    minio/S3 and postgres can talk nicely with each other via Notifications using rabbigMQ which we already have.
    See:

        https://blog.minio.io/part-5-5-publish-minio-events-via-postgresql-50f6cc7a7346
        https://docs.minio.io/docs/minio-bucket-notification-guide.html
    """

    # TODO: perhaps can be used a cache? add a lifetime?

    s3_client: MinioClientWrapper
    engine: Engine
    loop: object
    pool: ThreadPoolExecutor
    simcore_bucket_name: str
    has_project_db: bool
    session: AioSession = field(default_factory=aiobotocore.get_session)
    datcore_tokens: Dict[str, DatCoreApiToken] = field(default_factory=dict)
    app: Optional[web.Application] = None

    def _create_aiobotocore_client_context(self) -> ClientCreatorContext:
        assert hasattr(self.session, "create_client")  # nosec
        # pylint: disable=no-member

        # SEE API in https://botocore.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html
        # SEE https://aiobotocore.readthedocs.io/en/latest/index.html
        return self.session.create_client(
            "s3",
            endpoint_url=self.s3_client.endpoint_url,
            aws_access_key_id=self.s3_client.access_key,
            aws_secret_access_key=self.s3_client.secret_key,
        )

    def _get_datcore_tokens(
            self, user_id: str) -> Tuple[Optional[str], Optional[str]]:
        # pylint: disable=no-member
        token = self.datcore_tokens.get(user_id, DatCoreApiToken())
        return token.to_tuple()

    async def locations(self, user_id: str):
        locs = []
        simcore_s3 = {"name": SIMCORE_S3_STR, "id": SIMCORE_S3_ID}
        locs.append(simcore_s3)

        api_token, api_secret = self._get_datcore_tokens(user_id)

        if api_token and api_secret and self.app:
            if await datcore_adapter.check_user_can_connect(
                    self.app, api_token, api_secret):
                datcore = {"name": DATCORE_STR, "id": DATCORE_ID}
                locs.append(datcore)

        return locs

    @classmethod
    def location_from_id(cls, location_id: str):
        return get_location_from_id(location_id)

    # LIST/GET ---------------------------

    # pylint: disable=too-many-arguments
    # pylint: disable=too-many-branches
    # pylint: disable=too-many-statements
    async def list_files(self,
                         user_id: str,
                         location: str,
                         uuid_filter: str = "",
                         regex: str = "") -> List[FileMetaDataEx]:
        """Returns a list of file paths

        - Works for simcore.s3 and datcore
        - Can filter on uuid: useful to filter on project_id/node_id
        - Can filter upon regular expression (for now only on key: value pairs of the FileMetaData)
        """
        data = deque()
        if location == SIMCORE_S3_STR:
            accesible_projects_ids = []
            async with self.engine.acquire() as conn, conn.begin():
                accesible_projects_ids = await get_readable_project_ids(
                    conn, int(user_id))
                where_statement = (
                    file_meta_data.c.user_id == user_id
                ) | file_meta_data.c.project_id.in_(accesible_projects_ids)
                if uuid_filter:
                    where_statement &= file_meta_data.c.file_uuid.ilike(
                        f"%{uuid_filter}%")
                query = sa.select([file_meta_data]).where(where_statement)

                async for row in conn.execute(query):
                    dex = to_meta_data_extended(row)
                    if not is_file_entry_valid(dex.fmd):
                        # NOTE: the file is not updated with the information from S3 backend.
                        # 1. Either the file exists, but was never updated in the database
                        # 2. Or the file does not exist or was never completed, and the file_meta_data entry is old and faulty
                        # we need to update from S3 here since the database is not up-to-date
                        dex = await self.try_update_database_from_storage(
                            dex.fmd.file_uuid,
                            dex.fmd.bucket_name,
                            dex.fmd.object_name,
                        )
                    if dex:
                        data.append(dex)

            if self.has_project_db:
                uuid_name_dict = {}
                # now parse the project to search for node/project names
                try:
                    async with self.engine.acquire() as conn, conn.begin():
                        query = sa.select([projects]).where(
                            projects.c.uuid.in_(accesible_projects_ids))

                        async for row in conn.execute(query):
                            proj_data = dict(row.items())

                            uuid_name_dict[
                                proj_data["uuid"]] = proj_data["name"]
                            wb = proj_data["workbench"]
                            for node in wb.keys():
                                uuid_name_dict[node] = wb[node]["label"]
                except DBAPIError as _err:
                    logger.exception(
                        "Error querying database for project names")

                if not uuid_name_dict:
                    # there seems to be no project whatsoever for user_id
                    return []

                # only keep files from non-deleted project
                clean_data = deque()
                for dx in data:
                    d = dx.fmd
                    if d.project_id not in uuid_name_dict:
                        continue
                    #
                    # FIXME: artifically fills ['project_name', 'node_name', 'file_id', 'raw_file_path', 'display_file_path']
                    #        with information from the projects table!

                    d.project_name = uuid_name_dict[d.project_id]
                    if d.node_id in uuid_name_dict:
                        d.node_name = uuid_name_dict[d.node_id]

                    d.raw_file_path = str(
                        Path(d.project_id) / Path(d.node_id) /
                        Path(d.file_name))
                    d.display_file_path = d.raw_file_path
                    d.file_id = d.file_uuid
                    if d.node_name and d.project_name:
                        d.display_file_path = str(
                            Path(d.project_name) / Path(d.node_name) /
                            Path(d.file_name))
                        # once the data was sync to postgres metadata table at this point
                        clean_data.append(dx)

                data = clean_data

        elif location == DATCORE_STR:
            api_token, api_secret = self._get_datcore_tokens(user_id)
            assert self.app  # nosec
            assert api_secret  # nosec
            assert api_token  # nosec
            return await datcore_adapter.list_all_datasets_files_metadatas(
                self.app, api_token, api_secret)

        if uuid_filter:
            # TODO: incorporate this in db query!
            _query = re.compile(uuid_filter, re.IGNORECASE)
            filtered_data = deque()
            for dx in data:
                d = dx.fmd
                if _query.search(d.file_uuid):
                    filtered_data.append(dx)

            return list(filtered_data)

        if regex:
            _query = re.compile(regex, re.IGNORECASE)
            filtered_data = deque()
            for dx in data:
                d = dx.fmd
                _vars = vars(d)
                for v in _vars.keys():
                    if _query.search(v) or _query.search(str(_vars[v])):
                        filtered_data.append(dx)
                        break
            return list(filtered_data)

        return list(data)

    async def list_files_dataset(
            self, user_id: str, location: str, dataset_id: str
    ) -> Union[List[FileMetaData], List[FileMetaDataEx]]:
        # this is a cheap shot, needs fixing once storage/db is in sync
        data = []
        if location == SIMCORE_S3_STR:
            data: List[FileMetaDataEx] = await self.list_files(
                user_id, location, uuid_filter=dataset_id + "/")

        elif location == DATCORE_STR:
            api_token, api_secret = self._get_datcore_tokens(user_id)
            # lists all the files inside the dataset
            assert self.app  # nosec
            assert api_secret  # nosec
            assert api_token  # nosec
            return await datcore_adapter.list_all_files_metadatas_in_dataset(
                self.app, api_token, api_secret, dataset_id)

        return data

    async def list_datasets(self, user_id: str,
                            location: str) -> List[DatasetMetaData]:
        """Returns a list of top level datasets

        Works for simcore.s3 and datcore

        """
        data = []

        if location == SIMCORE_S3_STR:
            if self.has_project_db:
                try:
                    async with self.engine.acquire() as conn, conn.begin():
                        readable_projects_ids = await get_readable_project_ids(
                            conn, int(user_id))
                        has_read_access = projects.c.uuid.in_(
                            readable_projects_ids)

                        # FIXME: this DOES NOT read from file-metadata table!!!
                        query = sa.select([projects.c.uuid, projects.c.name
                                           ]).where(has_read_access)
                        async for row in conn.execute(query):
                            dmd = DatasetMetaData(
                                dataset_id=row.uuid,
                                display_name=row.name,
                            )
                            data.append(dmd)
                except DBAPIError as _err:
                    logger.exception(
                        "Error querying database for project names")

        elif location == DATCORE_STR:
            api_token, api_secret = self._get_datcore_tokens(user_id)
            assert self.app  # nosec
            assert api_secret  # nosec
            assert api_token  # nosec
            return await datcore_adapter.list_datasets(self.app, api_token,
                                                       api_secret)

        return data

    async def list_file(self, user_id: str, location: str,
                        file_uuid: str) -> Optional[FileMetaDataEx]:

        if location == SIMCORE_S3_STR:

            async with self.engine.acquire() as conn, conn.begin():
                can: Optional[AccessRights] = await get_file_access_rights(
                    conn, int(user_id), file_uuid)
                if can.read:
                    query = sa.select([
                        file_meta_data
                    ]).where(file_meta_data.c.file_uuid == file_uuid)
                    result = await conn.execute(query)
                    row = await result.first()
                    if not row:
                        return None
                    file_metadata = to_meta_data_extended(row)
                    if is_file_entry_valid(file_metadata.fmd):
                        return file_metadata
                    # we need to update from S3 here since the database is not up-to-date
                    file_metadata = await self.try_update_database_from_storage(
                        file_metadata.fmd.file_uuid,
                        file_metadata.fmd.bucket_name,
                        file_metadata.fmd.object_name,
                    )
                    return file_metadata
                # FIXME: returns None in both cases: file does not exist or use has no access
                logger.debug("User %s cannot read file %s", user_id, file_uuid)
                return None

        elif location == DATCORE_STR:
            # FIXME: review return inconsistencies
            # api_token, api_secret = self._get_datcore_tokens(user_id)
            import warnings

            warnings.warn("NOT IMPLEMENTED!!!")
            return None

    # UPLOAD/DOWNLOAD LINKS ---------------------------

    async def upload_file_to_datcore(self, _user_id: str,
                                     _local_file_path: str,
                                     _destination_id: str):
        import warnings

        warnings.warn(f"NOT IMPLEMENTED!!! in {self.__class__}")
        # uploads a locally available file to dat core given the storage path, optionally attached some meta data
        # api_token, api_secret = self._get_datcore_tokens(user_id)
        # await dcw.upload_file_to_id(destination_id, local_file_path)

    async def try_update_database_from_storage(
        self,
        file_uuid: str,
        bucket_name: str,
        object_name: str,
        silence_exception: bool = False,
    ) -> Optional[FileMetaDataEx]:
        try:
            async with self._create_aiobotocore_client_context(
            ) as aioboto_client:
                result = await aioboto_client.head_object(Bucket=bucket_name,
                                                          Key=object_name
                                                          )  # type: ignore

                file_size = result["ContentLength"]  # type: ignore
                last_modified = result["LastModified"]  # type: ignore
                entity_tag = result["ETag"].strip('"')  # type: ignore

                async with self.engine.acquire() as conn:
                    result: ResultProxy = await conn.execute(
                        file_meta_data.update().where(
                            file_meta_data.c.file_uuid == file_uuid).values(
                                file_size=file_size,
                                last_modified=last_modified,
                                entity_tag=entity_tag,
                            ).returning(literal_column("*")))
                    if not result:
                        return None
                    row: Optional[RowProxy] = await result.first()
                    if not row:
                        return None

                    return to_meta_data_extended(row)
        except botocore.exceptions.ClientError:
            if silence_exception:
                logger.debug("Error happened while trying to access %s",
                             file_uuid)
            else:
                logger.warning("Error happened while trying to access %s",
                               file_uuid,
                               exc_info=True)
            # the file is not existing or some error happened
            return None

    @retry(
        stop=stop_after_delay(1 * _HOUR),
        wait=wait_exponential(multiplier=0.1, exp_base=1.2, max=30),
        retry=(retry_if_exception_type()
               | retry_if_result(lambda result: result is None)),
        before_sleep=before_sleep_log(logger, logging.INFO),
    )
    async def auto_update_database_from_storage_task(self, file_uuid: str,
                                                     bucket_name: str,
                                                     object_name: str):
        return await self.try_update_database_from_storage(
            file_uuid, bucket_name, object_name, silence_exception=True)

    async def upload_link(self, user_id: str, file_uuid: str):
        """
        Creates pre-signed upload link and updates metadata table when
        link is used and upload is successfuly completed

        SEE _metadata_file_updater
        """

        async with self.engine.acquire() as conn:
            can: Optional[AccessRights] = await get_file_access_rights(
                conn, int(user_id), file_uuid)
            if not can.write:
                logger.debug("User %s was not allowed to upload file %s",
                             user_id, file_uuid)
                raise web.HTTPForbidden(
                    reason=
                    f"User does not have enough access rights to upload file {file_uuid}"
                )

        @retry(**postgres_service_retry_policy_kwargs)
        async def _init_metadata() -> Tuple[int, str]:
            async with self.engine.acquire() as conn:
                fmd = FileMetaData()
                fmd.simcore_from_uuid(file_uuid, self.simcore_bucket_name)
                fmd.user_id = user_id  # NOTE: takes ownership of uploaded data

                # if file already exists, we might want to update a time-stamp

                # upsert file_meta_data
                insert_stmt = pg_insert(file_meta_data).values(**vars(fmd))
                do_nothing_stmt = insert_stmt.on_conflict_do_nothing(
                    index_elements=["file_uuid"])
                await conn.execute(do_nothing_stmt)

                return fmd.file_size, fmd.last_modified

        await _init_metadata()

        bucket_name = self.simcore_bucket_name
        object_name = file_uuid

        # a parallel task is tarted which will update the metadata of the updated file
        # once the update has finished.
        fire_and_forget_task(
            self.auto_update_database_from_storage_task(
                file_uuid=file_uuid,
                bucket_name=bucket_name,
                object_name=object_name,
            ))
        return self.s3_client.create_presigned_put_url(bucket_name,
                                                       object_name)

    async def download_link_s3(self, file_uuid: str, user_id: int) -> str:

        # access layer
        async with self.engine.acquire() as conn:
            can: Optional[AccessRights] = await get_file_access_rights(
                conn, int(user_id), file_uuid)
            if not can.read:
                # NOTE: this is tricky. A user with read access can download and data!
                # If write permission would be required, then shared projects as views cannot
                # recover data in nodes (e.g. jupyter cannot pull work data)
                #
                logger.debug("User %s was not allowed to download file %s",
                             user_id, file_uuid)
                raise web.HTTPForbidden(
                    reason=
                    f"User does not have enough rights to download {file_uuid}"
                )

        bucket_name = self.simcore_bucket_name
        async with self.engine.acquire() as conn:
            stmt = sa.select([file_meta_data.c.object_name
                              ]).where(file_meta_data.c.file_uuid == file_uuid)
            object_name: Optional[str] = await conn.scalar(stmt)

            if object_name is None:
                raise web.HTTPNotFound(
                    reason=f"File '{file_uuid}' does not exists in storage.")

        link = self.s3_client.create_presigned_get_url(bucket_name,
                                                       object_name)
        return link

    async def download_link_datcore(self, user_id: str, file_id: str) -> URL:
        api_token, api_secret = self._get_datcore_tokens(user_id)
        assert self.app  # nosec
        assert api_secret  # nosec
        assert api_token  # nosec
        return await datcore_adapter.get_file_download_presigned_link(
            self.app, api_token, api_secret, file_id)

    # COPY -----------------------------

    async def copy_file_s3_s3(self, user_id: str, dest_uuid: str,
                              source_uuid: str):
        # FIXME: operation MUST be atomic

        # source is s3, location is s3
        to_bucket_name = self.simcore_bucket_name
        to_object_name = dest_uuid
        from_bucket = self.simcore_bucket_name
        from_object_name = source_uuid
        # FIXME: This is not async!
        self.s3_client.copy_object(to_bucket_name, to_object_name, from_bucket,
                                   from_object_name)

        # update db
        async with self.engine.acquire() as conn:
            fmd = FileMetaData()
            fmd.simcore_from_uuid(dest_uuid, self.simcore_bucket_name)
            fmd.user_id = user_id
            ins = file_meta_data.insert().values(**vars(fmd))
            await conn.execute(ins)

    async def copy_file_s3_datcore(self, user_id: str, dest_uuid: str,
                                   source_uuid: str):
        session = get_client_session(self.app)

        # source is s3, get link and copy to datcore
        bucket_name = self.simcore_bucket_name
        object_name = source_uuid
        filename = source_uuid.split("/")[-1]

        s3_dowload_link = self.s3_client.create_presigned_get_url(
            bucket_name, object_name)

        with tempfile.TemporaryDirectory() as tmpdir:
            # FIXME: connect download and upload streams
            local_file_path = os.path.join(tmpdir, filename)

            # Downloads S3 -> local
            await download_to_file_or_raise(session, s3_dowload_link,
                                            local_file_path)

            # Uploads local -> DATCore
            await self.upload_file_to_datcore(
                _user_id=user_id,
                _local_file_path=local_file_path,
                _destination_id=dest_uuid,
            )

    async def copy_file_datcore_s3(
        self,
        user_id: str,
        dest_uuid: str,
        source_uuid: str,
        filename_missing: bool = False,
    ):
        session = get_client_session(self.app)

        # 2 steps: Get download link for local copy, the upload link to s3
        # TODO: This should be a redirect stream!
        dc_link, filename = await self.download_link_datcore(
            user_id=user_id, file_id=source_uuid)
        if filename_missing:
            dest_uuid = str(Path(dest_uuid) / filename)

        s3_upload_link = await self.upload_link(user_id, dest_uuid)

        with tempfile.TemporaryDirectory() as tmpdir:
            # FIXME: connect download and upload streams

            local_file_path = os.path.join(tmpdir, filename)

            # Downloads DATCore -> local
            await download_to_file_or_raise(session, dc_link, local_file_path)

            # Uploads local -> S3
            s3_upload_link = URL(s3_upload_link)
            async with session.put(
                    s3_upload_link,
                    data=Path(local_file_path).open("rb"),
                    raise_for_status=True,
            ) as resp:
                logger.debug(
                    "Uploaded local -> SIMCore %s . Status %s",
                    s3_upload_link,
                    resp.status,
                )

        return dest_uuid

    async def copy_file(
        self,
        user_id: str,
        dest_location: str,
        dest_uuid: str,
        source_location: str,
        source_uuid: str,
    ):
        if source_location == SIMCORE_S3_STR:
            if dest_location == DATCORE_STR:
                await self.copy_file_s3_datcore(user_id, dest_uuid,
                                                source_uuid)
            elif dest_location == SIMCORE_S3_STR:
                await self.copy_file_s3_s3(user_id, dest_uuid, source_uuid)
        elif source_location == DATCORE_STR:
            if dest_location == DATCORE_STR:
                raise NotImplementedError(
                    "copy files from datcore 2 datcore not impl")
            if dest_location == SIMCORE_S3_STR:
                await self.copy_file_datcore_s3(user_id, dest_uuid,
                                                source_uuid)

    async def deep_copy_project_simcore_s3(
        self,
        user_id: str,
        source_project: Dict[str, Any],
        destination_project: Dict[str, Any],
        node_mapping: Dict[str, str],
    ):
        """Parses a given source project and copies all related files to the destination project

        Since all files are organized as

            project_id/node_id/filename or links to datcore

        this function creates a new folder structure

            project_id/node_id/filename

        and copies all files to the corresponding places.

        Additionally, all external files from datcore are being copied and the paths in the destination
        project are adapted accordingly

        Lastly, the meta data db is kept in sync
        """
        source_folder = source_project["uuid"]
        dest_folder = destination_project["uuid"]

        # access layer
        async with self.engine.acquire() as conn, conn.begin():
            source_access_rights = await get_project_access_rights(
                conn, int(user_id), project_id=source_folder)
            dest_access_rights = await get_project_access_rights(
                conn, int(user_id), project_id=dest_folder)
        if not source_access_rights.read:
            logger.debug(
                "User %s was not allowed to read from project %s",
                user_id,
                source_folder,
            )
            raise web.HTTPForbidden(
                reason=
                f"User does not have enough access rights to read from project '{source_folder}'"
            )

        if not dest_access_rights.write:
            logger.debug(
                "User %s was not allowed to write to project %s",
                user_id,
                dest_folder,
            )
            raise web.HTTPForbidden(
                reason=
                f"User does not have enough access rights to write to project '{dest_folder}'"
            )

        # build up naming map based on labels
        uuid_name_dict = {}
        uuid_name_dict[dest_folder] = destination_project["name"]
        for src_node_id, src_node in source_project["workbench"].items():
            new_node_id = node_mapping.get(src_node_id)
            if new_node_id is not None:
                uuid_name_dict[new_node_id] = src_node["label"]

        async with self._create_aiobotocore_client_context() as aioboto_client:

            logger.debug(
                "Listing all items under  %s:%s/",
                self.simcore_bucket_name,
                source_folder,
            )

            # Step 1: List all objects for this project replace them with the destination object name
            # and do a copy at the same time collect some names
            # Note: the / at the end of the Prefix is VERY important, makes the listing several order of magnitudes faster
            response = await aioboto_client.list_objects_v2(
                Bucket=self.simcore_bucket_name, Prefix=f"{source_folder}/")

            contents: List = response.get("Contents", [])
            logger.debug(
                "Listed  %s items under %s:%s/",
                len(contents),
                self.simcore_bucket_name,
                source_folder,
            )

            for item in contents:
                source_object_name = item["Key"]
                source_object_parts = Path(source_object_name).parts

                if len(source_object_parts) != 3:
                    # This may happen once we have shared/home folders
                    # FIXME: this might cause problems
                    logger.info(
                        "Skipping copy of '%s'. Expected three parts path!",
                        source_object_name,
                    )
                    continue

                old_node_id = source_object_parts[1]
                new_node_id = node_mapping.get(old_node_id)
                if new_node_id is not None:
                    old_filename = source_object_parts[2]
                    dest_object_name = str(
                        Path(dest_folder) / new_node_id / old_filename)

                    copy_kwargs = dict(
                        CopySource={
                            "Bucket": self.simcore_bucket_name,
                            "Key": source_object_name,
                        },
                        Bucket=self.simcore_bucket_name,
                        Key=dest_object_name,
                    )
                    logger.debug("Copying %s ...", copy_kwargs)

                    # FIXME: if 5GB, it must use multipart upload Upload Part - Copy API
                    # SEE https://botocore.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.copy_object
                    await aioboto_client.copy_object(**copy_kwargs)

        # Step 2: List all references in outputs that point to datcore and copy over
        for node_id, node in destination_project["workbench"].items():
            outputs: Dict = node.get("outputs", {})
            for _, output in outputs.items():
                source = output["path"]

                if output.get("store") == DATCORE_ID:
                    destination_folder = str(Path(dest_folder) / node_id)
                    logger.info("Copying %s to %s", source, destination_folder)

                    destination = await self.copy_file_datcore_s3(
                        user_id=user_id,
                        dest_uuid=destination_folder,
                        source_uuid=source,
                        filename_missing=True,
                    )
                    assert destination.startswith(destination_folder)  # nosec

                    output["store"] = SIMCORE_S3_ID
                    output["path"] = destination

                elif output.get("store") == SIMCORE_S3_ID:
                    destination = str(
                        Path(dest_folder) / node_id / Path(source).name)
                    output["store"] = SIMCORE_S3_ID
                    output["path"] = destination

        fmds = []
        async with self._create_aiobotocore_client_context() as aioboto_client:

            # step 3: list files first to create fmds
            # Note: the / at the end of the Prefix is VERY important, makes the listing several order of magnitudes faster
            response = await aioboto_client.list_objects_v2(
                Bucket=self.simcore_bucket_name, Prefix=f"{dest_folder}/")

            if "Contents" in response:
                for item in response["Contents"]:
                    fmd = FileMetaData()
                    fmd.simcore_from_uuid(item["Key"],
                                          self.simcore_bucket_name)
                    fmd.project_name = uuid_name_dict.get(
                        dest_folder, "Untitled")
                    fmd.node_name = uuid_name_dict.get(fmd.node_id, "Untitled")
                    fmd.raw_file_path = fmd.file_uuid
                    fmd.display_file_path = str(
                        Path(fmd.project_name) / fmd.node_name / fmd.file_name)
                    fmd.user_id = user_id
                    fmd.file_size = item["Size"]
                    fmd.last_modified = str(item["LastModified"])
                    fmds.append(fmd)

        # step 4 sync db
        async with self.engine.acquire() as conn, conn.begin():
            # TODO: upsert in one statment of ALL
            for fmd in fmds:
                query = sa.select([
                    file_meta_data
                ]).where(file_meta_data.c.file_uuid == fmd.file_uuid)
                # if file already exists, we might w
                rows = await conn.execute(query)
                exists = await rows.scalar()
                if exists:
                    delete_me = file_meta_data.delete().where(
                        file_meta_data.c.file_uuid == fmd.file_uuid)
                    await conn.execute(delete_me)
                ins = file_meta_data.insert().values(**vars(fmd))
                await conn.execute(ins)

    # DELETE -------------------------------------

    async def delete_file(self, user_id: str, location: str, file_uuid: str):
        """Deletes a file given its fmd and location

        Additionally requires a user_id for 3rd party auth

        For internal storage, the db state should be updated upon completion via
        Notification mechanism

        For simcore.s3 we can use the file_name
        For datcore we need the full path
        """
        if location == SIMCORE_S3_STR:
            # FIXME: operation MUST be atomic, transaction??

            to_delete = []
            async with self.engine.acquire() as conn, conn.begin():
                can: Optional[AccessRights] = await get_file_access_rights(
                    conn, int(user_id), file_uuid)
                if not can.delete:
                    logger.debug(
                        "User %s was not allowed to delete file %s",
                        user_id,
                        file_uuid,
                    )
                    raise web.HTTPForbidden(
                        reason=
                        f"User '{user_id}' does not have enough access rights to delete file {file_uuid}"
                    )

                query = sa.select([
                    file_meta_data.c.bucket_name, file_meta_data.c.object_name
                ]).where(file_meta_data.c.file_uuid == file_uuid)

                async for row in conn.execute(query):
                    if self.s3_client.remove_objects(row.bucket_name,
                                                     [row.object_name]):
                        to_delete.append(file_uuid)

                await conn.execute(file_meta_data.delete().where(
                    file_meta_data.c.file_uuid.in_(to_delete)))

        elif location == DATCORE_STR:
            # FIXME: review return inconsistencies
            api_token, api_secret = self._get_datcore_tokens(user_id)
            assert self.app  # nosec
            assert api_secret  # nosec
            assert api_token  # nosec
            await datcore_adapter.delete_file(self.app, api_token, api_secret,
                                              file_uuid)

    async def delete_project_simcore_s3(
            self,
            user_id: str,
            project_id: str,
            node_id: Optional[str] = None) -> Optional[web.Response]:
        """Deletes all files from a given node in a project in simcore.s3 and updated db accordingly.
        If node_id is not given, then all the project files db entries are deleted.
        """

        # FIXME: operation MUST be atomic. Mark for deletion and remove from db when deletion fully confirmed
        async with self.engine.acquire() as conn, conn.begin():
            # access layer
            can: Optional[AccessRights] = await get_project_access_rights(
                conn, int(user_id), project_id)
            if not can.delete:
                logger.debug(
                    "User %s was not allowed to delete project %s",
                    user_id,
                    project_id,
                )
                raise web.HTTPForbidden(
                    reason=f"User does not have delete access for {project_id}"
                )

            delete_me = file_meta_data.delete().where(
                file_meta_data.c.project_id == project_id, )
            if node_id:
                delete_me = delete_me.where(
                    file_meta_data.c.node_id == node_id)
            await conn.execute(delete_me)

        async with self._create_aiobotocore_client_context() as aioboto_client:
            # Note: the / at the end of the Prefix is VERY important, makes the listing several order of magnitudes faster
            response = await aioboto_client.list_objects_v2(
                Bucket=self.simcore_bucket_name,
                Prefix=f"{project_id}/{node_id}/"
                if node_id else f"{project_id}/",
            )

            objects_to_delete = []
            for f in response.get("Contents", []):
                objects_to_delete.append({"Key": f["Key"]})

            if objects_to_delete:
                response = await aioboto_client.delete_objects(
                    Bucket=self.simcore_bucket_name,
                    Delete={"Objects": objects_to_delete},
                )
                return response

    # SEARCH -------------------------------------

    async def search_files_starting_with(self, user_id: int,
                                         prefix: str) -> List[FileMetaDataEx]:
        # Avoids using list_files since it accounts for projects/nodes
        # Storage should know NOTHING about those concepts
        files_meta = deque()

        async with self.engine.acquire() as conn, conn.begin():
            # access layer
            can_read_projects_ids = await get_readable_project_ids(
                conn, int(user_id))
            has_read_access = (
                file_meta_data.c.user_id == str(user_id)
            ) | file_meta_data.c.project_id.in_(can_read_projects_ids)

            stmt = sa.select([file_meta_data]).where(
                file_meta_data.c.file_uuid.startswith(prefix)
                & has_read_access)

            async for row in conn.execute(stmt):
                meta_extended = to_meta_data_extended(row)
                files_meta.append(meta_extended)

        return list(files_meta)

    async def create_soft_link(self, user_id: int, target_uuid: str,
                               link_uuid: str) -> FileMetaDataEx:

        # validate link_uuid
        async with self.engine.acquire() as conn:
            # TODO: select exists(select 1 from file_metadat where file_uuid=12)
            found = await conn.scalar(
                sa.select([file_meta_data.c.file_uuid
                           ]).where(file_meta_data.c.file_uuid == link_uuid))
            if found:
                raise ValueError(
                    f"Invalid link {link_uuid}. Link already exists")

        # validate target_uuid
        target = await self.list_file(str(user_id), SIMCORE_S3_STR,
                                      target_uuid)
        if not target:
            raise ValueError(
                f"Invalid target '{target_uuid}'. File does not exists for this user"
            )

        # duplicate target and change the following columns:
        target.fmd.file_uuid = link_uuid
        target.fmd.file_id = link_uuid  # NOTE: api-server relies on this id
        target.fmd.is_soft_link = True

        async with self.engine.acquire() as conn:
            stmt = (file_meta_data.insert().values(
                **attr.asdict(target.fmd)).returning(literal_column("*")))

            result = await conn.execute(stmt)
            link = to_meta_data_extended(await result.first())
            return link

    async def synchronise_meta_data_table(self, location: str,
                                          dry_run: bool) -> Dict[str, Any]:

        PRUNE_CHUNK_SIZE = 20

        removed: List[str] = []
        to_remove: List[str] = []

        async def _prune_db_table(conn):
            if not dry_run:
                await conn.execute(file_meta_data.delete().where(
                    file_meta_data.c.object_name.in_(to_remove)))
            logger.info(
                "%s %s orphan items",
                "Would have deleted" if dry_run else "Deleted",
                len(to_remove),
            )
            removed.extend(to_remove)
            to_remove.clear()

        # ----------

        assert (  # nosec
            location == SIMCORE_S3_STR
        ), "Only with s3, no other sync implemented"  # nosec

        if location == SIMCORE_S3_STR:

            # NOTE: only valid for simcore, since datcore data is not in the database table
            # let's get all the files in the table
            logger.warning(
                "synchronisation of database/s3 storage started, this will take some time..."
            )

            async with self.engine.acquire(
            ) as conn, self._create_aiobotocore_client_context(
            ) as aioboto_client:

                number_of_rows_in_db = (await conn.scalar(
                    sa.select([sa.func.count()]).select_from(file_meta_data))
                                        or 0)
                logger.warning(
                    "Total number of entries to check %d",
                    number_of_rows_in_db,
                )

                assert isinstance(aioboto_client, AioBaseClient)  # nosec

                async for row in conn.execute(
                        sa.select([file_meta_data.c.object_name])):
                    s3_key = row.object_name  # type: ignore

                    # now check if the file exists in S3
                    # SEE https://www.peterbe.com/plog/fastest-way-to-find-out-if-a-file-exists-in-s3
                    response = await aioboto_client.list_objects_v2(
                        Bucket=self.simcore_bucket_name, Prefix=s3_key)
                    if response.get("KeyCount", 0) == 0:
                        # this file does not exist in S3
                        to_remove.append(s3_key)

                    if len(to_remove) >= PRUNE_CHUNK_SIZE:
                        await _prune_db_table(conn)

                if to_remove:
                    await _prune_db_table(conn)

                assert len(to_remove) == 0  # nosec
                assert len(removed) <= number_of_rows_in_db  # nosec

                logger.info(
                    "%s %d entries ",
                    "Would delete" if dry_run else "Deleting",
                    len(removed),
                )

        return {"removed": removed}
async def assert_service_is_running(
        service_id: str,
        docker,
        *,
        max_running_delay=1 * MINUTE
) -> Tuple[List[TaskDict], TenacityStatsDict]:
    MAX_WAIT = 5
    assert max_running_delay > 3 * MAX_WAIT

    #
    # The retry-policy constraints in this test
    # the time a service takes since it is deployed by the swarm
    # until it is running (i.e. started and healthy)
    #
    retry_policy = dict(
        # instead of wait_fix in order to help parallel execution in asyncio.gather
        wait=wait_random(1, MAX_WAIT),
        stop=stop_after_delay(max_running_delay),
        before_sleep=before_sleep_log(log, logging.INFO),
        reraise=True,
    )

    async for attempt in AsyncRetrying(**retry_policy):
        with attempt:

            # service
            service: ServiceDict = await docker.services.inspect(service_id)

            assert service_id == service["ID"]

            service_name = service["Spec"]["Name"]
            num_replicas = int(
                get_from_dict(service,
                              "Spec.Mode.Replicated.Replicas",
                              default=1))

            # tasks in a service
            tasks: List[TaskDict] = await docker.tasks.list(
                filters={"service": service_name})

            tasks_current_state = [task["Status"]["State"] for task in tasks]
            num_running = sum(current == "running"
                              for current in tasks_current_state)

            # assert condition
            is_running: bool = num_replicas == num_running

            error_msg = ""
            if not is_running:
                # lazy composes error msg
                logs_lines = await docker.services.logs(
                    service_id,
                    follow=False,
                    timestamps=True,
                    tail=50,  # SEE *_docker_logs artifacts for details
                )
                log_str = " ".join(logs_lines)
                tasks_json = json.dumps(
                    [
                        copy_from_dict(
                            task,
                            include={
                                "ID":...,
                                "CreatedAt":...,
                                "UpdatedAt":...,
                                "Spec": {
                                    "ContainerSpec": {"Image"}
                                },
                                "Status": {"Timestamp", "State"},
                                "DesiredState":...,
                            },
                        ) for task in tasks
                    ],
                    indent=1,
                )
                error_msg = (
                    f"{service_name=} has {tasks_current_state=}, but expected at least {num_replicas=} running. "
                    f"Details:\n"
                    f"tasks={tasks_json}\n"
                    f"logs={log_str}\n")

            assert is_running, error_msg

            log.info(
                "Connection to %s succeded [%s]",
                service_name,
                json.dumps(attempt.retry_state.retry_object.statistics),
            )

            return tasks, attempt.retry_state.retry_object.statistics
    assert False  # never reached
Example #17
0
class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
    """List origins from the "SourceForge" forge."""

    # Part of the lister API, that identifies this lister
    LISTER_NAME = "sourceforge"

    def __init__(
        self,
        scheduler: SchedulerInterface,
        incremental: bool = False,
        credentials: Optional[CredentialsType] = None,
    ):
        super().__init__(
            scheduler=scheduler,
            url="https://sourceforge.net",
            instance="main",
            credentials=credentials,
        )

        # Will hold the currently saved "last modified" dates to compare against our
        # requests.
        self._project_last_modified: Optional[ProjectsLastModifiedCache] = None
        self.session = requests.Session()
        # Declare the USER_AGENT is more sysadm-friendly for the forge we list
        self.session.headers.update({
            "Accept": "application/json",
            "User-Agent": USER_AGENT
        })
        self.incremental = incremental

    def state_from_dict(
            self, d: Dict[str, Dict[str, Any]]) -> SourceForgeListerState:
        subsitemaps = {
            k: datetime.date.fromisoformat(v)
            for k, v in d.get("subsitemap_last_modified", {}).items()
        }
        empty_projects = {
            k: datetime.date.fromisoformat(v)
            for k, v in d.get("empty_projects", {}).items()
        }
        return SourceForgeListerState(subsitemap_last_modified=subsitemaps,
                                      empty_projects=empty_projects)

    def state_to_dict(self, state: SourceForgeListerState) -> Dict[str, Any]:
        return {
            "subsitemap_last_modified": {
                k: v.isoformat()
                for k, v in state.subsitemap_last_modified.items()
            },
            "empty_projects":
            {k: v.isoformat()
             for k, v in state.empty_projects.items()},
        }

    def projects_last_modified(self) -> ProjectsLastModifiedCache:
        if not self.incremental:
            # No point in loading the previous results if we're doing a full run
            return {}
        if self._project_last_modified is not None:
            return self._project_last_modified
        # We know there will be at least that many origins
        stream = stream_results(self.scheduler.get_listed_origins,
                                self.lister_obj.id,
                                limit=300_000)
        listed_origins = dict()
        # Projects can have slashes in them if they're subprojects, but the
        # mointpoint (last component) cannot.
        url_match = re.compile(
            r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*")
        bzr_url_match = re.compile(
            r"http://(?P<project>[^/]+).bzr.sourceforge.net/bzr/([^/]+)")
        cvs_url_match = re.compile(
            r"rsync://a.cvs.sourceforge.net/cvsroot/(?P<project>.+)/([^/]+)")

        for origin in stream:
            url = origin.url
            match = url_match.match(url)
            if match is None:
                # Could be a bzr or cvs special endpoint
                bzr_match = bzr_url_match.match(url)
                cvs_match = cvs_url_match.match(url)
                matches = None
                if bzr_match is not None:
                    matches = bzr_match.groupdict()
                elif cvs_match is not None:
                    matches = cvs_match.groupdict()
                assert matches
                project = matches["project"]
                namespace = "p"  # no special namespacing for bzr and cvs projects
            else:
                matches = match.groupdict()
                namespace = matches["namespace"]
                project = matches["project"]
            # "Last modified" dates are the same across all VCS (tools, even)
            # within a project or subproject. An assertion here would be overkill.
            last_modified = origin.last_update
            assert last_modified is not None
            listed_origins[(namespace, project)] = last_modified.date()

        self._project_last_modified = listed_origins
        return listed_origins

    @throttling_retry(
        retry=retry_policy_generic,
        before_sleep=before_sleep_log(logger, logging.WARNING),
    )
    def page_request(self, url, params) -> requests.Response:
        # Log listed URL to ease debugging
        logger.debug("Fetching URL %s with params %s", url, params)
        response = self.session.get(url, params=params)

        if response.status_code != 200:
            # Log response content to ease debugging
            logger.warning(
                "Unexpected HTTP status code %s for URL %s",
                response.status_code,
                response.url,
            )
        # The lister must fail on blocking errors
        response.raise_for_status()

        return response

    def get_pages(self) -> Iterator[SourceForgeListerPage]:
        """
        SourceForge has a main XML sitemap that lists its sharded sitemaps for all
        projects.
        Each XML sub-sitemap lists project pages, which are not unique per project: a
        project can have a wiki, a home, a git, an svn, etc.
        For each unique project, we query an API endpoint that lists (among
        other things) the tools associated with said project, some of which are
        the VCS used. Subprojects are considered separate projects.
        Lastly we use the information of which VCS are used to build the predictable
        clone URL for any given VCS.
        """
        sitemap_contents = self.page_request(MAIN_SITEMAP_URL, {}).text
        tree = ElementTree.fromstring(sitemap_contents)

        for subsitemap in tree.iterfind(f"{SITEMAP_XML_NAMESPACE}sitemap"):
            last_modified_el = subsitemap.find(
                f"{SITEMAP_XML_NAMESPACE}lastmod")
            assert last_modified_el is not None and last_modified_el.text is not None
            last_modified = datetime.date.fromisoformat(last_modified_el.text)
            location = subsitemap.find(f"{SITEMAP_XML_NAMESPACE}loc")
            assert location is not None and location.text is not None
            sub_url = location.text

            if self.incremental:
                recorded_last_mod = self.state.subsitemap_last_modified.get(
                    sub_url)
                if recorded_last_mod == last_modified:
                    # The entire subsitemap hasn't changed, so none of its projects
                    # have either, skip it.
                    continue

            self.state.subsitemap_last_modified[sub_url] = last_modified
            subsitemap_contents = self.page_request(sub_url, {}).text
            subtree = ElementTree.fromstring(subsitemap_contents)

            yield from self._get_pages_from_subsitemap(subtree)

    def get_origins_from_page(
            self, page: SourceForgeListerPage) -> Iterator[ListedOrigin]:
        assert self.lister_obj.id is not None
        for hit in page:
            last_modified: str = str(hit.last_modified)
            last_update: datetime.datetime = iso8601.parse_date(last_modified)
            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                visit_type=hit.vcs.value,
                url=hit.url,
                last_update=last_update,
            )

    def _get_pages_from_subsitemap(
            self,
            subtree: ElementTree.Element) -> Iterator[SourceForgeListerPage]:
        projects: Set[ProjectNameT] = set()
        for project_block in subtree.iterfind(f"{SITEMAP_XML_NAMESPACE}url"):
            last_modified_block = project_block.find(
                f"{SITEMAP_XML_NAMESPACE}lastmod")
            assert last_modified_block is not None
            last_modified = last_modified_block.text
            location = project_block.find(f"{SITEMAP_XML_NAMESPACE}loc")
            assert location is not None
            project_url = location.text
            assert project_url is not None

            match = PROJ_URL_RE.match(project_url)
            if match:
                matches = match.groupdict()
                namespace = matches["namespace"]
                if namespace == "projects":
                    # These have a `p`-namespaced counterpart, use that instead
                    continue

                project = matches["project"]
                rest = matches["rest"]
                if rest.count("/") > 1:
                    # This is a subproject. There exists no sub-subprojects.
                    subproject_name = rest.rsplit("/", 2)[0]
                    project = f"{project}/{subproject_name}"

                prev_len = len(projects)
                projects.add(project)

                if prev_len == len(projects):
                    # Already seen
                    continue

                pages = self._get_pages_for_project(namespace, project,
                                                    last_modified)
                if pages:
                    yield pages
                else:
                    logger.debug("Project '%s' does not have any VCS", project)
            else:
                # Should almost always match, let's log it
                # The only ones that don't match are mostly specialized one-off URLs.
                msg = "Project URL '%s' does not match expected pattern"
                logger.warning(msg, project_url)

    def _get_pages_for_project(self, namespace, project,
                               last_modified) -> SourceForgeListerPage:
        endpoint = PROJECT_API_URL_FORMAT.format(namespace=namespace,
                                                 project=project)
        empty_project_last_modified = self.state.empty_projects.get(endpoint)
        if empty_project_last_modified is not None:
            if last_modified == empty_project_last_modified.isoformat():
                # Project has not changed, so is still empty, meaning it has
                # no VCS attached that we can archive.
                logger.debug(f"Project {namespace}/{project} is still empty")
                return []

        if self.incremental:
            expected = self.projects_last_modified().get((namespace, project))

            if expected is not None:
                if expected.isoformat() == last_modified:
                    # Project has not changed
                    logger.debug(
                        f"Project {namespace}/{project} has not changed")
                    return []
                else:
                    logger.debug(f"Project {namespace}/{project} was updated")
            else:
                msg = "New project during an incremental run: %s/%s"
                logger.debug(msg, namespace, project)

        try:
            res = self.page_request(endpoint, {}).json()
        except requests.HTTPError:
            # We've already logged in `page_request`
            return []

        tools = res.get("tools")
        if tools is None:
            # This rarely happens, on very old URLs
            logger.warning("Project '%s' does not have any tools", endpoint)
            return []

        hits = []
        for tool in tools:
            tool_name = tool["name"]
            if tool_name not in VCS_NAMES:
                continue
            if tool_name == VcsNames.CVS.value:
                # CVS projects are different from other VCS ones, they use the rsync
                # protocol, a list of modules needs to be fetched from an info page
                # and multiple origin URLs can be produced for a same project.
                cvs_info_url = f"http://{project}.cvs.sourceforge.net"
                try:
                    response = self.page_request(cvs_info_url, params={})
                except requests.HTTPError:
                    logger.warning(
                        "CVS info page could not be fetched, skipping project '%s'",
                        project,
                    )
                    continue
                else:
                    bs = BeautifulSoup(response.text, features="html.parser")
                    cvs_base_url = "rsync://a.cvs.sourceforge.net/cvsroot"
                    for text in [b.text for b in bs.find_all("b")]:
                        match = re.search(rf".*/cvsroot/{project} co -P (.+)",
                                          text)
                        if match is not None:
                            module = match.group(1)
                            if module != "Attic":
                                url = f"{cvs_base_url}/{project}/{module}"
                                hits.append(
                                    SourceForgeListerEntry(
                                        vcs=VcsNames(tool_name),
                                        url=url,
                                        last_modified=last_modified,
                                    ))
                    continue
            url = CLONE_URL_FORMAT.format(
                vcs=tool_name,
                namespace=namespace,
                project=project,
                mount_point=tool["mount_point"],
            )
            if tool_name == VcsNames.MERCURIAL.value:
                # SourceForge does not yet support anonymous HTTPS cloning for Mercurial
                # See https://sourceforge.net/p/forge/feature-requests/727/
                url = url.replace("https://", "http://")
            if tool_name == VcsNames.BAZAAR.value:
                # SourceForge has removed support for bzr and only keeps legacy projects
                # around at a separate (also not https) URL. Bzr projects are very rare
                # and a lot of them are 404 now.
                url = f"http://{project}.bzr.sourceforge.net/bzr/{project}"
                try:
                    response = self.page_request(url, params={})
                    if "To get this branch, use:" not in response.text:
                        # If a bzr project has multiple branches, we need to extract their
                        # names from the repository landing page and create one listed origin
                        # per branch
                        parser = lxml.etree.HTMLParser()
                        tree = lxml.etree.fromstring(response.text, parser)

                        # Get all tds with class 'autcell'
                        tds = tree.xpath(".//td[contains(@class, 'autcell')]")
                        for td in tds:
                            branch = td.findtext("a")
                            # If the td's parent contains <img alt="Branch"/> and
                            # it has non-empty text:
                            if td.xpath("..//img[@alt='Branch']") and branch:
                                hits.append(
                                    SourceForgeListerEntry(
                                        vcs=VcsNames(tool_name),
                                        url=f"{url}/{branch}",
                                        last_modified=last_modified,
                                    ))
                        continue
                except requests.HTTPError:
                    logger.warning(
                        "Bazaar repository page could not be fetched, skipping project '%s'",
                        project,
                    )
                    continue
            entry = SourceForgeListerEntry(vcs=VcsNames(tool_name),
                                           url=url,
                                           last_modified=last_modified)
            hits.append(entry)

        if not hits:
            date = datetime.date.fromisoformat(last_modified)
            self.state.empty_projects[endpoint] = date
        else:
            self.state.empty_projects.pop(endpoint, None)

        return hits
Example #18
0
class GitLabLister(Lister[GitLabListerState, PageResult]):
    """List origins for a gitlab instance.

    By default, the lister runs in incremental mode: it lists all repositories,
    starting with the `last_seen_next_link` stored in the scheduler backend.

    Args:
        scheduler: a scheduler instance
        url: the api v4 url of the gitlab instance to visit (e.g.
          https://gitlab.com/api/v4/)
        instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...)
        incremental: defines if incremental listing is activated or not

    """

    LISTER_NAME = "gitlab"

    def __init__(
        self,
        scheduler,
        url: str,
        instance: Optional[str] = None,
        credentials: Optional[CredentialsType] = None,
        incremental: bool = False,
    ):
        if instance is None:
            instance = parse_url(url).host
        super().__init__(
            scheduler=scheduler,
            url=url.rstrip("/"),
            instance=instance,
            credentials=credentials,
        )
        self.incremental = incremental
        self.last_page: Optional[str] = None

        self.session = requests.Session()
        self.session.headers.update({
            "Accept": "application/json",
            "User-Agent": USER_AGENT
        })

        if len(self.credentials) > 0:
            cred = random.choice(self.credentials)
            logger.info("Using %s credentials from user %s", self.instance,
                        cred["username"])
            api_token = cred["password"]
            if api_token:
                self.session.headers["Authorization"] = f"Bearer {api_token}"

    def state_from_dict(self, d: Dict[str, Any]) -> GitLabListerState:
        return GitLabListerState(**d)

    def state_to_dict(self, state: GitLabListerState) -> Dict[str, Any]:
        return asdict(state)

    @throttling_retry(retry=_if_rate_limited,
                      before_sleep=before_sleep_log(logger, logging.WARNING))
    def get_page_result(self, url: str) -> PageResult:
        logger.debug("Fetching URL %s", url)
        response = self.session.get(url)
        if response.status_code != 200:
            logger.warning(
                "Unexpected HTTP status code %s on %s: %s",
                response.status_code,
                response.url,
                response.content,
            )
        response.raise_for_status()
        repositories: Tuple[Repository, ...] = tuple(response.json())
        if hasattr(response, "links") and response.links.get("next"):
            next_page = response.links["next"]["url"]
        else:
            next_page = None

        return PageResult(repositories, next_page)

    def page_url(self, id_after: Optional[int] = None) -> str:
        parameters = {
            "pagination": "keyset",
            "order_by": "id",
            "sort": "asc",
        }
        if id_after is not None:
            parameters["id_after"] = str(id_after)
        return f"{self.url}/projects?{urlencode(parameters)}"

    def get_pages(self) -> Iterator[PageResult]:
        next_page: Optional[str]
        if self.incremental and self.state and self.state.last_seen_next_link:
            next_page = self.state.last_seen_next_link
        else:
            next_page = self.page_url()

        while next_page:
            self.last_page = next_page
            page_result = self.get_page_result(next_page)
            yield page_result
            next_page = page_result.next_page

    def get_origins_from_page(
            self, page_result: PageResult) -> Iterator[ListedOrigin]:
        assert self.lister_obj.id is not None

        repositories = page_result.repositories if page_result.repositories else []
        for repo in repositories:
            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=repo["http_url_to_repo"],
                visit_type="git",
                last_update=iso8601.parse_date(repo["last_activity_at"]),
            )

    def commit_page(self, page_result: PageResult) -> None:
        """Update currently stored state using the latest listed "next" page if relevant.

        Relevancy is determined by the next_page link whose 'page' id must be strictly
        superior to the currently stored one.

        Note: this is a noop for full listing mode

        """
        if self.incremental:
            # link: https://${project-api}/?...&page=2x...
            next_page = page_result.next_page
            if not next_page and self.last_page:
                next_page = self.last_page

            if next_page:
                id_after = _parse_id_after(next_page)
                previous_next_page = self.state.last_seen_next_link
                previous_id_after = _parse_id_after(previous_next_page)

                if previous_next_page is None or (
                        previous_id_after and id_after
                        and previous_id_after < id_after):
                    self.state.last_seen_next_link = next_page

    def finalize(self) -> None:
        """finalize the lister state when relevant (see `fn:commit_page` for details)

        Note: this is a noop for full listing mode

        """
        next_page = self.state.last_seen_next_link
        if self.incremental and next_page:
            # link: https://${project-api}/?...&page=2x...
            next_id_after = _parse_id_after(next_page)
            scheduler_state = self.get_state_from_scheduler()
            previous_next_id_after = _parse_id_after(
                scheduler_state.last_seen_next_link)

            if (not previous_next_id_after and next_id_after) or (
                    previous_next_id_after and next_id_after
                    and previous_next_id_after < next_id_after):
                self.updated = True
Example #19
0
    async def action(cls, app: FastAPI, scheduler_data: SchedulerData) -> None:
        logger.debug(
            "Getting docker compose spec for service %s", scheduler_data.service_name
        )

        dynamic_sidecar_client = get_dynamic_sidecar_client(app)
        dynamic_sidecar_endpoint = scheduler_data.dynamic_sidecar.endpoint

        # Starts dynamic SIDECAR -------------------------------------
        # creates a docker compose spec given the service key and tag
        compose_spec = assemble_spec(
            app=app,
            service_key=scheduler_data.key,
            service_tag=scheduler_data.version,
            paths_mapping=scheduler_data.paths_mapping,
            compose_spec=scheduler_data.compose_spec,
            container_http_entry=scheduler_data.container_http_entry,
            dynamic_sidecar_network_name=scheduler_data.dynamic_sidecar_network_name,
        )

        await dynamic_sidecar_client.start_service_creation(
            dynamic_sidecar_endpoint, compose_spec
        )

        # Starts PROXY -----------------------------------------------
        # The entrypoint container name was now computed
        # continue starting the proxy

        # check values have been set by previous step
        if (
            scheduler_data.dynamic_sidecar.dynamic_sidecar_id is None
            or scheduler_data.dynamic_sidecar.dynamic_sidecar_network_id is None
            or scheduler_data.dynamic_sidecar.swarm_network_id is None
            or scheduler_data.dynamic_sidecar.swarm_network_name is None
        ):
            raise ValueError(
                (
                    "Expected a value for all the following values: "
                    f"{scheduler_data.dynamic_sidecar.dynamic_sidecar_id=} "
                    f"{scheduler_data.dynamic_sidecar.dynamic_sidecar_network_id=} "
                    f"{scheduler_data.dynamic_sidecar.swarm_network_id=} "
                    f"{scheduler_data.dynamic_sidecar.swarm_network_name=}"
                )
            )

        dynamic_sidecar_settings: DynamicSidecarSettings = (
            app.state.settings.DYNAMIC_SERVICES.DYNAMIC_SIDECAR
        )

        async for attempt in AsyncRetrying(
            stop=stop_after_delay(
                dynamic_sidecar_settings.DYNAMIC_SIDECAR_WAIT_FOR_CONTAINERS_TO_START
            ),
            wait=wait_fixed(1),
            retry_error_cls=EntrypointContainerNotFoundError,
            before_sleep=before_sleep_log(logger, logging.WARNING),
        ):
            with attempt:
                if scheduler_data.dynamic_sidecar.service_removal_state.was_removed:
                    # the service was removed while waiting for the operation to finish
                    logger.warning(
                        "Stopping `get_entrypoint_container_name` operation. "
                        "Will no try to start the service."
                    )
                    return

                entrypoint_container = await dynamic_sidecar_client.get_entrypoint_container_name(
                    dynamic_sidecar_endpoint=dynamic_sidecar_endpoint,
                    dynamic_sidecar_network_name=scheduler_data.dynamic_sidecar_network_name,
                )
                logger.info(
                    "Fetched container entrypoint name %s", entrypoint_container
                )

        dynamic_sidecar_node_id = await get_node_id_from_task_for_service(
            scheduler_data.dynamic_sidecar.dynamic_sidecar_id, dynamic_sidecar_settings
        )

        dynamic_sidecar_proxy_create_service_params = get_dynamic_proxy_spec(
            scheduler_data=scheduler_data,
            dynamic_sidecar_settings=dynamic_sidecar_settings,
            dynamic_sidecar_network_id=scheduler_data.dynamic_sidecar.dynamic_sidecar_network_id,
            swarm_network_id=scheduler_data.dynamic_sidecar.swarm_network_id,
            swarm_network_name=scheduler_data.dynamic_sidecar.swarm_network_name,
            dynamic_sidecar_node_id=dynamic_sidecar_node_id,
            entrypoint_container_name=entrypoint_container,
            service_port=scheduler_data.service_port,
        )

        logger.debug(
            "dynamic-sidecar-proxy create_service_params %s",
            json_dumps(dynamic_sidecar_proxy_create_service_params),
        )

        # no need for the id any longer
        await create_service_and_get_id(dynamic_sidecar_proxy_create_service_params)
        scheduler_data.dynamic_sidecar.were_services_created = True

        scheduler_data.dynamic_sidecar.was_compose_spec_submitted = True
Example #20
0
class MavenLister(Lister[MavenListerState, RepoPage]):
    """List origins from a Maven repository.

    Maven Central provides artifacts for Java builds.
    It includes POM files and source archives, which we download to get
    the source code of artifacts and links to their scm repository.

    This lister yields origins of types: git/svn/hg or whatever the Artifacts
    use as repository type, plus maven types for the maven loader (tgz, jar)."""

    LISTER_NAME = "maven"

    def __init__(
        self,
        scheduler: SchedulerInterface,
        url: str,
        index_url: str = None,
        instance: Optional[str] = None,
        credentials: CredentialsType = None,
        incremental: bool = True,
    ):
        """Lister class for Maven repositories.

        Args:
            url: main URL of the Maven repository, i.e. url of the base index
                used to fetch maven artifacts. For Maven central use
                https://repo1.maven.org/maven2/
            index_url: the URL to download the exported text indexes from.
                Would typically be a local host running the export docker image.
                See README.md in this directory for more information.
            instance: Name of maven instance. Defaults to url's network location
                if unset.
            incremental: bool, defaults to True. Defines if incremental listing
                is activated or not.

        """
        self.BASE_URL = url
        self.INDEX_URL = index_url
        self.incremental = incremental

        super().__init__(
            scheduler=scheduler,
            credentials=credentials,
            url=url,
            instance=instance,
        )

        self.session = requests.Session()
        self.session.headers.update(
            {
                "Accept": "application/json",
                "User-Agent": USER_AGENT,
            }
        )

        self.jar_origins: Dict[str, ListedOrigin] = {}
        self.github_session = GitHubSession(
            credentials=self.credentials, user_agent=USER_AGENT
        )

    def state_from_dict(self, d: Dict[str, Any]) -> MavenListerState:
        return MavenListerState(**d)

    def state_to_dict(self, state: MavenListerState) -> Dict[str, Any]:
        return asdict(state)

    @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
    def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:

        logger.info("Fetching URL %s with params %s", url, params)

        response = self.session.get(url, params=params)
        if response.status_code != 200:
            logger.warning(
                "Unexpected HTTP status code %s on %s: %s",
                response.status_code,
                response.url,
                response.content,
            )
        response.raise_for_status()

        return response

    def get_pages(self) -> Iterator[RepoPage]:
        """Retrieve and parse exported maven indexes to
        identify all pom files and src archives.
        """

        # Example of returned RepoPage's:
        # [
        #   {
        #     "type": "maven",
        #     "url": "https://maven.xwiki.org/..-5.4.2-sources.jar",
        #     "time": 1626109619335,
        #     "gid": "org.xwiki.platform",
        #     "aid": "xwiki-platform-wikistream-events-xwiki",
        #     "version": "5.4.2"
        #   },
        #   {
        #     "type": "scm",
        #     "url": "scm:git:git://github.com/openengsb/openengsb-framework.git",
        #     "project": "openengsb-framework",
        #   },
        #   ...
        # ]

        # Download the main text index file.
        logger.info("Downloading computed index from %s.", self.INDEX_URL)
        assert self.INDEX_URL is not None
        response = requests.get(self.INDEX_URL, stream=True)
        if response.status_code != 200:
            logger.error("Index %s not found, stopping", self.INDEX_URL)
            response.raise_for_status()

        # Prepare regexes to parse index exports.

        # Parse doc id.
        # Example line: "doc 13"
        re_doc = re.compile(r"^doc (?P<doc>\d+)$")

        # Parse gid, aid, version, classifier, extension.
        # Example line: "    value al.aldi|sprova4j|0.1.0|sources|jar"
        re_val = re.compile(
            r"^\s{4}value (?P<gid>[^|]+)\|(?P<aid>[^|]+)\|(?P<version>[^|]+)\|"
            + r"(?P<classifier>[^|]+)\|(?P<ext>[^|]+)$"
        )

        # Parse last modification time.
        # Example line: "    value jar|1626109619335|14316|2|2|0|jar"
        re_time = re.compile(
            r"^\s{4}value ([^|]+)\|(?P<mtime>[^|]+)\|([^|]+)\|([^|]+)\|([^|]+)"
            + r"\|([^|]+)\|([^|]+)$"
        )

        # Read file line by line and process it
        out_pom: Dict = {}
        jar_src: Dict = {}
        doc_id: int = 0
        jar_src["doc"] = None
        url_src = None

        iterator = response.iter_lines(chunk_size=1024)
        for line_bytes in iterator:
            # Read the index text export and get URLs and SCMs.
            line = line_bytes.decode(errors="ignore")
            m_doc = re_doc.match(line)
            if m_doc is not None:
                doc_id = int(m_doc.group("doc"))
                # jar_src["doc"] contains the id of the current document, whatever
                # its type (scm or jar).
                jar_src["doc"] = doc_id
            else:
                m_val = re_val.match(line)
                if m_val is not None:
                    (gid, aid, version, classifier, ext) = m_val.groups()
                    ext = ext.strip()
                    path = "/".join(gid.split("."))
                    if classifier == "NA" and ext.lower() == "pom":
                        # If incremental mode, we don't record any line that is
                        # before our last recorded doc id.
                        if (
                            self.incremental
                            and self.state
                            and self.state.last_seen_pom
                            and self.state.last_seen_pom >= doc_id
                        ):
                            continue
                        url_path = f"{path}/{aid}/{version}/{aid}-{version}.{ext}"
                        url_pom = urljoin(
                            self.BASE_URL,
                            url_path,
                        )
                        out_pom[url_pom] = doc_id
                    elif (
                        classifier.lower() == "sources" or ("src" in classifier)
                    ) and ext.lower() in ("zip", "jar"):
                        url_path = (
                            f"{path}/{aid}/{version}/{aid}-{version}-{classifier}.{ext}"
                        )
                        url_src = urljoin(self.BASE_URL, url_path)
                        jar_src["gid"] = gid
                        jar_src["aid"] = aid
                        jar_src["version"] = version
                else:
                    m_time = re_time.match(line)
                    if m_time is not None and url_src is not None:
                        time = m_time.group("mtime")
                        jar_src["time"] = int(time)
                        artifact_metadata_d = {
                            "type": "maven",
                            "url": url_src,
                            **jar_src,
                        }
                        logger.debug(
                            "* Yielding jar %s: %s", url_src, artifact_metadata_d
                        )
                        yield artifact_metadata_d
                        url_src = None

        logger.info("Found %s poms.", len(out_pom))

        # Now fetch pom files and scan them for scm info.

        logger.info("Fetching poms..")
        for pom in out_pom:
            try:
                response = self.page_request(pom, {})
                project = xmltodict.parse(response.content)
                project_d = project.get("project", {})
                scm_d = project_d.get("scm")
                if scm_d is not None:
                    connection = scm_d.get("connection")
                    if connection is not None:
                        artifact_metadata_d = {
                            "type": "scm",
                            "doc": out_pom[pom],
                            "url": connection,
                        }
                        logger.debug("* Yielding pom %s: %s", pom, artifact_metadata_d)
                        yield artifact_metadata_d
                    else:
                        logger.debug("No scm.connection in pom %s", pom)
                else:
                    logger.debug("No scm in pom %s", pom)
            except requests.HTTPError:
                logger.warning(
                    "POM info page could not be fetched, skipping project '%s'",
                    pom,
                )
            except xmltodict.expat.ExpatError as error:
                logger.info("Could not parse POM %s XML: %s. Next.", pom, error)

    def get_scm(self, page: RepoPage) -> Optional[ListedOrigin]:
        """Retrieve scm origin out of the page information. Only called when type of the
        page is scm.

        Try and detect an scm/vcs repository. Note that official format is in the form:
        scm:{type}:git://example.org/{user}/{repo}.git but some projects directly put
        the repo url (without the "scm:type"), so we have to check against the content
        to extract the type and url properly.

        Raises
            AssertionError when the type of the page is not 'scm'

        Returns
            ListedOrigin with proper canonical scm url (for github) if any is found,
            None otherwise.

        """

        assert page["type"] == "scm"
        visit_type: Optional[str] = None
        url: Optional[str] = None
        m_scm = re.match(r"^scm:(?P<type>[^:]+):(?P<url>.*)$", page["url"])
        if m_scm is None:
            return None

        scm_type = m_scm.group("type")
        if scm_type and scm_type in SUPPORTED_SCM_TYPES:
            url = m_scm.group("url")
            visit_type = scm_type
        elif page["url"].endswith(".git"):
            url = page["url"].lstrip("scm:")
            visit_type = "git"
        else:
            return None

        if url and visit_type == "git":
            # Non-github urls will be returned as is, github ones will be canonical ones
            url = self.github_session.get_canonical_url(url)

        if not url:
            return None

        assert visit_type is not None
        assert self.lister_obj.id is not None
        return ListedOrigin(
            lister_id=self.lister_obj.id,
            url=url,
            visit_type=visit_type,
        )

    def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:

        """Convert a page of Maven repositories into a list of ListedOrigins."""
        if page["type"] == "scm":
            listed_origin = self.get_scm(page)
            if listed_origin:
                yield listed_origin
        else:
            # Origin is gathering source archives:
            last_update_dt = None
            last_update_iso = ""
            try:
                last_update_seconds = str(page["time"])[:-3]
                last_update_dt = datetime.fromtimestamp(int(last_update_seconds))
                last_update_dt = last_update_dt.astimezone(timezone.utc)
            except (OverflowError, ValueError):
                logger.warning("- Failed to convert datetime %s.", last_update_seconds)
            if last_update_dt:
                last_update_iso = last_update_dt.isoformat()

            # Origin URL will target page holding sources for all versions of
            # an artifactId (package name) inside a groupId (namespace)
            path = "/".join(page["gid"].split("."))
            origin_url = urljoin(self.BASE_URL, f"{path}/{page['aid']}")

            artifact = {
                **{k: v for k, v in page.items() if k != "doc"},
                "time": last_update_iso,
                "base_url": self.BASE_URL,
            }

            if origin_url not in self.jar_origins:
                # Create ListedOrigin instance if we did not see that origin yet
                assert self.lister_obj.id is not None
                jar_origin = ListedOrigin(
                    lister_id=self.lister_obj.id,
                    url=origin_url,
                    visit_type=page["type"],
                    last_update=last_update_dt,
                    extra_loader_arguments={"artifacts": [artifact]},
                )
                self.jar_origins[origin_url] = jar_origin
            else:
                # Update list of source artifacts for that origin otherwise
                jar_origin = self.jar_origins[origin_url]
                artifacts = jar_origin.extra_loader_arguments["artifacts"]
                if artifact not in artifacts:
                    artifacts.append(artifact)

            if (
                jar_origin.last_update
                and last_update_dt
                and last_update_dt > jar_origin.last_update
            ):
                jar_origin.last_update = last_update_dt

            if not self.incremental or (
                self.state and page["doc"] > self.state.last_seen_doc
            ):
                # Yield origin with updated source artifacts, multiple instances of
                # ListedOrigin for the same origin URL but with different artifacts
                # list will be sent to the scheduler but it will deduplicate them and
                # take the latest one to upsert in database
                yield jar_origin

    def commit_page(self, page: RepoPage) -> None:
        """Update currently stored state using the latest listed doc.

        Note: this is a noop for full listing mode

        """
        if self.incremental and self.state:
            # We need to differentiate the two state counters according
            # to the type of origin.
            if page["type"] == "maven" and page["doc"] > self.state.last_seen_doc:
                self.state.last_seen_doc = page["doc"]
            elif page["type"] == "scm" and page["doc"] > self.state.last_seen_pom:
                self.state.last_seen_doc = page["doc"]
                self.state.last_seen_pom = page["doc"]

    def finalize(self) -> None:
        """Finalize the lister state, set update if any progress has been made.

        Note: this is a noop for full listing mode

        """
        if self.incremental and self.state:
            last_seen_doc = self.state.last_seen_doc
            last_seen_pom = self.state.last_seen_pom

            scheduler_state = self.get_state_from_scheduler()
            if last_seen_doc and last_seen_pom:
                if (scheduler_state.last_seen_doc < last_seen_doc) or (
                    scheduler_state.last_seen_pom < last_seen_pom
                ):
                    self.updated = True
Example #21
0
class Ticker:
    def __init__(self, symbol: str, index: Optional[str] = None):
        self.symbol = symbol
        self.csv_path = f"{config.DATA_BASE_PATH}/{self.symbol}.csv"
        self._index = index or symbols_data.get_ticker_index(self.symbol)
        self._url = tse_settings.TSE_TICKER_ADDRESS.format(self._index)
        self._info_url = tse_settings.TSE_ISNT_INFO_URL.format(self._index)
        self._client_types_url = TSE_CLIENT_TYPE_DATA_URL.format(self._index)
        self._history: pd.DataFrame = pd.DataFrame()

        if os.path.exists(self.csv_path):
            self.from_file()
        else:
            self.from_web()

    def from_web(self):
        self._history = download(self._index)[self._index]

    def from_file(self):
        self._history = pd.read_csv(self.csv_path)
        self._history["date"] = pd.to_datetime(self._history["date"])

    @property
    def history(self):
        return self._history

    @property
    def url(self):
        return self._url

    @property
    def index(self):
        return self._index

    @property
    def instrument_id(self):
        """
        instrument id of a ticker is unique and used for calling
        some apis from tsetmc
        """
        return re.findall(r"InstrumentID='([\w\d]*)|',$",
                          self._ticker_page_response.text)[0]

    @property
    def ci_sin(self):
        """
        instrument id of a ticker is like instrument_id and used for calling
        some apis from tsetmc
        """
        return re.findall(r"CIsin='([\w\d]*)|',$",
                          self._ticker_page_response.text)[0]

    @property
    def title(self) -> str:
        return re.findall(
            r"Title='(.*?)',",
            self._ticker_page_response.text)[0].split("-")[0].strip()

    @property
    def group_name(self) -> str:
        return re.findall(r"LSecVal='([\D]*)',",
                          self._ticker_page_response.text)[0]

    @property
    def p_e_ratio(self) -> Optional[float]:
        """
        Notes on usage: tickers like آسام does not have p/e
        """
        adj_close = self.get_ticker_real_time_info_response().adj_close
        eps = self.eps
        if adj_close is None or eps is None or eps == 0:
            return None
        return self.get_ticker_real_time_info_response().adj_close / self.eps

    @property
    def group_p_e_ratio(self) -> Optional[float]:
        """
        Notes on usage: tickers like وملت does not have group P/E (gpe)
        """
        gpe = re.findall(r"SectorPE='([\d.]*)',",
                         self._ticker_page_response.text)
        if not gpe or not gpe[0]:
            return None
        return float(gpe[0])

    @property
    def eps(self) -> Optional[float]:
        """
        Notes on usage: tickers like آسام does not have eps
        """
        eps = re.findall(r"EstimatedEPS='([-,\d]*)',",
                         self._ticker_page_response.text)[0]
        if eps == "":
            return None
        return float(eps)

    @property
    def total_shares(self) -> float:
        return float(
            re.findall(r"ZTitad=([-,\d]*),",
                       self._ticker_page_response.text)[0])

    @property
    def base_volume(self) -> float:
        return float(
            re.findall(r"BaseVol=([-,\d]*),",
                       self._ticker_page_response.text)[0])

    @property
    def client_types(self):
        return download_ticker_client_types_record(self._index)

    @property
    def trade_dates(self):
        return self._history["date"].to_list()

    @property
    def shareholders(self) -> pd.DataFrame:
        session = utils.requests_retry_session()
        page = session.get(self._shareholders_url, timeout=5)
        session.close()
        soup = bs4.BeautifulSoup(page.content, 'html.parser')
        table: bs4.PageElement = soup.find_all("table")[0]
        shareholders_df = utils.get_shareholders_html_table_as_csv(table)
        shareholders_df = shareholders_df.rename(
            columns=translations.SHAREHOLDERS_FIELD_MAPPINGS)
        return shareholders_df

    def get_shareholders_history(self,
                                 from_when=datetime.timedelta(days=90),
                                 to_when=datetime.datetime.now(),
                                 only_trade_days=True,
                                 session=None) -> pd.DataFrame:
        """
            a helper function to use shareholders_history_async
        """
        return asyncio.run(
            self.get_shareholders_history_async(
                from_when,
                to_when,
                only_trade_days,
                session,
            ), )

    async def get_shareholders_history_async(
        self,
        from_when=datetime.timedelta(days=90),
        to_when=datetime.datetime.now(),
        only_trade_days=True,
        session=None,
    ) -> pd.DataFrame:
        requested_dates = utils.datetime_range(to_when - from_when, to_when)
        session_created = False
        if not session:
            session_created = True
            conn = aiohttp.TCPConnector(limit=3)
            session = aiohttp.ClientSession(connector=conn)
        tasks = []
        for date in requested_dates:
            if only_trade_days and date.date() not in self.trade_dates:
                continue
            tasks.append(
                self._get_ticker_daily_info_page_response(
                    session, date.strftime(tse_settings.DATE_FORMAT)))
        pages = await async_utils.run_tasks_with_wait(tasks, 30, 10)
        if session_created is True:
            await session.close()
        rows = []
        for page in pages:
            page_date = tsetmc_scraper.scrape_daily_info_page_for_date(page)
            shareholders_data = (
                tsetmc_scraper.scrape_daily_info_page_for_shareholder_data(
                    page))
            for shareholder_data in shareholders_data:
                rows.append([
                    datetime.datetime.strptime(
                        page_date,
                        tse_settings.DATE_FORMAT,
                    ),
                    shareholder_data.shares,
                    shareholder_data.percentage,
                    shareholder_data.instrument_id,
                    shareholder_data.name,
                ])

        return pd.DataFrame(data=rows,
                            columns=[
                                'date',
                                'shares',
                                'percentage',
                                'instrument_id',
                                'shareholder',
                            ])

    @property
    def last_price(self):
        return self.get_ticker_real_time_info_response().last_price

    @property
    def adj_close(self):
        return self.get_ticker_real_time_info_response().adj_close

    @property
    def best_demand_vol(self):
        return self.get_ticker_real_time_info_response().best_demand_vol

    @property
    def best_demand_price(self):
        return self.get_ticker_real_time_info_response().best_demand_price

    @property
    def best_supply_vol(self):
        return self.get_ticker_real_time_info_response().best_supply_vol

    @property
    def best_supply_price(self):
        return self.get_ticker_real_time_info_response().best_supply_price

    def get_ticker_real_time_info_response(self) -> RealtimeTickerInfo:
        """
        notes on usage:
        - Real time data might not be always available
        check for None values before usage
        """
        session = utils.requests_retry_session()
        response = session.get(self._info_url, timeout=5)
        session.close()

        # in some cases last price or adj price is undefined
        try:
            last_price = int(response.text.split()[1].split(",")[1])
        except (ValueError, IndexError):  # When instead of number value is `F`
            last_price = None
        try:
            adj_close = int(response.text.split()[1].split(",")[2])
        except (ValueError, IndexError):
            adj_close = None

        orders_data = response.text.split(";")[2]
        buy_orders, sell_orders = get_orders(orders_data)

        best_demand_vol = (buy_orders[0].volume
                           if 0 < len(buy_orders) else None)
        best_demand_price = (buy_orders[0].price
                             if 0 < len(buy_orders) else None)
        best_supply_vol = (sell_orders[0].volume
                           if 0 < len(sell_orders) else None)
        best_supply_price = (sell_orders[0].price
                             if 0 < len(sell_orders) else None)

        return RealtimeTickerInfo(
            last_price,
            adj_close,
            best_demand_vol=best_demand_vol,
            best_demand_price=best_demand_price,
            best_supply_vol=best_supply_vol,
            best_supply_price=best_supply_price,
            buy_orders=buy_orders,
            sell_orders=sell_orders,
        )

    @property
    @functools.lru_cache()
    def _ticker_page_response(self):
        return utils.requests_retry_session().get(self._url, timeout=10)

    @functools.lru_cache()
    @retry(wait=wait_random(min=3, max=5),
           before_sleep=before_sleep_log(logger, logging.ERROR))
    async def _get_ticker_daily_info_page_response(self, session,
                                                   date) -> requests.Response:
        async with session.get(
                tse_settings.INSTRUMENT_DAY_INFO_URL.format(
                    index=self.index, date=date), ) as response:
            response.raise_for_status()
            page = await response.text()
            logger.info(f"fetched date {date}")
            return page

    @property
    def _shareholders_url(self) -> str:
        return tse_settings.TSE_SHAREHOLDERS_URL.format(self.ci_sin)
Example #22
0
class TuleapLister(StatelessLister[RepoPage]):
    """List origins from Tuleap.

    Tuleap provides SVN and Git repositories hosting.

    Tuleap API getting started:
    https://tuleap.net/doc/en/user-guide/integration/rest.html
    Tuleap API reference:
    https://tuleap.net/api/explorer/

    Using the API we first request a list of projects, and from there request their
    associated repositories individually. Everything is paginated, code uses throttling
    at the individual GET call level."""

    LISTER_NAME = "tuleap"

    REPO_LIST_PATH = "/api"
    REPO_GIT_PATH = "plugins/git/"
    REPO_SVN_PATH = "plugins/svn/"

    def __init__(
        self,
        scheduler: SchedulerInterface,
        url: str,
        instance: Optional[str] = None,
        credentials: CredentialsType = None,
    ):
        super().__init__(
            scheduler=scheduler,
            credentials=credentials,
            url=url,
            instance=instance,
        )

        self.session = requests.Session()
        self.session.headers.update({
            "Accept": "application/json",
            "User-Agent": USER_AGENT,
        })

    @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
    def page_request(self, url: str, params: Dict[str,
                                                  Any]) -> requests.Response:

        logger.info("Fetching URL %s with params %s", url, params)

        response = self.session.get(url, params=params)
        if response.status_code != 200:
            logger.warning(
                "Unexpected HTTP status code %s on %s: %s",
                response.status_code,
                response.url,
                response.content,
            )
        response.raise_for_status()

        return response

    @classmethod
    def results_simplified(cls, url: str, repo_type: str,
                           repo: RepoPage) -> RepoPage:
        if repo_type == "git":
            prefix_url = TuleapLister.REPO_GIT_PATH
        else:
            prefix_url = TuleapLister.REPO_SVN_PATH
        rep = {
            "project": repo["name"],
            "type": repo_type,
            "uri": urljoin(url, f"{prefix_url}{repo['path']}"),
            "last_update_date": repo["last_update_date"],
        }
        return rep

    def _get_repositories(self, url_repo) -> List[Dict[str, Any]]:
        ret = self.page_request(url_repo, {})
        reps_list = ret.json()["repositories"]
        limit = int(ret.headers["X-PAGINATION-LIMIT-MAX"])
        offset = int(ret.headers["X-PAGINATION-LIMIT"])
        size = int(ret.headers["X-PAGINATION-SIZE"])
        while offset < size:
            url_offset = url_repo + "?offset=" + str(offset) + "&limit=" + str(
                limit)
            ret = self.page_request(url_offset, {}).json()
            reps_list = reps_list + ret["repositories"]
            offset += limit
        return reps_list

    def get_pages(self) -> Iterator[RepoPage]:
        # base with trailing slash, path without leading slash for urljoin
        url_api: str = urljoin(self.url, self.REPO_LIST_PATH)
        url_projects = url_api + "/projects/"

        # Get the list of projects.
        response = self.page_request(url_projects, {})
        projects_list = response.json()
        limit = int(response.headers["X-PAGINATION-LIMIT-MAX"])
        offset = int(response.headers["X-PAGINATION-LIMIT"])
        size = int(response.headers["X-PAGINATION-SIZE"])
        while offset < size:
            url_offset = (url_projects + "?offset=" + str(offset) + "&limit=" +
                          str(limit))
            ret = self.page_request(url_offset, {}).json()
            projects_list = projects_list + ret
            offset += limit

        # Get list of repositories for each project.
        for p in projects_list:
            p_id = p["id"]

            # Fetch Git repositories for project
            url_git = url_projects + str(p_id) + "/git"
            repos = self._get_repositories(url_git)
            for repo in repos:
                yield self.results_simplified(url_api, "git", repo)

    def get_origins_from_page(self, page: RepoPage) -> Iterator[ListedOrigin]:
        """Convert a page of Tuleap repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        yield ListedOrigin(
            lister_id=self.lister_obj.id,
            url=page["uri"],
            visit_type=page["type"],
            last_update=iso8601.parse_date(page["last_update_date"]),
        )
Example #23
0
class GiteaLister(StatelessLister[RepoListPage]):
    """List origins from Gitea.

    Gitea API documentation: https://try.gitea.io/api/swagger

    The API does pagination and provides navigation URLs through the 'Link' header.
    The default value for page size is the maximum value observed on the instances
    accessible at https://try.gitea.io/api/v1/ and https://codeberg.org/api/v1/."""

    LISTER_NAME = "gitea"

    REPO_LIST_PATH = "repos/search"

    def __init__(
        self,
        scheduler: SchedulerInterface,
        url: str,
        instance: Optional[str] = None,
        api_token: Optional[str] = None,
        page_size: int = 50,
        credentials: CredentialsType = None,
    ):
        super().__init__(
            scheduler=scheduler,
            credentials=credentials,
            url=url,
            instance=instance,
        )

        self.query_params = {
            "sort": "id",
            "order": "asc",
            "limit": page_size,
            "page": 1,
        }

        self.session = requests.Session()
        self.session.headers.update(
            {
                "Accept": "application/json",
                "User-Agent": USER_AGENT,
            }
        )

        if api_token is None:
            if len(self.credentials) > 0:
                cred = random.choice(self.credentials)
                username = cred.get("username")
                api_token = cred["password"]
                logger.warning(
                    "Using authentication token from user %s", username or "???"
                )
            else:
                logger.warning(
                    "No authentication token set in configuration, using anonymous mode"
                )

        if api_token:
            self.session.headers["Authorization"] = "Token %s" % api_token

    @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
    def page_request(self, url: str, params: Dict[str, Any]) -> requests.Response:

        logger.info("Fetching URL %s with params %s", url, params)

        response = self.session.get(url, params=params)

        if response.status_code != 200:
            logger.warning(
                "Unexpected HTTP status code %s on %s: %s",
                response.status_code,
                response.url,
                response.content,
            )
        response.raise_for_status()

        return response

    @classmethod
    def results_simplified(cls, body: Dict[str, RepoListPage]) -> RepoListPage:
        fields_filter = ["id", "clone_url", "updated_at"]
        return [{k: r[k] for k in fields_filter} for r in body["data"]]

    def get_pages(self) -> Iterator[RepoListPage]:
        # base with trailing slash, path without leading slash for urljoin
        url: str = urljoin(self.url, self.REPO_LIST_PATH)

        response = self.page_request(url, self.query_params)

        while True:
            page_results = self.results_simplified(response.json())

            yield page_results

            assert len(response.links) > 0, "API changed: no Link header found"
            if "next" in response.links:
                url = response.links["next"]["url"]
            else:
                # last page
                break

            response = self.page_request(url, {})

    def get_origins_from_page(self, page: RepoListPage) -> Iterator[ListedOrigin]:
        """Convert a page of Gitea repositories into a list of ListedOrigins."""
        assert self.lister_obj.id is not None

        for repo in page:
            last_update = iso8601.parse_date(repo["updated_at"])

            yield ListedOrigin(
                lister_id=self.lister_obj.id,
                url=repo["clone_url"],
                visit_type="git",
                last_update=last_update,
            )
Example #24
0
    DirectorV2Settings,
    get_client_session,
    get_plugin_settings,
)

log = logging.getLogger(__name__)

_APP_DIRECTOR_V2_CLIENT_KEY = f"{__name__}.DirectorV2ApiClient"

SERVICE_HEALTH_CHECK_TIMEOUT = ClientTimeout(total=2, connect=1)  # type:ignore

DEFAULT_RETRY_POLICY = dict(
    wait=wait_random(0, 1),
    stop=stop_after_attempt(2),
    reraise=True,
    before_sleep=before_sleep_log(log, logging.WARNING),
)

DataType = Dict[str, Any]
DataBody = Union[DataType, List[DataType], None]


class DirectorV2ApiClient:
    def __init__(self, app: web.Application) -> None:
        self._app = app
        self._settings: DirectorV2Settings = get_plugin_settings(app)

    async def start(self, project_id: ProjectID, user_id: UserID,
                    **options) -> str:
        computation_task_out = await _request_director_v2(
            self._app,
Example #25
0
def docker_stack(
    docker_swarm: None,
    docker_client: docker.client.DockerClient,
    core_docker_compose_file: Path,
    ops_docker_compose_file: Path,
    keep_docker_up: bool,
    testing_environ_vars: EnvVarsDict,
) -> Iterator[Dict]:
    """deploys core and ops stacks and returns as soon as all are running"""

    # WARNING: keep prefix "pytest-" in stack names
    core_stack_name = testing_environ_vars["SWARM_STACK_NAME"]
    ops_stack_name = "pytest-ops"

    assert core_stack_name
    assert core_stack_name.startswith("pytest-")
    stacks = [
        (
            "core",
            core_stack_name,
            core_docker_compose_file,
        ),
        (
            "ops",
            ops_stack_name,
            ops_docker_compose_file,
        ),
    ]

    # make up-version
    stacks_deployed: Dict[str, Dict] = {}
    for key, stack_name, compose_file in stacks:
        try:
            subprocess.run(
                [
                    "docker",
                    "stack",
                    "deploy",
                    "--with-registry-auth",
                    "--compose-file",
                    f"{compose_file.name}",
                    f"{stack_name}",
                ],
                check=True,
                cwd=compose_file.parent,
            )
        except subprocess.CalledProcessError as err:
            print(
                "docker_stack failed",
                f"{' '.join(err.cmd)}",
                f"returncode={err.returncode}",
                f"stdout={err.stdout}",
                f"stderr={err.stderr}",
            )
            raise

        stacks_deployed[key] = {
            "name": stack_name,
            "compose": yaml.safe_load(compose_file.read_text()),
        }

    # All SELECTED services ready
    # - notice that the timeout is set for all services in both stacks
    # - TODO: the time to deploy will depend on the number of services selected
    try:
        if sys.version_info >= (3, 7):
            from tenacity._asyncio import AsyncRetrying

            async def _check_all_services_are_running():
                async for attempt in AsyncRetrying(
                        wait=wait_fixed(5),
                        stop=stop_after_delay(8 * MINUTE),
                        before_sleep=before_sleep_log(log, logging.INFO),
                        reraise=True,
                ):
                    with attempt:
                        await asyncio.gather(*[
                            asyncio.get_event_loop().run_in_executor(
                                None, assert_service_is_running, service)
                            for service in docker_client.services.list()
                        ])

            asyncio.run(_check_all_services_are_running())

        else:
            for attempt in Retrying(
                    wait=wait_fixed(5),
                    stop=stop_after_delay(8 * MINUTE),
                    before_sleep=before_sleep_log(log, logging.INFO),
                    reraise=True,
            ):
                with attempt:
                    for service in docker_client.services.list():
                        assert_service_is_running(service)

    finally:
        _fetch_and_print_services(docker_client, "[BEFORE TEST]")

    yield {
        "stacks": stacks_deployed,
        "services":
        [service.name
         for service in docker_client.services.list()],  # type: ignore
    }

    ## TEAR DOWN ----------------------

    _fetch_and_print_services(docker_client, "[AFTER TEST]")

    if keep_docker_up:
        # skip bringing the stack down
        return

    # clean up. Guarantees that all services are down before creating a new stack!
    #
    # WORKAROUND https://github.com/moby/moby/issues/30942#issue-207070098
    #
    #   docker stack rm services
    #   until [ -z "$(docker service ls --filter label=com.docker.stack.namespace=services -q)" ] || [ "$limit" -lt 0 ]; do
    #   sleep 1;
    #   done
    #   until [ -z "$(docker network ls --filter label=com.docker.stack.namespace=services -q)" ] || [ "$limit" -lt 0 ]; do
    #   sleep 1;
    #   done

    # make down
    # NOTE: remove them in reverse order since stacks share common networks

    stacks.reverse()
    for _, stack, _ in stacks:

        try:
            subprocess.run(
                f"docker stack remove {stack}".split(" "),
                check=True,
                capture_output=True,
            )
        except subprocess.CalledProcessError as err:
            log.warning(
                "Ignoring failure while executing '%s' (returned code %d):\n%s\n%s\n%s\n%s\n",
                err.cmd,
                err.returncode,
                HEADER_STR.format("stdout"),
                err.stdout.decode("utf8") if err.stdout else "",
                HEADER_STR.format("stderr"),
                err.stderr.decode("utf8") if err.stderr else "",
            )

        # Waits that all resources get removed or force them
        # The check order is intentional because some resources depend on others to be removed
        # e.g. cannot remove networks/volumes used by running containers
        for resource_name in ("services", "containers", "volumes", "networks"):
            resource_client = getattr(docker_client, resource_name)

            for attempt in Retrying(
                    wait=wait_fixed(2),
                    stop=stop_after_delay(3 * MINUTE),
                    before_sleep=before_sleep_log(log, logging.WARNING),
                    reraise=True,
            ):
                with attempt:
                    pending = resource_client.list(
                        filters={
                            "label": f"com.docker.stack.namespace={stack}"
                        })
                    if pending:
                        if resource_name in ("volumes", ):
                            # WARNING: rm volumes on this stack migh be a problem when shared between different stacks
                            # NOTE: volumes are removed to avoid mixing configs (e.g. postgres db credentials)
                            for resource in pending:
                                resource.remove(force=True)

                        raise _ResourceStillNotRemoved(
                            f"Waiting for {len(pending)} {resource_name} to shutdown: {pending}."
                        )

    _fetch_and_print_services(docker_client, "[AFTER REMOVED]")
Example #26
0
async def delete_pipeline(
        comp_task_stop: ComputationTaskDelete,
        project_id: ProjectID,
        project_repo: ProjectsRepository = Depends(
            get_repository(ProjectsRepository)),
        computation_pipelines: CompPipelinesRepository = Depends(
            get_repository(CompPipelinesRepository)),
        computation_tasks: CompTasksRepository = Depends(
            get_repository(CompTasksRepository)),
        scheduler: BaseCompScheduler = Depends(get_scheduler),
) -> None:
    try:
        # get the project
        project: ProjectAtDB = await project_repo.get_project(project_id)
        # check if current state allow to stop the computation
        comp_tasks: List[
            CompTaskAtDB] = await computation_tasks.get_comp_tasks(project_id)
        pipeline_state = get_pipeline_state_from_task_states(comp_tasks)
        if is_pipeline_running(pipeline_state):
            if not comp_task_stop.force:
                raise HTTPException(
                    status_code=status.HTTP_403_FORBIDDEN,
                    detail=
                    f"Projet {project_id} is currently running and cannot be deleted, current state is {pipeline_state}",
                )
            # abort the pipeline first
            try:
                await scheduler.stop_pipeline(comp_task_stop.user_id,
                                              project_id)
            except SchedulerError as e:
                log.warning(
                    "Project %s could not be stopped properly.\n reason: %s",
                    project_id,
                    e,
                )

            def return_last_value(retry_state: Any) -> Any:
                """return the result of the last call attempt"""
                return retry_state.outcome.result()

            @retry(
                stop=stop_after_delay(PIPELINE_ABORT_TIMEOUT_S),
                wait=wait_random(0, 2),
                retry_error_callback=return_last_value,
                retry=retry_if_result(lambda result: result is False),
                reraise=False,
                before_sleep=before_sleep_log(log, logging.INFO),
            )
            async def check_pipeline_stopped() -> bool:
                comp_tasks: List[
                    CompTaskAtDB] = await computation_tasks.get_comp_tasks(
                        project_id)
                pipeline_state = get_pipeline_state_from_task_states(
                    comp_tasks, )
                return is_pipeline_stopped(pipeline_state)

            # wait for the pipeline to be stopped
            if not await check_pipeline_stopped():
                log.error(
                    "pipeline %s could not be stopped properly after %ss",
                    project_id,
                    PIPELINE_ABORT_TIMEOUT_S,
                )

        # delete the pipeline now
        await computation_tasks.delete_tasks_from_project(project)
        await computation_pipelines.delete_pipeline(project_id)

    except ProjectNotFoundError as e:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND,
                            detail=f"{e}") from e
Example #27
0
# TESTS ------------------------------------------------------------------------------------
#
#   publisher ---> (rabbitMQ)  ---> webserver --- (socketio) ---> front-end pages
#
# - logs, instrumentation and progress are set to rabbitMQ messages
# - webserver consumes these messages and forwards them to the front-end broadcasting them to socketio
# - all front-end insteances connected to these channes will get notified when new messages are directed
#   to them
#

POLLING_TIME = 0.2
TIMEOUT_S = 5
RETRY_POLICY = dict(
    wait=wait_fixed(POLLING_TIME),
    stop=stop_after_delay(TIMEOUT_S),
    before_sleep=before_sleep_log(logger, log_level=logging.WARNING),
    reraise=True,
)
NUMBER_OF_MESSAGES = 1
USER_ROLES = [
    UserRole.GUEST,
    UserRole.USER,
    UserRole.TESTER,
]


@pytest.mark.parametrize("user_role", USER_ROLES)
async def test_publish_to_other_user(
    other_user_id: int,
    other_project_id: UUIDStr,
    other_node_uuid: str,
Example #28
0
def wait_for_services() -> int:
    expected_services = core_services() + ops_services()
    started_services = []
    client = docker.from_env()
    try:
        for attempt in Retrying(
                stop=stop_after_attempt(MAX_RETRY_COUNT),
                wait=wait_fixed(WAIT_BEFORE_RETRY),
                before_sleep=before_sleep_log(logger, logging.WARNING),
        ):
            with attempt:
                started_services = sorted(
                    [
                        s for s in client.services.list()
                        if s.name.split("_")[-1] in expected_services
                    ],
                    key=by_service_creation,
                )

                assert len(started_services), "no services started!"
                assert len(expected_services) == len(started_services), (
                    "Some services are missing or unexpected:\n"
                    f"expected: {len(expected_services)} {expected_services}\n"
                    f"got: {len(started_services)} {[s.name for s in started_services]}"
                )
    except RetryError:
        print(
            f"found these services: {len(started_services)} {[s.name for s in started_services]}\nexpected services: {len(expected_services)} {expected_services}"
        )
        return os.EX_SOFTWARE

    for service in started_services:

        expected_replicas = (
            service.attrs["Spec"]["Mode"]["Replicated"]["Replicas"]
            if "Replicated" in service.attrs["Spec"]["Mode"] else len(
                client.nodes.list())  # we are in global mode
        )
        print(f"Service: {service.name} expects {expected_replicas} replicas",
              "-" * 10)

        try:
            for attempt in Retrying(
                    stop=stop_after_attempt(MAX_RETRY_COUNT),
                    wait=wait_fixed(WAIT_BEFORE_RETRY),
            ):
                with attempt:
                    service_tasks: List[Dict] = service.tasks()  #  freeze
                    print(get_tasks_summary(service_tasks))

                    #
                    # NOTE: a service could set 'ready' as desired-state instead of 'running' if
                    # it constantly breaks and the swarm desides to "stop trying".
                    #
                    valid_replicas = sum(
                        task["Status"]["State"] == RUNNING_STATE
                        for task in service_tasks)
                    assert valid_replicas == expected_replicas
        except RetryError:
            print(
                f"ERROR: Service {service.name} failed to start {expected_replicas} replica/s"
            )
            print(json.dumps(service.attrs, indent=1))
            return os.EX_SOFTWARE

    return os.EX_OK
from settings_library.rabbit import RabbitSettings
from tenacity.before_sleep import before_sleep_log
from tenacity.stop import stop_after_attempt
from tenacity.wait import wait_fixed

from .helpers.utils_docker import get_localhost_ip, get_service_published_port

# HELPERS ------------------------------------------------------------------------------------

log = logging.getLogger(__name__)


@tenacity.retry(
    wait=wait_fixed(5),
    stop=stop_after_attempt(60),
    before_sleep=before_sleep_log(log, logging.INFO),
    reraise=True,
)
async def wait_till_rabbit_responsive(url: str) -> None:
    connection = await aio_pika.connect(url)
    await connection.close()


# FIXTURES ------------------------------------------------------------------------------------


@pytest.fixture(scope="function")
async def rabbit_settings(
        docker_stack: Dict,
        testing_environ_vars: Dict  # stack is up
) -> RabbitSettings:
class NewForgeLister(Lister[NewForgeListerState, NewForgeListerPage]):
    """List origins from the "NewForge" forge.

    """

    # Part of the lister API, that identifies this lister
    LISTER_NAME = ""
    # (Optional) CVS type of the origins listed by this lister, if constant
    VISIT_TYPE = ""

    # Instance URLs include the hostname and the common path prefix of processed URLs
    EXAMPLE_BASE_URL = "https://netloc/api/v1/"
    # Path of a specific resource to process, to join the base URL with
    EXAMPLE_PATH = "origins/list"

    def __init__(
        self,
        # Required
        scheduler: SchedulerInterface,
        # Instance URL, required for multi-instances listers (e.g gitlab, ...)
        url: str,
        # Instance name (free form) required for multi-instance listers,
        # or computed from `url`
        instance: str,
        # Required whether lister supports authentication or not
        credentials: CredentialsType = None,
    ):
        super().__init__(
            scheduler=scheduler,
            credentials=credentials,
            url=url,
            instance=instance,
        )

        self.session = requests.Session()
        # Declare the USER_AGENT is more sysadm-friendly for the forge we list
        self.session.headers.update({
            "Accept": "application/json",
            "User-Agent": USER_AGENT
        })

    def state_from_dict(self, d: Dict[str, Any]) -> NewForgeListerState:
        return NewForgeListerState(**d)

    def state_to_dict(self, state: NewForgeListerState) -> Dict[str, Any]:
        return asdict(state)

    @throttling_retry(before_sleep=before_sleep_log(logger, logging.WARNING))
    def page_request(self, url, params) -> requests.Response:
        # Do the network resource request under a retrying decorator
        # to handle rate limiting and transient errors up to a limit.
        # `throttling_retry` by default use the `requests` library to check
        # only for rate-limit and a base-10 exponential waiting strategy.
        # This can be customized by passed waiting, retrying and logging strategies
        # as functions. See the `tenacity` library documentation.

        # Log listed URL to ease debugging
        logger.debug("Fetching URL %s with params %s", url, params)
        response = self.session.get(url, params=params)

        if response.status_code != 200:
            # Log response content to ease debugging
            logger.warning(
                "Unexpected HTTP status code %s on %s: %s",
                response.status_code,
                response.url,
                response.content,
            )
        # The lister must fail on blocking errors
        response.raise_for_status()

        return response

    def get_pages(self) -> Iterator[NewForgeListerPage]:
        # The algorithm depends on the service, but should request data reliably,
        # following pagination if relevant and yielding pages in a streaming fashion.
        # If incremental listing is supported, initialize from saved lister state.
        # Make use of any next page URL provided.
        # Simplify the results early to ease testing and debugging.

        # Initialize from the lister saved state
        current = ""
        if self.state.current is not None:
            current = self.state.current

        # Construct the URL of a service endpoint, the lister can have others to fetch
        url = urljoin(self.url, self.EXAMPLE_PATH)

        while current is not None:
            # Parametrize the request for incremental listing
            body = self.page_request(url, {"current": current}).json()

            # Simplify the page if possible to only the necessary elements
            # and yield it
            yield body

            # Get the next page parameter or end the loop when there is none
            current = body.get("next")

    def get_origins_from_page(
            self, page: NewForgeListerPage) -> Iterator[ListedOrigin]:
        """Convert a page of NewForgeLister repositories into a list of ListedOrigins"""
        assert self.lister_obj.id is not None

        for element in page:

            yield ListedOrigin(
                # Required. Should use this value.
                lister_id=self.lister_obj.id,
                # Required. Visit type of the currently processed origin
                visit_type=self.VISIT_TYPE,
                # Required. URL corresponding to the origin for loaders to ingest
                url=...,
                # Should get it if the service provides it and if it induces no
                # substantial additional processing cost
                last_update=...,
            )

    def commit_page(self, page: NewForgeListerPage) -> None:
        # Update the lister state to the latest `current`
        current = page[-1]["current"]

        if current > self.state.current:
            self.state.current = current

    def finalize(self) -> None:
        # Pull fresh lister state from the scheduler backend, in case multiple
        # listers run concurrently
        scheduler_state = self.get_state_from_scheduler()

        # Update the lister state in the backend only if `current` is fresher than
        # the one stored in the database.
        if self.state.current > scheduler_state.current:
            self.updated = True