Ejemplo n.º 1
0
def wait_for_role(sts: STSClient, role_arn: str) -> None:
    """
    Verify that it is possible to assume the given role

    In practice this always seems to take less than 10 seconds, but give it up
    to 90 to reduce any chance of flakiness.
    """
    for i in range(90, 0, -1):
        try:
            sts.assume_role(RoleArn=role_arn,
                            RoleSessionName="mzcomposevalidatecreated")
        except Exception as e:
            if i % 10 == 0:
                print(f"Unable to assume role, {i} seconds remaining: {e}")
            time.sleep(1)
            continue
        print(f"Successfully assumed role {role_arn}")
        break
    else:
        raise UIError("Never able to assume role")
Ejemplo n.º 2
0
def should_batch_copy_files_to_storage(
    s3_client: S3Client,
    s3_control_client: S3ControlClient,
    sts_client: STSClient,
    subtests: SubTests,
) -> None:
    # pylint: disable=too-many-locals
    # Given two metadata files with an asset each, all within a prefix
    original_prefix = any_safe_filename()

    root_asset_name = any_asset_name()
    root_asset_filename = any_safe_filename()
    root_asset_content = any_file_contents()
    root_asset_multihash = sha256_hex_digest_to_multihash(
        sha256(root_asset_content).hexdigest())
    child_asset_name = any_asset_name()
    child_asset_filename = any_safe_filename()
    child_asset_content = any_file_contents()
    child_asset_multihash = sha256_hex_digest_to_multihash(
        sha256(child_asset_content).hexdigest())

    root_metadata_filename = any_safe_filename()
    child_metadata_filename = any_safe_filename()

    with S3Object(
            BytesIO(initial_bytes=root_asset_content),
            ResourceName.STAGING_BUCKET_NAME.value,
            f"{original_prefix}/{root_asset_filename}",
    ) as root_asset_s3_object, S3Object(
            BytesIO(initial_bytes=child_asset_content),
            ResourceName.STAGING_BUCKET_NAME.value,
            f"{original_prefix}/{child_asset_filename}",
    ) as child_asset_s3_object, S3Object(
            BytesIO(initial_bytes=dumps({
                **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT),
                STAC_ASSETS_KEY: {
                    child_asset_name: {
                        STAC_HREF_KEY: child_asset_s3_object.url,
                        STAC_FILE_CHECKSUM_KEY: child_asset_multihash,
                    }
                },
            }).encode()),
            ResourceName.STAGING_BUCKET_NAME.value,
            f"{original_prefix}/{child_metadata_filename}",
    ) as child_metadata_s3_object, S3Object(
            BytesIO(initial_bytes=dumps({
                **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT),
                STAC_ASSETS_KEY: {
                    root_asset_name: {
                        STAC_HREF_KEY: root_asset_s3_object.url,
                        STAC_FILE_CHECKSUM_KEY: root_asset_multihash,
                    },
                },
                STAC_LINKS_KEY: [{
                    STAC_HREF_KEY: child_metadata_s3_object.url,
                    "rel": "child"
                }],
            }).encode()),
            ResourceName.STAGING_BUCKET_NAME.value,
            f"{original_prefix}/{root_metadata_filename}",
    ) as root_metadata_s3_object, Dataset() as dataset:
        version_id = any_dataset_version_id()
        asset_id = (f"{DATASET_ID_PREFIX}{dataset.dataset_id}"
                    f"{DB_KEY_SEPARATOR}{VERSION_ID_PREFIX}{version_id}")

        with ProcessingAsset(
                asset_id=asset_id,
                url=root_metadata_s3_object.url), ProcessingAsset(
                    asset_id=asset_id,
                    url=child_metadata_s3_object.url), ProcessingAsset(
                        asset_id=asset_id,
                        url=root_asset_s3_object.url,
                        multihash=root_asset_multihash), ProcessingAsset(
                            asset_id=asset_id,
                            url=child_asset_s3_object.url,
                            multihash=child_asset_multihash):
            # When
            try:
                response = lambda_handler(
                    {
                        DATASET_ID_KEY: dataset.dataset_id,
                        DATASET_PREFIX_KEY: dataset.dataset_prefix,
                        VERSION_ID_KEY: version_id,
                        METADATA_URL_KEY: root_metadata_s3_object.url,
                    },
                    any_lambda_context(),
                )

                account_id = sts_client.get_caller_identity()["Account"]

                metadata_copy_job_result, asset_copy_job_result = wait_for_copy_jobs(
                    response,
                    account_id,
                    s3_control_client,
                    subtests,
                )
            finally:
                # Then
                new_prefix = (
                    f"{dataset.title}{DATASET_KEY_SEPARATOR}{dataset.dataset_id}/{version_id}"
                )
                storage_bucket_prefix = f"{S3_URL_PREFIX}{ResourceName.STORAGE_BUCKET_NAME.value}/"

                new_root_metadata_key = f"{new_prefix}/{root_metadata_filename}"
                expected_root_metadata = dumps({
                    **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT),
                    STAC_ASSETS_KEY: {
                        root_asset_name: {
                            STAC_HREF_KEY: root_asset_filename,
                            STAC_FILE_CHECKSUM_KEY: root_asset_multihash,
                        },
                    },
                    STAC_LINKS_KEY: [{
                        STAC_HREF_KEY: child_metadata_filename,
                        "rel": "child"
                    }],
                }).encode()
                with subtests.test(msg="Root metadata content"), smart_open(
                        f"{storage_bucket_prefix}{new_root_metadata_key}"
                ) as new_root_metadata_file:
                    assert expected_root_metadata == new_root_metadata_file.read(
                    )

                with subtests.test(msg="Delete root metadata object"):
                    delete_s3_key(ResourceName.STORAGE_BUCKET_NAME.value,
                                  new_root_metadata_key, s3_client)

                new_child_metadata_key = f"{new_prefix}/{child_metadata_filename}"
                expected_child_metadata = dumps({
                    **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT),
                    STAC_ASSETS_KEY: {
                        child_asset_name: {
                            STAC_HREF_KEY: child_asset_filename,
                            STAC_FILE_CHECKSUM_KEY: child_asset_multihash,
                        }
                    },
                }).encode()
                with subtests.test(msg="Child metadata content"), smart_open(
                        f"{storage_bucket_prefix}{new_child_metadata_key}"
                ) as new_child_metadata_file:
                    assert expected_child_metadata == new_child_metadata_file.read(
                    )

                with subtests.test(msg="Delete child metadata object"):
                    delete_s3_key(ResourceName.STORAGE_BUCKET_NAME.value,
                                  new_child_metadata_key, s3_client)

                # Then the root asset file is in the root prefix
                with subtests.test(msg="Delete root asset object"):
                    delete_s3_key(
                        ResourceName.STORAGE_BUCKET_NAME.value,
                        f"{new_prefix}/{root_asset_filename}",
                        s3_client,
                    )

                # Then the child asset file is in the root prefix
                with subtests.test(msg="Delete child asset object"):
                    delete_s3_key(
                        ResourceName.STORAGE_BUCKET_NAME.value,
                        f"{new_prefix}/{child_asset_filename}",
                        s3_client,
                    )

                # Cleanup
                delete_copy_job_files(
                    metadata_copy_job_result,
                    asset_copy_job_result,
                    ResourceName.STORAGE_BUCKET_NAME.value,
                    s3_client,
                    subtests,
                )
Ejemplo n.º 3
0
    def should_successfully_run_dataset_version_creation_process_with_single_asset(
        # pylint:disable=too-many-arguments
        self,
        step_functions_client: SFNClient,
        lambda_client: LambdaClient,
        s3_client: S3Client,
        s3_control_client: S3ControlClient,
        sts_client: STSClient,
        subtests: SubTests,
    ) -> None:
        # pylint: disable=too-many-locals
        key_prefix = any_safe_file_path()

        root_metadata_filename = any_safe_filename()
        child_metadata_filename = any_safe_filename()

        asset_contents = any_file_contents()
        asset_filename = any_safe_filename()

        with S3Object(
            file_object=BytesIO(initial_bytes=asset_contents),
            bucket_name=self.staging_bucket_name,
            key=f"{key_prefix}/{asset_filename}",
        ) as asset_s3_object, S3Object(
            file_object=json_dict_to_file_object(
                {
                    **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT),
                    STAC_ASSETS_KEY: {
                        any_asset_name(): {
                            STAC_HREF_KEY: asset_s3_object.url,
                            STAC_FILE_CHECKSUM_KEY: sha256_hex_digest_to_multihash(
                                sha256(asset_contents).hexdigest()
                            ),
                        },
                    },
                }
            ),
            bucket_name=self.staging_bucket_name,
            key=("{}/{}".format(key_prefix, child_metadata_filename)),
        ) as child_metadata_file, S3Object(
            file_object=json_dict_to_file_object(
                {
                    **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT),
                    STAC_LINKS_KEY: [
                        {STAC_HREF_KEY: child_metadata_file.url, STAC_REL_KEY: STAC_REL_CHILD}
                    ],
                }
            ),
            bucket_name=self.staging_bucket_name,
            key=("{}/{}".format(key_prefix, root_metadata_filename)),
        ) as root_metadata_file, Dataset() as dataset:

            # When
            try:
                resp = lambda_client.invoke(
                    FunctionName=ResourceName.DATASET_VERSIONS_ENDPOINT_FUNCTION_NAME.value,
                    Payload=json.dumps(
                        {
                            HTTP_METHOD_KEY: "POST",
                            BODY_KEY: {
                                DATASET_ID_SHORT_KEY: dataset.dataset_id,
                                METADATA_URL_KEY: root_metadata_file.url,
                            },
                        }
                    ).encode(),
                )
                json_resp = json.load(resp["Payload"])

                with subtests.test(msg="Dataset Versions endpoint returns success"):
                    assert json_resp.get(STATUS_CODE_KEY) == HTTPStatus.CREATED, json_resp

                with subtests.test(msg="Should complete Step Function successfully"):

                    LOGGER.info("Executed State Machine: %s", json_resp)

                    # Then poll for State Machine State
                    while (
                        execution := step_functions_client.describe_execution(
                            executionArn=json_resp[BODY_KEY][EXECUTION_ARN_KEY]
                        )
                    )["status"] == "RUNNING":
                        LOGGER.info(  # pragma: no cover
                            "Polling for State Machine state %s", "." * 6
                        )
                        time.sleep(5)  # pragma: no cover

                assert (execution_output := execution.get("output")), execution

                account_id = sts_client.get_caller_identity()["Account"]

                import_dataset_response = json.loads(execution_output)[IMPORT_DATASET_KEY]
                metadata_copy_job_result, asset_copy_job_result = wait_for_copy_jobs(
                    import_dataset_response,
                    account_id,
                    s3_control_client,
                    subtests,
                )
            finally:
    def should_successfully_run_dataset_version_creation_process_with_multiple_assets(
        # pylint:disable=too-many-arguments
        self,
        step_functions_client: SFNClient,
        lambda_client: LambdaClient,
        s3_client: S3Client,
        s3_control_client: S3ControlClient,
        sts_client: STSClient,
        subtests: SubTests,
    ) -> None:
        # pylint: disable=too-many-locals
        key_prefix = any_safe_file_path()

        collection_metadata_filename = any_safe_filename()
        catalog_metadata_filename = any_safe_filename()
        item_metadata_filename = any_safe_filename()

        collection_metadata_url = (
            f"s3://{self.staging_bucket_name}/{key_prefix}/{collection_metadata_filename}"
        )
        catalog_metadata_url = (
            f"s3://{self.staging_bucket_name}/{key_prefix}/{catalog_metadata_filename}"
        )
        item_metadata_url = f"s3://{self.staging_bucket_name}/{key_prefix}/{item_metadata_filename}"

        first_asset_contents = any_file_contents()
        first_asset_filename = any_safe_filename()
        second_asset_contents = any_file_contents()
        second_asset_filename = any_safe_filename()

        with S3Object(
                file_object=BytesIO(initial_bytes=first_asset_contents),
                bucket_name=self.staging_bucket_name,
                key=f"{key_prefix}/{first_asset_filename}",
        ) as first_asset_s3_object, S3Object(
                file_object=BytesIO(initial_bytes=second_asset_contents),
                bucket_name=self.staging_bucket_name,
                key=f"{key_prefix}/{second_asset_filename}",
        ) as second_asset_s3_object, S3Object(
                file_object=json_dict_to_file_object({
                    **deepcopy(MINIMAL_VALID_STAC_CATALOG_OBJECT),
                    "links": [
                        {
                            "href": collection_metadata_url,
                            "rel": "child"
                        },
                        {
                            "href": catalog_metadata_url,
                            "rel": "root"
                        },
                        {
                            "href": catalog_metadata_url,
                            "rel": "self"
                        },
                    ],
                }),
                bucket_name=self.staging_bucket_name,
                key=f"{key_prefix}/{catalog_metadata_filename}",
        ) as catalog_metadata_file, S3Object(
                file_object=json_dict_to_file_object({
                    **deepcopy(MINIMAL_VALID_STAC_COLLECTION_OBJECT),
                    "assets": {
                        any_asset_name(): {
                            "href":
                            second_asset_s3_object.url,
                            "file:checksum":
                            sha256_hex_digest_to_multihash(
                                sha256(second_asset_contents).hexdigest()),
                        },
                    },
                    "links": [
                        {
                            "href": item_metadata_url,
                            "rel": "child"
                        },
                        {
                            "href": catalog_metadata_url,
                            "rel": "root"
                        },
                        {
                            "href": collection_metadata_url,
                            "rel": "self"
                        },
                    ],
                }),
                bucket_name=self.staging_bucket_name,
                key=f"{key_prefix}/{collection_metadata_filename}",
        ), S3Object(
                file_object=json_dict_to_file_object({
                    **deepcopy(MINIMAL_VALID_STAC_ITEM_OBJECT),
                    "assets": {
                        any_asset_name(): {
                            "href":
                            first_asset_s3_object.url,
                            "file:checksum":
                            sha256_hex_digest_to_multihash(
                                sha256(first_asset_contents).hexdigest()),
                        },
                    },
                    "links": [
                        {
                            "href": catalog_metadata_url,
                            "rel": "root"
                        },
                        {
                            "href": item_metadata_url,
                            "rel": "self"
                        },
                    ],
                }),
                bucket_name=self.staging_bucket_name,
                key=f"{key_prefix}/{item_metadata_filename}",
        ), Dataset() as dataset:

            # When
            try:
                resp = lambda_client.invoke(
                    FunctionName=ResourceName.
                    DATASET_VERSIONS_ENDPOINT_FUNCTION_NAME.value,
                    Payload=json.dumps({
                        "httpMethod": "POST",
                        "body": {
                            "id": dataset.dataset_id,
                            "metadata-url": catalog_metadata_file.url,
                        },
                    }).encode(),
                )
                json_resp = json.load(resp["Payload"])

                with subtests.test(
                        msg="Dataset Versions endpoint returns success"):
                    assert json_resp.get(
                        "statusCode") == HTTPStatus.CREATED, json_resp

                with subtests.test(
                        msg="Should complete Step Function successfully"):

                    LOGGER.info("Executed State Machine: %s", json_resp)

                    # Then poll for State Machine State
                    while (execution :=
                           step_functions_client.describe_execution(
                               executionArn=json_resp["body"]["execution_arn"])
                           )["status"] == "RUNNING":
                        LOGGER.info("Polling for State Machine state %s",
                                    "." * 6)
                        time.sleep(5)

                    assert execution["status"] == "SUCCEEDED", execution

                assert (execution_output := execution.get("output")), execution

                account_id = sts_client.get_caller_identity()["Account"]

                import_dataset_response = json.loads(
                    execution_output)["import_dataset"]
                metadata_copy_job_result, asset_copy_job_result = wait_for_copy_jobs(
                    import_dataset_response, account_id, s3_control_client,
                    subtests)
            finally: