def dataset_file(request): bucket_name = "somebucket" with mock_s3(): client = boto3.client("s3", region_name=os.environ.get("AWS_REGION")) client.create_bucket(Bucket=bucket_name) client.put_object( Bucket=bucket_name, Key=f"train/{request.param}", Body=f"contents={request.param}", ) dsf = DatasetFile(request.param, bucket_name) dsf.cli = client yield dsf
def createdatasetgroup(event, context) -> (Status, str): """ Create/ monitor Amazon Forecast dataset group creation :param event: lambda event :param context: lambda context :return: dataset group status and dataset group ARN """ config = Config.from_sfn(event) dataset_file = DatasetFile(event.get("dataset_file"), event.get("bucket")) dataset_groups = config.dataset_groups(dataset_file) datasets = config.datasets(dataset_file) # dataset group creation returns an ARN immediately; creation of all dependent dataset groups is safe # dataset group update returns immediately; update of all dependent dataset groups is safe for dataset_group in dataset_groups: if dataset_group.status == Status.DOES_NOT_EXIST: dataset_group.create() if dataset_group.status != Status.ACTIVE: raise ValueError( f"Dataset group {dataset_group.dataset_group_name} is {dataset_group.status}, expected ACTIVE" ) dataset_group.update(datasets, dataset_file) # at this point, we are guaranteed that all dataset groups are active (or an error was thrown) return ( Status.ACTIVE, [dataset_group.dataset_group_name for dataset_group in dataset_groups], )
def build_message(event): """ Build a message for SNS to publish :param event: the lambda event containing the message :return: the message to publish """ message = "" error = None file = DatasetFile(event.get("dataset_file"), event.get("bucket")) if "statesError" in event.keys(): logger.info("State error message encountered") message += f"There was an error running the forecast for {file.prefix}\n\n" error = event.get("statesError") if "serviceError" in event.keys(): logger.info("Service error message encountered") message += ( f"There was a service error running the forecast for {file.prefix}\n\n" ) error = event.get("serviceError") if error: error_type = error.get("Error", "Unknown") error_cause = json.loads(error.get("Cause", "{}")) error_message = error_cause.get("errorMessage") stack_trace = error_cause.get("stackTrace") message += f"Message: {error_message}\n\n" message += f"Details: (caught {error_type})\n\n" if stack_trace: message += f"\n".join(stack_trace) else: message = f"Forecast for {file.prefix} is ready!" return message
def test_status_still_good(forecast_stub, configuration_data, expected_dataset_arns): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket") predictor = config.predictor(dataset_file) predictor.cli = forecast_stub.client forecast_stub.add_response( "list_predictors", { "Predictors": [ {"PredictorArn": "arn:", "CreationTime": datetime.now(timezone.utc)} ] }, ) forecast_stub.add_response( "describe_dataset_group", {"DatasetArns": expected_dataset_arns} ) for arn in expected_dataset_arns: forecast_stub.add_response( "describe_dataset", {"Status": "ACTIVE", "DatasetArn": arn} ) forecast_stub.add_response( "describe_predictor", {"CreationTime": datetime.now(timezone.utc), "Status": "ACTIVE"}, ) assert predictor.status == Status.ACTIVE
def test_config_valid(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("some_new_key.csv", "some_bucket") errors = config.validate() assert not errors
def test_predictor_history(forecast_stub, configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket") predictor = config.predictor(dataset_file, "RetailDemandTNPTS") predictor.cli = forecast_stub.client forecast_stub.add_response( "list_predictors", { "Predictors": [ { "CreationTime": datetime(2015, 1, 1), "PredictorArn": "arn:2015-1-1", "Status": "ACTIVE", }, { "CreationTime": datetime(2017, 1, 1), "PredictorArn": "arn:2017-1-1", "Status": "CREATE_IN_PROGRESS", }, ] }, ) history = predictor.history() assert history[0].get("CreationTime") == datetime(2017, 1, 1) assert history[1].get("CreationTime") == datetime(2015, 1, 1)
def validate(self, event: dict): record = next(iter(event.get("Records", [{}]))) if not record: raise RecordNotFound # Make sure this event version is supported event_version = record.get("eventVersion") if version.parse(event_version).major != S3_EVENT_STRUCTURE_MAJOR: raise RecordNotSupported( f"The event version {event_version} is not supported by this solution." ) # Make sure there's a bucket in the event structure bucket = record.get("s3", {}).get("bucket", {}).get("name") if not bucket: raise BucketNotFound # Make sure there's a key in the event structure key = record.get("s3", {}).get("object", {}).get("key") if not key: raise KeyNotFound # The name of the event is the stem of the file without extensions file = DatasetFile(key=key, bucket=bucket) return bucket, key, file
def test_config_dependent_dataset_dependencies(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("DatasetsFromRetailDemandTRMProphet", "some_bucket") datasets = config.datasets(dataset_file)
def test_dataset_default(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("some_new_key.csv", "some_bucket") ds = config.dataset(dataset_file) assert ds.data_frequency == "D" assert ds.dataset_type == DatasetType.TARGET_TIME_SERIES assert ds.dataset_domain == DatasetDomain.RETAIL assert ds.dataset_name == "some_new_key" assert ds.dataset_schema == { "Attributes": [ { "AttributeName": "item_id", "AttributeType": "string" }, { "AttributeName": "timestamp", "AttributeType": "timestamp", }, { "AttributeName": "demand", "AttributeType": "float" }, ] }
def test_config_required_datasets(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("some_new_key.csv", "some_bucket") assert config.required_datasets(dataset_file) == ["TARGET_TIME_SERIES"]
def test_forecast_history(forecast_stub, configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket") forecast = config.forecast(dataset_file, "RetailDemandTNPTS") forecast.cli = forecast_stub.client forecast_stub.add_response( "list_forecasts", { "Forecasts": [ { "LastModificationTime": datetime(2015, 1, 1), "ForecastArn": "arn:2015-1-1", "Status": "ACTIVE", }, { "LastModificationTime": datetime(2017, 1, 1), "ForecastArn": "arn:2017-1-1", "Status": "CREATE_IN_PROGRESS", }, ] }, ) history = forecast.history() assert history[0].get("LastModificationTime") == datetime(2017, 1, 1) assert history[1].get("LastModificationTime") == datetime(2015, 1, 1)
def test_forecast_arn(forecast_stub, configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket") forecast = config.forecast(dataset_file, "RetailDemandTNPTS") forecast.cli = forecast_stub.client forecast_stub.add_response( "list_forecasts", { "Forecasts": [ { "LastModificationTime": datetime(2015, 1, 1), "ForecastArn": "arn:2015-1-1", }, { "LastModificationTime": datetime(2017, 1, 1), "ForecastArn": "arn:2017-1-1", }, ] }, ) assert forecast.arn == "arn:2017-1-1"
def test_predictor_arn(forecast_stub, configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket") predictor = config.predictor(dataset_file, "RetailDemandTNPTS") predictor.cli = forecast_stub.client forecast_stub.add_response( "list_predictors", { "Predictors": [ { "CreationTime": datetime(2015, 1, 1), "PredictorArn": "arn:2015-1-1" }, { "CreationTime": datetime(2017, 1, 1), "PredictorArn": "arn:2017-1-1" }, ] }, ) assert predictor.arn == "arn:2017-1-1"
def test_dataset_import_job_arn(configuration_data, forecast_stub, mocker): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTRM.csv", "some_bucket") dataset_import_job = config.dataset_import_job(dataset_file) # create some job history forecast_stub.add_response( "list_dataset_import_jobs", { "DatasetImportJobs": [ { "LastModificationTime": datetime(2015, 1, 1), "DatasetImportJobArn": "arn:2015-1-1", }, { "LastModificationTime": datetime(2017, 1, 1), "DatasetImportJobArn": "arn:aws:forecast:abcdefghijkl:us-east-1:dataset-import-job/RetailDemandTRM/RetailDemandTRM_2017_01_01_00_00_00", }, { "LastModificationTime": datetime(2016, 1, 1), "DatasetImportJobArn": "arn:2016-1-1", }, ] }, ) dataset_import_job.cli = forecast_stub.client assert ( dataset_import_job.arn == f"arn:aws:forecast:abcdefghijkl:us-east-1:dataset-import-job/RetailDemandTRM/RetailDemandTRM_2017_01_01_00_00_00" )
def test_config_dataset_groups(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTRMProphet", "some_bucket") dsgs = config.dataset_groups(dataset_file) assert len(dsgs) == 2
def test_config_required_datasets_override(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("Override.csv", "some_bucket") required_datasets = config.required_datasets(dataset_file) assert "TARGET_TIME_SERIES" in required_datasets assert "RELATED_TIME_SERIES" in required_datasets assert "ITEM_METADATA" in required_datasets
def test_config_dependent_dataset_groups(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTRMProphet", "some_bucket") dependents = config.dependent_dataset_groups(dataset_file) assert len(dependents) == 2 assert "DatasetsFromRetailDemandTRMProphet" in dependents
def test_dataset_group_mismatch(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("Mismatch.csv", "some_bucket") with pytest.raises(ValueError) as excinfo: config.dataset_group(dataset_file) assert "must match" in str(excinfo.value)
def test_duplicate_timeseries(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandDuplicateDatasets.csv", "some_bucket") with pytest.raises(ValueError) as excinfo: config.required_datasets(dataset_file) assert "duplicate dataset types" in str(excinfo.value)
def test_config_predictor_from_dependent(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTRMProphet", "some_bucket") predictor = config.predictor(dataset_file, "DatasetsFromRetailDemandTRMProphet") assert (predictor.validator.expected_params["AlgorithmArn"] == "arn:aws:forecast:::algorithm/CNN-QR")
def test_missing_timeseries(configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandForgottenDatasets.csv", "some_bucket") with pytest.raises(ValueError) as excinfo: config.required_datasets(dataset_file) assert "you must configure a TARGET_TIME_SERIES dataset" in str( excinfo.value)
def test_status_not_yet_created(forecast_stub, configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket") forecast = config.forecast(dataset_file, "RetailDemandTNPTS") forecast.cli = forecast_stub.client forecast_stub.add_response("list_forecasts", {"Forecasts": []}) assert forecast.status == Status.DOES_NOT_EXIST forecast_stub.assert_no_pending_responses()
def test_init_forecast(forecast_stub, configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket") forecast = config.forecast(dataset_file, "RetailDemandTNPTS") dataset_group = config.dataset_group(dataset_file) assert ( forecast._dataset_group.dataset_group_name == dataset_group.dataset_group_name ) assert forecast._forecast_config == config.config_item(dataset_file, "Forecast")
def test_dataset_import_timestamp_format_none(configuration_data, forecast_stub): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTRM.csv", "some_bucket") dataset = config.dataset(dataset_file) forecast_stub.add_response("list_dataset_import_jobs", {"DatasetImportJobs": []}) dataset.cli = forecast_stub.client assert dataset.timestamp_format == None
def _copy_dataset(self, source: DatasetFileDataset) -> DatasetFileDataset: """ Athena works against folders of .csv files, but not single .csv files. This copies them to a temporary location under the forecast data bucket (under /raw) to consume their data properly :param source: DatasetFileDataset of source input :return: DatasetFileDataset of destination (under 'raw') """ dest = source.dataset_file.copy("raw", self.unique_id, str(source.dataset_file.data_type)) copied_dataset_file = DatasetFile(key=dest, bucket=source.dataset_file.bucket) return DatasetFileDataset(dataset=source.dataset, dataset_file=copied_dataset_file)
def test_init_predictor(forecast_stub, configuration_data): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTNPTS.csv", "some_bucket") predictor = config.predictor(dataset_file, "RetailDemandTNPTS") predictor.cli = forecast_stub.client assert predictor._dataset_file == dataset_file for k, v in config.config_item(dataset_file, "Predictor").items(): if k != "MaxAge": assert predictor._predictor_params.get(k) == v
def test_dataset_status_lifecycle(configuration_data, forecast_stub): config = Config() config.config = configuration_data dataset_file = DatasetFile("RetailDemandTRM.csv", "some_bucket") dataset = config.dataset(dataset_file) forecast_stub.add_client_error("describe_dataset", "ResourceNotFoundException") forecast_stub.add_response("describe_dataset", {"Status": "ACTIVE"}) dataset.cli = forecast_stub.client assert dataset.status == Status.DOES_NOT_EXIST assert dataset.status == "ACTIVE"
def datasets(self, dataset_file: DatasetFile) -> List[Dataset]: """ Get all datasets that would be referenced by a dataset group. :param dataset_file: The dataset file to use :return: A list of all datasets that are codependent with dataset_file """ required = self.required_datasets(dataset_file) dataset_templates = [] for data_type in required: dataset_file.data_type = DatasetType[data_type] ds = self.dataset(dataset_file) dataset_templates.append(ds) return dataset_templates
def etl_forecast_trm(sfn_configuration_data, s3_valid_files): """This represents a single file upload""" config = Config.from_sfn(sfn_configuration_data) with mock_sts(): dataset_file = DatasetFile(key="train/RetailDemandTRM.csv", bucket="testbucket") forecast = config.forecast(dataset_file, "RetailDemandTRM") yield ForecastETL( workgroup="primary", schema="default", config=config, dataset_file=dataset_file, forecast=forecast, )
def _get_datasets( self, ) -> ( Union[None, DatasetFileDataset], Union[None, DatasetFileDataset], Union[None, DatasetFileDataset], ): """ Gets the datasets and dataset files associated with this forecast :return: (ts, rts, md) """ datasets = self.config.datasets(self.dataset_file) prefix = f"s3://{self.dataset_file.bucket}/train/{self.dataset_file.prefix}" ts, rts, md = None, None, None for dataset in datasets: if dataset.dataset_type == DatasetType.TARGET_TIME_SERIES: ts = DatasetFileDataset( dataset, DatasetFile.from_s3_path(prefix + dataset.dataset_type.suffix), ) elif dataset.dataset_type == DatasetType.RELATED_TIME_SERIES: rts = DatasetFileDataset( dataset, DatasetFile.from_s3_path(prefix + dataset.dataset_type.suffix), ) elif dataset.dataset_type == DatasetType.ITEM_METADATA: md = DatasetFileDataset( dataset, DatasetFile.from_s3_path(prefix + dataset.dataset_type.suffix), ) return (ts, rts, md)