def test_config_from_lambda_event_value_error(): value_error_event_1 = deepcopy(event_pipeline_lambda_event) value_error_event_1["payload"]["step_data"] = { "input_events": [{"foo": "bar"}, {"foo": "car"}], "s3_input_prefixes": {"input1": "some-s3-prefix"}, "status": "PENDING", "errors": [], } with pytest.raises(ValueError) as e1: Config.from_lambda_event(value_error_event_1) assert ( str(e1.value) == "Can only set values for one of 's3_input_prefixes' or 'input_events'" ) value_error_event_2 = deepcopy(event_pipeline_lambda_event) value_error_event_2["payload"]["step_data"] = { "status": "PENDING", "errors": [], } with pytest.raises(ValueError) as e2: Config.from_lambda_event(value_error_event_2) assert ( str(e2.value) == "Either 's3_input_prefixes' or 'input_events' must be assigned a value" )
def xls_to_csv(event, context): config = Config.from_lambda_event(event) output_dataset = config.payload.output_dataset step_data = config.payload.step_data input_prefixes = step_data.s3_input_prefixes if step_data.input_count < 1: raise ValueError("No input dataset prefix defined") if step_data.input_count > 1: raise ValueError(f"Too many dataset inputs: {input_prefixes}") input_dataset = list(input_prefixes)[0] input_prefix = input_prefixes[input_dataset] output_prefix = ( output_dataset.s3_prefix.replace("%stage%", "intermediate") + config.task + "/") table_config = TableConfig(config.task_config) response = s3_client.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix) for content in response["Contents"]: xlsInput = content["Key"] filename = xlsInput[len(input_prefix):] filename_prefix = filename[0:filename.lower().rfind(".xls")] output = output_prefix + filename_prefix + ".csv" convert_to_csv(xlsInput, output, table_config) config.payload.step_data.s3_input_prefixes = { output_dataset.id: output_prefix } config.payload.step_data.status = "OK" return asdict(config.payload.step_data)
def test_config_immutable(): config = Config.from_lambda_event(event_pipeline_lambda_event) with pytest.raises(FrozenInstanceError): config.execution_name = "bleh" with pytest.raises(FrozenInstanceError): config.payload.output_dataset.version = "bleh" with pytest.raises(FrozenInstanceError): config.payload.step_data = StepData("", [], {"foo": "bar"}) config.payload.step_data.s3_input_prefixes = {"Mutable": "ok"}
def validate_csv(event, context): config = Config.from_lambda_event(event) step_config = StepConfig.from_task_config(config.task_config) s3_prefix = config.payload.output_dataset.s3_prefix log_add( header_row=step_config.header_row, delimiter=step_config.delimiter, quote=step_config.quote, schema=step_config.schema, output_prefix=s3_prefix, ) if not step_config.schema: log_add(notice="No Schema provided for validation") config.payload.step_data.status = Status.VALIDATION_SUCCESS.value # 2020.06: Validation done optionally - we now return ok if we don't supply a # schema for the validation step return asdict(config.payload.step_data) input_prefix = next( iter(config.payload.step_data.s3_input_prefixes.values())) log_add(s3_input_prefix=input_prefix) objects = s3.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix) s3_path = next(iter(objects["Contents"]))["Key"] log_add(s3_input_path=s3_path) response = s3.get_object(Bucket=BUCKET, Key=s3_path) reader = csv.reader( string_reader.from_response(response), dialect="unix", delimiter=step_config.delimiter, quotechar=step_config.quote, ) header = None if step_config.header_row: header = next(reader) try: csv_data = parse_csv(reader, step_config.schema, header) except ParseErrors as p: return _with_error(config, p.errors) validation_errors = JsonSchemaValidator( step_config.schema).validate(csv_data) if validation_errors: return _with_error(config, errors=validation_errors) config.payload.step_data.status = Status.VALIDATION_SUCCESS.value return asdict(config.payload.step_data)
def invoke_lambda(event, context): config = Config.from_lambda_event(event) function_arn = config.payload.pipeline.task_config.get( config.task).get("arn") log_add(function_arn=function_arn) log_add(event=event) response = lambda_client.invoke( FunctionName=function_arn, Payload=json.dumps(event), InvocationType="RequestResponse", ) result = read_result(response) return result
def test_config_from_event_pipeline_lambda_event(): config = Config.from_lambda_event(event_pipeline_lambda_event) assert config.execution_name == "test_execution" assert config.task == "kinesis_writer" assert config.payload.pipeline == Pipeline( id="some-id", task_config={"kinesis_writer": {"some_config": "some_value"}}, ) assert config.payload.output_dataset == OutputDataset(id="some-id", version="1") assert config.payload.step_data == StepData( input_events=[{"foo": "bar"}, {"foo": "car"}], status="PENDING", errors=[], ) assert config.payload.step_data.input_count == 2
def validate_json(event, context): config = Config.from_lambda_event(event) step_config = StepConfig.from_dict(config.task_config) step_data = config.payload.step_data log_add( dataset_id=config.payload.output_dataset.id, version=config.payload.output_dataset.version, edition=config.payload.output_dataset.edition, ) if step_data.input_count > 1: raise IllegalWrite("cannot combine multiple datasets: ", step_data.input_count) if step_config.schema is None: return asdict( StepData( input_events=step_data.input_events, s3_input_prefixes=step_data.s3_input_prefixes, status="VALIDATION_SUCCESS", errors=[], )) input_data = resolve_input_data(step_data) validation_errors = JsonSchemaValidator( step_config.schema).validate_list(input_data) if validation_errors: return asdict( StepData( input_events=step_data.input_events, s3_input_prefixes=step_data.s3_input_prefixes, status="VALIDATION_FAILED", errors=validation_errors[:100], )) return asdict( StepData( input_events=step_data.input_events, s3_input_prefixes=step_data.s3_input_prefixes, status="VALIDATION_SUCCESS", errors=[], ))
def write_kinesis(event, context): pipeline_config = Config.from_lambda_event(event) dataset_id = pipeline_config.payload.output_dataset.id version = pipeline_config.payload.output_dataset.version log_add(dataset_id=dataset_id, version=version) dataset = dataset_client.get_dataset(dataset_id, retries=3) access_rights = dataset["accessRights"] confidentiality = CONFIDENTIALITY_MAP[access_rights] output_stream_name = f"dp.{confidentiality}.{dataset_id}.processed.{version}.json" log_add(output_stream_name=output_stream_name) input_events = pipeline_config.payload.step_data.input_events write_to_kinesis(events=input_events, stream_name=output_stream_name) return asdict(StepData(input_events=input_events, status="OK", errors=[]))
def test_config_from_s3_pipeline_lambda_event(): config = Config.from_lambda_event(s3_pipeline_lambda_event) assert config.execution_name == "test_execution" assert config.task == "s3_writer" assert config.payload.pipeline == Pipeline( id="some-id", task_config={"s3_writer": {"some_config": "some_value"}}, ) assert config.payload.output_dataset == OutputDataset( id="some-id", version="1", edition="some-edition", s3_prefix="some-s3-prefix" ) assert config.payload.step_data == StepData( s3_input_prefixes={ "input1": "some-s3-prefix", "input2": "some-s3-prefix", "input3": "some-s3-prefix", }, status="PENDING", errors=[], ) assert config.payload.step_data.input_count == 3
def test_handle_null_task_config_child(): config = Config.from_lambda_event(event_with_null_task_config_child) assert config.task_config["some_config"] == "some value"
def test_handle_missing_task_config(): config = Config.from_lambda_event(event_with_missing_task_config) assert config.task_config["some_config"] == "some value"
def test_override_default_task_config(): config = Config.from_lambda_event(event_overriding_default_task_config) assert config.task_config["some_config"] == "some other value"
def test_default_task_config(): config = Config.from_lambda_event(event_with_default_task_config) assert config.task_config["some_config"] == "some value"
def test_config_types(): config = Config.from_lambda_event(event_pipeline_lambda_event) assert isinstance(config.payload, Payload) assert isinstance(config.payload.pipeline, Pipeline) assert isinstance(config.payload.step_data, StepData) assert isinstance(config.payload.output_dataset, OutputDataset)
def __init__(self, event): self.s3 = boto3.client("s3") self.s3fs_prefix = f"s3://{BUCKET}/" self.config = Config.from_lambda_event(event) self.task_config = TaskConfig.from_config(self.config) log_add(input_config=asdict(self.task_config))
def write_s3(event, context): config = Config.from_lambda_event(event) task_config = TaskConfig.from_dict(config.task_config) output_dataset = config.payload.output_dataset step_data = config.payload.step_data content_type = task_config.content_type log_add( dataset_id=output_dataset.id, version=output_dataset.version, edition_id=output_dataset.edition, source_prefixes=step_data.s3_input_prefixes, write_to_latest=task_config.write_to_latest, output_stage=task_config.output_stage, ) if content_type: log_add(content_type=content_type) status_add( domain="dataset", domain_id=f"{output_dataset.id}/{output_dataset.version}", operation=config.task, ) if step_data.input_count > 1: raise IllegalWrite("cannot combine multiple datasets: ", step_data.input_count) source_prefix = next(iter(step_data.s3_input_prefixes.values())) output_prefix = config.payload.output_dataset.s3_prefix.replace( "%stage%", task_config.output_stage) s3_sources = s3_service.resolve_s3_sources(source_prefix) copied_files = copy_data(s3_sources, output_prefix) if task_config.output_stage == "processed": try: create_distribution_with_retries(output_dataset, copied_files, content_type) except Exception as e: s3_service.delete_from_prefix(output_prefix) log_exception(e) raise DistributionNotCreated if task_config.write_to_latest and is_latest_edition( output_dataset.id, output_dataset.version, output_dataset.edition): write_data_to_latest(s3_sources, output_prefix) output_prefixes = {output_dataset.id: output_prefix} response = StepData(s3_input_prefixes=output_prefixes, status="OK", errors=[]) # TODO: this is just to verify that we have a correct implementation of the status API # temporary - if we are in /latest write -> set run to complete # Once we get this up and see what the status-api can return to the CLI we will update with more information status_body = { "files": [s3_source.key for s3_source in s3_sources], "latest": task_config.write_to_latest, } status_add(status_body=status_body) return asdict(response)