コード例 #1
0
def test_config_from_lambda_event_value_error():
    value_error_event_1 = deepcopy(event_pipeline_lambda_event)
    value_error_event_1["payload"]["step_data"] = {
        "input_events": [{"foo": "bar"}, {"foo": "car"}],
        "s3_input_prefixes": {"input1": "some-s3-prefix"},
        "status": "PENDING",
        "errors": [],
    }

    with pytest.raises(ValueError) as e1:
        Config.from_lambda_event(value_error_event_1)

    assert (
        str(e1.value)
        == "Can only set values for one of 's3_input_prefixes' or 'input_events'"
    )

    value_error_event_2 = deepcopy(event_pipeline_lambda_event)
    value_error_event_2["payload"]["step_data"] = {
        "status": "PENDING",
        "errors": [],
    }

    with pytest.raises(ValueError) as e2:
        Config.from_lambda_event(value_error_event_2)

    assert (
        str(e2.value)
        == "Either 's3_input_prefixes' or 'input_events' must be assigned a value"
    )
コード例 #2
0
def xls_to_csv(event, context):
    config = Config.from_lambda_event(event)
    output_dataset = config.payload.output_dataset
    step_data = config.payload.step_data

    input_prefixes = step_data.s3_input_prefixes
    if step_data.input_count < 1:
        raise ValueError("No input dataset prefix defined")
    if step_data.input_count > 1:
        raise ValueError(f"Too many dataset inputs: {input_prefixes}")

    input_dataset = list(input_prefixes)[0]
    input_prefix = input_prefixes[input_dataset]
    output_prefix = (
        output_dataset.s3_prefix.replace("%stage%", "intermediate") +
        config.task + "/")
    table_config = TableConfig(config.task_config)

    response = s3_client.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix)

    for content in response["Contents"]:
        xlsInput = content["Key"]

        filename = xlsInput[len(input_prefix):]
        filename_prefix = filename[0:filename.lower().rfind(".xls")]

        output = output_prefix + filename_prefix + ".csv"

        convert_to_csv(xlsInput, output, table_config)

    config.payload.step_data.s3_input_prefixes = {
        output_dataset.id: output_prefix
    }
    config.payload.step_data.status = "OK"
    return asdict(config.payload.step_data)
コード例 #3
0
def test_config_immutable():
    config = Config.from_lambda_event(event_pipeline_lambda_event)
    with pytest.raises(FrozenInstanceError):
        config.execution_name = "bleh"
    with pytest.raises(FrozenInstanceError):
        config.payload.output_dataset.version = "bleh"
    with pytest.raises(FrozenInstanceError):
        config.payload.step_data = StepData("", [], {"foo": "bar"})
    config.payload.step_data.s3_input_prefixes = {"Mutable": "ok"}
コード例 #4
0
def validate_csv(event, context):
    config = Config.from_lambda_event(event)

    step_config = StepConfig.from_task_config(config.task_config)

    s3_prefix = config.payload.output_dataset.s3_prefix

    log_add(
        header_row=step_config.header_row,
        delimiter=step_config.delimiter,
        quote=step_config.quote,
        schema=step_config.schema,
        output_prefix=s3_prefix,
    )

    if not step_config.schema:
        log_add(notice="No Schema provided for validation")
        config.payload.step_data.status = Status.VALIDATION_SUCCESS.value
        # 2020.06: Validation done optionally - we now return ok if we don't supply a
        # schema for the validation step
        return asdict(config.payload.step_data)

    input_prefix = next(
        iter(config.payload.step_data.s3_input_prefixes.values()))
    log_add(s3_input_prefix=input_prefix)
    objects = s3.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix)

    s3_path = next(iter(objects["Contents"]))["Key"]
    log_add(s3_input_path=s3_path)

    response = s3.get_object(Bucket=BUCKET, Key=s3_path)
    reader = csv.reader(
        string_reader.from_response(response),
        dialect="unix",
        delimiter=step_config.delimiter,
        quotechar=step_config.quote,
    )
    header = None
    if step_config.header_row:
        header = next(reader)
    try:
        csv_data = parse_csv(reader, step_config.schema, header)
    except ParseErrors as p:
        return _with_error(config, p.errors)

    validation_errors = JsonSchemaValidator(
        step_config.schema).validate(csv_data)

    if validation_errors:
        return _with_error(config, errors=validation_errors)

    config.payload.step_data.status = Status.VALIDATION_SUCCESS.value
    return asdict(config.payload.step_data)
コード例 #5
0
def invoke_lambda(event, context):
    config = Config.from_lambda_event(event)
    function_arn = config.payload.pipeline.task_config.get(
        config.task).get("arn")

    log_add(function_arn=function_arn)
    log_add(event=event)

    response = lambda_client.invoke(
        FunctionName=function_arn,
        Payload=json.dumps(event),
        InvocationType="RequestResponse",
    )
    result = read_result(response)
    return result
コード例 #6
0
def test_config_from_event_pipeline_lambda_event():

    config = Config.from_lambda_event(event_pipeline_lambda_event)

    assert config.execution_name == "test_execution"
    assert config.task == "kinesis_writer"
    assert config.payload.pipeline == Pipeline(
        id="some-id",
        task_config={"kinesis_writer": {"some_config": "some_value"}},
    )
    assert config.payload.output_dataset == OutputDataset(id="some-id", version="1")
    assert config.payload.step_data == StepData(
        input_events=[{"foo": "bar"}, {"foo": "car"}],
        status="PENDING",
        errors=[],
    )
    assert config.payload.step_data.input_count == 2
コード例 #7
0
def validate_json(event, context):
    config = Config.from_lambda_event(event)
    step_config = StepConfig.from_dict(config.task_config)
    step_data = config.payload.step_data

    log_add(
        dataset_id=config.payload.output_dataset.id,
        version=config.payload.output_dataset.version,
        edition=config.payload.output_dataset.edition,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    if step_config.schema is None:
        return asdict(
            StepData(
                input_events=step_data.input_events,
                s3_input_prefixes=step_data.s3_input_prefixes,
                status="VALIDATION_SUCCESS",
                errors=[],
            ))

    input_data = resolve_input_data(step_data)

    validation_errors = JsonSchemaValidator(
        step_config.schema).validate_list(input_data)

    if validation_errors:
        return asdict(
            StepData(
                input_events=step_data.input_events,
                s3_input_prefixes=step_data.s3_input_prefixes,
                status="VALIDATION_FAILED",
                errors=validation_errors[:100],
            ))

    return asdict(
        StepData(
            input_events=step_data.input_events,
            s3_input_prefixes=step_data.s3_input_prefixes,
            status="VALIDATION_SUCCESS",
            errors=[],
        ))
コード例 #8
0
def write_kinesis(event, context):
    pipeline_config = Config.from_lambda_event(event)

    dataset_id = pipeline_config.payload.output_dataset.id
    version = pipeline_config.payload.output_dataset.version
    log_add(dataset_id=dataset_id, version=version)

    dataset = dataset_client.get_dataset(dataset_id, retries=3)
    access_rights = dataset["accessRights"]
    confidentiality = CONFIDENTIALITY_MAP[access_rights]

    output_stream_name = f"dp.{confidentiality}.{dataset_id}.processed.{version}.json"
    log_add(output_stream_name=output_stream_name)

    input_events = pipeline_config.payload.step_data.input_events
    write_to_kinesis(events=input_events, stream_name=output_stream_name)

    return asdict(StepData(input_events=input_events, status="OK", errors=[]))
コード例 #9
0
def test_config_from_s3_pipeline_lambda_event():
    config = Config.from_lambda_event(s3_pipeline_lambda_event)

    assert config.execution_name == "test_execution"
    assert config.task == "s3_writer"
    assert config.payload.pipeline == Pipeline(
        id="some-id",
        task_config={"s3_writer": {"some_config": "some_value"}},
    )
    assert config.payload.output_dataset == OutputDataset(
        id="some-id", version="1", edition="some-edition", s3_prefix="some-s3-prefix"
    )
    assert config.payload.step_data == StepData(
        s3_input_prefixes={
            "input1": "some-s3-prefix",
            "input2": "some-s3-prefix",
            "input3": "some-s3-prefix",
        },
        status="PENDING",
        errors=[],
    )
    assert config.payload.step_data.input_count == 3
コード例 #10
0
def test_handle_null_task_config_child():
    config = Config.from_lambda_event(event_with_null_task_config_child)

    assert config.task_config["some_config"] == "some value"
コード例 #11
0
def test_handle_missing_task_config():
    config = Config.from_lambda_event(event_with_missing_task_config)

    assert config.task_config["some_config"] == "some value"
コード例 #12
0
def test_override_default_task_config():
    config = Config.from_lambda_event(event_overriding_default_task_config)

    assert config.task_config["some_config"] == "some other value"
コード例 #13
0
def test_default_task_config():
    config = Config.from_lambda_event(event_with_default_task_config)

    assert config.task_config["some_config"] == "some value"
コード例 #14
0
def test_config_types():
    config = Config.from_lambda_event(event_pipeline_lambda_event)
    assert isinstance(config.payload, Payload)
    assert isinstance(config.payload.pipeline, Pipeline)
    assert isinstance(config.payload.step_data, StepData)
    assert isinstance(config.payload.output_dataset, OutputDataset)
コード例 #15
0
ファイル: base.py プロジェクト: oslokommune/okdata-pipeline
 def __init__(self, event):
     self.s3 = boto3.client("s3")
     self.s3fs_prefix = f"s3://{BUCKET}/"
     self.config = Config.from_lambda_event(event)
     self.task_config = TaskConfig.from_config(self.config)
     log_add(input_config=asdict(self.task_config))
コード例 #16
0
def write_s3(event, context):
    config = Config.from_lambda_event(event)
    task_config = TaskConfig.from_dict(config.task_config)
    output_dataset = config.payload.output_dataset
    step_data = config.payload.step_data
    content_type = task_config.content_type

    log_add(
        dataset_id=output_dataset.id,
        version=output_dataset.version,
        edition_id=output_dataset.edition,
        source_prefixes=step_data.s3_input_prefixes,
        write_to_latest=task_config.write_to_latest,
        output_stage=task_config.output_stage,
    )
    if content_type:
        log_add(content_type=content_type)

    status_add(
        domain="dataset",
        domain_id=f"{output_dataset.id}/{output_dataset.version}",
        operation=config.task,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    source_prefix = next(iter(step_data.s3_input_prefixes.values()))
    output_prefix = config.payload.output_dataset.s3_prefix.replace(
        "%stage%", task_config.output_stage)

    s3_sources = s3_service.resolve_s3_sources(source_prefix)
    copied_files = copy_data(s3_sources, output_prefix)

    if task_config.output_stage == "processed":
        try:
            create_distribution_with_retries(output_dataset, copied_files,
                                             content_type)
        except Exception as e:
            s3_service.delete_from_prefix(output_prefix)
            log_exception(e)
            raise DistributionNotCreated

    if task_config.write_to_latest and is_latest_edition(
            output_dataset.id, output_dataset.version, output_dataset.edition):
        write_data_to_latest(s3_sources, output_prefix)

    output_prefixes = {output_dataset.id: output_prefix}
    response = StepData(s3_input_prefixes=output_prefixes,
                        status="OK",
                        errors=[])

    # TODO: this is just to verify that we have a correct implementation of the status API
    # temporary - if we are in /latest write -> set run to complete
    # Once we get this up and see what the status-api can return to the CLI we will update with more information
    status_body = {
        "files": [s3_source.key for s3_source in s3_sources],
        "latest": task_config.write_to_latest,
    }
    status_add(status_body=status_body)
    return asdict(response)