Beispiel #1
0
 def _get_metadata(self, url):
     response = self._get(url)
     if response.status_code != 200:
         log_add(metadata_error_code=response.status_code, metadata_url=url)
         response.raise_for_status()
     data = response.json()
     return data
Beispiel #2
0
def write_to_kinesis(events, stream_name):
    records = [{
        "Data": json.dumps(event) + "\n",
        "PartitionKey": str(uuid.uuid4())
    } for event in events]
    log_add(num_records=len(records))
    kinesis_client.put_records(StreamName=stream_name, Records=records)
Beispiel #3
0
def say_hello(event, context):
    log_add(relevant_information="Hello from Python blueprint!")

    return {
        "statusCode": 200,
        "headers": {},
        "body": json.dumps({"hello": "world!"}),
    }
    def validate(self, data):
        raw_errors = self.validator.iter_errors(data)
        log_add(raw_errors=raw_errors)
        errors = []
        for e in raw_errors:
            error = {"message": e.message, "row": "root"}
            path_len = len(e.path)
            if path_len > 0:
                error["row"] = e.path[0]
                if path_len > 1:
                    error["col"] = e.path[1]
            errors.append(error)

        return errors
Beispiel #5
0
    def resolve_s3_sources(self, source_prefix: str):
        source_objects = self.list_objects_contents(source_prefix)
        log_add(num_source_objects=len(source_objects))
        if not source_objects:
            raise Exception(f"No source files found at: {source_prefix}")

        s3_sources = []

        for obj in source_objects:
            source_key = obj["Key"]
            filename = source_key.split("/")[-1]
            s3_sources.append(S3Source(filename=filename, key=source_key))

        return s3_sources
def invoke_lambda(event, context):
    config = Config.from_lambda_event(event)
    function_arn = config.payload.pipeline.task_config.get(
        config.task).get("arn")

    log_add(function_arn=function_arn)
    log_add(event=event)

    response = lambda_client.invoke(
        FunctionName=function_arn,
        Payload=json.dumps(event),
        InvocationType="RequestResponse",
    )
    result = read_result(response)
    return result
Beispiel #7
0
def validate_json(event, context):
    config = Config.from_lambda_event(event)
    step_config = StepConfig.from_dict(config.task_config)
    step_data = config.payload.step_data

    log_add(
        dataset_id=config.payload.output_dataset.id,
        version=config.payload.output_dataset.version,
        edition=config.payload.output_dataset.edition,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    if step_config.schema is None:
        return asdict(
            StepData(
                input_events=step_data.input_events,
                s3_input_prefixes=step_data.s3_input_prefixes,
                status="VALIDATION_SUCCESS",
                errors=[],
            ))

    input_data = resolve_input_data(step_data)

    validation_errors = JsonSchemaValidator(
        step_config.schema).validate_list(input_data)

    if validation_errors:
        return asdict(
            StepData(
                input_events=step_data.input_events,
                s3_input_prefixes=step_data.s3_input_prefixes,
                status="VALIDATION_FAILED",
                errors=validation_errors[:100],
            ))

    return asdict(
        StepData(
            input_events=step_data.input_events,
            s3_input_prefixes=step_data.s3_input_prefixes,
            status="VALIDATION_SUCCESS",
            errors=[],
        ))
Beispiel #8
0
def write_kinesis(event, context):
    pipeline_config = Config.from_lambda_event(event)

    dataset_id = pipeline_config.payload.output_dataset.id
    version = pipeline_config.payload.output_dataset.version
    log_add(dataset_id=dataset_id, version=version)

    dataset = dataset_client.get_dataset(dataset_id, retries=3)
    access_rights = dataset["accessRights"]
    confidentiality = CONFIDENTIALITY_MAP[access_rights]

    output_stream_name = f"dp.{confidentiality}.{dataset_id}.processed.{version}.json"
    log_add(output_stream_name=output_stream_name)

    input_events = pipeline_config.payload.step_data.input_events
    write_to_kinesis(events=input_events, stream_name=output_stream_name)

    return asdict(StepData(input_events=input_events, status="OK", errors=[]))
Beispiel #9
0
    def delete_from_prefix(self, s3_prefix):
        objects_to_delete = [{
            "Key": obj["Key"]
        } for obj in self.list_objects_contents(s3_prefix)]

        if not objects_to_delete:
            return

        self.client.delete_objects(
            Bucket=self.bucket,
            Delete={
                "Objects": [{
                    "Key": s3_object["Key"]
                } for s3_object in objects_to_delete],
                "Quiet":
                True,
            },
        )
        log_add(deleted_from_s3_path=objects_to_delete)
Beispiel #10
0
 def read_csv(self):
     s3_objects = self._list_s3_objects()
     schema = self.task_config.schema
     log_add(schema=schema)
     log_add(s3_keys=[obj["Key"] for obj in s3_objects])
     files = []
     for s3_object in s3_objects:
         key = self.s3fs_prefix + s3_object["Key"]
         dtype = Exporter.get_dtype(schema, key)
         df = self._read_csv_data(
             key,
             delimiter=self.task_config.delimiter,
             chunksize=self.task_config.chunksize,
             dtype=dtype,
         )
         filename = key.split("/")[-1]
         filename = Exporter.remove_suffix(filename)
         files.append((filename, df))
     return files
def validate_csv(event, context):
    config = Config.from_lambda_event(event)

    step_config = StepConfig.from_task_config(config.task_config)

    s3_prefix = config.payload.output_dataset.s3_prefix

    log_add(
        header_row=step_config.header_row,
        delimiter=step_config.delimiter,
        quote=step_config.quote,
        schema=step_config.schema,
        output_prefix=s3_prefix,
    )

    if not step_config.schema:
        log_add(notice="No Schema provided for validation")
        config.payload.step_data.status = Status.VALIDATION_SUCCESS.value
        # 2020.06: Validation done optionally - we now return ok if we don't supply a
        # schema for the validation step
        return asdict(config.payload.step_data)

    input_prefix = next(
        iter(config.payload.step_data.s3_input_prefixes.values()))
    log_add(s3_input_prefix=input_prefix)
    objects = s3.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix)

    s3_path = next(iter(objects["Contents"]))["Key"]
    log_add(s3_input_path=s3_path)

    response = s3.get_object(Bucket=BUCKET, Key=s3_path)
    reader = csv.reader(
        string_reader.from_response(response),
        dialect="unix",
        delimiter=step_config.delimiter,
        quotechar=step_config.quote,
    )
    header = None
    if step_config.header_row:
        header = next(reader)
    try:
        csv_data = parse_csv(reader, step_config.schema, header)
    except ParseErrors as p:
        return _with_error(config, p.errors)

    validation_errors = JsonSchemaValidator(
        step_config.schema).validate(csv_data)

    if validation_errors:
        return _with_error(config, errors=validation_errors)

    config.payload.step_data.status = Status.VALIDATION_SUCCESS.value
    return asdict(config.payload.step_data)
Beispiel #12
0
    def export(self):
        inputs = self.read_csv()
        s3_prefix = self.s3_prefix()
        outputs = []
        schema = self.task_config.schema
        errors = []
        try:
            for filename, source in inputs:
                out_prefix = f"s3://{BUCKET}/{s3_prefix}{filename}"
                if self.task_config.chunksize:
                    outputs.extend(
                        self._parallel_export(filename, source, schema,
                                              out_prefix))
                else:
                    outputs.append(self._export(source, schema, out_prefix))
        except OutOfBoundsDatetime as e:
            errors.append({"error": "OutOfBoundsDatetime", "message": str(e)})
        except ValueError as e:
            errors.append({"error": "ValueError", "message": str(e)})

        if len(errors) > 0:
            log_add(errors=errors)
            return asdict(
                StepData(
                    status="CONVERSION_FAILED",
                    errors=errors,
                    s3_input_prefixes={
                        self.config.payload.output_dataset.id: s3_prefix
                    },
                ))

        log_add(parquetfiles=outputs)
        return asdict(
            StepData(
                status="CONVERSION_SUCCESS",
                errors=[],
                s3_input_prefixes={
                    self.config.payload.output_dataset.id: s3_prefix
                },
            ))
Beispiel #13
0
def generate_signed_url_public(event, context):
    dataset_id, version_id, edition_id = _dataset_components_from_event(event)
    client = APIClient()

    try:
        dataset = client.get_dataset(dataset_id)
        edition = client.get_edition(dataset_id, version_id, edition_id)
        log_add(dataset=dataset)
    except requests.HTTPError as e:
        log_exception(e)
        return error_response(e.response.status_code, e.response.json())
    except Exception as e:
        log_exception(e)
        return error_response(500, "Could not complete request, please try again later")

    if not client.has_distributions(edition):
        return error_response(404, f"Missing data for {edition['Id']}")

    if dataset["accessRights"] != "public":
        return error_response(403, "Forbidden")

    signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset)
    return response(200, json.dumps(signed_urls))
Beispiel #14
0
def generate_signed_urls(bucket, dataset, edition):
    access_rights = dataset["accessRights"]
    dataset_id, version, edition_id = edition["Id"].split("/")
    confidentiality = CONFIDENTIALITY_MAP[access_rights]
    common_prefix = f"processed/{confidentiality}/"
    parent_id = dataset.get("parent_id")

    dataset_prefix = f"{dataset_id}/version={version}/edition={edition_id}/"
    if parent_id:
        dataset_prefix = f"{parent_id}/{dataset_prefix}"

    prefix = common_prefix + dataset_prefix

    log_add(
        dataset_access_rights=access_rights,
        s3_bucket=bucket,
        s3_prefix=prefix,
    )

    session = boto3.Session()
    s3 = session.client("s3")
    resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

    signed_urls = [{
        "key":
        obj["Key"],
        "url":
        s3.generate_presigned_url(
            "get_object",
            Params={
                "Bucket": bucket,
                "Key": obj["Key"]
            },
            ExpiresIn=60 * 5,
        ),
    } for obj in resp["Contents"]]
    return signed_urls
Beispiel #15
0
def generate_signed_url(event, context):
    dataset_id, version_id, edition_id = _dataset_components_from_event(event)
    client = APIClient.with_access_token_from_event(event)

    if not client:
        return error_response(403, "Forbidden")

    try:
        dataset = client.get_dataset(dataset_id)
        edition = client.get_edition(dataset_id, version_id, edition_id)
        log_add(dataset=dataset)
    except requests.HTTPError as e:
        log_exception(e)
        return error_response(e.response.status_code, e.response.json())
    except Exception as e:
        log_exception(e)
        return error_response(500, "Could not complete request, please try again later")

    if not client.has_distributions(edition):
        return error_response(404, f"Missing data for {edition['Id']}")

    # Only users with read access download non-public datasets.
    if (
        dataset["accessRights"] != "public"
        and ENABLE_AUTH
        and not resource_authorizer.has_access(
            client.access_token,
            scope="okdata:dataset:read",
            resource_name=f"okdata:dataset:{dataset_id}",
        )
    ):
        log_add(has_access=False)
        return error_response(403, "Forbidden")

    signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset)
    return response(200, json.dumps(signed_urls))
Beispiel #16
0
 def get_dtype(schema, input):
     """
     Try to resolve the dtype for the columns for reading csv file
     Resolve dtype from the taskConfig.TASK_NAME.schema, if this is
     not available we read the first line (column headers) for the file that is about
     to be read in by pandas, and set each column to be of type object (default)
     """
     log_add(dtype_source="jsonschema")
     dtype = Exporter.jsonschema_to_dtypes(schema)
     if dtype is None:
         log_add(dtype_source=f"input:{input}")
         dtype = Exporter.get_dtype_from_input(input)
     log_add(dtype=dtype)
     return dtype
def _with_error(config: Config, errors):
    log_add(errors=errors)
    log_add(status=Status.VALIDATION_FAILED.value)
    config.payload.step_data.status = Status.VALIDATION_FAILED.value
    config.payload.step_data.errors = errors[:100]
    return asdict(config.payload.step_data)
Beispiel #18
0
def _dataset_components_from_event(event):
    pp = event["pathParameters"]
    dataset_id, version_id, edition_id = pp["dataset"], pp["version"], pp["edition"]
    log_add(dataset_id=dataset_id, version_id=version_id, edition_id=edition_id)
    return dataset_id, version_id, edition_id
 def validate_schema_version(self, schema):
     schema_version = schema["$schema"]
     log_add(schema_version=schema_version)
     if schema_version not in SCHEMA_SUPPORTED_VERSIONS:
         raise ValueError(
             f"Schema version: {schema_version} is not supported")
Beispiel #20
0
 def __init__(self, event):
     self.s3 = boto3.client("s3")
     self.s3fs_prefix = f"s3://{BUCKET}/"
     self.config = Config.from_lambda_event(event)
     self.task_config = TaskConfig.from_config(self.config)
     log_add(input_config=asdict(self.task_config))
Beispiel #21
0
def read_root():
    log_add(hello="world")
    return {"hello": "world"}
Beispiel #22
0
def read_error():
    log_add(hello="error")
    raise Exception("This is wrong!")
Beispiel #23
0
def is_latest_edition(dataset_id, version, edition):
    latest_edition = dataset_client.get_latest_edition(dataset_id, version)
    is_latest = [dataset_id, version,
                 edition] == latest_edition["Id"].split("/")
    log_add(is_latest_edition=is_latest)
    return is_latest
Beispiel #24
0
def write_s3(event, context):
    config = Config.from_lambda_event(event)
    task_config = TaskConfig.from_dict(config.task_config)
    output_dataset = config.payload.output_dataset
    step_data = config.payload.step_data
    content_type = task_config.content_type

    log_add(
        dataset_id=output_dataset.id,
        version=output_dataset.version,
        edition_id=output_dataset.edition,
        source_prefixes=step_data.s3_input_prefixes,
        write_to_latest=task_config.write_to_latest,
        output_stage=task_config.output_stage,
    )
    if content_type:
        log_add(content_type=content_type)

    status_add(
        domain="dataset",
        domain_id=f"{output_dataset.id}/{output_dataset.version}",
        operation=config.task,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    source_prefix = next(iter(step_data.s3_input_prefixes.values()))
    output_prefix = config.payload.output_dataset.s3_prefix.replace(
        "%stage%", task_config.output_stage)

    s3_sources = s3_service.resolve_s3_sources(source_prefix)
    copied_files = copy_data(s3_sources, output_prefix)

    if task_config.output_stage == "processed":
        try:
            create_distribution_with_retries(output_dataset, copied_files,
                                             content_type)
        except Exception as e:
            s3_service.delete_from_prefix(output_prefix)
            log_exception(e)
            raise DistributionNotCreated

    if task_config.write_to_latest and is_latest_edition(
            output_dataset.id, output_dataset.version, output_dataset.edition):
        write_data_to_latest(s3_sources, output_prefix)

    output_prefixes = {output_dataset.id: output_prefix}
    response = StepData(s3_input_prefixes=output_prefixes,
                        status="OK",
                        errors=[])

    # TODO: this is just to verify that we have a correct implementation of the status API
    # temporary - if we are in /latest write -> set run to complete
    # Once we get this up and see what the status-api can return to the CLI we will update with more information
    status_body = {
        "files": [s3_source.key for s3_source in s3_sources],
        "latest": task_config.write_to_latest,
    }
    status_add(status_body=status_body)
    return asdict(response)
Beispiel #25
0
    def __init__(self):

        self.client = boto3.client("s3")
        log_add(s3_bucket=self.bucket)