Python log_add Beispiele, okdata.aws.logging.log_add Python Beispiele

Beispiel #1

0

Datei anzeigen

 def _get_metadata(self, url):
     response = self._get(url)
     if response.status_code != 200:
         log_add(metadata_error_code=response.status_code, metadata_url=url)
         response.raise_for_status()
     data = response.json()
     return data

Beispiel #2

0

Datei anzeigen

Datei: handler.py Projekt: oslokommune/okdata-pipeline

def write_to_kinesis(events, stream_name):
    records = [{
        "Data": json.dumps(event) + "\n",
        "PartitionKey": str(uuid.uuid4())
    } for event in events]
    log_add(num_records=len(records))
    kinesis_client.put_records(StreamName=stream_name, Records=records)

Beispiel #3

0

Datei anzeigen

def say_hello(event, context):
    log_add(relevant_information="Hello from Python blueprint!")

    return {
        "statusCode": 200,
        "headers": {},
        "body": json.dumps({"hello": "world!"}),
    }

Beispiel #4

0

Datei anzeigen

Datei: jsonschema_validator.py Projekt: oslokommune/okdata-pipeline

    def validate(self, data):
        raw_errors = self.validator.iter_errors(data)
        log_add(raw_errors=raw_errors)
        errors = []
        for e in raw_errors:
            error = {"message": e.message, "row": "root"}
            path_len = len(e.path)
            if path_len > 0:
                error["row"] = e.path[0]
                if path_len > 1:
                    error["col"] = e.path[1]
            errors.append(error)

        return errors

Beispiel #5

0

Datei anzeigen

    def resolve_s3_sources(self, source_prefix: str):
        source_objects = self.list_objects_contents(source_prefix)
        log_add(num_source_objects=len(source_objects))
        if not source_objects:
            raise Exception(f"No source files found at: {source_prefix}")

        s3_sources = []

        for obj in source_objects:
            source_key = obj["Key"]
            filename = source_key.split("/")[-1]
            s3_sources.append(S3Source(filename=filename, key=source_key))

        return s3_sources

Beispiel #6

0

Datei anzeigen

Datei: lambda_invoker.py Projekt: oslokommune/okdata-pipeline

def invoke_lambda(event, context):
    config = Config.from_lambda_event(event)
    function_arn = config.payload.pipeline.task_config.get(
        config.task).get("arn")

    log_add(function_arn=function_arn)
    log_add(event=event)

    response = lambda_client.invoke(
        FunctionName=function_arn,
        Payload=json.dumps(event),
        InvocationType="RequestResponse",
    )
    result = read_result(response)
    return result

Beispiel #7

0

Datei anzeigen

def validate_json(event, context):
    config = Config.from_lambda_event(event)
    step_config = StepConfig.from_dict(config.task_config)
    step_data = config.payload.step_data

    log_add(
        dataset_id=config.payload.output_dataset.id,
        version=config.payload.output_dataset.version,
        edition=config.payload.output_dataset.edition,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    if step_config.schema is None:
        return asdict(
            StepData(
                input_events=step_data.input_events,
                s3_input_prefixes=step_data.s3_input_prefixes,
                status="VALIDATION_SUCCESS",
                errors=[],
            ))

    input_data = resolve_input_data(step_data)

    validation_errors = JsonSchemaValidator(
        step_config.schema).validate_list(input_data)

    if validation_errors:
        return asdict(
            StepData(
                input_events=step_data.input_events,
                s3_input_prefixes=step_data.s3_input_prefixes,
                status="VALIDATION_FAILED",
                errors=validation_errors[:100],
            ))

    return asdict(
        StepData(
            input_events=step_data.input_events,
            s3_input_prefixes=step_data.s3_input_prefixes,
            status="VALIDATION_SUCCESS",
            errors=[],
        ))

Beispiel #8

0

Datei anzeigen

Datei: handler.py Projekt: oslokommune/okdata-pipeline

def write_kinesis(event, context):
    pipeline_config = Config.from_lambda_event(event)

    dataset_id = pipeline_config.payload.output_dataset.id
    version = pipeline_config.payload.output_dataset.version
    log_add(dataset_id=dataset_id, version=version)

    dataset = dataset_client.get_dataset(dataset_id, retries=3)
    access_rights = dataset["accessRights"]
    confidentiality = CONFIDENTIALITY_MAP[access_rights]

    output_stream_name = f"dp.{confidentiality}.{dataset_id}.processed.{version}.json"
    log_add(output_stream_name=output_stream_name)

    input_events = pipeline_config.payload.step_data.input_events
    write_to_kinesis(events=input_events, stream_name=output_stream_name)

    return asdict(StepData(input_events=input_events, status="OK", errors=[]))

Beispiel #9

0

Datei anzeigen

    def delete_from_prefix(self, s3_prefix):
        objects_to_delete = [{
            "Key": obj["Key"]
        } for obj in self.list_objects_contents(s3_prefix)]

        if not objects_to_delete:
            return

        self.client.delete_objects(
            Bucket=self.bucket,
            Delete={
                "Objects": [{
                    "Key": s3_object["Key"]
                } for s3_object in objects_to_delete],
                "Quiet":
                True,
            },
        )
        log_add(deleted_from_s3_path=objects_to_delete)

Beispiel #10

0

Datei anzeigen

Datei: base.py Projekt: oslokommune/okdata-pipeline

 def read_csv(self):
     s3_objects = self._list_s3_objects()
     schema = self.task_config.schema
     log_add(schema=schema)
     log_add(s3_keys=[obj["Key"] for obj in s3_objects])
     files = []
     for s3_object in s3_objects:
         key = self.s3fs_prefix + s3_object["Key"]
         dtype = Exporter.get_dtype(schema, key)
         df = self._read_csv_data(
             key,
             delimiter=self.task_config.delimiter,
             chunksize=self.task_config.chunksize,
             dtype=dtype,
         )
         filename = key.split("/")[-1]
         filename = Exporter.remove_suffix(filename)
         files.append((filename, df))
     return files

Beispiel #11

0

Datei anzeigen

Datei: validator.py Projekt: oslokommune/okdata-pipeline

def validate_csv(event, context):
    config = Config.from_lambda_event(event)

    step_config = StepConfig.from_task_config(config.task_config)

    s3_prefix = config.payload.output_dataset.s3_prefix

    log_add(
        header_row=step_config.header_row,
        delimiter=step_config.delimiter,
        quote=step_config.quote,
        schema=step_config.schema,
        output_prefix=s3_prefix,
    )

    if not step_config.schema:
        log_add(notice="No Schema provided for validation")
        config.payload.step_data.status = Status.VALIDATION_SUCCESS.value
        # 2020.06: Validation done optionally - we now return ok if we don't supply a
        # schema for the validation step
        return asdict(config.payload.step_data)

    input_prefix = next(
        iter(config.payload.step_data.s3_input_prefixes.values()))
    log_add(s3_input_prefix=input_prefix)
    objects = s3.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix)

    s3_path = next(iter(objects["Contents"]))["Key"]
    log_add(s3_input_path=s3_path)

    response = s3.get_object(Bucket=BUCKET, Key=s3_path)
    reader = csv.reader(
        string_reader.from_response(response),
        dialect="unix",
        delimiter=step_config.delimiter,
        quotechar=step_config.quote,
    )
    header = None
    if step_config.header_row:
        header = next(reader)
    try:
        csv_data = parse_csv(reader, step_config.schema, header)
    except ParseErrors as p:
        return _with_error(config, p.errors)

    validation_errors = JsonSchemaValidator(
        step_config.schema).validate(csv_data)

    if validation_errors:
        return _with_error(config, errors=validation_errors)

    config.payload.step_data.status = Status.VALIDATION_SUCCESS.value
    return asdict(config.payload.step_data)

Beispiel #12

0

Datei anzeigen

    def export(self):
        inputs = self.read_csv()
        s3_prefix = self.s3_prefix()
        outputs = []
        schema = self.task_config.schema
        errors = []
        try:
            for filename, source in inputs:
                out_prefix = f"s3://{BUCKET}/{s3_prefix}{filename}"
                if self.task_config.chunksize:
                    outputs.extend(
                        self._parallel_export(filename, source, schema,
                                              out_prefix))
                else:
                    outputs.append(self._export(source, schema, out_prefix))
        except OutOfBoundsDatetime as e:
            errors.append({"error": "OutOfBoundsDatetime", "message": str(e)})
        except ValueError as e:
            errors.append({"error": "ValueError", "message": str(e)})

        if len(errors) > 0:
            log_add(errors=errors)
            return asdict(
                StepData(
                    status="CONVERSION_FAILED",
                    errors=errors,
                    s3_input_prefixes={
                        self.config.payload.output_dataset.id: s3_prefix
                    },
                ))

        log_add(parquetfiles=outputs)
        return asdict(
            StepData(
                status="CONVERSION_SUCCESS",
                errors=[],
                s3_input_prefixes={
                    self.config.payload.output_dataset.id: s3_prefix
                },
            ))

Beispiel #13

0

Datei anzeigen

def generate_signed_url_public(event, context):
    dataset_id, version_id, edition_id = _dataset_components_from_event(event)
    client = APIClient()

    try:
        dataset = client.get_dataset(dataset_id)
        edition = client.get_edition(dataset_id, version_id, edition_id)
        log_add(dataset=dataset)
    except requests.HTTPError as e:
        log_exception(e)
        return error_response(e.response.status_code, e.response.json())
    except Exception as e:
        log_exception(e)
        return error_response(500, "Could not complete request, please try again later")

    if not client.has_distributions(edition):
        return error_response(404, f"Missing data for {edition['Id']}")

    if dataset["accessRights"] != "public":
        return error_response(403, "Forbidden")

    signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset)
    return response(200, json.dumps(signed_urls))

Beispiel #14

0

Datei anzeigen

def generate_signed_urls(bucket, dataset, edition):
    access_rights = dataset["accessRights"]
    dataset_id, version, edition_id = edition["Id"].split("/")
    confidentiality = CONFIDENTIALITY_MAP[access_rights]
    common_prefix = f"processed/{confidentiality}/"
    parent_id = dataset.get("parent_id")

    dataset_prefix = f"{dataset_id}/version={version}/edition={edition_id}/"
    if parent_id:
        dataset_prefix = f"{parent_id}/{dataset_prefix}"

    prefix = common_prefix + dataset_prefix

    log_add(
        dataset_access_rights=access_rights,
        s3_bucket=bucket,
        s3_prefix=prefix,
    )

    session = boto3.Session()
    s3 = session.client("s3")
    resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

    signed_urls = [{
        "key":
        obj["Key"],
        "url":
        s3.generate_presigned_url(
            "get_object",
            Params={
                "Bucket": bucket,
                "Key": obj["Key"]
            },
            ExpiresIn=60 * 5,
        ),
    } for obj in resp["Contents"]]
    return signed_urls

Beispiel #15

0

Datei anzeigen

def generate_signed_url(event, context):
    dataset_id, version_id, edition_id = _dataset_components_from_event(event)
    client = APIClient.with_access_token_from_event(event)

    if not client:
        return error_response(403, "Forbidden")

    try:
        dataset = client.get_dataset(dataset_id)
        edition = client.get_edition(dataset_id, version_id, edition_id)
        log_add(dataset=dataset)
    except requests.HTTPError as e:
        log_exception(e)
        return error_response(e.response.status_code, e.response.json())
    except Exception as e:
        log_exception(e)
        return error_response(500, "Could not complete request, please try again later")

    if not client.has_distributions(edition):
        return error_response(404, f"Missing data for {edition['Id']}")

    # Only users with read access download non-public datasets.
    if (
        dataset["accessRights"] != "public"
        and ENABLE_AUTH
        and not resource_authorizer.has_access(
            client.access_token,
            scope="okdata:dataset:read",
            resource_name=f"okdata:dataset:{dataset_id}",
        )
    ):
        log_add(has_access=False)
        return error_response(403, "Forbidden")

    signed_urls = generate_signed_urls(BUCKET, edition=edition, dataset=dataset)
    return response(200, json.dumps(signed_urls))

Beispiel #16

0

Datei anzeigen

Datei: base.py Projekt: oslokommune/okdata-pipeline

 def get_dtype(schema, input):
     """
     Try to resolve the dtype for the columns for reading csv file
     Resolve dtype from the taskConfig.TASK_NAME.schema, if this is
     not available we read the first line (column headers) for the file that is about
     to be read in by pandas, and set each column to be of type object (default)
     """
     log_add(dtype_source="jsonschema")
     dtype = Exporter.jsonschema_to_dtypes(schema)
     if dtype is None:
         log_add(dtype_source=f"input:{input}")
         dtype = Exporter.get_dtype_from_input(input)
     log_add(dtype=dtype)
     return dtype

Beispiel #17

0

Datei anzeigen

Datei: validator.py Projekt: oslokommune/okdata-pipeline

def _with_error(config: Config, errors):
    log_add(errors=errors)
    log_add(status=Status.VALIDATION_FAILED.value)
    config.payload.step_data.status = Status.VALIDATION_FAILED.value
    config.payload.step_data.errors = errors[:100]
    return asdict(config.payload.step_data)

Beispiel #18

0

Datei anzeigen

def _dataset_components_from_event(event):
    pp = event["pathParameters"]
    dataset_id, version_id, edition_id = pp["dataset"], pp["version"], pp["edition"]
    log_add(dataset_id=dataset_id, version_id=version_id, edition_id=edition_id)
    return dataset_id, version_id, edition_id

Beispiel #19

0

Datei anzeigen

Datei: jsonschema_validator.py Projekt: oslokommune/okdata-pipeline

 def validate_schema_version(self, schema):
     schema_version = schema["$schema"]
     log_add(schema_version=schema_version)
     if schema_version not in SCHEMA_SUPPORTED_VERSIONS:
         raise ValueError(
             f"Schema version: {schema_version} is not supported")

Beispiel #20

0

Datei anzeigen

Datei: base.py Projekt: oslokommune/okdata-pipeline

 def __init__(self, event):
     self.s3 = boto3.client("s3")
     self.s3fs_prefix = f"s3://{BUCKET}/"
     self.config = Config.from_lambda_event(event)
     self.task_config = TaskConfig.from_config(self.config)
     log_add(input_config=asdict(self.task_config))

Beispiel #21

0

Datei anzeigen

def read_root():
    log_add(hello="world")
    return {"hello": "world"}

Beispiel #22

0

Datei anzeigen

def read_error():
    log_add(hello="error")
    raise Exception("This is wrong!")

Beispiel #23

0

Datei anzeigen

Datei: handlers.py Projekt: oslokommune/okdata-pipeline

def is_latest_edition(dataset_id, version, edition):
    latest_edition = dataset_client.get_latest_edition(dataset_id, version)
    is_latest = [dataset_id, version,
                 edition] == latest_edition["Id"].split("/")
    log_add(is_latest_edition=is_latest)
    return is_latest

Beispiel #24

0

Datei anzeigen

Datei: handlers.py Projekt: oslokommune/okdata-pipeline

def write_s3(event, context):
    config = Config.from_lambda_event(event)
    task_config = TaskConfig.from_dict(config.task_config)
    output_dataset = config.payload.output_dataset
    step_data = config.payload.step_data
    content_type = task_config.content_type

    log_add(
        dataset_id=output_dataset.id,
        version=output_dataset.version,
        edition_id=output_dataset.edition,
        source_prefixes=step_data.s3_input_prefixes,
        write_to_latest=task_config.write_to_latest,
        output_stage=task_config.output_stage,
    )
    if content_type:
        log_add(content_type=content_type)

    status_add(
        domain="dataset",
        domain_id=f"{output_dataset.id}/{output_dataset.version}",
        operation=config.task,
    )

    if step_data.input_count > 1:
        raise IllegalWrite("cannot combine multiple datasets: ",
                           step_data.input_count)

    source_prefix = next(iter(step_data.s3_input_prefixes.values()))
    output_prefix = config.payload.output_dataset.s3_prefix.replace(
        "%stage%", task_config.output_stage)

    s3_sources = s3_service.resolve_s3_sources(source_prefix)
    copied_files = copy_data(s3_sources, output_prefix)

    if task_config.output_stage == "processed":
        try:
            create_distribution_with_retries(output_dataset, copied_files,
                                             content_type)
        except Exception as e:
            s3_service.delete_from_prefix(output_prefix)
            log_exception(e)
            raise DistributionNotCreated

    if task_config.write_to_latest and is_latest_edition(
            output_dataset.id, output_dataset.version, output_dataset.edition):
        write_data_to_latest(s3_sources, output_prefix)

    output_prefixes = {output_dataset.id: output_prefix}
    response = StepData(s3_input_prefixes=output_prefixes,
                        status="OK",
                        errors=[])

    # TODO: this is just to verify that we have a correct implementation of the status API
    # temporary - if we are in /latest write -> set run to complete
    # Once we get this up and see what the status-api can return to the CLI we will update with more information
    status_body = {
        "files": [s3_source.key for s3_source in s3_sources],
        "latest": task_config.write_to_latest,
    }
    status_add(status_body=status_body)
    return asdict(response)

Beispiel #25

0

Datei anzeigen

    def __init__(self):

        self.client = boto3.client("s3")
        log_add(s3_bucket=self.bucket)