Ejemplo n.º 1
0
def handler(event=None, context=None) -> dict:
    payload = get_payload(event)

    logger = configure_log()
    logger.info(payload)

    if PAYLOAD_CORRELATION_ID in payload and PAYLOAD_S3_PREFIX in payload:
        raise ValueError(
            "Data passed it triggeres old handler which has now been depracated. Please use new handler"
        )

    if (PAYLOAD_EVENT_NOTIFICATION_RECORDS in payload and PAYLOAD_BODY
            in payload[PAYLOAD_EVENT_NOTIFICATION_RECORDS][0]):
        message = payload[PAYLOAD_EVENT_NOTIFICATION_RECORDS][0]
        loaded_payload_body = json.loads(message[PAYLOAD_BODY])
        logger.info(
            f'Processing payload from SQS", "payload": "{loaded_payload_body}')
        if (PAYLOAD_EVENT_NOTIFICATION_RECORDS in loaded_payload_body
                and PAYLOAD_S3
                in loaded_payload_body[PAYLOAD_EVENT_NOTIFICATION_RECORDS][0]):
            logger.info(
                f'Using S3 event notification handler", "payload": "{payload}')
            correlation_id = (message["messageId"]
                              if "messageId" in message else str(uuid.uuid4()))
            logger.info(
                f'Correlation id set", "correlation_id": "{correlation_id}')
            return s3_event_notification_handler(
                correlation_id,
                loaded_payload_body[PAYLOAD_EVENT_NOTIFICATION_RECORDS][0],
            )

    try:
        payload = Payload(**payload)
    except:
        raise TypeError("Invalid request payload")

    cluster_config = build_config(
        payload.s3_overrides,
        payload.overrides,
        payload.extend,
        payload.additional_step_args,
    )

    if payload.copy_secconfig:
        secconfig_orig = cluster_config.get("SecurityConfiguration", "")
        if secconfig_orig != "":
            secconfig = dup_security_configuration(secconfig_orig)
            cluster_config["SecurityConfiguration"] = secconfig

    return emr_launch_cluster(cluster_config)
Ejemplo n.º 2
0
def add_command_line_params(
    cluster_config,
    correlation_id,
    s3_prefix,
    snapshot_type,
    export_date,
    skip_pdm_trigger,
):
    """
    Adding command line arguments to ADG and PDM EMR steps scripts. First if block in Try is for PDM and the second one
    is for ADG.
    """
    logger = configure_log()

    try:
        for step_name in [
            SEND_NOTIFICATION_STEP,
            COURTESY_FLUSH_STEP_NAME,
            SUBMIT_JOB,
            CREATE_CLIVE_DATABASES,
            CREATE_UC_FEATURE_DATABASES,
            CREATE_HIVE_DYNAMO_TABLE,
            SOURCE,
        ]:
            add_command_line_args_to_step(
                cluster_config,
                correlation_id,
                s3_prefix,
                snapshot_type,
                export_date,
                step_name,
            )

        add_command_line_args_to_step(
            cluster_config,
            correlation_id,
            s3_prefix,
            snapshot_type,
            export_date,
            CREATE_PDM_TRIGGER_STEP_NAME,
            skip_pdm_trigger,
        )
    except Exception as ex:
        logger.error(ex)
        raise ex
Ejemplo n.º 3
0
from emr_launcher.logger import configure_log
from emr_launcher.handler import handler

logger = configure_log()
try:
    handler()
except Exception as e:
    logger.error(e)
Ejemplo n.º 4
0
def add_command_line_params(cluster_config, correlation_id, s3_prefix,
                            snapshot_type, export_date):
    """
    Adding command line arguments to ADG and PDM EMR steps scripts. First if block in Try is for PDM and the second one
    is for ADG.
    """
    logger = configure_log()
    print(correlation_id, "\n", s3_prefix)
    try:
        if (next(
            (sub for sub in cluster_config[STEPS] if sub[NAME_KEY] == SOURCE),
                None,
        ) is not None):
            pdm_script_args = next(
                (sub
                 for sub in cluster_config[STEPS] if sub[NAME_KEY] == SOURCE),
                None,
            )[HADOOP_JAR_STEP][ARGS]
            pdm_script_args.append(CORRELATION_ID)
            pdm_script_args.append(correlation_id)
            pdm_script_args.append(S3_PREFIX)
            pdm_script_args.append(s3_prefix)
            next(
                (sub
                 for sub in cluster_config[STEPS] if sub[NAME_KEY] == SOURCE),
                None,
            )[HADOOP_JAR_STEP][ARGS] = pdm_script_args

        if (next(
            (sub for sub in cluster_config[STEPS]
             if sub[NAME_KEY] == CREATE_HIVE_DYNAMO_TABLE),
                None,
        ) is not None):
            pdm_script_args = next(
                (sub for sub in cluster_config[STEPS]
                 if sub[NAME_KEY] == CREATE_HIVE_DYNAMO_TABLE),
                None,
            )[HADOOP_JAR_STEP][ARGS]
            pdm_script_args.append(CORRELATION_ID)
            pdm_script_args.append(correlation_id)
            pdm_script_args.append(S3_PREFIX)
            pdm_script_args.append(s3_prefix)
            pdm_script_args.append(SNAPSHOT_TYPE)
            pdm_script_args.append(snapshot_type)
            pdm_script_args.append(EXPORT_DATE_COMMAND)
            pdm_script_args.append(export_date)
            next(
                (sub for sub in cluster_config[STEPS]
                 if sub[NAME_KEY] == CREATE_HIVE_DYNAMO_TABLE),
                None,
            )[HADOOP_JAR_STEP][ARGS] = pdm_script_args
    except Exception as e:
        logger.error(e)

    try:
        if (next(
            (sub
             for sub in cluster_config[STEPS] if sub[NAME_KEY] == SUBMIT_JOB),
                None,
        ) is not None):
            adg_script_args = next(
                (sub for sub in cluster_config[STEPS]
                 if sub[NAME_KEY] == SUBMIT_JOB),
                None,
            )[HADOOP_JAR_STEP][ARGS]
            adg_script_args.append(CORRELATION_ID)
            adg_script_args.append(correlation_id)
            adg_script_args.append(S3_PREFIX)
            adg_script_args.append(s3_prefix)
            adg_script_args.append(SNAPSHOT_TYPE)
            adg_script_args.append(snapshot_type)
            adg_script_args.append(EXPORT_DATE_COMMAND)
            adg_script_args.append(export_date)
            print(adg_script_args)
            next(
                (sub for sub in cluster_config[STEPS]
                 if sub[NAME_KEY] == SUBMIT_JOB),
                None,
            )[HADOOP_JAR_STEP][ARGS] = adg_script_args

    except Exception as e:
        logger.error(e)

    try:
        if (next(
            (sub for sub in cluster_config[STEPS]
             if sub[NAME_KEY] == COURTESY_FLUSH_STEP_NAME),
                None,
        ) is not None):
            adg_script_args = next(
                (sub for sub in cluster_config[STEPS]
                 if sub[NAME_KEY] == COURTESY_FLUSH_STEP_NAME),
                None,
            )[HADOOP_JAR_STEP][ARGS]
            adg_script_args.append(CORRELATION_ID)
            adg_script_args.append(correlation_id)
            adg_script_args.append(S3_PREFIX)
            adg_script_args.append(s3_prefix)
            adg_script_args.append(SNAPSHOT_TYPE)
            adg_script_args.append(snapshot_type)
            adg_script_args.append(EXPORT_DATE_COMMAND)
            adg_script_args.append(export_date)
            print(adg_script_args)
            next(
                (sub for sub in cluster_config[STEPS]
                 if sub[NAME_KEY] == COURTESY_FLUSH_STEP_NAME),
                None,
            )[HADOOP_JAR_STEP][ARGS] = adg_script_args

    except Exception as e:
        logger.error(e)

    try:
        if (next(
            (sub for sub in cluster_config[STEPS]
             if sub[NAME_KEY] == CREATE_PDM_TRIGGER_STEP_NAME),
                None,
        ) is not None):
            adg_script_args = next(
                (sub for sub in cluster_config[STEPS]
                 if sub[NAME_KEY] == CREATE_PDM_TRIGGER_STEP_NAME),
                None,
            )[HADOOP_JAR_STEP][ARGS]
            adg_script_args.append(CORRELATION_ID)
            adg_script_args.append(correlation_id)
            adg_script_args.append(S3_PREFIX)
            adg_script_args.append(s3_prefix)
            adg_script_args.append(SNAPSHOT_TYPE)
            adg_script_args.append(snapshot_type)
            adg_script_args.append(EXPORT_DATE_COMMAND)
            adg_script_args.append(export_date)
            print(adg_script_args)
            next(
                (sub for sub in cluster_config[STEPS]
                 if sub[NAME_KEY] == CREATE_PDM_TRIGGER_STEP_NAME),
                None,
            )[HADOOP_JAR_STEP][ARGS] = adg_script_args

    except Exception as e:
        logger.error(e)
Ejemplo n.º 5
0
def old_handler(event=None) -> dict:
    """Launches an EMR cluster with the provided configuration."""
    logger = configure_log()
    correlation_id_necessary = False
    # If when this lambda is triggered via API
    # Elif when this lambda is triggered via SNS

    correlation_id = get_value(PAYLOAD_CORRELATION_ID, event)
    s3_prefix = get_value(PAYLOAD_S3_PREFIX, event)
    snapshot_type = get_value(PAYLOAD_SNAPSHOT_TYPE, event)
    export_date = get_value(PAYLOAD_EXPORT_DATE, event)

    if "Records" in event or (PAYLOAD_CORRELATION_ID in event
                              and PAYLOAD_S3_PREFIX in event):
        correlation_id_necessary = True

    cluster_config = read_config("cluster")
    cluster_name = cluster_config["Name"]

    # if ADG and if snapshot_type is incremental use "configurations_incremental"
    config_yml_name = get_config_file_name(cluster_name, snapshot_type)

    cluster_config.update(
        read_config(config_type=config_yml_name,
                    s3_overrides=None,
                    required=False))

    try:
        if (next(
            (sub for sub in cluster_config["Configurations"]
             if sub["Classification"] == "spark-hive-site"),
                None,
        ) is not None):
            secret_name = next(
                (sub for sub in cluster_config["Configurations"]
                 if sub["Classification"] == "spark-hive-site"),
                None,
            )["Properties"]["javax.jdo.option.ConnectionPassword"]
            secret_value = sm_retrieve_secrets(secret_name)
            next(
                (sub for sub in cluster_config["Configurations"]
                 if sub["Classification"] == "spark-hive-site"),
                None,
            )["Properties"][
                "javax.jdo.option.ConnectionPassword"] = secret_value
    except Exception as e:
        logger.info(e)

    try:
        if (next(
            (sub for sub in cluster_config["Configurations"]
             if sub["Classification"] == "hive-site"),
                None,
        ) is not None):
            secret_name = next(
                (sub for sub in cluster_config["Configurations"]
                 if sub["Classification"] == "hive-site"),
                None,
            )["Properties"]["javax.jdo.option.ConnectionPassword"]
            secret_value = sm_retrieve_secrets(secret_name)
            next(
                (sub for sub in cluster_config["Configurations"]
                 if sub["Classification"] == "hive-site"),
                None,
            )["Properties"][
                "javax.jdo.option.ConnectionPassword"] = secret_value
    except Exception as e:
        logger.info(e)

    cluster_config.update(read_config("instances"))
    cluster_config.update(
        read_config(config_type="steps", s3_overrides=None, required=False))

    if correlation_id_necessary:
        add_command_line_params(cluster_config, correlation_id, s3_prefix,
                                snapshot_type, export_date)
        adg_trim_steps_for_incremental(cluster_config, snapshot_type)
        adg_trim_steps_for_full(cluster_config, snapshot_type)

    # Renaming ADG cluster based on snapshot type full/incremental
    if cluster_name == ADG_NAME:
        update_adg_cluster_name(cluster_config, snapshot_type)
    logger.debug("Requested cluster parameters", extra=cluster_config)

    resp = emr_launch_cluster(cluster_config)

    job_flow_id = resp["JobFlowId"]

    additional_tags = {
        "Correlation_Id": correlation_id,
        "snapshot_type": snapshot_type,
        "export_date": export_date,
    }

    logger.debug(resp)

    emr_cluster_add_tags(job_flow_id, additional_tags)

    return resp
Ejemplo n.º 6
0
def s3_event_notification_handler(correlation_id, record=None) -> dict:
    """Launches an EMR cluster with the provided configuration."""
    logger = configure_log()
    logger.info(record)

    export_date = get_event_time_as_date_string(
        get_value(PAYLOAD_EVENT_TIME, record))
    s3_object = get_value(PAYLOAD_S3, record)
    s3_bucket_object = get_value(PAYLOAD_BUCKET, s3_object)
    s3_object_object = get_value(PAYLOAD_OBJECT, s3_object)
    s3_prefix = get_value(PAYLOAD_KEY, s3_object_object)
    s3_bucket_name = get_value(PAYLOAD_NAME, s3_bucket_object)

    cluster_config = read_config("cluster")
    configurations_config_yml_name = "configurations"

    cluster_config.update(
        read_config(
            config_type=configurations_config_yml_name,
            s3_overrides=None,
            required=False,
        ))

    try:
        if (next(
            (sub for sub in cluster_config["Configurations"]
             if sub["Classification"] == "spark-hive-site"),
                None,
        ) is not None):
            secret_name = next(
                (sub for sub in cluster_config["Configurations"]
                 if sub["Classification"] == "spark-hive-site"),
                None,
            )["Properties"]["javax.jdo.option.ConnectionPassword"]
            secret_value = sm_retrieve_secrets(secret_name)
            next(
                (sub for sub in cluster_config["Configurations"]
                 if sub["Classification"] == "spark-hive-site"),
                None,
            )["Properties"][
                "javax.jdo.option.ConnectionPassword"] = secret_value
    except Exception as e:
        logger.info(e)

    try:
        if (next(
            (sub for sub in cluster_config["Configurations"]
             if sub["Classification"] == "hive-site"),
                None,
        ) is not None):
            secret_name = next(
                (sub for sub in cluster_config["Configurations"]
                 if sub["Classification"] == "hive-site"),
                None,
            )["Properties"]["javax.jdo.option.ConnectionPassword"]
            secret_value = sm_retrieve_secrets(secret_name)
            next(
                (sub for sub in cluster_config["Configurations"]
                 if sub["Classification"] == "hive-site"),
                None,
            )["Properties"][
                "javax.jdo.option.ConnectionPassword"] = secret_value
    except Exception as e:
        logger.info(e)

    cluster_config.update(read_config("instances"))
    cluster_config.update(
        read_config(config_type="steps", s3_overrides=None, required=False))

    HADOOP_JAR_STEP = "HadoopJarStep"
    ARGS = "Args"
    STEPS = "Steps"
    for sub in cluster_config[STEPS]:
        if HADOOP_JAR_STEP in sub:
            script_args = sub[HADOOP_JAR_STEP][ARGS]
            script_args.append("--correlation_id")
            script_args.append(correlation_id)
            script_args.append("--s3_bucket_name")
            script_args.append(s3_bucket_name)
            script_args.append("--s3_prefix")
            script_args.append(s3_prefix)
            script_args.append("--export_date")
            script_args.append(export_date)
            sub[HADOOP_JAR_STEP][ARGS] = script_args

    resp = emr_launch_cluster(cluster_config)
    job_flow_id = resp["JobFlowId"]
    logger.debug(resp)

    additional_tags = {
        "Correlation_Id": correlation_id,
        "export_date": export_date,
    }

    emr_cluster_add_tags(job_flow_id, additional_tags)
    return resp