Example #1
0
def lambda_handler(event, context):

    bucket_name, ipst = s3.parse_s3_event(event)

    comm = io.get_io_bundle(bucket_name)

    if ipst == "all":
        print("Cleaning all datasets;  removing all job resources (S3 files).")

        comm.messages.delete_literal(
            "clean-all")  # don't interpret all as existing ipppssoots

        cleanup_ids = comm.ids("all")

        comm.messages.broadcast("clean", cleanup_ids)
    elif ipst == "ingested":  # a variation of "all" restricted to datasets with an ingest message
        print(
            "Cleaning all ingested datasets;  removing all job resources (S3 files)."
        )

        comm.messages.delete_literal(
            "clean-ingested")  # don't interpret "ingested"

        cleanup_ids = comm.messages.ids("ingested")

        comm.messages.broadcast("clean", cleanup_ids)
    else:
        print("Cleaning", ipst)
        comm.clean(ipst)
Example #2
0
def lambda_handler(event, context):

    bucket_name, ipst = s3.parse_s3_event(event)

    comm = io.get_io_bundle(bucket_name)

    comm.xdata.delete(ipst)  # biggest difference between "placed" and "rescue"

    # comm.messages.delete(f"placed-{ipst}")

    lambda_submit.main(comm, ipst, bucket_name)
Example #3
0
def setup_job(event_basename, **overrides):
    """Set up system as if batch job corresponding to `event_basename` event artifact has been submitted,
    first applying `overrides` to nominal xdata.
    """
    event = conftest.load_event(event_basename)
    overrides = overrides or {}
    ipppssoot = event["detail"]["container"]["command"][1]
    comm = io.get_io_bundle()
    metadata = starting_metadata(overrides)
    comm.xdata.put(ipppssoot, metadata)
    return event, comm, ipppssoot, metadata
Example #4
0
def lambda_handler(event, context):

    bucket_name, ipst = s3.parse_s3_event(event)

    comm = io.get_io_bundle(bucket_name)

    if ipst == "all":
        # Delete exactly the cancel-all message,  not every ipppssoot
        comm.messages.delete_literal("cancel-all")
        # Cancel all jobs in a killable state broadcasting cancel over job_ids
        job_ids = batch.get_job_ids()
        comm.messages.broadcast("cancel", job_ids)
    elif batch.JOB_ID_RE.match(ipst):
        job_id, ipst = ipst, "unknown"  # kill one job, ipst = job_id
        print("Cancelling job_id", job_id)
        comm.messages.delete_literal(f"cancel-{job_id}")
        with log.trap_exception("Handling messages + control for", job_id):
            ipst = batch.get_job_name(job_id)
            print("Handling messages and control for", ipst)
            comm.messages.delete(f"all-{ipst}")
            comm.messages.put(f"terminated-{ipst}",
                              "cancel lambda " + bucket_name)
            try:
                metadata = comm.xdata.get(ipst)
            except comm.xdata.client.exceptions.NoSuchKeyError:
                metadata = dict(job_id=job_id, cancel_type="job_id")
            metadata["terminated"] = True
            comm.xdata.put(ipst, metadata)
        # Do last so terminate flag is set if possible.
        print("Terminating", job_id)
        batch.terminate_job(job_id, "Operator cancelled")
    elif hst.IPPPSSOOT_RE.match(ipst):  # kill one ipppssoot
        print("Cancelling ipppssoot", ipst)
        comm.messages.delete(f"all-{ipst}")
        comm.messages.put(f"terminated-{ipst}", "cancel lambda " + bucket_name)
        metadata = comm.xdata.get(ipst)
        metadata["terminated"] = True
        metadata["cancel_type"] = "ipppssoot"
        comm.xdata.put(ipst, metadata)
        job_id = metadata["job_id"]
        with log.trap_exception("Terminating", job_id):
            print("Terminating", job_id)
            batch.terminate_job(job_id, "Operator cancelled")
    else:
        raise ValueError("Bad cancel ID", ipst)
Example #5
0
def lambda_handler(event, context):

    bucket_name, ipst = s3.parse_s3_event(event)

    comm = io.get_io_bundle(bucket_name)

    if ipst == "all":
        print("Rescuing all")

        comm.messages.delete_literal(
            "rescue-all")  # don't interpret all as existing ipppssoots

        rescues = comm.messages.ids(RESCUE_TYPES)

        comm.messages.broadcast("rescue", rescues)
    else:
        print("Rescuing", ipst)
        # comm.outputs.delete(ipst)
        lambda_submit.main(comm, ipst, bucket_name)
Example #6
0
def lambda_handler(event, context):

    bucket_name, serial = s3.parse_s3_event(event)

    comm = io.get_io_bundle(bucket_name)

    if check_for_kill(comm, "Detected broadcast-kill on entry."):
        return

    broadcasted = comm.messages.pop(f"broadcast-{serial}")

    if len(broadcasted) > 100:
        serial1, serial2 = comm.messages.get_id(), comm.messages.get_id()
        comm.messages.put(f"broadcast-{serial1}", broadcasted[: len(broadcasted) // 2])
        comm.messages.put(f"broadcast-{serial2}", broadcasted[len(broadcasted) // 2 :])
    else:
        for i, msg in enumerate(broadcasted):
            if not i % 10:
                if check_for_kill(comm, "Detected broadcast-kill in put loop"):
                    return
            comm.messages.put(msg)
Example #7
0
def lambda_handler(event, context):

    bucket_name, serial = s3.parse_s3_event(event)

    comm = io.get_io_bundle(bucket_name)

    if check_for_kill(comm, "Detected broadcast-kill on entry."):
        return

    bmsg = comm.messages.pop(f"broadcast-{serial}")

    broadcasted, payload = bmsg["messages"], bmsg["payload"]

    if len(broadcasted) > 100:  # split broadcast into two new broadcasts
        comm.messages.bifurcate_broadcast(broadcasted, payload)
    else:  # iteratively send payload to each message in broadcasted
        for i, msg in enumerate(broadcasted):
            if not i % 10:
                if check_for_kill(comm, "Detected broadcast-kill in put loop"):
                    return
            comm.messages.put(msg, payload)
def lambda_handler(event, context):

    print(event)

    detail = event["detail"]
    job_id = detail["jobId"]
    job_name = detail["jobName"]  # appears to be ipppssoot
    status_reason = detail.get("statusReason", "undefined")

    container = event["detail"]["container"]
    ipppssoot = container["command"][1]
    bucket = container["command"][2].split("/")[2]
    container_reason = container.get("reason", "undefined")
    exit_code = container.get("exitCode", "undefined")
    exit_reason = exit_codes.explain(
        exit_code) if exit_code != "undefined" else exit_code

    comm = io.get_io_bundle(bucket)

    metadata = comm.xdata.get(ipppssoot)
    metadata["ipppssoot"] = ipppssoot
    metadata["bucket"] = bucket
    metadata["job_id"] = job_id
    metadata["job_name"] = job_name
    metadata["exit_code"] = exit_code
    metadata["exit_reason"] = exit_reason
    metadata["status_reason"] = status_reason
    metadata["container_reason"] = container_reason

    if exit_reason != "undefined":
        combined_reason = exit_reason
    elif container_reason != "undefined":
        combined_reason = container_reason
    else:
        combined_reason = status_reason

    continuation_msg = "error-" + ipppssoot

    if exit_codes.is_memory_error(exit_code) or container_reason.startswith(
            "OutOfMemoryError: Container killed"):
        if not metadata["terminated"] and metadata["memory_retries"] < int(
                os.environ["MAX_MEMORY_RETRIES"]):
            metadata["memory_retries"] += 1
            continuation_msg = "rescue-" + ipppssoot
            print("Automatic OutOfMemory rescue of", ipppssoot,
                  "with memory retry count", metadata["memory_retries"])
        else:
            print("Automatic OutOfMemory retries for", ipppssoot,
                  "exhausted at", metadata["memory_retries"])
    elif container_reason.startswith("CannotInspectContainer"):
        if not metadata["terminated"] and metadata["retries"] < int(
                os.environ["MAX_DOCKER_RETRIES"]):
            metadata["retries"] += 1
            continuation_msg = "rescue-" + ipppssoot
            print("Automatic CannotInspectContainer rescue for", ipppssoot,
                  "with retry count", metadata["retries"])
        else:
            print("Automatic CannotInspectContainer retries for", ipppssoot,
                  "exhausted at", metadata["retries"])
    elif container_reason.startswith("DockerTimeoutError"):
        if not metadata["terminated"] and metadata["retries"] < int(
                os.environ["MAX_DOCKER_RETRIES"]):
            metadata["retries"] += 1
            continuation_msg = "rescue-" + ipppssoot
            print("Automatic DockerTimeoutError rescue for", ipppssoot,
                  "with retry count", metadata["retries"])
        else:
            print("Automatic DockerTimeoutError retries for", ipppssoot,
                  "exhausted at", metadata["retries"])
    elif status_reason.startswith("Operator cancelled"):
        print("Operator cancelled job", job_id, "for", ipppssoot,
              "no automatic retry.")
        continuation_msg = "terminated-" + ipppssoot
    else:
        print("Failure for", ipppssoot, "no automatic retry for",
              combined_reason)

    # XXXX Since retry count used in planning, control output must precede rescue message
    print(metadata)
    comm.xdata.put(ipppssoot, metadata)
    comm.messages.delete("all-" + ipppssoot)
    comm.messages.put(continuation_msg)