def lambda_handler(event, context): bucket_name, ipst = s3.parse_s3_event(event) comm = io.get_io_bundle(bucket_name) if ipst == "all": print("Cleaning all datasets; removing all job resources (S3 files).") comm.messages.delete_literal( "clean-all") # don't interpret all as existing ipppssoots cleanup_ids = comm.ids("all") comm.messages.broadcast("clean", cleanup_ids) elif ipst == "ingested": # a variation of "all" restricted to datasets with an ingest message print( "Cleaning all ingested datasets; removing all job resources (S3 files)." ) comm.messages.delete_literal( "clean-ingested") # don't interpret "ingested" cleanup_ids = comm.messages.ids("ingested") comm.messages.broadcast("clean", cleanup_ids) else: print("Cleaning", ipst) comm.clean(ipst)
def lambda_handler(event, context): bucket_name, ipst = s3.parse_s3_event(event) comm = io.get_io_bundle(bucket_name) comm.xdata.delete(ipst) # biggest difference between "placed" and "rescue" # comm.messages.delete(f"placed-{ipst}") lambda_submit.main(comm, ipst, bucket_name)
def setup_job(event_basename, **overrides): """Set up system as if batch job corresponding to `event_basename` event artifact has been submitted, first applying `overrides` to nominal xdata. """ event = conftest.load_event(event_basename) overrides = overrides or {} ipppssoot = event["detail"]["container"]["command"][1] comm = io.get_io_bundle() metadata = starting_metadata(overrides) comm.xdata.put(ipppssoot, metadata) return event, comm, ipppssoot, metadata
def lambda_handler(event, context): bucket_name, ipst = s3.parse_s3_event(event) comm = io.get_io_bundle(bucket_name) if ipst == "all": # Delete exactly the cancel-all message, not every ipppssoot comm.messages.delete_literal("cancel-all") # Cancel all jobs in a killable state broadcasting cancel over job_ids job_ids = batch.get_job_ids() comm.messages.broadcast("cancel", job_ids) elif batch.JOB_ID_RE.match(ipst): job_id, ipst = ipst, "unknown" # kill one job, ipst = job_id print("Cancelling job_id", job_id) comm.messages.delete_literal(f"cancel-{job_id}") with log.trap_exception("Handling messages + control for", job_id): ipst = batch.get_job_name(job_id) print("Handling messages and control for", ipst) comm.messages.delete(f"all-{ipst}") comm.messages.put(f"terminated-{ipst}", "cancel lambda " + bucket_name) try: metadata = comm.xdata.get(ipst) except comm.xdata.client.exceptions.NoSuchKeyError: metadata = dict(job_id=job_id, cancel_type="job_id") metadata["terminated"] = True comm.xdata.put(ipst, metadata) # Do last so terminate flag is set if possible. print("Terminating", job_id) batch.terminate_job(job_id, "Operator cancelled") elif hst.IPPPSSOOT_RE.match(ipst): # kill one ipppssoot print("Cancelling ipppssoot", ipst) comm.messages.delete(f"all-{ipst}") comm.messages.put(f"terminated-{ipst}", "cancel lambda " + bucket_name) metadata = comm.xdata.get(ipst) metadata["terminated"] = True metadata["cancel_type"] = "ipppssoot" comm.xdata.put(ipst, metadata) job_id = metadata["job_id"] with log.trap_exception("Terminating", job_id): print("Terminating", job_id) batch.terminate_job(job_id, "Operator cancelled") else: raise ValueError("Bad cancel ID", ipst)
def lambda_handler(event, context): bucket_name, ipst = s3.parse_s3_event(event) comm = io.get_io_bundle(bucket_name) if ipst == "all": print("Rescuing all") comm.messages.delete_literal( "rescue-all") # don't interpret all as existing ipppssoots rescues = comm.messages.ids(RESCUE_TYPES) comm.messages.broadcast("rescue", rescues) else: print("Rescuing", ipst) # comm.outputs.delete(ipst) lambda_submit.main(comm, ipst, bucket_name)
def lambda_handler(event, context): bucket_name, serial = s3.parse_s3_event(event) comm = io.get_io_bundle(bucket_name) if check_for_kill(comm, "Detected broadcast-kill on entry."): return broadcasted = comm.messages.pop(f"broadcast-{serial}") if len(broadcasted) > 100: serial1, serial2 = comm.messages.get_id(), comm.messages.get_id() comm.messages.put(f"broadcast-{serial1}", broadcasted[: len(broadcasted) // 2]) comm.messages.put(f"broadcast-{serial2}", broadcasted[len(broadcasted) // 2 :]) else: for i, msg in enumerate(broadcasted): if not i % 10: if check_for_kill(comm, "Detected broadcast-kill in put loop"): return comm.messages.put(msg)
def lambda_handler(event, context): bucket_name, serial = s3.parse_s3_event(event) comm = io.get_io_bundle(bucket_name) if check_for_kill(comm, "Detected broadcast-kill on entry."): return bmsg = comm.messages.pop(f"broadcast-{serial}") broadcasted, payload = bmsg["messages"], bmsg["payload"] if len(broadcasted) > 100: # split broadcast into two new broadcasts comm.messages.bifurcate_broadcast(broadcasted, payload) else: # iteratively send payload to each message in broadcasted for i, msg in enumerate(broadcasted): if not i % 10: if check_for_kill(comm, "Detected broadcast-kill in put loop"): return comm.messages.put(msg, payload)
def lambda_handler(event, context): print(event) detail = event["detail"] job_id = detail["jobId"] job_name = detail["jobName"] # appears to be ipppssoot status_reason = detail.get("statusReason", "undefined") container = event["detail"]["container"] ipppssoot = container["command"][1] bucket = container["command"][2].split("/")[2] container_reason = container.get("reason", "undefined") exit_code = container.get("exitCode", "undefined") exit_reason = exit_codes.explain( exit_code) if exit_code != "undefined" else exit_code comm = io.get_io_bundle(bucket) metadata = comm.xdata.get(ipppssoot) metadata["ipppssoot"] = ipppssoot metadata["bucket"] = bucket metadata["job_id"] = job_id metadata["job_name"] = job_name metadata["exit_code"] = exit_code metadata["exit_reason"] = exit_reason metadata["status_reason"] = status_reason metadata["container_reason"] = container_reason if exit_reason != "undefined": combined_reason = exit_reason elif container_reason != "undefined": combined_reason = container_reason else: combined_reason = status_reason continuation_msg = "error-" + ipppssoot if exit_codes.is_memory_error(exit_code) or container_reason.startswith( "OutOfMemoryError: Container killed"): if not metadata["terminated"] and metadata["memory_retries"] < int( os.environ["MAX_MEMORY_RETRIES"]): metadata["memory_retries"] += 1 continuation_msg = "rescue-" + ipppssoot print("Automatic OutOfMemory rescue of", ipppssoot, "with memory retry count", metadata["memory_retries"]) else: print("Automatic OutOfMemory retries for", ipppssoot, "exhausted at", metadata["memory_retries"]) elif container_reason.startswith("CannotInspectContainer"): if not metadata["terminated"] and metadata["retries"] < int( os.environ["MAX_DOCKER_RETRIES"]): metadata["retries"] += 1 continuation_msg = "rescue-" + ipppssoot print("Automatic CannotInspectContainer rescue for", ipppssoot, "with retry count", metadata["retries"]) else: print("Automatic CannotInspectContainer retries for", ipppssoot, "exhausted at", metadata["retries"]) elif container_reason.startswith("DockerTimeoutError"): if not metadata["terminated"] and metadata["retries"] < int( os.environ["MAX_DOCKER_RETRIES"]): metadata["retries"] += 1 continuation_msg = "rescue-" + ipppssoot print("Automatic DockerTimeoutError rescue for", ipppssoot, "with retry count", metadata["retries"]) else: print("Automatic DockerTimeoutError retries for", ipppssoot, "exhausted at", metadata["retries"]) elif status_reason.startswith("Operator cancelled"): print("Operator cancelled job", job_id, "for", ipppssoot, "no automatic retry.") continuation_msg = "terminated-" + ipppssoot else: print("Failure for", ipppssoot, "no automatic retry for", combined_reason) # XXXX Since retry count used in planning, control output must precede rescue message print(metadata) comm.xdata.put(ipppssoot, metadata) comm.messages.delete("all-" + ipppssoot) comm.messages.put(continuation_msg)