def log_runtime(event, mean_time, std, instances): event = event.replace(".", "_") update_job( job_id, { f"timings.{event}.{rank}": { "mean": mean_time, "std": std, "instances": instances, } }, w=0, )
def side_thread_fn(): global is_stopping if is_stopping: return # Check the status of the job and if we need to self-destruct res = mongo.job.find_one({"_id": ObjectId(job_id)}, {"status": 1}) if res is None or res["status"] not in ["SCHEDULED", "RUNNING", "FINISHED"]: status = res["status"] if res is not None else "DELETED" print( f"Job status changed to {status}. This worker will self-destruct.", file=sys.stderr ) os._exit(1) # os.system("kill %d" % os.getpid()) is_stopping = True # Update the worker's heartbeat update_job( job_id, { "last_heartbeat_time": datetime.datetime.utcnow(), f"workers.{rank}.last_heartbeat_time": datetime.datetime.utcnow(), }, w=0, )
def main(): parser = ArgumentParser() parser.add_argument( "job_id", nargs="+", help="List of job ids. Use 'any' to do any work that is left" ) parser.add_argument( "--queue-mode", "-q", default=False, action="store_true", help='Queue mode: pick a job with status "CREATED" from the job_id list', ) parser.add_argument( "--min-worker-count", type=int, default=None, help=( "Optimal minimum number of workers for a job to be a candidate for execution by this worker. " "Only appliccable in queue mode." ), ) parser.add_argument( "--mpi", default=False, action="store_true", help="Derive rank and world_size from MPI" ) args = parser.parse_args() # Retrieve the job description if args.queue_mode: query = { "$expr": {"$lt": ["$registered_workers", "$n_workers"]}, "status": {"$in": ["SCHEDULED", "CREATED"]}, } if args.min_worker_count is not None: query["n_workers"] = {"$gte": args.min_worker_count} if args.job_id != ["any"]: query["_id"] = {"$in": [ObjectId(id) for id in args.job_id]} job = mongo.job.find_one_and_update( query, update={ "$set": {"status": "SCHEDULED", "schedule_time": datetime.datetime.utcnow()}, "$inc": {"registered_workers": 1}, }, sort=[("priority", DESCENDING), ("creation_time", ASCENDING)], ) if job is None: print("Queue is empty. Waiting for a task.") sleep(10) return main() job_id = str(job["_id"]) else: job = mongo.job.find_one_and_update( { "_id": ObjectId(args.job_id[0]), "$expr": {"$lt": ["$registered_workers", "$n_workers"]}, "status": {"$in": ["SCHEDULED", "CREATED"]}, }, update={ "$set": {"status": "SCHEDULED", "schedule_time": datetime.datetime.utcnow()}, "$inc": {"registered_workers": 1}, }, sort=[("priority", DESCENDING), ("creation_time", ASCENDING)], ) if job is None: print("Job not found / nothing to do.") sys.exit(0) job_id = str(job["_id"]) if not args.mpi: rank = job["registered_workers"] n_workers = job["n_workers"] else: rank = int(os.getenv("OMPI_COMM_WORLD_RANK", os.getenv("PMIX_RANK"))) n_workers = int(os.getenv("OMPI_COMM_WORLD_SIZE", os.getenv("SLURM_NTASKS"))) # Create an output directory output_dir = os.path.join( job["project"], job["experiment"], job["job"] + "_" + str(job["_id"])[-6:] ) output_dir_abs = os.path.join(os.getenv("JOBMONITOR_RESULTS_DIR"), output_dir) os.makedirs(output_dir_abs, exist_ok=True) code_dir = os.path.join(output_dir_abs, "code") # Copy the files to run into the output directory if rank == 0: clone_info = job["environment"]["clone"] if "path" in clone_info: # fill in any environment variables used in the path clone_from = re.sub( r"""\$([\w_]+)""", lambda match: os.getenv(match.group(1)), clone_info["path"] ) clone_directory(clone_from, code_dir) elif "code_package" in clone_info: download_code_package(clone_info["code_package"], code_dir) else: raise ValueError('Current, only the "path" clone approach is supported') # Store hostname and pid so we can find things later update_job( job_id, {f"workers.{rank}.host": socket.gethostname(), f"workers.{rank}.pid": os.getpid()} ) # Wait for all the workers to reach this point barrier("jobstart", job_id, n_workers, desired_statuses=["SCHEDULED", "RUNNING"]) # Somehow the output directory doesn't seem to exist on all workers. # Maybe it needs a little sleep. sleep(1) # Set job to 'RUNNING' in MongoDB if rank == 0: update_job( job_id, { "host": socket.gethostname(), "status": "RUNNING", "start_time": datetime.datetime.utcnow(), "output_dir": output_dir, }, ) def side_thread_fn(): global is_stopping if is_stopping: return # Check the status of the job and if we need to self-destruct res = mongo.job.find_one({"_id": ObjectId(job_id)}, {"status": 1}) if res is None or res["status"] not in ["SCHEDULED", "RUNNING", "FINISHED"]: status = res["status"] if res is not None else "DELETED" print( f"Job status changed to {status}. This worker will self-destruct.", file=sys.stderr ) os._exit(1) # os.system("kill %d" % os.getpid()) is_stopping = True # Update the worker's heartbeat update_job( job_id, { "last_heartbeat_time": datetime.datetime.utcnow(), f"workers.{rank}.last_heartbeat_time": datetime.datetime.utcnow(), }, w=0, ) # Start sending regular heartbeat updates to the db # and check whether the job isn't getting canceled side_thread_stop, side_thread = IntervalTimer.create(side_thread_fn, 10) side_thread.start() try: # Change directory to the right directory os.chdir(code_dir) sys.path.insert(0, code_dir) # Rewire stdout and stderr to write to the output file if rank == 0: logfile_path = os.path.join(output_dir_abs, "output.txt") else: logfile_path = os.path.join(output_dir_abs, f"output.worker{rank}.txt") logfile = open(logfile_path, "a") print("Starting. Output piped to {}".format(logfile_path)) orig_stdout = sys.stdout orig_stderr = sys.stderr sys.stdout = MultiLogChannel( MongoLogChannel(mongo.job, job_id, tags={"worker": rank, "type": "info"}), sys.stdout, FileLogChannel(logfile), ) sys.stderr = MultiLogChannel( MongoLogChannel(mongo.job, job_id, tags={"worker": rank, "type": "error"}), sys.stderr, FileLogChannel(logfile), ) print("cwd: {}".format(code_dir)) # Import the script specified in the script = import_module(job["environment"]["script"].strip(".py")) # Override non-default config parameters for key, value in job.get("config", {}).items(): script.config[key] = value script.config["rank"] = rank script.config["n_workers"] = n_workers script.config["distributed_init_file"] = os.path.join(output_dir_abs, "dist_init") # Give the script access to all logging facilities def log_info(info_dict): update_job(job_id, info_dict, w=0) # Allows the script to register images def log_image(key: str, path: str): if path.startswith(output_dir_abs): path = path[len(output_dir_abs) + 1 :] update_job(job_id, {f"images.{key}": path}, w=0) def log_runtime(event, mean_time, std, instances): event = event.replace(".", "_") update_job( job_id, { f"timings.{event}.{rank}": { "mean": mean_time, "std": std, "instances": instances, } }, w=0, ) # keep track of which metrics already got an entry in MongoDB metrics_created_so_far = set() def log_metric(measurement, value, tags={}): # Log the metric to MongoDB if not isinstance(value, dict): value = {"value": value} values = {"time": datetime.datetime.utcnow(), **value} key_dict = {"measurement": measurement, **tags} if n_workers > 1: key_dict["worker"] = rank key_hash = hashlib.md5(json.dumps(key_dict, sort_keys=True).encode("utf-8")).hexdigest() if not key_hash in metrics_created_so_far: metrics_created_so_far.add(key_hash) mongo.job.update( {"_id": ObjectId(job_id)}, {"$push": {f"metrics": {**key_dict, "id": key_hash}}}, w=0, ) mongo.job.update( {"_id": ObjectId(job_id)}, {"$push": {f"metric_data.{key_hash}": values}}, w=0 ) script.log_info = log_info script.log_image = log_image script.output_dir = output_dir_abs script.log_metric = log_metric script.log_runtime = log_runtime if rank == 0: # Store the effective config used in the database update_job(job_id, {"config": dict(script.config)}) # and in the output directory, just to be sure with open(os.path.join(output_dir_abs, "config.yml"), "w") as fp: yaml.dump(dict(script.config), fp, default_flow_style=False) # Run the task script.main() # Finished successfully print("Job finished successfully") if rank == 0: update_job(job_id, {"status": "FINISHED", "end_time": datetime.datetime.utcnow()}) except Exception as e: error_message = traceback.format_exc() print(error_message, file=sys.stderr) if isinstance(e, KeyboardInterrupt) or isinstance(e, SystemExit): status = "CANCELED" else: status = "FAILED" update_job( job_id, { "status": status, "end_time": datetime.datetime.utcnow(), "exception": repr(e), "traceback": error_message, "exception_worker": rank, }, ) sys.exit(1) finally: global is_stopping is_stopping = True # Stop the heartbeat thread logfile.close() sys.stdout = orig_stdout sys.stderr = orig_stderr side_thread_stop.set() side_thread.join(timeout=1)
def log_image(key: str, path: str): if path.startswith(output_dir_abs): path = path[len(output_dir_abs) + 1 :] update_job(job_id, {f"images.{key}": path}, w=0)
def log_info(info_dict): update_job(job_id, info_dict, w=0)