def notify_to_run_job(jid): if jid not in to_launch_jobs_already_treated: if 0: # TODO OAR::IO::is_job_desktop_computing logger.debug(str(jid) + ": Desktop computing job, I don't handle it!") else: nb_sent = tools.notify_almighty('OARRUNJOB_' + str(jid) + '\n') if nb_sent: to_launch_jobs_already_treated[jid] = 1 logger.debug("Notify almighty to launch the job" + str(jid)) else: logger.warn( "Not able to notify almighty to launch the job " + str(jid) + " (socket error)")
def set_job_state(jid, state): # TODO # TODO Later: notify_user # TODO Later: update_current_scheduler_priority result = db.query(Job).filter(Job.id == jid)\ .filter(Job.state != 'Error')\ .filter(Job.state != 'Terminated')\ .filter(Job.state != state)\ .update({Job.state: state}) db.commit() if result == 1: # OK for sqlite logger.debug( "Job state updated, job_id: " + str(jid) + ", wanted state: " + state) date = tools.get_date() # TODO: optimize job log db.query(JobStateLog).filter(JobStateLog.date_stop == 0)\ .filter(JobStateLog.job_id == jid)\ .update({JobStateLog.date_stop: date}) db.commit() req = db.insert(JobStateLog).values( {'job_id': jid, 'job_state': state, 'date_start': date}) db.session.execute(req) if state == "Terminated" or state == "Error" or state == "toLaunch" or \ state == "Running" or state == "Suspended" or state == "Resuming": job = db.query(Job).filter(Job.id == jid).one() if state == "Suspend": tools.notify_user(job, "SUSPENDED", "Job is suspended.") elif state == "Resuming": tools.notify_user(job, "RESUMING", "Job is resuming.") elif state == "Running": tools.notify_user(job, "RUNNING", "Job is running.") elif state == "toLaunch": update_current_scheduler_priority(job, "+2", "START") else: # job is "Terminated" or ($state eq "Error") if job.stop_time < job.start_time: db.query(Job).filter(Job.id == jid)\ .update({Job.stop_time: job.start_time}) db.commit() if job.assigned_moldable_job != "0": # Update last_job_date field for resources used update_scheduler_last_job_date( date, int(job.assigned_moldable_job)) if state == "Terminated": tools.notify_user(job, "END", "Job stopped normally.") else: # Verify if the job was suspended and if the resource # property suspended is updated if job.suspended == "YES": r = get_current_resources_with_suspended_job() if r != (): db.query(Resource).filter(~Resource.id.in_(r))\ .update({Resource.suspended_jobs: 'NO'}) else: db.query(Resource).update( {Resource.suspended_jobs: 'NO'}) db.commit() tools.notify_user( job, "ERROR", "Job stopped abnormally or an OAR error occured.") update_current_scheduler_priority(job, "-2", "STOP") # Here we must not be asynchronously with the scheduler log_job(job) # $dbh is valid so these 2 variables must be defined nb_sent = tools.notify_almighty("ChState") if nb_sent == 0: logger.warning("Not able to notify almighty to launch the job " + str(job.id) + " (socket error)") else: logger.warning("Job is already termindated or in error or wanted state, job_id: " + str(jid) + ", wanted state: " + state)
def meta_schedule(mode='internal', plt=Platform()): exit_code = 0 job_security_time = int(config['SCHEDULER_JOB_SECURITY_TIME']) if ('QUOTAS' in config) and (config['QUOTAS'] == 'yes'): if 'QUOTAS_FILE' not in config: config['QUOTAS_FILE'] = './quotas_conf.json' load_quotas_rules() tools.init_judas_notify_user() tools.create_almighty_socket() logger.debug( "Retrieve information for already scheduled reservations from \ database before flush (keep assign resources)") # reservation ??. initial_time_sec = tools.get_date() # time.time() initial_time_sql = local_to_sql(initial_time_sec) current_time_sec = initial_time_sec current_time_sql = initial_time_sql gantt_init_results = gantt_init_with_running_jobs(plt, initial_time_sec, job_security_time) all_slot_sets, scheduled_jobs, besteffort_rid2jid = gantt_init_results resource_set = plt.resource_set() # Path for user of external schedulers if 'OARDIR' in os.environ: binpath = os.environ['OARDIR'] + '/' else: binpath = '/usr/local/lib/oar/' logger.warning( "OARDIR env variable must be defined, " + binpath + " is used by default") for queue in db.query(Queue).order_by(text('priority DESC')).all(): if queue.state == 'Active': logger.debug("Queue " + queue.name + ": Launching scheduler " + queue.scheduler_policy + " at time " + initial_time_sql) if mode == 'external': # pragma: no cover call_external_scheduler(binpath, scheduled_jobs, all_slot_sets, resource_set, job_security_time, queue, initial_time_sec, initial_time_sql) else: call_internal_scheduler(plt, scheduled_jobs, all_slot_sets, job_security_time, queue, initial_time_sec) handle_waiting_reservation_jobs(queue.name, resource_set, job_security_time, current_time_sec) # handle_new_AR_jobs check_reservation_jobs( plt, resource_set, queue.name, all_slot_sets, current_time_sec) jobs_to_launch, jobs_to_launch_lst, rid2jid_to_launch = get_gantt_jobs_to_launch(resource_set, job_security_time, current_time_sec) if check_besteffort_jobs_to_kill(jobs_to_launch, rid2jid_to_launch, current_time_sec, besteffort_rid2jid, resource_set) == 1: # We must kill some besteffort jobs tools.notify_almighty('ChState') exit_code = 2 elif handle_jobs_to_launch(jobs_to_launch_lst, current_time_sec, current_time_sql) == 1: exit_code = 0 # Update visu gantt tables update_gantt_visualization() # Manage dynamic node feature flag_hulot = False timeout_cmd = int(config['SCHEDULER_TIMEOUT']) if ((('SCHEDULER_NODE_MANAGER_SLEEP_CMD' in config) or ((config['ENERGY_SAVING_INTERNAL'] == 'yes') and ('ENERGY_SAVING_NODE_MANAGER_SLEEP_CMD' in config))) and (('SCHEDULER_NODE_MANAGER_SLEEP_TIME' in config) and ('SCHEDULER_NODE_MANAGER_IDLE_TIME' in config))): # Look at nodes that are unused for a duration idle_duration = int(config['SCHEDULER_NODE_MANAGER_IDLE_TIME']) sleep_duration = int(config['SCHEDULER_NODE_MANAGER_SLEEP_TIME']) idle_nodes = search_idle_nodes(current_time_sec) tmp_time = current_time_sec - idle_duration node_halt = [] for node, idle_duration in iteritems(idle_nodes): if idle_duration < tmp_time: # Search if the node has enough time to sleep tmp = get_next_job_date_on_node(node) if (tmp is None) or (tmp - sleep_duration > current_time_sec): # Search if node has not been woken up recently wakeup_date = get_last_wake_up_date_of_node(node) if (wakeup_date is None) or (wakeup_date < tmp_time): node_halt.append(node) if node_halt != []: logger.debug("Powering off some nodes (energy saving): " + str(node_halt)) # Using the built-in energy saving module to shut down nodes if config['ENERGY_SAVING_INTERNAL'] == 'yes': if kao_tools.send_to_hulot('HALT', ' '.join(node_halt)): logger.error("Communication problem with the energy saving module (Hulot)\n") flag_hulot = 1 else: # Not using the built-in energy saving module to shut down nodes cmd = config['SCHEDULER_NODE_MANAGER_SLEEP_CMD'] if kao_tools.fork_and_feed_stdin(cmd, timeout_cmd, node_halt): logger.error("Command " + cmd + "timeouted (" + str(timeout_cmd) + "s) while trying to poweroff some nodes") if (('SCHEDULER_NODE_MANAGER_SLEEP_CMD' in config) or ((config['ENERGY_SAVING_INTERNAL'] == 'yes') and ('ENERGY_SAVING_NODE_MANAGER_SLEEP_CMD' in config))): # Get nodes which the scheduler wants to schedule jobs to, # but which are in the Absent state, to wake them up wakeup_time = int(config['SCHEDULER_NODE_MANAGER_WAKEUP_TIME']) nodes = get_gantt_hostname_to_wake_up(current_time_sec, wakeup_time) if nodes != []: logger.debug("Awaking some nodes: " + str(nodes)) # Using the built-in energy saving module to wake up nodes if config['ENERGY_SAVING_INTERNAL'] == 'yes': if kao_tools.send_to_hulot('WAKEUP', ' '.join(nodes)): logger.error("Communication problem with the energy saving module (Hulot)") flag_hulot = 1 else: # Not using the built-in energy saving module to wake up nodes cmd = config['SCHEDULER_NODE_MANAGER_WAKE_UP_CMD'] if kao_tools.fork_and_feed_stdin(cmd, timeout_cmd, nodes): logger.error("Command " + cmd + "timeouted (" + str(timeout_cmd) + "s) while trying to wake-up some nodes ") # Send CHECK signal to Hulot if needed if not flag_hulot and (config['ENERGY_SAVING_INTERNAL'] == 'yes'): if kao_tools.send_to_hulot('CHECK', []): logger.error("Communication problem with the energy saving module (Hulot)") # Retrieve jobs according to their state and excluding job in 'Waiting' state. jobs_by_state = get_current_not_waiting_jobs() # # Search jobs to resume # # # TODO: TOFINISH # if 'Resuming' in jobs_by_state: logger.warn("Resuming job is NOT ENTIRELY IMPLEMENTED") for job in jobs_by_state['Resuming']: other_jobs = get_jobs_on_resuming_job_resources(job.id) # TODO : look for timesharing other jobs. What do we do????? if other_jobs == []: # We can resume the job logger.debug("[" + str(job.id) + "] Resuming job") if 'noop' in job.types: resume_job_action(job.id) logger.debug("[" + str(job.id) + "] Resume NOOP job OK") else: script = config['JUST_BEFORE_RESUME_EXEC_FILE'] timeout = int(config['SUSPEND_RESUME_SCRIPT_TIMEOUT']) if timeout is None: timeout = kao_tools.get_default_suspend_resume_script_timeout() skip = 0 logger.debug("[" + str(job.id) + "] Running post suspend script: `" + script + " " + str(job.id) + "'") cmd_str = script + str(job.id) return_code = -1 try: return_code = call(cmd_str, shell=True, timeout=timeout) except TimeoutExpired as e: logger.error(str(e) + "[" + str(job.id) + "] Suspend script timeouted") add_new_event('RESUME_SCRIPT_ERROR', job.id, "Suspend script timeouted") if return_code != 0: str_error = "[" + str(job.id) + "] Suspend script error, return code = "\ + str(return_code) logger.error(str_error) add_new_event('RESUME_SCRIPT_ERROR', job.id, str_error) frag_job(job.id) tools.notify_almighty('Qdel') skip = 1 cpuset_nodes = None if 'JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD' in config: cpuset_field = config['JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD'] else: cpuset_field = "" if cpuset_field and (skip == 0): # TODO cpuset_name = job.user + "_" + str(job.id) cpuset_nodes = get_cpuset_values(cpuset_field, job.assigned_moldable_id) # TODO suspend_data_hash = {'name': cpuset_name, 'job_id': job.id, 'oarexec_pid_file': kao_tools.get_oar_pid_file_name(job.id)} if cpuset_nodes: # TODO taktuk_cmd = config['TAKTUK_CMD'] if 'SUSPEND_RESUME_FILE' in config: suspend_file = config['SUSPEND_RESUME_FILE'] else: # TODO suspend_file = kao_tools.get_default_suspend_resume_file() # # TODO: TOFINISH # # Notify oarsub -I when they will be launched for j_info in get_gantt_waiting_interactive_prediction_date(): job_id, job_info_type, job_start_time, job_message = j_info addr, port = job_info_type.split(':') new_start_prediction = local_to_sql(job_start_time) logger.debug("[" + str(job_id) + "] Notifying user of the start prediction: " + new_start_prediction + "(" + job_message + ")") tools.notify_tcp_socket(addr, port, "[" + initial_time_sql + "] Start prediction: " + new_start_prediction + " (" + job_message + ")") # Run the decisions # Process "toError" jobs if 'toError' in jobs_by_state: for job in jobs_by_state['toError']: addr, port = job.info_type.split(':') if job.type == 'INTERACTIVE' or\ (job.type == 'PASSIVE' and job.reservation == 'Scheduled'): logger.debug("Notify oarsub job (num:" + str(job.id) + ") in error; jobInfo=" + job.info_type) nb_sent1 = tools.notify_tcp_socket(addr, port, job.message + '\n') nb_sent2 = tools.notify_tcp_socket(addr, port, 'BAD JOB' + '\n') if (nb_sent1 == 0) or (nb_sent2 == 0): logger.warn( "Cannot open connection to oarsub client for" + str(job.id)) logger.debug("Set job " + str(job.id) + " to state Error") set_job_state(job.id, 'Error') # Process toAckReservation jobs if 'toAckReservation' in jobs_by_state: for job in jobs_by_state['toAckReservation']: addr, port = job.info_type.split(':') logger.debug( "Treate job" + str(job.id) + " in toAckReservation state") nb_sent = tools.notify_tcp_socket(addr, port, 'GOOD RESERVATION' + '\n') if nb_sent == 0: logger.warn( "Frag job " + str(job.id) + ", I cannot notify oarsub for the reservation") add_new_event('CANNOT_NOTIFY_OARSUB', str( job.id), "Can not notify oarsub for the job " + str(job.id)) # TODO ??? # OAR::IO::lock_table / OAR::IO::unlock_table($base) frag_job(job.id) exit_code = 2 else: logger.debug("Notify oarsub for a RESERVATION (idJob=" + str(job.id) + ") --> OK; jobInfo=" + job.info_type) set_job_state(job.id, 'Waiting') if ((job.start_time - 1) <= current_time_sec) and (exit_code == 0): exit_code = 1 # Process toLaunch jobs if 'toLaunch' in jobs_by_state: for job in jobs_by_state['toLaunch']: notify_to_run_job(job.id) logger.debug("End of Meta Scheduler") return exit_code
def cli( command, interactive, queue, resource, reservation, connect, type, checkpoint, property, resubmit, scanscript, project, signal, directory, name, after, notify, array, array_param_file, use_job_key, import_job_key_from_file, import_job_key_inline, export_job_key_to_file, stdout, stderr, hold, version, ): """Submit a job to OAR batch scheduler.""" config.setdefault_config(DEFAULT_CONFIG) # import pdb; pdb.set_trace() # print(resource) # When the walltime of a job is not defined default_job_walltime = str(config["DEFAULT_JOB_WALLTIME"]) log_warning = "" # TODO log_error = "" log_info = "" log_std = "" remote_host = config["SERVER_HOSTNAME"] remote_port = int(config["SERVER_PORT"]) if "OARSUB_DEFAULT_RESOURCES" in config: default_resources = config["OARSUB_DEFAULT_RESOURCES"] else: default_resources = "/resource_id=1" if "OARSUB_NODES_RESOURCES" in config: nodes_resources = config["OARSUB_NODES_RESOURCES"] else: nodes_resources = "resource_id" # TODO Deploy_hostname / Cosystem_hostname # $Deploy_hostname = get_conf("DEPLOY_HOSTNAME"); # if (!defined($Deploy_hostname)){ # $Deploy_hostname = $remote_host; # } # $Cosystem_hostname = get_conf("COSYSTEM_HOSTNAME"); # if (!defined($Cosystem_hostname)){ # $Cosystem_hostname = $remote_host; # } cpuset_field = config["JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD"] cpuset_path = config["CPUSET_PATH"] if "OAR_RUNTIME_DIRECTORY" in config: pass # if (is_conf("OAR_RUNTIME_DIRECTORY")){ # OAR::Sub::set_default_oarexec_directory(get_conf("OAR_RUNTIME_DIRECTORY")); # } # my $default_oar_dir = OAR::Sub::get_default_oarexec_directory(); # if (!(((-d $default_oar_dir) and (-O $default_oar_dir)) or (mkdir($default_oar_dir)))){ # die("# Error: failed to create the OAR directory $default_oar_dir, or bad permissions.\n"); # } binpath = "" if "OARDIR" in os.environ: binpath = os.environ["OARDIR"] + "/" else: print_error("OARDIR environment variable is not defined.") sub_exit(1) openssh_cmd = config["OPENSSH_CMD"] ssh_timeout = int(config["OAR_SSH_CONNECTION_TIMEOUT"]) # if (is_conf("OAR_SSH_CONNECTION_TIMEOUT")){ # OAR::Sub::set_ssh_timeout(get_conf("OAR_SSH_CONNECTION_TIMEOUT")); # } # OAR version # TODO: OAR is now a set of composition... # types = type properties = lstrip_none(property) if not directory: launching_directory = "" else: launching_directory = lstrip_none(directory) initial_request = " ".join(sys.argv[1:]) queue_name = lstrip_none(queue) reservation_date = lstrip_none(reservation) if reservation_date: m = re.search(r"^\s*(\d{4}\-\d{1,2}\-\d{1,2})\s+(\d{1,2}:\d{1,2}:\d{1,2})\s*$", reservation) if m: reservation_date = sql_to_local(m.group(1) + " " + m.group(2)) else: print_error( 'syntax error for the advance reservation start date \ specification. Expected format is:"YYYY-MM-DD hh:mm:ss"' ) sub_exit(7) if array: array_nb = array else: array_nb = 1 # Check the default name of the key if we have to generate it if ("OARSUB_FORCE_JOB_KEY" in config) and config["OARSUB_FORCE_JOB_KEY"] == "yes": use_job_key = True else: use_job_key = False # TODO ssh_private_key, ssh_public_key, # ssh_private_key = '' # ssh_public_key = '' # TODO import_job_key_file, export_job_key_file import_job_key_file = "" export_job_key_file = "" if resubmit: print("# Resubmitting job ", resubmit, "...") ret = resubmit_job(resubmit) if ret > 0: job_id = ret print(" done.\n") print("OAR_JOB_ID=" + str(job_id)) if signal_almighty(remote_host, remote_port, "Qsub") > 0: print_error( "cannot connect to executor " + str(remote_host) + ":" + str(remote_port) + ". OAR server might be down." ) sub_exit(3) else: sub_exit(0) else: print(" error.") if ret == -1: print_error("interactive jobs and advance reservations cannot be resubmitted.") elif ret == -2: print_error("only jobs in the Error or Terminated state can be resubmitted.") elif ret == -3: print_error("resubmitted job user mismatch.") elif ret == -4: print_error("another active job is using the same job key.") else: print_error("unknown error.") sub_exit(4) if not command and not interactive and not reservation and not connect: usage() sub_exit(5) if interactive and reservation: print_error("an advance reservation cannot be interactive.") usage() sub_exit(7) if interactive and any(re.match(r"^desktop_computing$", t) for t in type): print_error(" a desktop computing job cannot be interactive") usage() sub_exit(17) if any(re.match(r"^noop$", t) for t in type): if interactive: print_error("a NOOP job cannot be interactive.") sub_exit(17) elif connect: print_error("a NOOP job does not have a shell to connect to.") sub_exit(17) # notify : check insecure character if notify and re.match(r"^.*exec\s*:.+$"): m = re.search(r".*exec\s*:([a-zA-Z0-9_.\/ -]+)$", notify) if not m: print_error( "insecure characters found in the notification method \ (the allowed regexp is: [a-zA-Z0-9_.\/ -]+)." ) sub_exit(16) # TODO Connect to a reservation # Connect to a reservation # if (defined($connect_job)){ # Do not kill the job if the user close the window # $SIG{HUP} = 'DEFAULT'; # OAR::Sub::close_db_connection(); exit(connect_job($connect_job,0,$Openssh_cmd)); # } if not project: project = DEFAULT_VALUE["project"] if not signal: signal = DEFAULT_VALUE["signal"] if not directory: directory = DEFAULT_VALUE["directory"] resource_request = parse_resource_descriptions(resource, default_resources, nodes_resources) job_vars = { "job_type": None, "resource_request": resource_request, "command": command, "info_type": None, "queue_name": queue_name, "properties": properties, "checkpoint": checkpoint, "signal": signal, "notify": notify, "name": name, "types": types, "launching_directory": launching_directory, "dependencies": after, "stdout": stdout, "stderr": stderr, "hold": hold, "project": project, "initial_request": initial_request, "user": os.environ["OARDO_USER"], "array_id": 0, "start_time": "0", "reservation_field": None, } if not interactive and command: cmd_executor = "Qsub" if scanscript: # TODO scanscript pass array_params = [] if array_param_file: pass # TODO # $array_params_ref = OAR::Sub::read_array_param_file($array_param_file); # $array_nb = scalar @{$array_params_ref}; if array_nb == 0: print_error("an array of job must have a number of sub-jobs greater than 0.") usage() sub_exit(6) job_vars["info_type"] = "$Host:$server_port" # TODO "$Host:$server_port" job_vars["job_type"] = "PASSIVE" (err, job_id_lst) = add_micheline_jobs( job_vars, reservation_date, use_job_key, import_job_key_inline, import_job_key_file, export_job_key_file, initial_request, array_nb, array_params, ) else: # TODO interactive if command: print_warning("asking for an interactive job (-I), so ignoring arguments: " + command + " .") cmd_executor = "Qsub -I" if array_param_file: print_error("a array job with parameters given in a file cannot be interactive.") usage() sub_exit(9) if array_nb != 1: print_error("an array job cannot be interactive.") usage() sub_exit(8) if reservation: # Test if this job is a reservation and the syntax is right # TODO Pass pass socket_server = init_tcp_server() (server, server_port) = socket_server.getsockname() job_vars["info_type"] = server + ":" + str(server_port) job_vars["job_type"] = "INTERACTIVE" (err, job_id_lst) = add_micheline_jobs( job_vars, reservation_date, use_job_key, import_job_key_inline, import_job_key_file, export_job_key_file, initial_request, array_nb, array_params, ) # pdb.set_trace() if err != 0: print_error("command failed, please verify your syntax.") sub_exit(err, "") oar_array_id = 0 # Print job_id list if len(job_id_lst) == 1: print("OAR_JOB_ID=", job_id_lst[0]) else: job = db["Job"].query.filter(Job.id == job_id_lst[0]).one() oar_array_id = job.array_id for job_id in job_id_lst: print("OAR_JOB_ID=", job_id) result = (job_id_lst, oar_array_id) # Notify Almigthy tools.create_almighty_socket() tools.notify_almighty(cmd_executor) if reservation: # Reservation mode print_info("advance reservation request: waiting for approval from the scheduler...") (conn, address) = socket_server.accept() answer = conn.recv(1024) if answer[:-1] == "GOOD RESERVATION": print_info("advance reservation is GRANTED.") else: print_info("advance reservation is REJECTED ", answer[:-1]) sub_exit(10) elif interactive: # Interactive mode print_info("interactive mode: waiting...") prev_str = "" while True: (conn, address) = socket_server.accept() answer = conn.recv(1024) answer = answer[:-1] m = re.search(r"\](.*)$", answer) if m and m.group(1) != prev_str: print_info(answer) prev_str = m.group(1) elif answer != "GOOD JOB": print_info(answer) if ( (answer == "GOOD JOB") or (answer == "BAD JOB") or (answer == "JOB KILLE") or re.match(r"^ERROR", answer) ): break if answer == "GOOD JOB": # TODO exit(connect_job($Job_id_list_ref->[0],1,$Openssh_cmd)); pass else: sub_exit(11) sub_exit(0, result)