Example #1
0
def meta_schedule(mode='internal', plt=Platform()):

    exit_code = 0

    job_security_time = int(config['SCHEDULER_JOB_SECURITY_TIME'])

    if ('QUOTAS' in config) and (config['QUOTAS'] == 'yes'):
        if 'QUOTAS_FILE' not in config:
            config['QUOTAS_FILE'] = './quotas_conf.json'
        load_quotas_rules()

    tools.init_judas_notify_user()
    tools.create_almighty_socket()

    logger.debug(
        "Retrieve information for already scheduled reservations from \
        database before flush (keep assign resources)")

    # reservation ??.

    initial_time_sec = tools.get_date()  # time.time()
    initial_time_sql = local_to_sql(initial_time_sec)

    current_time_sec = initial_time_sec
    current_time_sql = initial_time_sql

    gantt_init_results = gantt_init_with_running_jobs(plt, initial_time_sec,
                                                      job_security_time)
    all_slot_sets, scheduled_jobs, besteffort_rid2jid = gantt_init_results
    resource_set = plt.resource_set()

    # Path for user of external schedulers
    if 'OARDIR' in os.environ:
        binpath = os.environ['OARDIR'] + '/'
    else:
        binpath = '/usr/local/lib/oar/'
        logger.warning(
            "OARDIR env variable must be defined, " + binpath + " is used by default")

    for queue in db.query(Queue).order_by(text('priority DESC')).all():

        if queue.state == 'Active':
            logger.debug("Queue " + queue.name + ": Launching scheduler " +
                         queue.scheduler_policy + " at time " + initial_time_sql)

            if mode == 'external':  # pragma: no cover
                call_external_scheduler(binpath, scheduled_jobs, all_slot_sets,
                                        resource_set, job_security_time, queue,
                                        initial_time_sec, initial_time_sql)
            else:
                call_internal_scheduler(plt, scheduled_jobs, all_slot_sets,
                                        job_security_time, queue, initial_time_sec)

            handle_waiting_reservation_jobs(queue.name, resource_set,
                                            job_security_time, current_time_sec)

            # handle_new_AR_jobs
            check_reservation_jobs(
                plt, resource_set, queue.name, all_slot_sets, current_time_sec)

    jobs_to_launch, jobs_to_launch_lst, rid2jid_to_launch = get_gantt_jobs_to_launch(resource_set,
                                                                                     job_security_time,
                                                                                     current_time_sec)

    if check_besteffort_jobs_to_kill(jobs_to_launch, rid2jid_to_launch,
                                     current_time_sec, besteffort_rid2jid,
                                     resource_set) == 1:
        # We must kill some besteffort jobs
        tools.notify_almighty('ChState')
        exit_code = 2
    elif handle_jobs_to_launch(jobs_to_launch_lst, current_time_sec, current_time_sql) == 1:
        exit_code = 0

    # Update visu gantt tables
    update_gantt_visualization()

    # Manage dynamic node feature
    flag_hulot = False
    timeout_cmd = int(config['SCHEDULER_TIMEOUT'])

    if ((('SCHEDULER_NODE_MANAGER_SLEEP_CMD' in config) or
         ((config['ENERGY_SAVING_INTERNAL'] == 'yes') and
          ('ENERGY_SAVING_NODE_MANAGER_SLEEP_CMD' in config))) and
        (('SCHEDULER_NODE_MANAGER_SLEEP_TIME' in config)
         and ('SCHEDULER_NODE_MANAGER_IDLE_TIME' in config))):

        # Look at nodes that are unused for a duration
        idle_duration = int(config['SCHEDULER_NODE_MANAGER_IDLE_TIME'])
        sleep_duration = int(config['SCHEDULER_NODE_MANAGER_SLEEP_TIME'])

        idle_nodes = search_idle_nodes(current_time_sec)
        tmp_time = current_time_sec - idle_duration

        node_halt = []

        for node, idle_duration in iteritems(idle_nodes):
            if idle_duration < tmp_time:
                # Search if the node has enough time to sleep
                tmp = get_next_job_date_on_node(node)
                if (tmp is None) or (tmp - sleep_duration > current_time_sec):
                    # Search if node has not been woken up recently
                    wakeup_date = get_last_wake_up_date_of_node(node)
                    if (wakeup_date is None) or (wakeup_date < tmp_time):
                        node_halt.append(node)

        if node_halt != []:
            logger.debug("Powering off some nodes (energy saving): " + str(node_halt))
            # Using the built-in energy saving module to shut down nodes
            if config['ENERGY_SAVING_INTERNAL'] == 'yes':
                if kao_tools.send_to_hulot('HALT', ' '.join(node_halt)):
                    logger.error("Communication problem with the energy saving module (Hulot)\n")
                flag_hulot = 1
            else:
                # Not using the built-in energy saving module to shut down nodes
                cmd = config['SCHEDULER_NODE_MANAGER_SLEEP_CMD']
                if kao_tools.fork_and_feed_stdin(cmd, timeout_cmd, node_halt):
                    logger.error("Command " + cmd + "timeouted (" + str(timeout_cmd)
                                 + "s) while trying to  poweroff some nodes")

    if (('SCHEDULER_NODE_MANAGER_SLEEP_CMD' in config) or
        ((config['ENERGY_SAVING_INTERNAL'] == 'yes') and
         ('ENERGY_SAVING_NODE_MANAGER_SLEEP_CMD' in config))):
        # Get nodes which the scheduler wants to schedule jobs to,
        # but which are in the Absent state, to wake them up
        wakeup_time = int(config['SCHEDULER_NODE_MANAGER_WAKEUP_TIME'])
        nodes = get_gantt_hostname_to_wake_up(current_time_sec, wakeup_time)

        if nodes != []:
            logger.debug("Awaking some nodes: " + str(nodes))
            # Using the built-in energy saving module to wake up nodes
            if config['ENERGY_SAVING_INTERNAL'] == 'yes':
                if kao_tools.send_to_hulot('WAKEUP', ' '.join(nodes)):
                    logger.error("Communication problem with the energy saving module (Hulot)")
                flag_hulot = 1
            else:
                # Not using the built-in energy saving module to wake up nodes
                cmd = config['SCHEDULER_NODE_MANAGER_WAKE_UP_CMD']
                if kao_tools.fork_and_feed_stdin(cmd, timeout_cmd, nodes):
                    logger.error("Command " + cmd + "timeouted (" + str(timeout_cmd)
                                 + "s) while trying to wake-up some nodes ")

    # Send CHECK signal to Hulot if needed
    if not flag_hulot and (config['ENERGY_SAVING_INTERNAL'] == 'yes'):
        if kao_tools.send_to_hulot('CHECK', []):
            logger.error("Communication problem with the energy saving module (Hulot)")

    # Retrieve jobs according to their state and excluding job in 'Waiting' state.
    jobs_by_state = get_current_not_waiting_jobs()

    #
    # Search jobs to resume
    #

    #
    # TODO: TOFINISH
    #
    if 'Resuming' in jobs_by_state:
        logger.warn("Resuming job is NOT ENTIRELY IMPLEMENTED")
        for job in jobs_by_state['Resuming']:
            other_jobs = get_jobs_on_resuming_job_resources(job.id)
            # TODO : look for timesharing other jobs. What do we do?????
            if other_jobs == []:
                # We can resume the job
                logger.debug("[" + str(job.id) + "] Resuming job")
                if 'noop' in job.types:
                    resume_job_action(job.id)
                    logger.debug("[" + str(job.id) + "] Resume NOOP job OK")
                else:
                    script = config['JUST_BEFORE_RESUME_EXEC_FILE']
                    timeout = int(config['SUSPEND_RESUME_SCRIPT_TIMEOUT'])
                    if timeout is None:
                        timeout = kao_tools.get_default_suspend_resume_script_timeout()
                    skip = 0
                    logger.debug("[" + str(job.id) + "] Running post suspend script: `" +
                                 script + " " + str(job.id) + "'")
                    cmd_str = script + str(job.id)
                    return_code = -1
                    try:
                        return_code = call(cmd_str, shell=True, timeout=timeout)
                    except TimeoutExpired as e:
                        logger.error(str(e) + "[" + str(job.id) + "] Suspend script timeouted")
                        add_new_event('RESUME_SCRIPT_ERROR', job.id, "Suspend script timeouted")
                    if return_code != 0:
                        str_error = "[" + str(job.id) + "] Suspend script error, return code = "\
                                    + str(return_code)
                        logger.error(str_error)
                        add_new_event('RESUME_SCRIPT_ERROR', job.id, str_error)
                        frag_job(job.id)
                        tools.notify_almighty('Qdel')
                    skip = 1

                cpuset_nodes = None
                if 'JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD' in config:
                    cpuset_field = config['JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD']
                else:
                    cpuset_field = ""
                if cpuset_field and (skip == 0):
                    # TODO
                    cpuset_name = job.user + "_" + str(job.id)
                    cpuset_nodes = get_cpuset_values(cpuset_field,
                                                     job.assigned_moldable_id)
                    # TODO
                    suspend_data_hash = {'name': cpuset_name,
                                         'job_id': job.id,
                                         'oarexec_pid_file':
                                         kao_tools.get_oar_pid_file_name(job.id)}
                if cpuset_nodes:
                    # TODO
                    taktuk_cmd = config['TAKTUK_CMD']
                    if 'SUSPEND_RESUME_FILE' in config:
                        suspend_file = config['SUSPEND_RESUME_FILE']
                    else:
                        # TODO
                        suspend_file = kao_tools.get_default_suspend_resume_file()

    #
    # TODO: TOFINISH
    #

    # Notify oarsub -I when they will be launched
    for j_info in get_gantt_waiting_interactive_prediction_date():
        job_id, job_info_type, job_start_time, job_message = j_info
        addr, port = job_info_type.split(':')
        new_start_prediction = local_to_sql(job_start_time)
        logger.debug("[" + str(job_id) + "] Notifying user of the start prediction: " +
                     new_start_prediction + "(" + job_message + ")")
        tools.notify_tcp_socket(addr, port, "[" + initial_time_sql + "] Start prediction: " +
                                new_start_prediction + " (" + job_message + ")")

    # Run the decisions
    # Process "toError" jobs
    if 'toError' in jobs_by_state:
        for job in jobs_by_state['toError']:
            addr, port = job.info_type.split(':')
            if job.type == 'INTERACTIVE' or\
               (job.type == 'PASSIVE' and job.reservation == 'Scheduled'):
                logger.debug("Notify oarsub job (num:" + str(job.id) + ") in error; jobInfo=" +
                             job.info_type)

                nb_sent1 = tools.notify_tcp_socket(addr, port, job.message + '\n')
                nb_sent2 = tools.notify_tcp_socket(addr, port, 'BAD JOB' + '\n')
                if (nb_sent1 == 0) or (nb_sent2 == 0):
                    logger.warn(
                        "Cannot open connection to oarsub client for" + str(job.id))
            logger.debug("Set job " + str(job.id) + " to state Error")
            set_job_state(job.id, 'Error')

    # Process toAckReservation jobs
    if 'toAckReservation' in jobs_by_state:
        for job in jobs_by_state['toAckReservation']:
            addr, port = job.info_type.split(':')
            logger.debug(
                "Treate job" + str(job.id) + " in toAckReservation state")

            nb_sent = tools.notify_tcp_socket(addr, port, 'GOOD RESERVATION' + '\n')

            if nb_sent == 0:
                logger.warn(
                    "Frag job " + str(job.id) + ", I cannot notify oarsub for the reservation")
                add_new_event('CANNOT_NOTIFY_OARSUB', str(
                    job.id), "Can not notify oarsub for the job " + str(job.id))

                # TODO ???
                # OAR::IO::lock_table / OAR::IO::unlock_table($base)
                frag_job(job.id)

                exit_code = 2
            else:
                logger.debug("Notify oarsub for a RESERVATION (idJob=" +
                             str(job.id) + ") --> OK; jobInfo=" + job.info_type)
                set_job_state(job.id, 'Waiting')
                if ((job.start_time - 1) <= current_time_sec) and (exit_code == 0):
                    exit_code = 1

    # Process toLaunch jobs
    if 'toLaunch' in jobs_by_state:
        for job in jobs_by_state['toLaunch']:
            notify_to_run_job(job.id)

    logger.debug("End of Meta Scheduler")

    return exit_code
Example #2
0
def cli(
    command,
    interactive,
    queue,
    resource,
    reservation,
    connect,
    type,
    checkpoint,
    property,
    resubmit,
    scanscript,
    project,
    signal,
    directory,
    name,
    after,
    notify,
    array,
    array_param_file,
    use_job_key,
    import_job_key_from_file,
    import_job_key_inline,
    export_job_key_to_file,
    stdout,
    stderr,
    hold,
    version,
):
    """Submit a job to OAR batch scheduler."""

    config.setdefault_config(DEFAULT_CONFIG)
    # import pdb; pdb.set_trace()

    # print(resource)
    # When the walltime of a job is not defined

    default_job_walltime = str(config["DEFAULT_JOB_WALLTIME"])

    log_warning = ""  # TODO
    log_error = ""
    log_info = ""
    log_std = ""

    remote_host = config["SERVER_HOSTNAME"]
    remote_port = int(config["SERVER_PORT"])

    if "OARSUB_DEFAULT_RESOURCES" in config:
        default_resources = config["OARSUB_DEFAULT_RESOURCES"]
    else:
        default_resources = "/resource_id=1"

    if "OARSUB_NODES_RESOURCES" in config:
        nodes_resources = config["OARSUB_NODES_RESOURCES"]
    else:
        nodes_resources = "resource_id"

    # TODO Deploy_hostname / Cosystem_hostname
    # $Deploy_hostname = get_conf("DEPLOY_HOSTNAME");
    # if (!defined($Deploy_hostname)){
    #    $Deploy_hostname = $remote_host;
    # }

    # $Cosystem_hostname = get_conf("COSYSTEM_HOSTNAME");
    # if (!defined($Cosystem_hostname)){
    #    $Cosystem_hostname = $remote_host;
    # }

    cpuset_field = config["JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD"]
    cpuset_path = config["CPUSET_PATH"]

    if "OAR_RUNTIME_DIRECTORY" in config:
        pass
    # if (is_conf("OAR_RUNTIME_DIRECTORY")){
    #  OAR::Sub::set_default_oarexec_directory(get_conf("OAR_RUNTIME_DIRECTORY"));
    # }

    # my $default_oar_dir = OAR::Sub::get_default_oarexec_directory();
    # if (!(((-d $default_oar_dir) and (-O $default_oar_dir)) or (mkdir($default_oar_dir)))){
    #    die("# Error: failed to create the OAR directory $default_oar_dir, or bad permissions.\n");
    # }

    binpath = ""
    if "OARDIR" in os.environ:
        binpath = os.environ["OARDIR"] + "/"
    else:
        print_error("OARDIR environment variable is not defined.")
        sub_exit(1)

    openssh_cmd = config["OPENSSH_CMD"]
    ssh_timeout = int(config["OAR_SSH_CONNECTION_TIMEOUT"])

    # if (is_conf("OAR_SSH_CONNECTION_TIMEOUT")){
    #    OAR::Sub::set_ssh_timeout(get_conf("OAR_SSH_CONNECTION_TIMEOUT"));
    # }

    # OAR version
    # TODO: OAR is now a set of composition...

    #
    types = type

    properties = lstrip_none(property)
    if not directory:
        launching_directory = ""
    else:
        launching_directory = lstrip_none(directory)

    initial_request = " ".join(sys.argv[1:])
    queue_name = lstrip_none(queue)
    reservation_date = lstrip_none(reservation)

    if reservation_date:
        m = re.search(r"^\s*(\d{4}\-\d{1,2}\-\d{1,2})\s+(\d{1,2}:\d{1,2}:\d{1,2})\s*$", reservation)
        if m:
            reservation_date = sql_to_local(m.group(1) + " " + m.group(2))
        else:
            print_error(
                'syntax error for the advance reservation start date \
            specification. Expected format is:"YYYY-MM-DD hh:mm:ss"'
            )
            sub_exit(7)

    if array:
        array_nb = array
    else:
        array_nb = 1

    # Check the default name of the key if we have to generate it
    if ("OARSUB_FORCE_JOB_KEY" in config) and config["OARSUB_FORCE_JOB_KEY"] == "yes":
        use_job_key = True
    else:
        use_job_key = False

    # TODO ssh_private_key, ssh_public_key,
    # ssh_private_key = ''
    # ssh_public_key = ''

    # TODO import_job_key_file, export_job_key_file
    import_job_key_file = ""
    export_job_key_file = ""

    if resubmit:
        print("# Resubmitting job ", resubmit, "...")
        ret = resubmit_job(resubmit)
        if ret > 0:
            job_id = ret
            print(" done.\n")
            print("OAR_JOB_ID=" + str(job_id))
            if signal_almighty(remote_host, remote_port, "Qsub") > 0:
                print_error(
                    "cannot connect to executor "
                    + str(remote_host)
                    + ":"
                    + str(remote_port)
                    + ". OAR server might be down."
                )
                sub_exit(3)
            else:
                sub_exit(0)
        else:
            print(" error.")
            if ret == -1:
                print_error("interactive jobs and advance reservations cannot be resubmitted.")
            elif ret == -2:
                print_error("only jobs in the Error or Terminated state can be resubmitted.")
            elif ret == -3:
                print_error("resubmitted job user mismatch.")
            elif ret == -4:
                print_error("another active job is using the same job key.")
            else:
                print_error("unknown error.")
            sub_exit(4)

    if not command and not interactive and not reservation and not connect:
        usage()
        sub_exit(5)

    if interactive and reservation:
        print_error("an advance reservation cannot be interactive.")
        usage()
        sub_exit(7)

    if interactive and any(re.match(r"^desktop_computing$", t) for t in type):
        print_error(" a desktop computing job cannot be interactive")
        usage()
        sub_exit(17)

    if any(re.match(r"^noop$", t) for t in type):
        if interactive:
            print_error("a NOOP job cannot be interactive.")
            sub_exit(17)
        elif connect:
            print_error("a NOOP job does not have a shell to connect to.")
            sub_exit(17)

    # notify : check insecure character
    if notify and re.match(r"^.*exec\s*:.+$"):
        m = re.search(r".*exec\s*:([a-zA-Z0-9_.\/ -]+)$", notify)
        if not m:
            print_error(
                "insecure characters found in the notification method \
            (the allowed regexp is: [a-zA-Z0-9_.\/ -]+)."
            )
            sub_exit(16)

    # TODO   Connect to a reservation
    # Connect to a reservation
    # if (defined($connect_job)){
    # Do not kill the job if the user close the window
    #  $SIG{HUP} = 'DEFAULT';
    #  OAR::Sub::close_db_connection(); exit(connect_job($connect_job,0,$Openssh_cmd));
    # }

    if not project:
        project = DEFAULT_VALUE["project"]
    if not signal:
        signal = DEFAULT_VALUE["signal"]
    if not directory:
        directory = DEFAULT_VALUE["directory"]

    resource_request = parse_resource_descriptions(resource, default_resources, nodes_resources)

    job_vars = {
        "job_type": None,
        "resource_request": resource_request,
        "command": command,
        "info_type": None,
        "queue_name": queue_name,
        "properties": properties,
        "checkpoint": checkpoint,
        "signal": signal,
        "notify": notify,
        "name": name,
        "types": types,
        "launching_directory": launching_directory,
        "dependencies": after,
        "stdout": stdout,
        "stderr": stderr,
        "hold": hold,
        "project": project,
        "initial_request": initial_request,
        "user": os.environ["OARDO_USER"],
        "array_id": 0,
        "start_time": "0",
        "reservation_field": None,
    }

    if not interactive and command:

        cmd_executor = "Qsub"

        if scanscript:
            # TODO scanscript
            pass

        array_params = []
        if array_param_file:
            pass
        # TODO
        # $array_params_ref = OAR::Sub::read_array_param_file($array_param_file);
        # $array_nb = scalar @{$array_params_ref};

        if array_nb == 0:
            print_error("an array of job must have a number of sub-jobs greater than 0.")
            usage()
            sub_exit(6)

        job_vars["info_type"] = "$Host:$server_port"  # TODO  "$Host:$server_port"
        job_vars["job_type"] = "PASSIVE"
        (err, job_id_lst) = add_micheline_jobs(
            job_vars,
            reservation_date,
            use_job_key,
            import_job_key_inline,
            import_job_key_file,
            export_job_key_file,
            initial_request,
            array_nb,
            array_params,
        )
    else:
        # TODO interactive
        if command:
            print_warning("asking for an interactive job (-I), so ignoring arguments: " + command + " .")

        cmd_executor = "Qsub -I"

        if array_param_file:
            print_error("a array job with parameters given in a file cannot be interactive.")
            usage()
            sub_exit(9)

        if array_nb != 1:
            print_error("an array job cannot be interactive.")
            usage()
            sub_exit(8)

        if reservation:
            # Test if this job is a reservation and the syntax is right
            # TODO Pass
            pass
        socket_server = init_tcp_server()
        (server, server_port) = socket_server.getsockname()
        job_vars["info_type"] = server + ":" + str(server_port)
        job_vars["job_type"] = "INTERACTIVE"
        (err, job_id_lst) = add_micheline_jobs(
            job_vars,
            reservation_date,
            use_job_key,
            import_job_key_inline,
            import_job_key_file,
            export_job_key_file,
            initial_request,
            array_nb,
            array_params,
        )

    # pdb.set_trace()

    if err != 0:
        print_error("command failed, please verify your syntax.")
        sub_exit(err, "")

    oar_array_id = 0

    # Print job_id list
    if len(job_id_lst) == 1:
        print("OAR_JOB_ID=", job_id_lst[0])
    else:
        job = db["Job"].query.filter(Job.id == job_id_lst[0]).one()
        oar_array_id = job.array_id
        for job_id in job_id_lst:
            print("OAR_JOB_ID=", job_id)

    result = (job_id_lst, oar_array_id)

    # Notify Almigthy
    tools.create_almighty_socket()
    tools.notify_almighty(cmd_executor)

    if reservation:
        # Reservation mode
        print_info("advance reservation request: waiting for approval from the scheduler...")
        (conn, address) = socket_server.accept()
        answer = conn.recv(1024)
        if answer[:-1] == "GOOD RESERVATION":
            print_info("advance reservation is GRANTED.")
        else:
            print_info("advance reservation is REJECTED ", answer[:-1])
            sub_exit(10)
    elif interactive:
        # Interactive mode
        print_info("interactive mode: waiting...")

        prev_str = ""
        while True:
            (conn, address) = socket_server.accept()
            answer = conn.recv(1024)
            answer = answer[:-1]

            m = re.search(r"\](.*)$", answer)
            if m and m.group(1) != prev_str:
                print_info(answer)
                prev_str = m.group(1)
            elif answer != "GOOD JOB":
                print_info(answer)

            if (
                (answer == "GOOD JOB")
                or (answer == "BAD JOB")
                or (answer == "JOB KILLE")
                or re.match(r"^ERROR", answer)
            ):
                break

        if answer == "GOOD JOB":
            # TODO exit(connect_job($Job_id_list_ref->[0],1,$Openssh_cmd));
            pass
        else:
            sub_exit(11)

    sub_exit(0, result)