Exemple #1
0
    def __init__(self,
                 param_file,
                 out_file,
                 results_file,
                 opt_value,
                 tmpdir,
                 name='shadho',
                 port=9123,
                 shutdown=True,
                 logfile='shadho_wq.log',
                 debugfile='shadho_wq.debug'):
        WORKQUEUE.cctools_debug_flags_set("all")
        WORKQUEUE.cctools_debug_config_file(debugfile)
        WORKQUEUE.cctools_debug_config_file_size(0)

        if os.environ['USER'] not in name:
            name += '-{}'.format(os.environ['USER'])

        super(WQManager, self).__init__(name=name,
                                        port=int(port),
                                        shutdown=shutdown)

        self.specify_log(logfile)

        self.param_file = param_file
        self.out_file = out_file
        self.results_file = results_file
        self.opt_value = opt_value
        self.tmpdir = tmpdir
        self.tasks_submitted = self.stats.tasks_submitted
Exemple #2
0
def main():
    args = parse_args()

    # Create a temporary file store for task results.
    tmpdir = tempfile.mkdtemp()

    # Load the task input data here
    task_inputs = load_inputs()

    print('Creating tasks')
    tasks = generate_tasks(args.command, task_inputs,
                           args.infiles, args.outfile,
                           tmpdir, args.max_retries)

    # Create the Work Queue master that manages task distribution.
    work_queue.cctools_debug_flags_set("all")
    work_queue.cctools_debug_config_file(f'{args.name}.debug')
    work_queue.cctools_debug_config_file_size(0)
    wq = WorkQueue(port=args.port, name=args.name, shutdown=True)
    wq.specify_log(f'{args.name}.log')

    # Submit all tasks to the queue.
    print('Submitting tasks')
    for t in tasks.values():
        wq.submit(t)

    # The main loop waits for a task to get done then handles success or
    # failure accordingly
    print('Entering main loop')
    while not all([done_check(t) for t in tasks.values()]):
        t = wq.wait(10)  # This blocks for 10s or until a task is done.

        if t is not None:
            tasks[t.tag] = t  # Update the task map with the correct status

            # On success, post-process the task. If the maximum number of
            # submissions for a task has been reached, make a note. Otherwise,
            # report the failure and resubmit.
            if t.return_status == 0 and t.result == WORK_QUEUE_RESULT_SUCCESS:
                print(f'Task {t.tag} completed successfully.')
                input_idx = int(t.tag.split('_')[1])
                handle_success(t, tmpdir, args.outfile)
            elif t.result == WORK_QUEUE_RESULT_MAX_RETRIES:
                print(f'Task {t.tag} resubmitted too many times.')
            else:
                wq.submit(t)
                print(f'Task {t.tag} failed with result {t.result}')
                print(t.output)

    print('All tasks completed or hit max retries.')
    print('Cleaning up...')
    shutil.rmtree(tmpdir)

    print('Done')
Exemple #3
0
    def __init__(self, param_file, out_file, results_file, opt_value, tmpdir,
                 name='shadho', port=9123, exclusive=True, shutdown=True,
                 logfile='shadho_wq.log', debugfile='shadho_wq.debug'):
        work_queue.cctools_debug_flags_set("all")
        work_queue.cctools_debug_config_file(debugfile)
        work_queue.cctools_debug_config_file_size(0)

        super(WQManager, self).__init__(name=name,
                                        port=port,
                                        exclusive=exclusive,
                                        shutdown=shutdown,
                                        catalog=False)

        self.specify_log(logfile)

        self.param_file = param_file
        self.out_file = out_file
        self.results_file = results_file
        self.opt_value = opt_value
        self.tmpdir = tmpdir
        self.tasks_submitted = self.stats.tasks_submitted
Exemple #4
0
def WorkQueueSubmitThread(task_queue=multiprocessing.Queue(),
                          queue_lock=threading.Lock(),
                          launch_cmd=None,
                          env=None,
                          collector_queue=multiprocessing.Queue(),
                          see_worker_output=False,
                          data_dir=".",
                          full=False,
                          cancel_value=multiprocessing.Value('i', 1),
                          port=WORK_QUEUE_DEFAULT_PORT,
                          wq_log_dir=None,
                          project_password=None,
                          project_password_file=None,
                          project_name=None):
    """Thread to handle Parsl app submissions to the Work Queue objects.
    Takes in Parsl functions submitted using submit(), and creates a
    Work Queue task with the appropriate specifications, which is then
    submitted to Work Queue. After tasks are completed, processes the
    exit status and exit code of the task, and sends results to the
    Work Queue collector thread.
    """
    logger.debug("Starting WorkQueue Submit/Wait Process")

    # Enable debugging flags and create logging file
    if wq_log_dir is not None:
        logger.debug("Setting debugging flags and creating logging file")
        wq_debug_log = os.path.join(wq_log_dir, "debug_log")
        cctools_debug_flags_set("all")
        cctools_debug_config_file(wq_debug_log)

    # Create WorkQueue queue object
    logger.debug("Creating WorkQueue Object")
    try:
        logger.debug("Listening on port {}".format(port))
        q = WorkQueue(port)
    except Exception as e:
        logger.error("Unable to create WorkQueue object: {}".format(e))
        raise e

    # Specify WorkQueue queue attributes
    if project_name:
        q.specify_name(project_name)
    if project_password:
        q.specify_password(project_password)
    elif project_password_file:
        q.specify_password_file(project_password_file)

    # Only write logs when the wq_log_dir is specified, which it most likely will be
    if wq_log_dir is not None:
        wq_master_log = os.path.join(wq_log_dir, "master_log")
        wq_trans_log = os.path.join(wq_log_dir, "transaction_log")
        if full:
            wq_resource_log = os.path.join(wq_log_dir, "resource_logs")
            q.enable_monitoring_full(dirname=wq_resource_log)
        q.specify_log(wq_master_log)
        q.specify_transactions_log(wq_trans_log)

    wq_tasks = set()
    orig_ppid = os.getppid()
    continue_running = True
    while (continue_running):
        # Monitor the task queue
        ppid = os.getppid()
        if ppid != orig_ppid:
            logger.debug("new Process")
            continue_running = False
            continue

        # Submit tasks
        while task_queue.qsize() > 0:
            if cancel_value.value == 0:
                logger.debug("cancel value set to cancel")
                continue_running = False
                break

            # Obtain task from task_queue
            try:
                item = task_queue.get(timeout=1)
                logger.debug("Removing task from queue")
            except queue.Empty:
                continue
            parsl_id = item["task_id"]

            # Extract information about the task
            function_data_loc = item["data_loc"]
            function_data_loc_remote = function_data_loc.split("/")[-1]
            function_result_loc = item["result_loc"]
            function_result_loc_remote = function_result_loc.split("/")[-1]
            input_files = item["input_files"]
            output_files = item["output_files"]
            std_files = item["std_files"]

            full_script_name = workqueue_worker.__file__
            script_name = full_script_name.split("/")[-1]

            remapping_string = ""
            std_string = ""

            # Parse input file information
            logger.debug("Looking at input")
            for item in input_files:
                if item[3] == "std":
                    std_string += "mv " + item[1] + " " + item[0] + "; "
                else:
                    remapping_string += item[0] + ":" + item[1] + ","
            logger.debug(remapping_string)

            # Parse output file information
            logger.debug("Looking at output")
            for item in output_files:
                remapping_string += item[0] + ":" + item[1] + ","
            logger.debug(remapping_string)

            if len(input_files) + len(output_files) > 0:
                remapping_string = "-r " + remapping_string
                remapping_string = remapping_string[:-1]

            # Create command string
            logger.debug(launch_cmd)
            command_str = launch_cmd.format(
                input_file=function_data_loc_remote,
                output_file=function_result_loc_remote,
                remapping_string=remapping_string)
            command_str = std_string + command_str
            logger.debug(command_str)

            # Create WorkQueue task for the command
            logger.debug("Sending task {} with command: {}".format(
                parsl_id, command_str))
            try:
                t = Task(command_str)
            except Exception as e:
                logger.error("Unable to create task: {}".format(e))
                continue

            # Specify environment variables for the task
            if env is not None:
                for var in env:
                    t.specify_environment_variable(var, env[var])

            # Specify script, and data/result files for task
            t.specify_file(full_script_name,
                           script_name,
                           WORK_QUEUE_INPUT,
                           cache=True)
            t.specify_file(function_data_loc,
                           function_data_loc_remote,
                           WORK_QUEUE_INPUT,
                           cache=False)
            t.specify_file(function_result_loc,
                           function_result_loc_remote,
                           WORK_QUEUE_OUTPUT,
                           cache=False)
            t.specify_tag(str(parsl_id))
            logger.debug("Parsl ID: {}".format(t.id))

            # Specify all input/output files for task
            for item in input_files:
                t.specify_file(item[0],
                               item[1],
                               WORK_QUEUE_INPUT,
                               cache=item[2])
            for item in output_files:
                t.specify_file(item[0],
                               item[1],
                               WORK_QUEUE_OUTPUT,
                               cache=item[2])
            for item in std_files:
                t.specify_file(item[0],
                               item[1],
                               WORK_QUEUE_OUTPUT,
                               cache=item[2])

            # Submit the task to the WorkQueue object
            logger.debug("Submitting task {} to WorkQueue".format(parsl_id))
            try:
                wq_id = q.submit(t)
                wq_tasks.add(wq_id)
            except Exception as e:
                logger.error("Unable to create task: {}".format(e))

                msg = {
                    "tid": parsl_id,
                    "result_received": False,
                    "reason": "Workqueue Task Start Failure",
                    "status": 1
                }

                collector_queue.put_nowait(msg)
                continue

            logger.debug("Task {} submitted to WorkQueue with id {}".format(
                parsl_id, wq_id))

        if cancel_value.value == 0:
            continue_running = False

        # If the queue is not empty wait on the WorkQueue queue for a task
        task_found = True
        if not q.empty() and continue_running:
            while task_found is True:
                if cancel_value.value == 0:
                    continue_running = False
                    task_found = False
                    continue

                # Obtain the task from the queue
                t = q.wait(1)
                if t is None:
                    task_found = False
                    continue
                else:
                    parsl_tid = t.tag
                    logger.debug(
                        "Completed WorkQueue task {}, parsl task {}".format(
                            t.id, parsl_tid))
                    status = t.return_status
                    task_result = t.result
                    msg = None

                    # Task failure
                    if status != 0 or (task_result != WORK_QUEUE_RESULT_SUCCESS
                                       and task_result !=
                                       WORK_QUEUE_RESULT_OUTPUT_MISSING):
                        logger.debug(
                            "Wrapper Script status: {}\nWorkQueue Status: {}".
                            format(status, task_result))
                        # Wrapper script failure
                        if status != 0:
                            logger.debug(
                                "WorkQueue task {} failed with status {}".
                                format(t.id, status))
                            reason = "Wrapper Script Failure: "
                            if status == 1:
                                reason += "problem parsing command line options"
                            elif status == 2:
                                reason += "problem loading function data"
                            elif status == 3:
                                reason += "problem remapping file names"
                            elif status == 4:
                                reason += "problem writing out function result"
                            else:
                                reason += "unable to process wrapper script failure with status = {}".format(
                                    status)
                            reason += "\nTrace:\n" + str(t.output)
                            logger.debug(
                                "WorkQueue runner script failed for task {} because {}\n"
                                .format(parsl_tid, reason))
                        # WorkQueue system failure
                        else:
                            reason = "WorkQueue System Failure: "
                            if task_result == 1:
                                reason += "missing input file"
                            elif task_result == 2:
                                reason += "unable to generate output file"
                            elif task_result == 4:
                                reason += "stdout has been truncated"
                            elif task_result == 1 << 3:
                                reason += "task terminated with a signal"
                            elif task_result == 2 << 3:
                                reason += "task used more resources than requested"
                            elif task_result == 3 << 3:
                                reason += "task ran past the specified end time"
                            elif task_result == 4 << 3:
                                reason += "result could not be classified"
                            elif task_result == 5 << 3:
                                reason += "task failed, but not a task error"
                            elif task_result == 6 << 3:
                                reason += "unable to complete after specified number of retries"
                            elif task_result == 7 << 3:
                                reason += "task ran for more than the specified time"
                            elif task_result == 8 << 3:
                                reason += "task needed more space to complete task"
                            else:
                                reason += "unable to process Work Queue system failure"

                        msg = {
                            "tid": parsl_tid,
                            "result_received": False,
                            "reason": reason,
                            "status": status
                        }

                        collector_queue.put_nowait(msg)

                    # Task Success
                    else:
                        # Print the output from the task
                        if see_worker_output:
                            print(t.output)

                        # Load result into result file
                        result_loc = os.path.join(
                            data_dir,
                            "task_" + str(parsl_tid) + "_function_result")
                        logger.debug(
                            "Looking for result in {}".format(result_loc))
                        f = open(result_loc, "rb")
                        result = pickle.load(f)
                        f.close()

                        msg = {
                            "tid": parsl_tid,
                            "result_received": True,
                            "result": result
                        }
                        wq_tasks.remove(t.id)

                    collector_queue.put_nowait(msg)

        if continue_running is False:
            logger.debug("Exiting WorkQueue Master Thread event loop")
            break

    # Remove all WorkQueue tasks that remain in the queue object
    for wq_task in wq_tasks:
        logger.debug("Cancelling WorkQueue Task {}".format(wq_task))
        q.cancel_by_taskid(wq_task)

    logger.debug("Exiting WorkQueue Monitoring Process")
    return 0
Exemple #5
0
def WorkQueueSubmitThread(task_queue=multiprocessing.Queue(),
                          queue_lock=threading.Lock(),
                          launch_cmd=None,
                          env=None,
                          collector_queue=multiprocessing.Queue(),
                          see_worker_output=False,
                          data_dir=".",
                          full=False,
                          cancel_value=multiprocessing.Value('i', 1),
                          port=WORK_QUEUE_DEFAULT_PORT,
                          wq_log_dir=None,
                          project_password=None,
                          project_password_file=None,
                          project_name=None):

    logger.debug("Starting WorkQueue Submit/Wait Process")

    orig_ppid = os.getppid()

    wq_tasks = set()

    continue_running = True

    if wq_log_dir is not None:
        wq_debug_log = os.path.join(wq_log_dir, "debug")
        cctools_debug_flags_set("all")
        cctools_debug_config_file(wq_debug_log)

    logger.debug("Creating Workqueue Object")
    try:
        q = WorkQueue(port)
    except Exception as e:
        logger.error("Unable to create Workqueue object: {}", format(e))
        raise e

    if project_name:
        q.specify_name(project_name)

    if project_password:
        q.specify_password(project_password)
    elif project_password_file:
        q.specify_password_file(project_password_file)

    # Only write Logs when the log_dir is specified, which is most likely always will be
    if wq_log_dir is not None:
        wq_master_log = os.path.join(wq_log_dir, "master_log")
        wq_trans_log = os.path.join(wq_log_dir, "transaction_log")
        if full:
            wq_resource_log = os.path.join(wq_log_dir, "resource_logs")
            q.enable_monitoring_full(dirname=wq_resource_log)

        q.specify_log(wq_master_log)
        q.specify_transactions_log(wq_trans_log)

    while (continue_running):
        # Monitor the Task Queue
        ppid = os.getppid()
        if ppid != orig_ppid:
            continue_running = False
            continue

        # Submit Tasks
        while task_queue.qsize() > 0:
            if cancel_value.value == 0:
                continue_running = False
                break

            try:
                # item = task_queue.get_nowait()
                item = task_queue.get(timeout=1)
                logger.debug("Removing task from queue")
            except queue.Empty:
                continue
            parsl_id = item["task_id"]

            function_data_loc = item["data_loc"]
            function_result_loc = item["result_loc"]
            function_result_loc_remote = function_result_loc.split("/")[-1]
            function_data_loc_remote = function_data_loc.split("/")[-1]

            input_files = item["input_files"]
            output_files = item["output_files"]
            std_files = item["std_files"]

            full_script_name = workqueue_worker.__file__
            script_name = full_script_name.split("/")[-1]

            remapping_string = ""

            std_string = ""
            logger.debug("looking at input")
            for item in input_files:
                if item[3] == "std":
                    std_string += "mv " + item[1] + " " + item[0] + "; "
                else:
                    remapping_string += item[0] + ":" + item[1] + ","
            logger.debug(remapping_string)

            logger.debug("looking at output")
            for item in output_files:
                remapping_string += item[0] + ":" + item[1] + ","
            logger.debug(remapping_string)

            if len(input_files) + len(output_files) > 0:
                remapping_string = "-r " + remapping_string
                remapping_string = remapping_string[:-1]

            logger.debug(launch_cmd)
            command_str = launch_cmd.format(
                input_file=function_data_loc_remote,
                output_file=function_result_loc_remote,
                remapping_string=remapping_string)

            logger.debug(command_str)
            command_str = std_string + command_str
            logger.debug(command_str)

            logger.debug("Sending task {} with command: {}".format(
                parsl_id, command_str))
            try:
                t = Task(command_str)
            except Exception as e:
                logger.error("Unable to create task: {}".format(e))
                continue
            if env is not None:
                for var in env:
                    t.specify_environment_variable(var, env[var])

            t.specify_file(full_script_name,
                           script_name,
                           WORK_QUEUE_INPUT,
                           cache=True)
            t.specify_file(function_result_loc,
                           function_result_loc_remote,
                           WORK_QUEUE_OUTPUT,
                           cache=False)
            t.specify_file(function_data_loc,
                           function_data_loc_remote,
                           WORK_QUEUE_INPUT,
                           cache=False)
            t.specify_tag(str(parsl_id))

            for item in input_files:
                t.specify_file(item[0],
                               item[1],
                               WORK_QUEUE_INPUT,
                               cache=item[2])

            for item in output_files:
                t.specify_file(item[0],
                               item[1],
                               WORK_QUEUE_OUTPUT,
                               cache=item[2])

            for item in std_files:
                t.specify_file(item[0],
                               item[1],
                               WORK_QUEUE_OUTPUT,
                               cache=item[2])

            logger.debug("Submitting task {} to workqueue".format(parsl_id))
            try:
                wq_id = q.submit(t)
                wq_tasks.add(wq_id)
            except Exception as e:
                logger.error("Unable to create task: {}".format(e))

                msg = {
                    "tid": parsl_id,
                    "result_received": False,
                    "reason": "Workqueue Task Start Failure",
                    "status": 1
                }

                collector_queue.put_nowait(msg)
                continue

            logger.debug("Task {} submitted workqueue with id {}".format(
                parsl_id, wq_id))

        if cancel_value.value == 0:
            continue_running = False

        # Wait for Tasks
        task_found = True
        # If the queue is not empty wait on the workqueue queue for a task
        if not q.empty() and continue_running:
            while task_found is True:
                if cancel_value.value == 0:
                    continue_running = False
                    task_found = False
                    continue
                t = q.wait(1)
                if t is None:
                    task_found = False
                    continue
                else:
                    parsl_tid = t.tag
                    logger.debug(
                        "Completed workqueue task {}, parsl task {}".format(
                            t.id, parsl_tid))
                    status = t.return_status
                    task_result = t.result
                    msg = None

                    if status != 0 or (task_result != WORK_QUEUE_RESULT_SUCCESS
                                       and task_result !=
                                       WORK_QUEUE_RESULT_OUTPUT_MISSING):
                        if task_result == WORK_QUEUE_RESULT_SUCCESS:
                            logger.debug(
                                "Workqueue task {} failed with status {}".
                                format(t.id, status))

                            reason = "Wrapper Script Failure: "
                            if status == 1:
                                reason += "command line parsing"
                            if status == 2:
                                reason += "problem loading function data"
                            if status == 3:
                                reason += "problem remapping file names"
                            if status == 4:
                                reason += "problem writing out function result"

                            reason += "\nTrace:\n" + t.output

                            logger.debug(
                                "Workqueue runner script failed for task {} because {}\n"
                                .format(parsl_tid, reason))

                        else:
                            reason = "Workqueue system failure\n"

                        msg = {
                            "tid": parsl_tid,
                            "result_received": False,
                            "reason": reason,
                            "status": status
                        }

                        collector_queue.put_nowait(msg)

                    else:

                        if see_worker_output:
                            print(t.output)

                        result_loc = os.path.join(
                            data_dir,
                            "task_" + str(parsl_tid) + "_function_result")
                        logger.debug(
                            "Looking for result in {}".format(result_loc))
                        f = open(result_loc, "rb")
                        result = pickle.load(f)
                        f.close()

                        msg = {
                            "tid": parsl_tid,
                            "result_received": True,
                            "result": result
                        }
                        wq_tasks.remove(t.id)

                    collector_queue.put_nowait(msg)

        if continue_running is False:
            logger.debug("Exiting WorkQueue Master Thread event loop")
            break

    for wq_task in wq_tasks:
        logger.debug("Cancelling Workqueue Task {}".format(wq_task))
        q.cancel_by_taskid(wq_task)

    logger.debug("Exiting WorkQueue Monitoring Process")
    return 0
Exemple #6
0
    def sprint(self):
        with util.PartiallyMutable.unlock():
            self.source = TaskProvider(self.config)
        action = actions.Actions(self.config, self.source)

        logger.info("using wq from {0}".format(wq.__file__))
        logger.info("running Lobster version {0}".format(util.get_version()))
        logger.info("current PID is {0}".format(os.getpid()))

        wq.cctools_debug_flags_set("all")
        wq.cctools_debug_config_file(
            os.path.join(self.config.workdir, "work_queue_debug.log"))
        wq.cctools_debug_config_file_size(1 << 29)

        self.queue = wq.WorkQueue(self.config.advanced.wq_port)
        self.queue.specify_min_taskid(self.source.max_taskid() + 1)
        self.queue.specify_log(
            os.path.join(self.config.workdir, "work_queue.log"))
        self.queue.specify_transactions_log(
            os.path.join(self.config.workdir, "transactions.log"))
        self.queue.specify_name("lobster_" + self.config.label)
        self.queue.specify_keepalive_timeout(300)
        # self.queue.tune("short-timeout", 600)
        self.queue.tune("transfer-outlier-factor", 4)
        self.queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND)
        if self.config.advanced.full_monitoring:
            self.queue.enable_monitoring_full(None)
        else:
            self.queue.enable_monitoring(None)

        logger.info("starting queue as {0}".format(self.queue.name))

        abort_active = False
        abort_threshold = self.config.advanced.abort_threshold
        abort_multiplier = self.config.advanced.abort_multiplier

        wq_max_retries = self.config.advanced.wq_max_retries

        if util.checkpoint(self.config.workdir, 'KILLED') == 'PENDING':
            util.register_checkpoint(self.config.workdir, 'KILLED', 'RESTART')

        # time in seconds to wait for WQ to return tasks, with minimum wait
        # time in case no more tasks are waiting
        interval = 120
        interval_minimum = 30

        tasks_left = 0
        units_left = 0
        successful_tasks = 0

        categories = []

        self.setup_logging('all')
        # Workflows can be assigned categories, with each category having
        # different cpu/memory/walltime requirements that WQ will automatically
        # fine-tune
        for category in self.config.categories:
            constraints = category.wq()
            if category.name != 'merge':
                categories.append(category.name)
                self.setup_logging(category.name)
            self.queue.specify_category_mode(category.name, category.mode)
            if category.mode == wq.WORK_QUEUE_ALLOCATION_MODE_FIXED:
                self.queue.specify_category_max_resources(
                    category.name, constraints)
            else:
                self.queue.specify_category_first_allocation_guess(
                    category.name, constraints)
            logger.debug('Category {0}: {1}'.format(category.name,
                                                    constraints))
            if 'wall_time' not in constraints:
                self.queue.activate_fast_abort_category(
                    category.name, abort_multiplier)

        proxy_email_sent = False
        while not self.source.done():
            with self.measure('status'):
                tasks_left = self.source.tasks_left()
                units_left = self.source.work_left()

                logger.debug("expecting {0} tasks, still".format(tasks_left))
                self.queue.specify_num_tasks_left(tasks_left)

                for c in categories + ['all']:
                    self.log(c, units_left)

                if util.checkpoint(self.config.workdir, 'KILLED') == 'PENDING':
                    util.register_checkpoint(self.config.workdir, 'KILLED',
                                             str(datetime.datetime.utcnow()))

                    # let the task source shut down gracefully
                    logger.info("terminating task source")
                    self.source.terminate()
                    logger.info("terminating gracefully")
                    break

            with self.measure('create'):
                have = {}
                for c in categories:
                    cstats = self.queue.stats_category(c)
                    have[c] = {
                        'running': cstats.tasks_running,
                        'queued': cstats.tasks_waiting
                    }

                stats = self.queue.stats_hierarchy
                tasks = self.source.obtain(stats.total_cores, have)

                expiry = None
                if self.config.advanced.proxy:
                    expiry = self.config.advanced.proxy.expires()
                    proxy_time_left = self.config.advanced.proxy.time_left()
                    if proxy_time_left >= 24 * 3600:
                        proxy_email_sent = False
                    if proxy_time_left < 24 * 3600 and not proxy_email_sent:
                        util.sendemail(
                            "Your proxy is about to expire.\n" + "Timeleft: " +
                            str(datetime.timedelta(seconds=proxy_time_left)),
                            self.config)
                        proxy_email_sent = True

                for category, cmd, id, inputs, outputs, env, dir in tasks:
                    task = wq.Task(cmd)
                    task.specify_category(category)
                    task.specify_tag(id)
                    task.specify_max_retries(wq_max_retries)
                    task.specify_monitor_output(
                        os.path.join(dir, 'resource_monitor'))

                    for k, v in env.items():
                        task.specify_environment_variable(k, v)

                    for (local, remote, cache) in inputs:
                        cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE
                        if os.path.isfile(local) or os.path.isdir(local):
                            task.specify_input_file(str(local), str(remote),
                                                    cache_opt)
                        else:
                            logger.critical(
                                "cannot send file to worker: {0}".format(
                                    local))
                            raise NotImplementedError

                    for (local, remote) in outputs:
                        task.specify_output_file(str(local), str(remote))

                    if expiry:
                        task.specify_end_time(expiry * 10**6)
                    self.queue.submit(task)

            with self.measure('status'):
                stats = self.queue.stats_hierarchy
                logger.info(
                    "{0} out of {1} workers busy; {2} tasks running, {3} waiting; {4} units left"
                    .format(stats.workers_busy,
                            stats.workers_busy + stats.workers_ready,
                            stats.tasks_running, stats.tasks_waiting,
                            units_left))

            with self.measure('update'):
                self.source.update(self.queue)

            # recurring actions are triggered here; plotting etc should run
            # while we have WQ hand us back tasks w/o any database
            # interaction
            with self.measure('action'):
                if action:
                    action.take()

            with self.measure('fetch'):
                starttime = time.time()
                task = self.queue.wait(interval)
                tasks = []
                while task:
                    if task.return_status == 0:
                        successful_tasks += 1
                    elif task.return_status in self.config.advanced.bad_exit_codes:
                        logger.warning(
                            "blacklisting host {0} due to bad exit code from task {1}"
                            .format(task.hostname, task.tag))
                        self.queue.blacklist(task.hostname)
                    tasks.append(task)

                    remaining = int(starttime + interval - time.time())
                    if (interval - remaining < interval_minimum
                            or self.queue.stats.tasks_waiting > 0
                        ) and remaining > 0:
                        task = self.queue.wait(remaining)
                    else:
                        task = None
                # TODO do we really need this?  We have everything based on
                # categories by now, so this should not be needed.
                if abort_threshold > 0 and successful_tasks >= abort_threshold and not abort_active:
                    logger.info(
                        "activating fast abort with multiplier: {0}".format(
                            abort_multiplier))
                    abort_active = True
                    self.queue.activate_fast_abort(abort_multiplier)
            if len(tasks) > 0:
                try:
                    with self.measure('return'):
                        self.source.release(tasks)
                except Exception:
                    tb = traceback.format_exc()
                    logger.critical(
                        "cannot recover from the following exception:\n" + tb)
                    util.sendemail(
                        "Your Lobster project has crashed from the following exception:\n"
                        + tb, self.config)
                    for task in tasks:
                        logger.critical(
                            "tried to return task {0} from {1}".format(
                                task.tag, task.hostname))
                    raise
        if units_left == 0:
            logger.info("no more work left to do")
            util.sendemail("Your Lobster project is done!", self.config)
            if self.config.elk:
                self.config.elk.end()
            if action:
                action.take(True)
Exemple #7
0
def run(args):
    dash_checker = cmssw.dash.JobStateChecker(300)
    with open(args.configfile) as configfile:
        config = yaml.load(configfile)

    workdir = config['workdir']
    if not os.path.exists(workdir):
        os.makedirs(workdir)
        util.register_checkpoint(workdir, "version", get_distribution('Lobster').version)
    else:
        util.verify(workdir)

    cmsjob = False
    if config.get('type', 'cmssw') == 'cmssw':
        cmsjob = True

        from ProdCommon.Credential.CredentialAPI import CredentialAPI
        cred = CredentialAPI({'credential': 'Proxy'})
        if cred.checkCredential(Time=60):
            if not 'X509_USER_PROXY' in os.environ:
                os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy()
        else:
            if config.get('advanced', {}).get('renew proxy', True):
                try:
                    cred.ManualRenewCredential()
                except Exception as e:
                    print("could not renew proxy")
                    sys.exit(1)
            else:
                print("please renew your proxy")
                sys.exit(1)

    print "Saving log to {0}".format(os.path.join(workdir, 'lobster.log'))

    if not args.foreground:
        ttyfile = open(os.path.join(workdir, 'lobster.err'), 'a')
        print "Saving stderr and stdout to {0}".format(os.path.join(workdir, 'lobster.err'))

    signals = daemon.daemon.make_default_signal_map()
    signals[signal.SIGTERM] = lambda num, frame: kill(args)

    with daemon.DaemonContext(
            detach_process=not args.foreground,
            stdout=sys.stdout if args.foreground else ttyfile,
            stderr=sys.stderr if args.foreground else ttyfile,
            working_directory=workdir,
            pidfile=util.get_lock(workdir),
            signal_map=signals):

        fileh = logging.handlers.RotatingFileHandler(os.path.join(workdir, 'lobster.log'), maxBytes=500e6, backupCount=10)
        fileh.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s"))
        fileh.setLevel(config.get('advanced', {}).get('log level', 2) * 10)

        logger.addHandler(fileh)
        logger.setLevel(config.get('advanced', {}).get('log level', 2) * 10)

        if args.foreground:
            console = logging.StreamHandler()
            console.setLevel(config.get('advanced', {}).get('log level', 2) * 10)
            console.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s"))
            logger.addHandler(console)

        config['configdir'] = args.configdir
        config['filename'] = args.configfile
        config['startdir'] = args.startdir
        if cmsjob:
            job_src = cmssw.JobProvider(config)
            actions = cmssw.Actions(config)
        else:
            job_src = job.SimpleJobProvider(config)
            actions = None

        logger.info("using wq from {0}".format(wq.__file__))

        wq.cctools_debug_flags_set("all")
        wq.cctools_debug_config_file(os.path.join(workdir, "work_queue_debug.log"))
        wq.cctools_debug_config_file_size(1 << 29)

        queue = wq.WorkQueue(-1)
        queue.specify_log(os.path.join(workdir, "work_queue.log"))
        queue.specify_name("lobster_" + config["id"])
        queue.specify_keepalive_timeout(300)
        # queue.tune("short-timeout", 600)
        queue.tune("transfer-outlier-factor", 4)
        queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND)

        logger.info("starting queue as {0}".format(queue.name))
        logger.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name))

        payload = config.get('advanced', {}).get('payload', 400)
        abort_active = False
        abort_threshold = config.get('advanced', {}).get('abort threshold', 400)
        abort_multiplier = config.get('advanced', {}).get('abort multiplier', 4)

        if util.checkpoint(workdir, 'KILLED') == 'PENDING':
            util.register_checkpoint(workdir, 'KILLED', 'RESTART')

        jobits_left = 0
        successful_jobs = 0

        creation_time = 0
        destruction_time = 0

        with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile:
            statsfile.write(
                    "#timestamp " +
                    "total_workers_connected total_workers_joined total_workers_removed " +
                    "workers_busy workers_idle " +
                    "tasks_running " +
                    "total_send_time total_receive_time " +
                    "total_create_time total_return_time " +
                    "idle_percentage " +
                    "capacity " +
                    "efficiency " +
                    "total_memory " +
                    "total_cores " +
                    "jobits_left\n")

        while not job_src.done():
            jobits_left = job_src.work_left()
            stats = queue.stats

            with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile:
                now = datetime.datetime.now()
                statsfile.write(" ".join(map(str,
                    [
                        int(int(now.strftime('%s')) * 1e6 + now.microsecond),
                        stats.total_workers_connected,
                        stats.total_workers_joined,
                        stats.total_workers_removed,
                        stats.workers_busy,
                        stats.workers_idle,
                        stats.tasks_running,
                        stats.total_send_time,
                        stats.total_receive_time,
                        creation_time,
                        destruction_time,
                        stats.idle_percentage,
                        stats.capacity,
                        stats.efficiency,
                        stats.total_memory,
                        stats.total_cores,
                        jobits_left
                    ]
                    )) + "\n"
                )

            if util.checkpoint(workdir, 'KILLED') == 'PENDING':
                util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow()))
                # just in case, check for any remaining not done task that
                # hasn't been reported as aborted
                for task_id in queue._task_table.keys():
                    status = cmssw.dash.status_map[queue.task_state(task_id)]
                    if status not in (cmssw.dash.DONE, cmssw.dash.ABORTED):
                        job_src._JobProvider__dash.update_job(task_id, cmssw.dash.ABORTED)

                logger.info("terminating gracefully")
                break

            logger.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format(
                    stats.workers_busy,
                    stats.workers_busy + stats.workers_ready,
                    jobits_left,
                    stats.tasks_running,
                    stats.tasks_waiting))

            hunger = max(payload - stats.tasks_waiting, 0)

            t = time.time()
            while hunger > 0:
                jobs = job_src.obtain(50)

                if jobs == None or len(jobs) == 0:
                    break

                hunger -= len(jobs)
                cores = config.get('cores per job', 1)
                for id, cmd, inputs, outputs in jobs:
                    task = wq.Task(cmd)
                    task.specify_tag(id)
                    task.specify_cores(cores)
                    # temporary work-around?
                    # task.specify_memory(1000)
                    # task.specify_disk(4000)

                    for (local, remote, cache) in inputs:
                        if os.path.isfile(local):
                            cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE
                            task.specify_input_file(str(local), str(remote), cache_opt)
                        elif os.path.isdir(local):
                            task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT,
                                    wq.WORK_QUEUE_CACHE, recursive=True)
                        else:
                            logger.critical("cannot send file to worker: {0}".format(local))
                            raise NotImplementedError

                    for (local, remote) in outputs:
                        task.specify_output_file(str(local), str(remote))

                    queue.submit(task)
            creation_time += int((time.time() - t) * 1e6)

            # update dashboard status for all not done tasks
            # report Done status only once when releasing the task
            # WAITING_RETRIEVAL is not a valid status in dashboard
            # so, skipping it for now
            monitor = job_src._JobProvider__dash
            queue = queue
            exclude_states = (cmssw.dash.DONE, cmssw.dash.WAITING_RETRIEVAL)
            try:
                dash_checker.update_dashboard_states(monitor, queue, exclude_states)
            except Exception as e:
                logger.warning("Could not update job states to dashboard")

            task = queue.wait(300)
            tasks = []
            while task:
                if task.return_status == 0:
                    successful_jobs += 1
                tasks.append(task)
                if queue.stats.tasks_complete > 0:
                    task = queue.wait(1)
                else:
                    task = None
            if len(tasks) > 0:
                try:
                    t = time.time()
                    job_src.release(tasks)
                    destruction_time += int((time.time() - t) * 1e6)
                except:
                    tb = traceback.format_exc()
                    logger.critical("cannot recover from the following exception:\n" + tb)
                    for task in tasks:
                        logger.critical("tried to return task {0} from {1}".format(task.tag, task.hostname))
                    raise
            if successful_jobs >= abort_threshold and not abort_active:
                logger.info("activating fast abort with multiplier: {0}".format(abort_multiplier))
                abort_active = True
                queue.activate_fast_abort(abort_multiplier)

            # recurring actions are triggered here
            if actions:
                actions.take()
        if jobits_left == 0:
            logger.info("no more work left to do")
Exemple #8
0
def run(args):
    with open(args.configfile) as configfile:
        config = yaml.load(configfile)

    workdir = config['workdir']
    if not os.path.exists(workdir):
        os.makedirs(workdir)

    cmsjob = False
    if config.get('type', 'cmssw') == 'cmssw':
        cmsjob = True

        from ProdCommon.Credential.CredentialAPI import CredentialAPI
        cred = CredentialAPI({'credential': 'Proxy'})
        if cred.checkCredential(Time=60):
            if not 'X509_USER_PROXY' in os.environ:
                os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy()
        else:
            if config.get('check proxy', True):
                try:
                    cred.ManualRenewCredential()
                except Exception as e:
                    logging.critical("could not renew proxy")
                    sys.exit(1)
            else:
                logging.critical("please renew your proxy")
                sys.exit(1)

    mode_label = 'merge_' if args.merge else ''
    print "Saving log to {0}".format(os.path.join(workdir, mode_label+'lobster.log'))

    if not args.foreground:
        ttyfile = open(os.path.join(workdir, mode_label+'lobster.err'), 'a')
        print "Saving stderr and stdout to {0}".format(os.path.join(workdir, mode_label+'lobster.err'))

    with daemon.DaemonContext(
            detach_process=not args.foreground,
            stdout=sys.stdout if args.foreground else ttyfile,
            stderr=sys.stderr if args.foreground else ttyfile,
            working_directory=workdir,
            pidfile=get_lock(workdir)):
        logging.basicConfig(
                datefmt="%Y-%m-%d %H:%M:%S",
                format="%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s",
                level=config.get('log level', 2) * 10,
                filename=os.path.join(workdir, mode_label+'lobster.log'))

        if args.foreground:
            console = logging.StreamHandler()
            console.setLevel(config.get('log level', 2) * 10)
            console.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s"))
            logging.getLogger('').addHandler(console)

        config['configdir'] = args.configdir
        config['filepath'] = args.configfile
        config['startdir'] = args.startdir
        if args.merge:
            if args.server:
                config['stageout server'] = args.server
            config['max megabytes'] = args.max_megabytes
            job_src = cmssw.MergeProvider(config)
        elif cmsjob:
            job_src = cmssw.JobProvider(config)
        else:
            job_src = job.SimpleJobProvider(config)

        wq.cctools_debug_flags_set("all")
        wq.cctools_debug_config_file(os.path.join(workdir, mode_label+"work_queue_debug.log"))
        wq.cctools_debug_config_file_size(1 << 29)

        queue = wq.WorkQueue(-1)
        queue.specify_log(os.path.join(workdir, mode_label+"work_queue.log"))
        queue.specify_name("lobster_" + mode_label + config["id"])
        queue.specify_keepalive_timeout(300)
        # queue.tune("short-timeout", 600)
        queue.tune("transfer-outlier-factor", 4)

        logging.info("starting queue as {0}".format(queue.name))
        logging.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name))

        payload = config.get('tune', {}).get('payload', 400)
        abort_active = False
        abort_threshold = config.get('tune', {}).get('abort threshold', 400)
        abort_multiplier = config.get('tune', {}).get('abort multiplier', 4)

        if util.checkpoint(workdir, 'KILLED') == 'PENDING':
            util.register_checkpoint(workdir, 'KILLED', 'RESTART')

        successful_jobs = 0

        creation_time = 0
        destruction_time = 0

        with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile:
            statsfile.write(
                    "#timestamp " +
                    "total_workers_connected total_workers_joined total_workers_removed " +
                    "workers_busy workers_idle " +
                    "tasks_running " +
                    "total_send_time total_receive_time " +
                    "total_create_time total_return_time " +
                    "idle_percentage " +
                    "capacity " +
                    "efficiency " +
                    "jobits_left\n")

        while not job_src.done():
            jobits_left = job_src.work_left()
            stats = queue.stats

            with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile:
                now = datetime.datetime.now()
                statsfile.write(" ".join(map(str,
                    [
                        int(int(now.strftime('%s')) * 1e6 + now.microsecond),
                        stats.total_workers_connected,
                        stats.total_workers_joined,
                        stats.total_workers_removed,
                        stats.workers_busy,
                        stats.workers_idle,
                        stats.tasks_running,
                        stats.total_send_time,
                        stats.total_receive_time,
                        creation_time,
                        destruction_time,
                        stats.idle_percentage,
                        stats.capacity,
                        stats.efficiency,
                        jobits_left
                    ]
                    )) + "\n"
                )

            if util.checkpoint(workdir, 'KILLED') == 'PENDING':
                util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow()))
                logging.info("terminating gracefully")
                break

            logging.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format(
                    stats.workers_busy,
                    stats.workers_busy + stats.workers_ready,
                    jobits_left,
                    stats.tasks_running,
                    stats.tasks_waiting))

            hunger = max(payload - stats.tasks_waiting, 0)

            t = time.time()
            while hunger > 0:
                jobs = job_src.obtain(50)

                if jobs == None or len(jobs) == 0:
                    break

                hunger -= len(jobs)

                for id, cmd, inputs, outputs in jobs:
                    task = wq.Task(cmd)
                    task.specify_tag(id)
                    task.specify_cores(1)
                    # temporary work-around?
                    # task.specify_memory(1000)
                    # task.specify_disk(4000)

                    for (local, remote) in inputs:
                        if os.path.isfile(local):
                            task.specify_input_file(str(local), str(remote), wq.WORK_QUEUE_CACHE)
                        elif os.path.isdir(local):
                            task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT,
                                    wq.WORK_QUEUE_CACHE, recursive=True)
                        else:
                            logging.critical("cannot send file to worker: {0}".format(local))
                            raise NotImplementedError

                    for (local, remote) in outputs:
                        task.specify_output_file(str(local), str(remote))

                    queue.submit(task)
            creation_time += int((time.time() - t) * 1e6)

            task = queue.wait(300)
            tasks = []
            while task:
                if task.return_status == 0:
                    successful_jobs += 1
                tasks.append(task)
                if queue.stats.tasks_complete > 0:
                    task = queue.wait(1)
                else:
                    task = None
            if len(tasks) > 0:
                try:
                    t = time.time()
                    job_src.release(tasks)
                    destruction_time += int((time.time() - t) * 1e6)
                except:
                    tb = traceback.format_exc()
                    logging.critical("cannot recover from the following exception:\n" + tb)
                    for task in tasks:
                        logging.critical("tried to return task {0} from {1}".format(task.tag, task.hostname))
                    raise
            if successful_jobs >= abort_threshold and not abort_active:
                logging.info("activating fast abort with multiplier: {0}".format(abort_multiplier))
                abort_active = True
                queue.activate_fast_abort(abort_multiplier)
        if jobits_left == 0:
            logging.info("no more work left to do")