Beispiel #1
0
    def run(self, pdb_on_exception=False):
        """
        Run the rocket (check out a job from the database and execute it)

        Args:
            pdb_on_exception (bool): whether to invoke the debugger on
                a caught exception.  Default False.
        """
        all_stored_data = {}  # combined stored data for *all* the Tasks
        all_update_spec = {}  # combined update_spec for *all* the Tasks
        all_mod_spec = []  # combined mod_spec for *all* the Tasks

        lp = self.launchpad
        launch_dir = os.path.abspath(os.getcwd())
        logdir = lp.get_logdir() if lp else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=logdir,
                                 stream_level=ROCKET_STREAM_LOGLEVEL)

        # check a FW job out of the launchpad
        if lp:
            m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id)
        else:  # offline mode
            m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json"))

            # set the run start time
            fpath = zpath("FW_offline.json")
            with zopen(fpath) as f_in:
                d = json.loads(f_in.read())
                d['started_on'] = datetime.utcnow().isoformat()
                with zopen(fpath, "wt") as f_out:
                    f_out.write(json.dumps(d, ensure_ascii=False))

            launch_id = None  # we don't need this in offline mode...

        if not m_fw:
            print("No FireWorks are ready to run and match query! {}".format(self.fworker.query))
            return False

        final_state = None
        ping_stop = None
        btask_stops = []

        try:
            if '_launch_dir' in m_fw.spec and lp:
                prev_dir = launch_dir
                launch_dir = os.path.expandvars(m_fw.spec['_launch_dir'])
                if not os.path.abspath(launch_dir):
                    launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir))
                # thread-safe "mkdir -p"
                try:
                    os.makedirs(launch_dir)
                except OSError as exception:
                    if exception.errno != errno.EEXIST:
                        raise
                os.chdir(launch_dir)

                if not os.path.samefile(launch_dir, prev_dir):
                    lp.change_launch_dir(launch_id, launch_dir)

                if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS:
                    try:
                        os.rmdir(prev_dir)
                    except Exception:
                        pass

            recovery = m_fw.spec.get('_recovery', None)
            if recovery:
                recovery_dir = recovery.get('_prev_dir')
                recovery_mode = recovery.get('_mode')
                starting_task = recovery.get('_task_n')
                all_stored_data.update(recovery.get('_all_stored_data'))
                all_update_spec.update(recovery.get('_all_update_spec'))
                all_mod_spec.extend(recovery.get('_all_mod_spec'))
                if lp:
                    l_logger.log(
                        logging.INFO,
                        'Recovering from task number {} in folder {}.'.format(starting_task,
                                                                              recovery_dir))
                if recovery_mode == 'cp' and launch_dir != recovery_dir:
                    if lp:
                        l_logger.log(
                            logging.INFO,
                            'Copying data from recovery folder {} to folder {}.'.format(recovery_dir,
                                                                                        launch_dir))
                    distutils.dir_util.copy_tree(recovery_dir, launch_dir, update=1)

            else:
                starting_task = 0
                files_in = m_fw.spec.get("_files_in", {})
                prev_files = m_fw.spec.get("_files_prev", {})
                for f in set(files_in.keys()).intersection(prev_files.keys()):
                    # We use zopen for the file objects for transparent handling
                    # of zipped files. shutil.copyfileobj does the actual copy
                    # in chunks that avoid memory issues.
                    with zopen(prev_files[f], "rb") as fin, zopen(files_in[f], "wb") as fout:
                        shutil.copyfileobj(fin, fout)

            if lp:
                message = 'RUNNING fw_id: {} in directory: {}'. \
                    format(m_fw.fw_id, os.getcwd())
                l_logger.log(logging.INFO, message)

            # write FW.json and/or FW.yaml to the directory
            if PRINT_FW_JSON:
                m_fw.to_file('FW.json', indent=4)
            if PRINT_FW_YAML:
                m_fw.to_file('FW.yaml')

            my_spec = dict(m_fw.spec)  # make a copy of spec, don't override original
            my_spec["_fw_env"] = self.fworker.env

            # set up heartbeat (pinging the server that we're still alive)
            ping_stop = start_ping_launch(lp, launch_id)

            # start background tasks
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    btask_stops.append(start_background_task(bt, m_fw.spec))

            # execute the Firetasks!
            for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task):
                checkpoint = {'_task_n': t_counter,
                              '_all_stored_data': all_stored_data,
                              '_all_update_spec': all_update_spec,
                              '_all_mod_spec': all_mod_spec}
                Rocket.update_checkpoint(lp, launch_dir, launch_id, checkpoint)

                if lp:
                    l_logger.log(logging.INFO, "Task started: %s." % t.fw_name)

                if my_spec.get("_add_launchpad_and_fw_id"):
                    t.fw_id = m_fw.fw_id
                    if FWData().MULTIPROCESSING:
                        # hack because AutoProxy manager can't access attributes
                        t.launchpad = LaunchPad.from_dict(self.launchpad.to_dict())
                    else:
                        t.launchpad = self.launchpad

                if my_spec.get("_add_fworker"):
                    t.fworker = self.fworker

                try:
                    m_action = t.run_task(my_spec)
                except BaseException as e:
                    traceback.print_exc()
                    tb = traceback.format_exc()
                    stop_backgrounds(ping_stop, btask_stops)
                    do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
                    # If the exception is serializable, save its details
                    if pdb_on_exception:
                        pdb.post_mortem()
                    try:
                        exception_details = e.to_dict()
                    except AttributeError:
                        exception_details = None
                    except BaseException as e:
                        if lp:
                            l_logger.log(logging.WARNING,
                                         "Exception couldn't be serialized: %s " % e)
                        exception_details = None

                    try:
                        m_task = t.to_dict()
                    except Exception:
                        m_task = None

                    m_action = FWAction(stored_data={'_message': 'runtime error during task',
                                                     '_task': m_task,
                                                     '_exception': {'_stacktrace': tb,
                                                                    '_details': exception_details}},
                                        exit=True)
                    m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

                    if lp:
                        final_state = 'FIZZLED'
                        lp.complete_launch(launch_id, m_action, final_state)
                    else:
                        fpath = zpath("FW_offline.json")
                        with zopen(fpath) as f_in:
                            d = json.loads(f_in.read())
                            d['fwaction'] = m_action.to_dict()
                            d['state'] = 'FIZZLED'
                            d['completed_on'] = datetime.utcnow().isoformat()
                            with zopen(fpath, "wt") as f_out:
                                f_out.write(json.dumps(d, ensure_ascii=False))

                    return True

                # read in a FWAction from a file, in case the task is not Python and cannot return
                # it explicitly
                if os.path.exists('FWAction.json'):
                    m_action = FWAction.from_file('FWAction.json')
                elif os.path.exists('FWAction.yaml'):
                    m_action = FWAction.from_file('FWAction.yaml')

                if not m_action:
                    m_action = FWAction()

                # update the global stored data with the data to store and update from this
                # particular Task
                all_stored_data.update(m_action.stored_data)
                all_update_spec.update(m_action.update_spec)
                all_mod_spec.extend(m_action.mod_spec)

                # update spec for next task as well
                my_spec.update(m_action.update_spec)
                for mod in m_action.mod_spec:
                    apply_mod(mod, my_spec)
                if lp:
                    l_logger.log(logging.INFO, "Task completed: %s " % t.fw_name)
                if m_action.skip_remaining_tasks:
                    break

            # add job packing info if this is needed
            if FWData().MULTIPROCESSING and STORE_PACKING_INFO:
                all_stored_data['multiprocess_name'] = multiprocessing.current_process().name

            # perform finishing operation
            stop_backgrounds(ping_stop, btask_stops)
            for b in btask_stops:
                b.set()
            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # last background monitors
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    if bt.run_on_finish:
                        for task in bt.tasks:
                            task.run_task(m_fw.spec)

            m_action.stored_data = all_stored_data
            m_action.mod_spec = all_mod_spec
            m_action.update_spec = all_update_spec

            m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

            if lp:
                final_state = 'COMPLETED'
                lp.complete_launch(launch_id, m_action, final_state)
            else:

                fpath = zpath("FW_offline.json")
                with zopen(fpath) as f_in:
                    d = json.loads(f_in.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'COMPLETED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    with zopen(fpath, "wt") as f_out:
                        f_out.write(json.dumps(d, ensure_ascii=False))

            return True

        except LockedWorkflowError as e:
            l_logger.log(logging.DEBUG, traceback.format_exc())
            l_logger.log(logging.WARNING,
                         "Firework {} reached final state {} but couldn't complete the update of "
                         "the database. Reason: {}\nRefresh the WF to recover the result "
                         "(lpad admin refresh -i {}).".format(
                             self.fw_id, final_state, e, self.fw_id))
            return True

        except Exception:
            # problems while processing the results. high probability of malformed data.
            traceback.print_exc()
            stop_backgrounds(ping_stop, btask_stops)
            # restore initial state to prevent the raise of further exceptions
            if lp:
                lp.restore_backup_data(launch_id, m_fw.fw_id)

            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # the action produced by the task is discarded
            m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None,
                                             '_exception': {'_stacktrace': traceback.format_exc(),
                                                            '_details': None}},
                                exit=True)

            try:
                m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)
            except Exception:
                traceback.print_exc()

            if lp:
                try:
                    lp.complete_launch(launch_id, m_action, 'FIZZLED')
                except LockedWorkflowError as e:
                    l_logger.log(logging.DEBUG, traceback.format_exc())
                    l_logger.log(logging.WARNING,
                                 "Firework {} fizzled but couldn't complete the update of the database."
                                 " Reason: {}\nRefresh the WF to recover the result "
                                 "(lpad admin refresh -i {}).".format(
                                     self.fw_id, final_state, e, self.fw_id))
                    return True
            else:
                fpath = zpath("FW_offline.json")
                with zopen(fpath) as f_in:
                    d = json.loads(f_in.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'FIZZLED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    with zopen(fpath, "wt") as f_out:
                        f_out.write(json.dumps(d, ensure_ascii=False))

            return True
Beispiel #2
0
    def run(self, pdb_on_exception=False):
        """
        Run the rocket (check out a job from the database and execute it)

        Args:
            pdb_on_exception (bool): whether to invoke the debugger on
                a caught exception.  Default False.
        """
        all_stored_data = {}  # combined stored data for *all* the Tasks
        all_update_spec = {}  # combined update_spec for *all* the Tasks
        all_mod_spec = []  # combined mod_spec for *all* the Tasks

        lp = self.launchpad
        launch_dir = os.path.abspath(os.getcwd())
        logdir = lp.get_logdir() if lp else None
        l_logger = get_fw_logger('rocket.launcher', l_dir=logdir,
                                 stream_level=ROCKET_STREAM_LOGLEVEL)

        # check a FW job out of the launchpad
        if lp:
            m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id)
        else:  # offline mode
            m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json"))

            # set the run start time
            fpath = zpath("FW_offline.json")
            with zopen(fpath) as f_in:
                d = json.loads(f_in.read())
                d['started_on'] = datetime.utcnow().isoformat()
                with zopen(fpath, "wt") as f_out:
                    f_out.write(json.dumps(d, ensure_ascii=False))

            launch_id = None  # we don't need this in offline mode...

        if not m_fw:
            print("No FireWorks are ready to run and match query! {}".format(self.fworker.query))
            return False

        final_state = None
        ping_stop = None
        btask_stops = []

        try:
            if '_launch_dir' in m_fw.spec and lp:
                prev_dir = launch_dir
                launch_dir = os.path.expandvars(m_fw.spec['_launch_dir'])
                if not os.path.abspath(launch_dir):
                    launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir))
                # thread-safe "mkdir -p"
                try:
                    os.makedirs(launch_dir)
                except OSError as exception:
                    if exception.errno != errno.EEXIST:
                        raise
                os.chdir(launch_dir)

                if not os.path.samefile(launch_dir, prev_dir):
                    lp.change_launch_dir(launch_id, launch_dir)

                if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS:
                    try:
                        os.rmdir(prev_dir)
                    except:
                        pass

            recovery = m_fw.spec.get('_recovery', None)
            if recovery:
                recovery_dir = recovery.get('_prev_dir')
                recovery_mode = recovery.get('_mode')
                starting_task = recovery.get('_task_n')
                all_stored_data.update(recovery.get('_all_stored_data'))
                all_update_spec.update(recovery.get('_all_update_spec'))
                all_mod_spec.extend(recovery.get('_all_mod_spec'))
                if lp:
                    l_logger.log(
                                logging.INFO,
                                'Recovering from task number {} in folder {}.'.format(starting_task,
                                                                                      recovery_dir))
                if recovery_mode == 'cp' and launch_dir != recovery_dir:
                    if lp:
                        l_logger.log(
                                    logging.INFO,
                                    'Copying data from recovery folder {} to folder {}.'.format(recovery_dir,
                                                                                                launch_dir))
                    distutils.dir_util.copy_tree(recovery_dir, launch_dir, update=1)

            else:
                starting_task = 0
                files_in = m_fw.spec.get("_files_in", {})
                prev_files = m_fw.spec.get("_files_prev", {})
                for f in set(files_in.keys()).intersection(prev_files.keys()):
                    # We use zopen for the file objects for transparent handling
                    # of zipped files. shutil.copyfileobj does the actual copy
                    # in chunks that avoid memory issues.
                    with zopen(prev_files[f], "rb") as fin, zopen(files_in[f], "wb") as fout:
                        shutil.copyfileobj(fin, fout)

            if lp:
                message = 'RUNNING fw_id: {} in directory: {}'.\
                    format(m_fw.fw_id, os.getcwd())
                l_logger.log(logging.INFO, message)

            # write FW.json and/or FW.yaml to the directory
            if PRINT_FW_JSON:
                m_fw.to_file('FW.json', indent=4)
            if PRINT_FW_YAML:
                m_fw.to_file('FW.yaml')

            my_spec = dict(m_fw.spec)  # make a copy of spec, don't override original
            my_spec["_fw_env"] = self.fworker.env

            # set up heartbeat (pinging the server that we're still alive)
            ping_stop = start_ping_launch(lp, launch_id)

            # start background tasks
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    btask_stops.append(start_background_task(bt, m_fw.spec))

            # execute the Firetasks!
            for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task):
                checkpoint = {'_task_n': t_counter,
                              '_all_stored_data': all_stored_data,
                              '_all_update_spec': all_update_spec,
                              '_all_mod_spec': all_mod_spec}
                Rocket.update_checkpoint(lp, launch_dir, launch_id, checkpoint)
 
                if lp:
                   l_logger.log(logging.INFO, "Task started: %s." % t.fw_name)

                if my_spec.get("_add_launchpad_and_fw_id"):
                    t.fw_id = m_fw.fw_id
                    if FWData().MULTIPROCESSING:
                        # hack because AutoProxy manager can't access attributes
                        t.launchpad = LaunchPad.from_dict(self.launchpad.to_dict())
                    else:
                        t.launchpad = self.launchpad

                if my_spec.get("_add_fworker"):
                    t.fworker = self.fworker

                try:
                    m_action = t.run_task(my_spec)
                except BaseException as e:
                    traceback.print_exc()
                    tb = traceback.format_exc()
                    stop_backgrounds(ping_stop, btask_stops)
                    do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
                    # If the exception is serializable, save its details
                    if pdb_on_exception:
                        pdb.post_mortem()
                    try:
                        exception_details = e.to_dict()
                    except AttributeError:
                        exception_details = None
                    except BaseException as e:
                        if lp:
                            l_logger.log(logging.WARNING,
                                        "Exception couldn't be serialized: %s " % e)
                        exception_details = None

                    try:
                        m_task = t.to_dict()
                    except:
                        m_task = None

                    m_action = FWAction(stored_data={'_message': 'runtime error during task',
                                                     '_task': m_task,
                                                     '_exception': {'_stacktrace': tb,
                                                                    '_details': exception_details}},
                                        exit=True)
                    m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

                    if lp:
                        final_state = 'FIZZLED'
                        lp.complete_launch(launch_id, m_action, final_state)
                    else:
                        fpath = zpath("FW_offline.json")
                        with zopen(fpath) as f_in:
                            d = json.loads(f_in.read())
                            d['fwaction'] = m_action.to_dict()
                            d['state'] = 'FIZZLED'
                            d['completed_on'] = datetime.utcnow().isoformat()
                            with zopen(fpath, "wt") as f_out:
                                f_out.write(json.dumps(d, ensure_ascii=False))

                    return True

                # read in a FWAction from a file, in case the task is not Python and cannot return
                # it explicitly
                if os.path.exists('FWAction.json'):
                    m_action = FWAction.from_file('FWAction.json')
                elif os.path.exists('FWAction.yaml'):
                    m_action = FWAction.from_file('FWAction.yaml')

                if not m_action:
                    m_action = FWAction()

                # update the global stored data with the data to store and update from this
                # particular Task
                all_stored_data.update(m_action.stored_data)
                all_update_spec.update(m_action.update_spec)
                all_mod_spec.extend(m_action.mod_spec)

                # update spec for next task as well
                my_spec.update(m_action.update_spec)
                for mod in m_action.mod_spec:
                    apply_mod(mod, my_spec)
                if lp:
                    l_logger.log(logging.INFO, "Task completed: %s " % t.fw_name)
                if m_action.skip_remaining_tasks:
                    break

            # add job packing info if this is needed
            if FWData().MULTIPROCESSING and STORE_PACKING_INFO:
                all_stored_data['multiprocess_name'] = multiprocessing.current_process().name

            # perform finishing operation
            stop_backgrounds(ping_stop, btask_stops)
            for b in btask_stops:
                b.set()
            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # last background monitors
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    if bt.run_on_finish:
                        for task in bt.tasks:
                            task.run_task(m_fw.spec)

            m_action.stored_data = all_stored_data
            m_action.mod_spec = all_mod_spec
            m_action.update_spec = all_update_spec

            m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

            if lp:
                final_state = 'COMPLETED'
                lp.complete_launch(launch_id, m_action, final_state)
            else:

                fpath = zpath("FW_offline.json")
                with zopen(fpath) as f_in:
                    d = json.loads(f_in.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'COMPLETED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    with zopen(fpath, "wt") as f_out:
                        f_out.write(json.dumps(d, ensure_ascii=False))

            return True

        except LockedWorkflowError as e:
            l_logger.log(logging.DEBUG, traceback.format_exc())
            l_logger.log(logging.WARNING,
                           "Firework {} reached final state {} but couldn't complete the update of "
                           "the database. Reason: {}\nRefresh the WF to recover the result "
                           "(lpad admin refresh -i {}).".format(
                               self.fw_id, final_state, e, self.fw_id))
            return True

        except:
            # problems while processing the results. high probability of malformed data.
            traceback.print_exc()
            stop_backgrounds(ping_stop, btask_stops)
            # restore initial state to prevent the raise of further exceptions
            if lp:
                lp.restore_backup_data(launch_id, m_fw.fw_id)

            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # the action produced by the task is discarded
            m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None,
                                             '_exception': {'_stacktrace': traceback.format_exc(),
                                                            '_details': None}},
                                exit=True)

            try:
                m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)
            except:
                traceback.print_exc()

            if lp:
                try:
                    lp.complete_launch(launch_id, m_action, 'FIZZLED')
                except LockedWorkflowError as e:
                    l_logger.log(logging.DEBUG, traceback.format_exc())
                    l_logger.log(logging.WARNING,
                                   "Firework {} fizzled but couldn't complete the update of the database."
                                   " Reason: {}\nRefresh the WF to recover the result "
                                   "(lpad admin refresh -i {}).".format(
                                       self.fw_id, final_state, e, self.fw_id))
                    return True
            else:
                fpath = zpath("FW_offline.json")
                with zopen(fpath) as f_in:
                    d = json.loads(f_in.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'FIZZLED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    with zopen(fpath, "wt") as f_out:
                        f_out.write(json.dumps(d, ensure_ascii=False))

            return True
Beispiel #3
0
    def run(self):
        """
        Run the rocket (check out a job from the database and execute it)
        """
        all_stored_data = {}  # combined stored data for *all* the Tasks
        all_update_spec = {}  # combined update_spec for *all* the Tasks
        all_mod_spec = []  # combined mod_spec for *all* the Tasks

        lp = self.launchpad
        launch_dir = os.path.abspath(os.getcwd())

        # check a FW job out of the launchpad
        if lp:
            m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id)
        else:  # offline mode
            m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json"))

            # set the run start time
            with open('FW_offline.json', 'r+') as f:
                d = json.loads(f.read())
                d['started_on'] = datetime.utcnow().isoformat()
                f.seek(0)
                f.write(json.dumps(d))
                f.truncate()

            launch_id = None  # we don't need this in offline mode...

        if not m_fw:
            print("No FireWorks are ready to run and match query! {}".format(self.fworker.query))
            return False

        try:
            if '_launch_dir' in m_fw.spec and lp:
                prev_dir = launch_dir
                launch_dir = os.path.expandvars(m_fw.spec['_launch_dir'])
                if not os.path.abspath(launch_dir):
                    launch_dir = os.path.normpath(os.path.join(os.getcwd(), launch_dir))
                # thread-safe "mkdir -p"
                try:
                    os.makedirs(launch_dir)
                except OSError as exception:
                    if exception.errno != errno.EEXIST:
                        raise
                os.chdir(launch_dir)

                if not os.path.samefile(launch_dir, prev_dir):
                    lp.change_launch_dir(launch_id, launch_dir)

                if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS:
                    try:
                        os.rmdir(prev_dir)
                    except:
                        pass

            if m_fw.spec.get('_recover_launch', None):
                launch_to_recover = lp.get_launch_by_id(m_fw.spec['_recover_launch']['_launch_id'])
                starting_task = launch_to_recover.action.stored_data.get('_exception', {}).get('_failed_task_n', 0)
                recover_launch_dir = launch_to_recover.launch_dir
                if lp:
                    lp.log_message(
                        logging.INFO,
                        'Recovering from task number {} in folder {}.'.format(starting_task, recover_launch_dir))
                if m_fw.spec['_recover_launch']['_recover_mode'] == 'cp' and launch_dir != recover_launch_dir:
                    if lp:
                        lp.log_message(
                            logging.INFO,
                            'Copying data from recovery folder {} to folder {}.'.format(recover_launch_dir, launch_dir))
                    distutils.dir_util.copy_tree(recover_launch_dir, launch_dir, update=1)

            else:
                starting_task = 0

            if lp:
                message = 'RUNNING fw_id: {} in directory: {}'.\
                    format(m_fw.fw_id, os.getcwd())
                lp.log_message(logging.INFO, message)

            # write FW.json and/or FW.yaml to the directory
            if PRINT_FW_JSON:
                m_fw.to_file('FW.json', indent=4)
            if PRINT_FW_YAML:
                m_fw.to_file('FW.yaml')

            my_spec = dict(m_fw.spec)  # make a copy of spec, don't override original
            my_spec["_fw_env"] = self.fworker.env

            # set up heartbeat (pinging the server that we're still alive)
            ping_stop = start_ping_launch(lp, launch_id)

            # start background tasks
            btask_stops = []
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    btask_stops.append(start_background_task(bt, m_fw.spec))

            # execute the FireTasks!
            for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task):
                if lp:
                    lp.log_message(logging.INFO, "Task started: %s." % t.fw_name)

                if my_spec.get("_add_launchpad_and_fw_id"):
                    t.launchpad = self.launchpad
                    t.fw_id = m_fw.fw_id

                try:
                    m_action = t.run_task(my_spec)
                except BaseException as e:
                    traceback.print_exc()
                    tb = traceback.format_exc()
                    stop_backgrounds(ping_stop, btask_stops)
                    do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
                    # If the exception is serializable, save its details
                    try:
                        exception_details = e.to_dict()
                    except AttributeError:
                        exception_details = None
                    except BaseException as e:
                        if lp:
                            lp.log_message(logging.WARNING, "Exception couldn't be serialized: %s " % e)
                        exception_details = None

                    try:
                        m_task = t.to_dict()
                    except:
                        m_task = None

                    m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task,
                                                     '_exception': {'_stacktrace': tb, '_details': exception_details,
                                                                    '_failed_task_n': t_counter}}, exit=True)
                    m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

                    if lp:
                        lp.complete_launch(launch_id, m_action, 'FIZZLED')
                    else:
                        with open('FW_offline.json', 'r+') as f:
                            d = json.loads(f.read())
                            d['fwaction'] = m_action.to_dict()
                            d['state'] = 'FIZZLED'
                            f.seek(0)
                            f.write(json.dumps(d))
                            f.truncate()

                    return True

                # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly
                if os.path.exists('FWAction.json'):
                    m_action = FWAction.from_file('FWAction.json')
                elif os.path.exists('FWAction.yaml'):
                    m_action = FWAction.from_file('FWAction.yaml')

                if not m_action:
                    m_action = FWAction()

                # update the global stored data with the data to store and update from this particular Task
                all_stored_data.update(m_action.stored_data)
                all_update_spec.update(m_action.update_spec)
                all_mod_spec.extend(m_action.mod_spec)

                # update spec for next task as well
                my_spec.update(m_action.update_spec)
                for mod in m_action.mod_spec:
                    apply_mod(mod, my_spec)
                if lp:
                    lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name)
                if m_action.skip_remaining_tasks:
                    break

            # add job packing info if this is needed
            if FWData().MULTIPROCESSING and STORE_PACKING_INFO:
                all_stored_data['multiprocess_name'] = multiprocessing.current_process().name

            # perform finishing operation
            stop_backgrounds(ping_stop, btask_stops)
            for b in btask_stops:
                b.set()
            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # last background monitors
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    if bt.run_on_finish:
                        for task in bt.tasks:
                            task.run_task(m_fw.spec)

            m_action.stored_data = all_stored_data
            m_action.mod_spec = all_mod_spec
            m_action.update_spec = all_update_spec

            m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)

            if lp:
                lp.complete_launch(launch_id, m_action, 'COMPLETED')
            else:
                with open('FW_offline.json', 'r+') as f:
                    d = json.loads(f.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'COMPLETED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    f.seek(0)
                    f.write(json.dumps(d))
                    f.truncate()

            return True

        except:
            # problems while processing the results. high probability of malformed data.
            traceback.print_exc()
            stop_backgrounds(ping_stop, btask_stops)
            # restore initial state to prevent the raise of further exceptions
            if lp:
                lp.restore_backup_data(launch_id, m_fw.fw_id)

            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # the action produced by the task is discarded
            m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None,
                                             '_exception': {'_stacktrace': traceback.format_exc(),
                                             '_details': None}}, exit=True)

            try:
                m_action = self.decorate_fwaction(m_action, my_spec, m_fw, launch_dir)
            except:
                traceback.print_exc()

            if lp:
                lp.complete_launch(launch_id, m_action, 'FIZZLED')
            else:
                with open('FW_offline.json', 'r+') as f:
                    d = json.loads(f.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'FIZZLED'
                    f.seek(0)
                    f.write(json.dumps(d))
                    f.truncate()

            return True
Beispiel #4
0
    def run(self):
        """
        Run the rocket (check out a job from the database and execute it)
        """
        all_stored_data = {}  # combined stored data for *all* the Tasks
        all_update_spec = {}  # combined update_spec for *all* the Tasks
        all_mod_spec = []  # combined mod_spec for *all* the Tasks

        lp = self.launchpad
        launch_dir = os.path.abspath(os.getcwd())

        # check a FW job out of the launchpad
        if lp:
            m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id)
        else:  # offline mode
            m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json"))

            # set the run start time
            with open('FW_offline.json', 'r+') as f:
                d = json.loads(f.read())
                d['started_on'] = datetime.utcnow().isoformat()
                f.seek(0)
                f.write(json.dumps(d))
                f.truncate()

            launch_id = None  # we don't need this in offline mode...

        if not m_fw:
            print("No FireWorks are ready to run and match query! {}".format(self.fworker.query))
            return False

        if lp:
            message = 'RUNNING fw_id: {} in directory: {}'.\
                format(m_fw.fw_id, os.getcwd())
            lp.log_message(logging.INFO, message)

        # write FW.json and/or FW.yaml to the directory
        if PRINT_FW_JSON:
            m_fw.to_file('FW.json', indent=4)
        if PRINT_FW_YAML:
            m_fw.to_file('FW.yaml')

        try:
            if '_launch_dir' in m_fw.spec:
                prev_dir = launch_dir
                launch_dir = os.path.expandvars(m_fw.spec['_launch_dir'])
                # thread-safe "mkdir -p"
                try:
                    os.makedirs(launch_dir)
                except OSError as exception:
                    if exception.errno != errno.EEXIST:
                        raise
                os.chdir(launch_dir)
                launch_dir = os.path.abspath(os.getcwd())

                if lp:
                    lp.change_launch_dir(launch_id, launch_dir)

                if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS:
                    try:
                        os.rmdir(prev_dir)
                    except:
                        pass

            if m_fw.spec.get('_recover_launch', None):
                launch_to_recover = lp.get_launch_by_id(m_fw.spec['_recover_launch']['_launch_id'])
                starting_task = launch_to_recover.action.stored_data.get('_exception', {}).get('_failed_task_n', 0)
                recover_launch_dir = launch_to_recover.launch_dir
                if lp:
                    lp.log_message(
                        logging.INFO,
                        'Recovering from task number {} in folder {}.'.format(starting_task, recover_launch_dir))
                if m_fw.spec['_recover_launch']['_recover_mode'] == 'cp' and launch_dir != recover_launch_dir:
                    if lp:
                        lp.log_message(
                            logging.INFO,
                            'Copying data from recovery folder {} to folder {}.'.format(recover_launch_dir, launch_dir))
                    distutils.dir_util.copy_tree(recover_launch_dir, launch_dir, update=1)

            else:
                starting_task = 0

            my_spec = dict(m_fw.spec)  # make a copy of spec, don't override original
            my_spec["_fw_env"] = self.fworker.env

            # set up heartbeat (pinging the server that we're still alive)
            ping_stop = start_ping_launch(lp, launch_id)

            # start background tasks
            btask_stops = []
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    btask_stops.append(start_background_task(bt, m_fw.spec))

            # execute the FireTasks!
            for t_counter, t in enumerate(m_fw.tasks[starting_task:], start=starting_task):
                if lp:
                    lp.log_message(logging.INFO, "Task started: %s." % t.fw_name)
                try:
                    m_action = t.run_task(my_spec)
                except BaseException as e:
                    traceback.print_exc()
                    tb = traceback.format_exc()
                    stop_backgrounds(ping_stop, btask_stops)
                    do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
                    # If the exception is serializable, save its details
                    try:
                        exception_details = e.to_dict()
                    except AttributeError:
                        exception_details = None
                    except BaseException as e:
                        if lp:
                            lp.log_message(logging.WARNING, "Exception couldn't be serialized: %s " % e)
                        exception_details = None

                    try:
                        m_task = t.to_dict()
                    except:
                        m_task = None

                    m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': m_task,
                                                     '_exception': {'_stacktrace': tb, '_details': exception_details,
                                                                    '_failed_task_n': t_counter}}, exit=True)
                    if lp:
                        lp.complete_launch(launch_id, m_action, 'FIZZLED')
                    else:
                        with open('FW_offline.json', 'r+') as f:
                            d = json.loads(f.read())
                            d['fwaction'] = m_action.to_dict()
                            d['state'] = 'FIZZLED'
                            f.seek(0)
                            f.write(json.dumps(d))
                            f.truncate()

                    return True


                # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly
                if os.path.exists('FWAction.json'):
                    m_action = FWAction.from_file('FWAction.json')
                elif os.path.exists('FWAction.yaml'):
                    m_action = FWAction.from_file('FWAction.yaml')

                if not m_action:
                    m_action = FWAction()

                # update the global stored data with the data to store and update from this particular Task
                all_stored_data.update(m_action.stored_data)
                all_update_spec.update(m_action.update_spec)
                all_mod_spec.extend(m_action.mod_spec)

                # update spec for next task as well
                my_spec.update(m_action.update_spec)
                for mod in m_action.mod_spec:
                    apply_mod(mod, my_spec)
                if lp:
                    lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name)
                if m_action.skip_remaining_tasks:
                    break

            # add job packing info if this is needed
            if FWData().MULTIPROCESSING and STORE_PACKING_INFO:
                all_stored_data['multiprocess_name'] = multiprocessing.current_process().name

            # perform finishing operation
            stop_backgrounds(ping_stop, btask_stops)
            for b in btask_stops:
                b.set()
            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # last background monitors
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    if bt.run_on_finish:
                        for task in bt.tasks:
                            task.run_task(m_fw.spec)

            m_action.stored_data = all_stored_data
            m_action.mod_spec = all_mod_spec
            m_action.update_spec = all_update_spec

            if lp:
                lp.complete_launch(launch_id, m_action, 'COMPLETED')
            else:
                with open('FW_offline.json', 'r+') as f:
                    d = json.loads(f.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'COMPLETED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    f.seek(0)
                    f.write(json.dumps(d))
                    f.truncate()

            return True

        except:
            # problems while processing the results. high probability of malformed data.
            traceback.print_exc()
            stop_backgrounds(ping_stop, btask_stops)
            # restore initial state to prevent the raise of further exceptions
            if lp:
                lp.restore_backup_data(launch_id, m_fw.fw_id)

            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # the action produced by the task is discarded
            m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None,
                                             '_exception': {'_stacktrace': traceback.format_exc(),
                                             '_details': None}}, exit=True)
            if lp:
                lp.complete_launch(launch_id, m_action, 'FIZZLED')
            else:
                with open('FW_offline.json', 'r+') as f:
                    d = json.loads(f.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'FIZZLED'
                    f.seek(0)
                    f.write(json.dumps(d))
                    f.truncate()

            return True
Beispiel #5
0
    def run(self):
        """
        Run the rocket (check out a job from the database and execute it)
        """
        all_stored_data = {}  # combined stored data for *all* the Tasks
        all_update_spec = {}  # combined update_spec for *all* the Tasks
        all_mod_spec = []  # combined mod_spec for *all* the Tasks

        lp = self.launchpad
        launch_dir = os.path.abspath(os.getcwd())

        # check a FW job out of the launchpad
        if lp:
            m_fw, launch_id = lp.checkout_fw(self.fworker, launch_dir, self.fw_id)
        else:  # offline mode
            m_fw = Firework.from_file(os.path.join(os.getcwd(), "FW.json"))

            # set the run start time
            with open('FW_offline.json', 'r+') as f:
                d = json.loads(f.read())
                d['started_on'] = datetime.utcnow().isoformat()
                f.seek(0)
                f.write(json.dumps(d))
                f.truncate()

            launch_id = None  # we don't need this in offline mode...

        if not m_fw:
            print("No FireWorks are ready to run and match query! {}".format(self.fworker.query))
            return False

        if '_launch_dir' in m_fw.spec:
            prev_dir = launch_dir
            os.chdir(m_fw.spec['_launch_dir'])
            launch_dir = os.path.abspath(os.getcwd())

            if lp:
                lp.change_launch_dir(launch_id, launch_dir)

            if not os.listdir(prev_dir) and REMOVE_USELESS_DIRS:
                try:
                    os.rmdir(prev_dir)
                except:
                    pass

        if lp:
            message = 'RUNNING fw_id: {} in directory: {}'.\
                format(m_fw.fw_id, os.getcwd())
            lp.log_message(logging.INFO, message)

        # write FW.json and/or FW.yaml to the directory
        if PRINT_FW_JSON:
            m_fw.to_file('FW.json', indent=4)
        if PRINT_FW_YAML:
            m_fw.to_file('FW.yaml')

        try:
            my_spec = dict(m_fw.spec)  # make a copy of spec, don't override original
            my_spec["_fw_env"] = self.fworker.env

            # set up heartbeat (pinging the server that we're still alive)
            ping_stop = start_ping_launch(lp, launch_id)

            # start background tasks
            btask_stops = []
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    btask_stops.append(start_background_task(bt, m_fw.spec))

            # execute the FireTasks!
            for t in m_fw.tasks:
                lp.log_message(logging.INFO, "Task started: %s." % t.fw_name)
                m_action = t.run_task(my_spec)

                # read in a FWAction from a file, in case the task is not Python and cannot return it explicitly
                if os.path.exists('FWAction.json'):
                    m_action = FWAction.from_file('FWAction.json')
                elif os.path.exists('FWAction.yaml'):
                    m_action = FWAction.from_file('FWAction.yaml')

                if not m_action:
                    m_action = FWAction()

                # update the global stored data with the data to store and update from this particular Task
                all_stored_data.update(m_action.stored_data)
                all_update_spec.update(m_action.update_spec)
                all_mod_spec.extend(m_action.mod_spec)

                # update spec for next task as well
                my_spec.update(m_action.update_spec)
                for mod in m_action.mod_spec:
                    apply_mod(mod, my_spec)
                lp.log_message(logging.INFO, "Task completed: %s " % t.fw_name)
                if m_action.skip_remaining_tasks:
                    break

            # add job packing info if this is needed
            if FWData().MULTIPROCESSING and STORE_PACKING_INFO:
                all_stored_data['multiprocess_name'] = multiprocessing.current_process().name

            # perform finishing operation
            stop_backgrounds(ping_stop, btask_stops)
            for b in btask_stops:
                b.set()
            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            # last background monitors
            if '_background_tasks' in my_spec:
                for bt in my_spec['_background_tasks']:
                    if bt.run_on_finish:
                        for task in bt.tasks:
                            task.run_task(m_fw.spec)

            m_action.stored_data = all_stored_data
            m_action.mod_spec = all_mod_spec
            m_action.update_spec = all_update_spec

            if lp:
                lp.complete_launch(launch_id, m_action, 'COMPLETED')
            else:
                with open('FW_offline.json', 'r+') as f:
                    d = json.loads(f.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'COMPLETED'
                    d['completed_on'] = datetime.utcnow().isoformat()
                    f.seek(0)
                    f.write(json.dumps(d))
                    f.truncate()

            return True

        except:
            stop_backgrounds(ping_stop, btask_stops)
            do_ping(lp, launch_id)  # one last ping, esp if there is a monitor
            traceback.print_exc()
            try:
                m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': t.to_dict(),
                                             '_exception': traceback.format_exc()}, exit=True)
            except:
                m_action = FWAction(stored_data={'_message': 'runtime error during task', '_task': None,
                                             '_exception': traceback.format_exc()}, exit=True)
            if lp:
                lp.complete_launch(launch_id, m_action, 'FIZZLED')
            else:
                with open('FW_offline.json', 'r+') as f:
                    d = json.loads(f.read())
                    d['fwaction'] = m_action.to_dict()
                    d['state'] = 'FIZZLED'
                    f.seek(0)
                    f.write(json.dumps(d))
                    f.truncate()

            return True