Exemple #1
0
 def setup(self, config):
     super(Dashboard, self).setup(config)
     if util.checkpoint(config.workdir, "sandbox cmssw version"):
         self.__cmssw_version = str(
             util.checkpoint(config.workdir, "sandbox cmssw version"))
     if util.checkpoint(config.workdir, "executable"):
         self.__executable = str(
             util.checkpoint(config.workdir, "executable"))
Exemple #2
0
def kill(args):
    logger.info("setting flag to quit at the next checkpoint")
    with open(args.configfile) as configfile:
        config = yaml.load(configfile)

    workdir = config['workdir']
    util.register_checkpoint(workdir, 'KILLED', 'PENDING')

    logger.info("reporting unfinished tasks as Aborted to the dashboard")
    # report unfinished tasks as aborted to dashboard
    # note: even if lobster didn't terminate gracefully for some reason,
    # "lobster terminate" can still be run afterwards to properly update
    # the tasks as aborted to the dashboard.
    db_path = os.path.join(workdir, "lobster.db")
    db = sqlite3.connect(db_path)
    ids = db.execute("select id from jobs where status=1").fetchall()
    task_id = util.checkpoint(workdir, 'id')
    if task_id:
        for (id,) in ids:
            if config.get('use dashboard', True):
                dash = cmssw.dash.Monitor(task_id)
            else:
                dash = cmssw.dash.DummyMonitor(task_id)
            dash.update_job(id, cmssw.dash.ABORTED)
    else:
        logger.warning("""taskid not found: could not report aborted jobs
                       to the dashboard""")
Exemple #3
0
    def __init__(self, config, source):
        self.config = config
        self.source = source

        if config.plotdir:
            logger.info('plots in {0} will be updated automatically'.format(config.plotdir))
            if config.foremen_logs:
                logger.info('foremen logs will be included from: {0}'.format(', '.join(config.foremen_logs)))
            self.plotter = Plotter(config)

        self.__last = datetime.datetime.now()
        self.__last_config_update = util.checkpoint(config.workdir, 'configuration_check')
        if not self.__last_config_update:
            self.__last_config_update = time.time()
            util.register_checkpoint(config.workdir, 'configuration_check', self.__last_config_update)
Exemple #4
0
    def __init__(self, config, source):
        self.config = config
        self.source = source

        if config.plotdir:
            logger.info('plots in {0} will be updated automatically'.format(
                config.plotdir))
            if config.foremen_logs:
                logger.info('foremen logs will be included from: {0}'.format(
                    ', '.join(config.foremen_logs)))
            self.plotter = Plotter(config)

        self.__last = datetime.datetime.now()
        self.__last_config_update = util.checkpoint(config.workdir,
                                                    'configuration_check')
        if not self.__last_config_update:
            self.__last_config_update = time.time()
            util.register_checkpoint(config.workdir, 'configuration_check',
                                     self.__last_config_update)
Exemple #5
0
    def __init__(self, config):
        super(MergeProvider, self).__init__(config)

        self.__chirp = self.config.get('stageout server', None)
        self.__sandbox = os.path.join(self.workdir, 'sandbox')
        self.__dash = dash.DummyMonitor(self.taskid)
        self.__mergehandlers = {}

        self.__store = jobit.JobitStore(self.config)
        self.__store.reset_merging()
        logging.info("registering unmerged jobs")
        self.__store.register_unmerged()

        self.__grid_files = [(os.path.join('/cvmfs/grid.cern.ch', x), os.path.join('grid', x)) for x in
                                 ['3.2.11-1/external/etc/profile.d/clean-grid-env-funcs.sh',
                                  '3.2.11-1/external/etc/profile.d/grid-env-funcs.sh',
                                  '3.2.11-1/external/etc/profile.d/grid-env.sh',
                                  '3.2.11-1/etc/profile.d/grid-env.sh',
                                  '3.2.11-1/glite/bin/voms-proxy-info',
                                  '3.2.11-1/glite/lib64/libvomsapi_nog.so.0.0.0',
                                  '3.2.11-1/glite/lib64/libvomsapi_nog.so.0',
                                  'etc/grid-security/certificates'
                                  ]
                             ]

        self.__common_inputs = [(self.__sandbox + ".tar.bz2", "sandbox.tar.bz2"),
                                (os.path.join(os.path.dirname(__file__), 'data', 'mtab'), 'mtab'),
                                (os.path.join(os.path.dirname(__file__), 'data', 'siteconfig'), 'siteconfig'),
                                (os.path.join(os.path.dirname(__file__), 'data', 'wrapper.sh'), 'wrapper.sh'),
                                (self.parrot_bin, 'bin'),
                                (self.parrot_lib, 'lib'),
                                (os.path.join(os.path.dirname(__file__), 'data', 'job.py'), 'job.py'),
                                (os.path.join(os.path.dirname(__file__), 'data', 'merge_cfg.py'), 'merge_cfg.py')
                                ] + self.__grid_files

        if not util.checkpoint(self.workdir, 'sandbox'):
            raise NotImplementedError
Exemple #6
0
    def sprint(self):
        with util.PartiallyMutable.unlock():
            self.source = TaskProvider(self.config)
        action = actions.Actions(self.config, self.source)

        logger.info("using wq from {0}".format(wq.__file__))
        logger.info("running Lobster version {0}".format(util.get_version()))
        logger.info("current PID is {0}".format(os.getpid()))

        wq.cctools_debug_flags_set("all")
        wq.cctools_debug_config_file(
            os.path.join(self.config.workdir, "work_queue_debug.log"))
        wq.cctools_debug_config_file_size(1 << 29)

        self.queue = wq.WorkQueue(self.config.advanced.wq_port)
        self.queue.specify_min_taskid(self.source.max_taskid() + 1)
        self.queue.specify_log(
            os.path.join(self.config.workdir, "work_queue.log"))
        self.queue.specify_transactions_log(
            os.path.join(self.config.workdir, "transactions.log"))
        self.queue.specify_name("lobster_" + self.config.label)
        self.queue.specify_keepalive_timeout(300)
        # self.queue.tune("short-timeout", 600)
        self.queue.tune("transfer-outlier-factor", 4)
        self.queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND)
        if self.config.advanced.full_monitoring:
            self.queue.enable_monitoring_full(None)
        else:
            self.queue.enable_monitoring(None)

        logger.info("starting queue as {0}".format(self.queue.name))

        abort_active = False
        abort_threshold = self.config.advanced.abort_threshold
        abort_multiplier = self.config.advanced.abort_multiplier

        wq_max_retries = self.config.advanced.wq_max_retries

        if util.checkpoint(self.config.workdir, 'KILLED') == 'PENDING':
            util.register_checkpoint(self.config.workdir, 'KILLED', 'RESTART')

        # time in seconds to wait for WQ to return tasks, with minimum wait
        # time in case no more tasks are waiting
        interval = 120
        interval_minimum = 30

        tasks_left = 0
        units_left = 0
        successful_tasks = 0

        categories = []

        self.setup_logging('all')
        # Workflows can be assigned categories, with each category having
        # different cpu/memory/walltime requirements that WQ will automatically
        # fine-tune
        for category in self.config.categories:
            constraints = category.wq()
            if category.name != 'merge':
                categories.append(category.name)
                self.setup_logging(category.name)
            self.queue.specify_category_mode(category.name, category.mode)
            if category.mode == wq.WORK_QUEUE_ALLOCATION_MODE_FIXED:
                self.queue.specify_category_max_resources(
                    category.name, constraints)
            else:
                self.queue.specify_category_first_allocation_guess(
                    category.name, constraints)
            logger.debug('Category {0}: {1}'.format(category.name,
                                                    constraints))
            if 'wall_time' not in constraints:
                self.queue.activate_fast_abort_category(
                    category.name, abort_multiplier)

        proxy_email_sent = False
        while not self.source.done():
            with self.measure('status'):
                tasks_left = self.source.tasks_left()
                units_left = self.source.work_left()

                logger.debug("expecting {0} tasks, still".format(tasks_left))
                self.queue.specify_num_tasks_left(tasks_left)

                for c in categories + ['all']:
                    self.log(c, units_left)

                if util.checkpoint(self.config.workdir, 'KILLED') == 'PENDING':
                    util.register_checkpoint(self.config.workdir, 'KILLED',
                                             str(datetime.datetime.utcnow()))

                    # let the task source shut down gracefully
                    logger.info("terminating task source")
                    self.source.terminate()
                    logger.info("terminating gracefully")
                    break

            with self.measure('create'):
                have = {}
                for c in categories:
                    cstats = self.queue.stats_category(c)
                    have[c] = {
                        'running': cstats.tasks_running,
                        'queued': cstats.tasks_waiting
                    }

                stats = self.queue.stats_hierarchy
                tasks = self.source.obtain(stats.total_cores, have)

                expiry = None
                if self.config.advanced.proxy:
                    expiry = self.config.advanced.proxy.expires()
                    proxy_time_left = self.config.advanced.proxy.time_left()
                    if proxy_time_left >= 24 * 3600:
                        proxy_email_sent = False
                    if proxy_time_left < 24 * 3600 and not proxy_email_sent:
                        util.sendemail(
                            "Your proxy is about to expire.\n" + "Timeleft: " +
                            str(datetime.timedelta(seconds=proxy_time_left)),
                            self.config)
                        proxy_email_sent = True

                for category, cmd, id, inputs, outputs, env, dir in tasks:
                    task = wq.Task(cmd)
                    task.specify_category(category)
                    task.specify_tag(id)
                    task.specify_max_retries(wq_max_retries)
                    task.specify_monitor_output(
                        os.path.join(dir, 'resource_monitor'))

                    for k, v in env.items():
                        task.specify_environment_variable(k, v)

                    for (local, remote, cache) in inputs:
                        cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE
                        if os.path.isfile(local) or os.path.isdir(local):
                            task.specify_input_file(str(local), str(remote),
                                                    cache_opt)
                        else:
                            logger.critical(
                                "cannot send file to worker: {0}".format(
                                    local))
                            raise NotImplementedError

                    for (local, remote) in outputs:
                        task.specify_output_file(str(local), str(remote))

                    if expiry:
                        task.specify_end_time(expiry * 10**6)
                    self.queue.submit(task)

            with self.measure('status'):
                stats = self.queue.stats_hierarchy
                logger.info(
                    "{0} out of {1} workers busy; {2} tasks running, {3} waiting; {4} units left"
                    .format(stats.workers_busy,
                            stats.workers_busy + stats.workers_ready,
                            stats.tasks_running, stats.tasks_waiting,
                            units_left))

            with self.measure('update'):
                self.source.update(self.queue)

            # recurring actions are triggered here; plotting etc should run
            # while we have WQ hand us back tasks w/o any database
            # interaction
            with self.measure('action'):
                if action:
                    action.take()

            with self.measure('fetch'):
                starttime = time.time()
                task = self.queue.wait(interval)
                tasks = []
                while task:
                    if task.return_status == 0:
                        successful_tasks += 1
                    elif task.return_status in self.config.advanced.bad_exit_codes:
                        logger.warning(
                            "blacklisting host {0} due to bad exit code from task {1}"
                            .format(task.hostname, task.tag))
                        self.queue.blacklist(task.hostname)
                    tasks.append(task)

                    remaining = int(starttime + interval - time.time())
                    if (interval - remaining < interval_minimum
                            or self.queue.stats.tasks_waiting > 0
                        ) and remaining > 0:
                        task = self.queue.wait(remaining)
                    else:
                        task = None
                # TODO do we really need this?  We have everything based on
                # categories by now, so this should not be needed.
                if abort_threshold > 0 and successful_tasks >= abort_threshold and not abort_active:
                    logger.info(
                        "activating fast abort with multiplier: {0}".format(
                            abort_multiplier))
                    abort_active = True
                    self.queue.activate_fast_abort(abort_multiplier)
            if len(tasks) > 0:
                try:
                    with self.measure('return'):
                        self.source.release(tasks)
                except Exception:
                    tb = traceback.format_exc()
                    logger.critical(
                        "cannot recover from the following exception:\n" + tb)
                    util.sendemail(
                        "Your Lobster project has crashed from the following exception:\n"
                        + tb, self.config)
                    for task in tasks:
                        logger.critical(
                            "tried to return task {0} from {1}".format(
                                task.tag, task.hostname))
                    raise
        if units_left == 0:
            logger.info("no more work left to do")
            util.sendemail("Your Lobster project is done!", self.config)
            if self.config.elk:
                self.config.elk.end()
            if action:
                action.take(True)
Exemple #7
0
def boil():
    parser = ArgumentParser(description='A task submission tool for CMS')
    parser.add_argument('--verbose',
                        '-v',
                        action='count',
                        default=0,
                        help='increase verbosity')
    parser.add_argument('--quiet',
                        '-q',
                        action='count',
                        default=0,
                        help='decrease verbosity')

    command.Command.register([
        os.path.join(os.path.dirname(__file__), d, 'commands')
        for d in ['.', 'cmssw']
    ], parser)

    parser.add_argument(
        metavar='{configfile,workdir}',
        dest='checkpoint',
        help='configuration file to use or working directory to resume.')

    args = parser.parse_args()

    if os.path.isfile(args.checkpoint):
        try:
            import imp
            cfg = imp.load_source('userconfig', args.checkpoint).config
        except Exception as e:
            parser.error("the configuration '{0}' is not valid: {1}".format(
                args.checkpoint, e))

        if util.checkpoint(cfg.workdir, 'version'):
            cfg = config.Config.load(cfg.workdir)
        elif args.plugin.__class__.__name__.lower() == 'process':
            # This is the original configuration file!
            with util.PartiallyMutable.unlock():
                cfg.base_directory = os.path.abspath(
                    os.path.dirname(args.checkpoint))
                cfg.base_configuration = os.path.abspath(args.checkpoint)
                cfg.startup_directory = os.path.abspath(os.getcwd())
                for w in cfg.workflows:
                    try:
                        w.validate()
                    except Exception as e:
                        parser.error(
                            "configuration '{0}' failed validation: {1}".
                            format(args.checkpoint, e))
        else:
            parser.error("""
                Cannot find working directory at '{0}'.
                Have you run 'lobster process {1}'?
                If so, check if you have specified the working directory to change
                programatically (for example, with a timestamp appended). In that
                case, you will need to pass the desired working directory instead of
                configuration file.
                """.format(cfg.workdir, args.checkpoint))
    elif os.path.isdir(args.checkpoint):
        # Load configuration from working directory passed to us
        workdir = args.checkpoint
        try:
            cfg = config.Config.load(workdir)
        except Exception as e:
            parser.error(
                "the working directory '{0}' does not contain a valid configuration: {1}"
                .format(workdir, e))
        with util.PartiallyMutable.unlock():
            cfg.workdir = workdir
    else:
        parser.error(
            "the working directory or configuration '{0}' does not exist".
            format(args.checkpoint))

    args.config = cfg
    args.preserve = []

    # Handle logging for everything in only one place!
    level = max(
        1, args.config.advanced.log_level + args.quiet - args.verbose) * 10
    logger.setLevel(level)

    formatter = logging.Formatter(
        fmt='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')

    console = logging.StreamHandler()
    console.setFormatter(formatter)
    logger.addHandler(console)

    if args.plugin.daemonizable:
        fn = args.plugin.__class__.__name__.lower() + '.log'
        logger.info("saving log to {0}".format(os.path.join(cfg.workdir, fn)))
        if not os.path.isdir(cfg.workdir):
            os.makedirs(cfg.workdir)
        fileh = logging.handlers.RotatingFileHandler(os.path.join(
            cfg.workdir, fn),
                                                     maxBytes=100e6,
                                                     backupCount=10)
        fileh.setFormatter(formatter)
        fileh.setLevel(logging.INFO)
        for p in args.plugin.blacklisted_logs():
            fileh.addFilter(util.InvertedFilter('lobster.' + p))
        args.preserve.append(fileh.stream)
        logger.addHandler(fileh)

        if level < logging.INFO:
            fn = args.plugin.__class__.__name__.lower() + '_debug.log'
            logger.info("saving debug log to {0}".format(
                os.path.join(cfg.workdir, fn)))
            debugh = logging.handlers.RotatingFileHandler(os.path.join(
                cfg.workdir, fn),
                                                          maxBytes=100e6,
                                                          backupCount=10)
            debugh.setFormatter(formatter)
            args.preserve.append(debugh.stream)
            logger.addHandler(debugh)

        if not getattr(args, "foreground", False):
            logger.removeHandler(console)

    for p in args.plugin.additional_logs():
        fn = p + '.log'
        lg = logging.getLogger('lobster.' + p)
        logger.info("saving additional log for {1} to {0}".format(
            os.path.join(cfg.workdir, fn), p))
        if not os.path.isdir(cfg.workdir):
            os.makedirs(cfg.workdir)
        fileh = logging.handlers.RotatingFileHandler(os.path.join(
            cfg.workdir, fn),
                                                     maxBytes=100e6,
                                                     backupCount=10)
        fileh.setFormatter(formatter)
        args.preserve.append(fileh.stream)
        lg.addHandler(fileh)

    args.plugin.run(args)
Exemple #8
0
    def __init__(self, config):
        util.Timing.__init__(self, 'dash', 'handler', 'updates', 'elk',
                             'transfers', 'cleanup', 'propagate', 'sqlite')

        self.config = config
        self.basedirs = [config.base_directory, config.startup_directory]
        self.workdir = config.workdir
        self._storage = config.storage
        self.statusfile = os.path.join(self.workdir, 'status.json')
        self.siteconf = os.path.join(self.workdir, 'siteconf')

        self.parrot_path = os.path.dirname(util.which('parrot_run'))
        self.parrot_bin = os.path.join(self.workdir, 'bin')
        self.parrot_lib = os.path.join(self.workdir, 'lib')

        self.__algo = Algo(config)
        self.__host = socket.getfqdn()
        try:
            siteconf = loadSiteLocalConfig()
            self.__ce = siteconf.siteName
            self.__se = siteconf.localStageOutPNN()
            self.__frontier_proxy = siteconf.frontierProxies[0]
        except (SiteConfigError, IndexError):
            logger.error("can't load siteconfig, defaulting to hostname")
            self.__ce = socket.getfqdn()
            self.__se = socket.getfqdn()
            try:
                self.__frontier_proxy = os.environ['HTTP_PROXY']
            except KeyError:
                logger.error(
                    "can't determine proxy for Frontier via $HTTP_PROXY")
                sys.exit(1)

        try:
            with open('/etc/cvmfs/default.local') as f:
                lines = f.readlines()
        except IOError:
            lines = []
        for l in lines:
            m = re.match('\s*CVMFS_HTTP_PROXY\s*=\s*[\'"]?(.*)[\'"]?', l)
            if m:
                self.__cvmfs_proxy = m.group(1).strip("\"'")
                break
        else:
            try:
                self.__cvmfs_proxy = os.environ['HTTP_PROXY']
            except KeyError:
                logger.error("can't determine proxy for CVMFS via $HTTP_PROXY")
                sys.exit(1)

        logger.debug("using {} as proxy for CVMFS".format(self.__cvmfs_proxy))
        logger.debug("using {} as proxy for Frontier".format(
            self.__frontier_proxy))
        logger.debug("using {} as osg_version".format(
            self.config.advanced.osg_version))
        util.sendemail("Your Lobster project has started!", self.config)

        self.__taskhandlers = {}
        self.__store = unit.UnitStore(self.config)

        self.__setup_inputs()
        self.copy_siteconf()

        create = not util.checkpoint(self.workdir, 'id')
        if create:
            self.taskid = 'lobster_{0}_{1}'.format(
                self.config.label,
                sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:])
            util.register_checkpoint(self.workdir, 'id', self.taskid)
            shutil.copy(self.config.base_configuration,
                        os.path.join(self.workdir, 'config.py'))
        else:
            self.taskid = util.checkpoint(self.workdir, 'id')
            util.register_checkpoint(self.workdir, 'RESTARTED',
                                     str(datetime.datetime.utcnow()))

        if not util.checkpoint(self.workdir, 'executable'):
            # We can actually have more than one exe name (one per task label)
            # Set 'cmsRun' if any of the tasks are of that type,
            # or use cmd command if all tasks execute the same cmd,
            # or use 'noncmsRun' if task cmds are different
            # Using this for dashboard exe name reporting
            cmsconfigs = [wflow.pset for wflow in self.config.workflows]
            cmds = [wflow.command for wflow in self.config.workflows]
            if any(cmsconfigs):
                exename = 'cmsRun'
            elif all(x == cmds[0] and x is not None for x in cmds):
                exename = cmds[0]
            else:
                exename = 'noncmsRun'

            util.register_checkpoint(self.workdir, 'executable', exename)

        for wflow in self.config.workflows:
            if create and not util.checkpoint(self.workdir, wflow.label):
                wflow.setup(self.workdir, self.basedirs)
                logger.info("querying backend for {0}".format(wflow.label))
                with fs.alternative():
                    dataset_info = wflow.dataset.get_info()

                logger.info("registering {0} in database".format(wflow.label))
                self.__store.register_dataset(wflow, dataset_info,
                                              wflow.category.runtime)
                util.register_checkpoint(self.workdir, wflow.label,
                                         'REGISTERED')
            elif os.path.exists(os.path.join(wflow.workdir, 'running')):
                for id in self.get_taskids(wflow.label):
                    util.move(wflow.workdir, id, 'failed')

        for wflow in self.config.workflows:
            if wflow.parent:
                getattr(self.config.workflows,
                        wflow.parent.label).register(wflow)
                if create:
                    total_units = wflow.dataset.total_units * len(
                        wflow.unique_arguments)
                    self.__store.register_dependency(wflow.label,
                                                     wflow.parent.label,
                                                     total_units)

        if not util.checkpoint(self.workdir, 'sandbox cmssw version'):
            util.register_checkpoint(self.workdir, 'sandbox', 'CREATED')
            versions = set([w.version for w in self.config.workflows])
            if len(versions) == 1:
                util.register_checkpoint(self.workdir, 'sandbox cmssw version',
                                         list(versions)[0])

        if self.config.elk:
            if create:
                categories = {
                    wflow.category.name: []
                    for wflow in self.config.workflows
                }
                for category in categories:
                    for workflow in self.config.workflows:
                        if workflow.category.name == category:
                            categories[category].append(workflow.label)
                self.config.elk.create(categories)
            else:
                self.config.elk.resume()

        self.config.advanced.dashboard.setup(self.config)
        if create:
            self.config.save()
            self.config.advanced.dashboard.register_run()
        else:
            self.config.advanced.dashboard.update_task_status(
                (id_, dash.ABORTED) for id_ in self.__store.reset_units())

        for p in (self.parrot_bin, self.parrot_lib):
            if not os.path.exists(p):
                os.makedirs(p)

        for exe in ('parrot_run', 'chirp', 'chirp_put', 'chirp_get'):
            shutil.copy(util.which(exe), self.parrot_bin)
            subprocess.check_call(
                ["strip", os.path.join(self.parrot_bin, exe)])

        p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib',
                                'lib64', 'libparrot_helper.so')
        shutil.copy(p_helper, self.parrot_lib)
Exemple #9
0
def boil():
    parser = ArgumentParser(description='A task submission tool for CMS')
    parser.add_argument('--verbose', '-v', action='count', default=0, help='increase verbosity')
    parser.add_argument('--quiet', '-q', action='count', default=0, help='decrease verbosity')

    command.Command.register([os.path.join(os.path.dirname(__file__), d, 'commands') for d in ['.', 'cmssw']], parser)

    parser.add_argument(metavar='{configfile,workdir}', dest='checkpoint',
                        help='configuration file to use or working directory to resume.')

    args = parser.parse_args()

    if os.path.isfile(args.checkpoint):
        try:
            import imp
            cfg = imp.load_source('userconfig', args.checkpoint).config
        except Exception as e:
            parser.error("the configuration '{0}' is not valid: {1}".format(args.checkpoint, e))

        if util.checkpoint(cfg.workdir, 'version'):
            cfg = config.Config.load(cfg.workdir)
        elif args.plugin.__class__.__name__.lower() == 'process':
            # This is the original configuration file!
            with util.PartiallyMutable.unlock():
                cfg.base_directory = os.path.abspath(os.path.dirname(args.checkpoint))
                cfg.base_configuration = os.path.abspath(args.checkpoint)
                cfg.startup_directory = os.path.abspath(os.getcwd())
                for w in cfg.workflows:
                    try:
                        w.validate()
                    except Exception as e:
                        parser.error("configuration '{0}' failed validation: {1}".format(args.checkpoint, e))
        else:
            parser.error("""
                Cannot find working directory at '{0}'.
                Have you run 'lobster process {1}'?
                If so, check if you have specified the working directory to change
                programatically (for example, with a timestamp appended). In that
                case, you will need to pass the desired working directory instead of
                configuration file.
                """.format(cfg.workdir, args.checkpoint))
    elif os.path.isdir(args.checkpoint):
        # Load configuration from working directory passed to us
        workdir = args.checkpoint
        try:
            cfg = config.Config.load(workdir)
        except Exception as e:
            parser.error("the working directory '{0}' does not contain a valid configuration: {1}".format(workdir, e))
        with util.PartiallyMutable.unlock():
            cfg.workdir = workdir
    else:
        parser.error("the working directory or configuration '{0}' does not exist".format(args.checkpoint))

    args.config = cfg
    args.preserve = []

    # Handle logging for everything in only one place!
    level = max(1, args.config.advanced.log_level + args.quiet - args.verbose) * 10
    logger.setLevel(level)

    formatter = logging.Formatter(fmt='%(asctime)s [%(levelname)s] %(name)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

    console = logging.StreamHandler()
    console.setFormatter(formatter)
    logger.addHandler(console)

    if args.plugin.daemonizable:
        fn = args.plugin.__class__.__name__.lower() + '.log'
        logger.info("saving log to {0}".format(os.path.join(cfg.workdir, fn)))
        if not os.path.isdir(cfg.workdir):
            os.makedirs(cfg.workdir)
        fileh = logging.handlers.RotatingFileHandler(os.path.join(cfg.workdir, fn), maxBytes=100e6, backupCount=10)
        fileh.setFormatter(formatter)
        fileh.setLevel(logging.INFO)
        for p in args.plugin.blacklisted_logs():
            fileh.addFilter(util.InvertedFilter('lobster.' + p))
        args.preserve.append(fileh.stream)
        logger.addHandler(fileh)

        if level < logging.INFO:
            fn = args.plugin.__class__.__name__.lower() + '_debug.log'
            logger.info("saving debug log to {0}".format(os.path.join(cfg.workdir, fn)))
            debugh = logging.handlers.RotatingFileHandler(os.path.join(cfg.workdir, fn), maxBytes=100e6, backupCount=10)
            debugh.setFormatter(formatter)
            args.preserve.append(debugh.stream)
            logger.addHandler(debugh)

        if not getattr(args, "foreground", False):
            logger.removeHandler(console)

    for p in args.plugin.additional_logs():
        fn = p + '.log'
        lg = logging.getLogger('lobster.' + p)
        logger.info("saving additional log for {1} to {0}".format(os.path.join(cfg.workdir, fn), p))
        if not os.path.isdir(cfg.workdir):
            os.makedirs(cfg.workdir)
        fileh = logging.handlers.RotatingFileHandler(os.path.join(cfg.workdir, fn), maxBytes=100e6, backupCount=10)
        fileh.setFormatter(formatter)
        args.preserve.append(fileh.stream)
        lg.addHandler(fileh)

    args.plugin.run(args)
Exemple #10
0
    def __init__(self, config):
        self.config = config
        self.basedirs = [config['configdir'], config['startdir']]
        self.workdir = config.get('workdir', os.getcwd())
        self.stageout = config.get('stageout location', os.getcwd())
        self.statusfile = os.path.join(self.workdir, 'status.yaml')

        self.parrot_path = os.path.dirname(util.which('parrot_run'))
        self.parrot_bin = os.path.join(self.workdir, 'bin')
        self.parrot_lib = os.path.join(self.workdir, 'lib')

        self.extra_inputs = {}
        self.args = {}
        self.outputs = {}
        self.outputformats = {}
        self.cmds = {}

        chirp_server = config.get('chirp server')
        chirp_root = config.get('chirp root')

        create = not util.checkpoint(self.workdir, 'id') and not self.config.get('merge', False)
        if create:
            self.taskid = 'lobster_{0}_{1}'.format(
                self.config['id'],
                sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:])
            util.register_checkpoint(self.workdir, 'id', self.taskid)
        else:
            self.taskid = util.checkpoint(self.workdir, 'id')
            util.register_checkpoint(self.workdir, 'RESTARTED', str(datetime.datetime.utcnow()))

        self.config = apply_matching(self.config)
        for cfg in self.config['tasks']:
            label = cfg['label']
            self.extra_inputs[label] = map(
                    partial(util.findpath, self.basedirs),
                    cfg.get('extra inputs', []))
            self.outputs[label] = cfg.get('outputs', [])
            self.args[label] = cfg.get('parameters', [])
            self.outputformats[label] = cfg.get("output format", "{base}_{id}.{ext}")
            self.cmds[label] = cfg.get('cmd')

            taskdir = os.path.join(self.workdir, label)
            stageoutdir = os.path.join(self.stageout, label)
            if create:
                if not os.path.exists(taskdir):
                    os.makedirs(taskdir)
                if chirp_root and stageoutdir.startswith(chirp_root):
                    target = stageoutdir.replace(chirp_root, '', 1)
                    if not chirp.exists(chirp_server, chirp_root, target):
                        chirp.makedirs(chirp_server, chirp_root, target)
                else:
                    if not os.path.exists(stageoutdir):
                        os.makedirs(stageoutdir)

                shutil.copy(self.config['filename'], os.path.join(self.workdir, 'lobster_config.yaml'))

        for p in (self.parrot_bin, self.parrot_lib):
            if not os.path.exists(p):
                os.makedirs(p)

        for exe in ('parrot_run', 'chirp', 'chirp_put', 'chirp_get'):
            shutil.copy(util.which(exe), self.parrot_bin)
            subprocess.check_call(["strip", os.path.join(self.parrot_bin, exe)])

        p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib', 'lib64', 'libparrot_helper.so')
        shutil.copy(p_helper, self.parrot_lib)
Exemple #11
0
def run(args):
    with open(args.configfile) as configfile:
        config = yaml.load(configfile)

    workdir = config['workdir']
    if not os.path.exists(workdir):
        os.makedirs(workdir)

    cmsjob = False
    if config.get('type', 'cmssw') == 'cmssw':
        cmsjob = True

        from ProdCommon.Credential.CredentialAPI import CredentialAPI
        cred = CredentialAPI({'credential': 'Proxy'})
        if cred.checkCredential(Time=60):
            if not 'X509_USER_PROXY' in os.environ:
                os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy()
        else:
            if config.get('check proxy', True):
                try:
                    cred.ManualRenewCredential()
                except Exception as e:
                    logging.critical("could not renew proxy")
                    sys.exit(1)
            else:
                logging.critical("please renew your proxy")
                sys.exit(1)

    mode_label = 'merge_' if args.merge else ''
    print "Saving log to {0}".format(os.path.join(workdir, mode_label+'lobster.log'))

    if not args.foreground:
        ttyfile = open(os.path.join(workdir, mode_label+'lobster.err'), 'a')
        print "Saving stderr and stdout to {0}".format(os.path.join(workdir, mode_label+'lobster.err'))

    with daemon.DaemonContext(
            detach_process=not args.foreground,
            stdout=sys.stdout if args.foreground else ttyfile,
            stderr=sys.stderr if args.foreground else ttyfile,
            working_directory=workdir,
            pidfile=get_lock(workdir)):
        logging.basicConfig(
                datefmt="%Y-%m-%d %H:%M:%S",
                format="%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s",
                level=config.get('log level', 2) * 10,
                filename=os.path.join(workdir, mode_label+'lobster.log'))

        if args.foreground:
            console = logging.StreamHandler()
            console.setLevel(config.get('log level', 2) * 10)
            console.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s"))
            logging.getLogger('').addHandler(console)

        config['configdir'] = args.configdir
        config['filepath'] = args.configfile
        config['startdir'] = args.startdir
        if args.merge:
            if args.server:
                config['stageout server'] = args.server
            config['max megabytes'] = args.max_megabytes
            job_src = cmssw.MergeProvider(config)
        elif cmsjob:
            job_src = cmssw.JobProvider(config)
        else:
            job_src = job.SimpleJobProvider(config)

        wq.cctools_debug_flags_set("all")
        wq.cctools_debug_config_file(os.path.join(workdir, mode_label+"work_queue_debug.log"))
        wq.cctools_debug_config_file_size(1 << 29)

        queue = wq.WorkQueue(-1)
        queue.specify_log(os.path.join(workdir, mode_label+"work_queue.log"))
        queue.specify_name("lobster_" + mode_label + config["id"])
        queue.specify_keepalive_timeout(300)
        # queue.tune("short-timeout", 600)
        queue.tune("transfer-outlier-factor", 4)

        logging.info("starting queue as {0}".format(queue.name))
        logging.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name))

        payload = config.get('tune', {}).get('payload', 400)
        abort_active = False
        abort_threshold = config.get('tune', {}).get('abort threshold', 400)
        abort_multiplier = config.get('tune', {}).get('abort multiplier', 4)

        if util.checkpoint(workdir, 'KILLED') == 'PENDING':
            util.register_checkpoint(workdir, 'KILLED', 'RESTART')

        successful_jobs = 0

        creation_time = 0
        destruction_time = 0

        with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile:
            statsfile.write(
                    "#timestamp " +
                    "total_workers_connected total_workers_joined total_workers_removed " +
                    "workers_busy workers_idle " +
                    "tasks_running " +
                    "total_send_time total_receive_time " +
                    "total_create_time total_return_time " +
                    "idle_percentage " +
                    "capacity " +
                    "efficiency " +
                    "jobits_left\n")

        while not job_src.done():
            jobits_left = job_src.work_left()
            stats = queue.stats

            with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile:
                now = datetime.datetime.now()
                statsfile.write(" ".join(map(str,
                    [
                        int(int(now.strftime('%s')) * 1e6 + now.microsecond),
                        stats.total_workers_connected,
                        stats.total_workers_joined,
                        stats.total_workers_removed,
                        stats.workers_busy,
                        stats.workers_idle,
                        stats.tasks_running,
                        stats.total_send_time,
                        stats.total_receive_time,
                        creation_time,
                        destruction_time,
                        stats.idle_percentage,
                        stats.capacity,
                        stats.efficiency,
                        jobits_left
                    ]
                    )) + "\n"
                )

            if util.checkpoint(workdir, 'KILLED') == 'PENDING':
                util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow()))
                logging.info("terminating gracefully")
                break

            logging.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format(
                    stats.workers_busy,
                    stats.workers_busy + stats.workers_ready,
                    jobits_left,
                    stats.tasks_running,
                    stats.tasks_waiting))

            hunger = max(payload - stats.tasks_waiting, 0)

            t = time.time()
            while hunger > 0:
                jobs = job_src.obtain(50)

                if jobs == None or len(jobs) == 0:
                    break

                hunger -= len(jobs)

                for id, cmd, inputs, outputs in jobs:
                    task = wq.Task(cmd)
                    task.specify_tag(id)
                    task.specify_cores(1)
                    # temporary work-around?
                    # task.specify_memory(1000)
                    # task.specify_disk(4000)

                    for (local, remote) in inputs:
                        if os.path.isfile(local):
                            task.specify_input_file(str(local), str(remote), wq.WORK_QUEUE_CACHE)
                        elif os.path.isdir(local):
                            task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT,
                                    wq.WORK_QUEUE_CACHE, recursive=True)
                        else:
                            logging.critical("cannot send file to worker: {0}".format(local))
                            raise NotImplementedError

                    for (local, remote) in outputs:
                        task.specify_output_file(str(local), str(remote))

                    queue.submit(task)
            creation_time += int((time.time() - t) * 1e6)

            task = queue.wait(300)
            tasks = []
            while task:
                if task.return_status == 0:
                    successful_jobs += 1
                tasks.append(task)
                if queue.stats.tasks_complete > 0:
                    task = queue.wait(1)
                else:
                    task = None
            if len(tasks) > 0:
                try:
                    t = time.time()
                    job_src.release(tasks)
                    destruction_time += int((time.time() - t) * 1e6)
                except:
                    tb = traceback.format_exc()
                    logging.critical("cannot recover from the following exception:\n" + tb)
                    for task in tasks:
                        logging.critical("tried to return task {0} from {1}".format(task.tag, task.hostname))
                    raise
            if successful_jobs >= abort_threshold and not abort_active:
                logging.info("activating fast abort with multiplier: {0}".format(abort_multiplier))
                abort_active = True
                queue.activate_fast_abort(abort_multiplier)
        if jobits_left == 0:
            logging.info("no more work left to do")
Exemple #12
0
 def setup(self, config):
     self._workflowid = util.checkpoint(config.workdir, 'id')
Exemple #13
0
    def __init__(self, config):
        util.Timing.__init__(self, 'dash', 'handler', 'updates', 'elk', 'transfers', 'cleanup', 'propagate', 'sqlite')

        self.config = config
        self.basedirs = [config.base_directory, config.startup_directory]
        self.workdir = config.workdir
        self._storage = config.storage
        self.statusfile = os.path.join(self.workdir, 'status.json')
        self.siteconf = os.path.join(self.workdir, 'siteconf')

        self.parrot_path = os.path.dirname(util.which('parrot_run'))
        self.parrot_bin = os.path.join(self.workdir, 'bin')
        self.parrot_lib = os.path.join(self.workdir, 'lib')

        self.__algo = Algo(config)
        self.__host = socket.getfqdn()
        try:
            siteconf = loadSiteLocalConfig()
            self.__ce = siteconf.siteName
            self.__se = siteconf.localStageOutPNN()
            self.__frontier_proxy = siteconf.frontierProxies[0]
        except (SiteConfigError, IndexError):
            logger.error("can't load siteconfig, defaulting to hostname")
            self.__ce = socket.getfqdn()
            self.__se = socket.getfqdn()
            try:
                self.__frontier_proxy = os.environ['HTTP_PROXY']
            except KeyError:
                logger.error("can't determine proxy for Frontier via $HTTP_PROXY")
                sys.exit(1)

        try:
            with open('/etc/cvmfs/default.local') as f:
                lines = f.readlines()
        except IOError:
            lines = []
        for l in lines:
            m = re.match('\s*CVMFS_HTTP_PROXY\s*=\s*[\'"]?(.*)[\'"]?', l)
            if m:
                self.__cvmfs_proxy = m.group(1).strip("\"'")
                break
        else:
            try:
                self.__cvmfs_proxy = os.environ['HTTP_PROXY']
            except KeyError:
                logger.error("can't determine proxy for CVMFS via $HTTP_PROXY")
                sys.exit(1)

        logger.debug("using {} as proxy for CVMFS".format(self.__cvmfs_proxy))
        logger.debug("using {} as proxy for Frontier".format(self.__frontier_proxy))
        logger.debug("using {} as osg_version".format(self.config.advanced.osg_version))
        util.sendemail("Your Lobster project has started!", self.config)

        self.__taskhandlers = {}
        self.__store = unit.UnitStore(self.config)

        self.__setup_inputs()
        self.copy_siteconf()

        create = not util.checkpoint(self.workdir, 'id')
        if create:
            self.taskid = 'lobster_{0}_{1}'.format(
                self.config.label,
                sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:])
            util.register_checkpoint(self.workdir, 'id', self.taskid)
            shutil.copy(self.config.base_configuration, os.path.join(self.workdir, 'config.py'))
        else:
            self.taskid = util.checkpoint(self.workdir, 'id')
            util.register_checkpoint(self.workdir, 'RESTARTED', str(datetime.datetime.utcnow()))

        if not util.checkpoint(self.workdir, 'executable'):
            # We can actually have more than one exe name (one per task label)
            # Set 'cmsRun' if any of the tasks are of that type,
            # or use cmd command if all tasks execute the same cmd,
            # or use 'noncmsRun' if task cmds are different
            # Using this for dashboard exe name reporting
            cmsconfigs = [wflow.pset for wflow in self.config.workflows]
            cmds = [wflow.command for wflow in self.config.workflows]
            if any(cmsconfigs):
                exename = 'cmsRun'
            elif all(x == cmds[0] and x is not None for x in cmds):
                exename = cmds[0]
            else:
                exename = 'noncmsRun'

            util.register_checkpoint(self.workdir, 'executable', exename)

        for wflow in self.config.workflows:
            if create and not util.checkpoint(self.workdir, wflow.label):
                wflow.setup(self.workdir, self.basedirs)
                logger.info("querying backend for {0}".format(wflow.label))
                with fs.alternative():
                    dataset_info = wflow.dataset.get_info()

                logger.info("registering {0} in database".format(wflow.label))
                self.__store.register_dataset(wflow, dataset_info, wflow.category.runtime)
                util.register_checkpoint(self.workdir, wflow.label, 'REGISTERED')
            elif os.path.exists(os.path.join(wflow.workdir, 'running')):
                for id in self.get_taskids(wflow.label):
                    util.move(wflow.workdir, id, 'failed')

        for wflow in self.config.workflows:
            if wflow.parent:
                getattr(self.config.workflows, wflow.parent.label).register(wflow)
                if create:
                    total_units = wflow.dataset.total_units * len(wflow.unique_arguments)
                    self.__store.register_dependency(wflow.label, wflow.parent.label, total_units)

        if not util.checkpoint(self.workdir, 'sandbox cmssw version'):
            util.register_checkpoint(self.workdir, 'sandbox', 'CREATED')
            versions = set([w.version for w in self.config.workflows])
            if len(versions) == 1:
                util.register_checkpoint(self.workdir, 'sandbox cmssw version', list(versions)[0])

        if self.config.elk:
            if create:
                categories = {wflow.category.name: [] for wflow in self.config.workflows}
                for category in categories:
                    for workflow in self.config.workflows:
                        if workflow.category.name == category:
                            categories[category].append(workflow.label)
                self.config.elk.create(categories)
            else:
                self.config.elk.resume()

        self.config.advanced.dashboard.setup(self.config)
        if create:
            self.config.save()
            self.config.advanced.dashboard.register_run()
        else:
            self.config.advanced.dashboard.update_task_status(
                (id_, dash.ABORTED) for id_ in self.__store.reset_units()
            )

        for p in (self.parrot_bin, self.parrot_lib):
            if not os.path.exists(p):
                os.makedirs(p)

        for exe in ('parrot_run', 'chirp', 'chirp_put', 'chirp_get'):
            shutil.copy(util.which(exe), self.parrot_bin)
            subprocess.check_call(["strip", os.path.join(self.parrot_bin, exe)])

        p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib', 'lib64', 'libparrot_helper.so')
        shutil.copy(p_helper, self.parrot_lib)
Exemple #14
0
 def setup(self, config):
     super(Dashboard, self).setup(config)
     if util.checkpoint(config.workdir, "sandbox cmssw version"):
         self.__cmssw_version = str(util.checkpoint(config.workdir, "sandbox cmssw version"))
     if util.checkpoint(config.workdir, "executable"):
         self.__executable = str(util.checkpoint(config.workdir, "executable"))
Exemple #15
0
    def run(self, args):
        self.config = args.config

        if args.finalize:
            args.config.advanced.threshold_for_failure = 0
            args.config.advanced.threshold_for_skipping = 0

        if not os.path.exists(self.config.workdir):
            os.makedirs(self.config.workdir)

        if not util.checkpoint(self.config.workdir, "version"):
            util.register_checkpoint(self.config.workdir, "version",
                                     util.get_version())
        else:
            util.verify(self.config.workdir)

        if not args.foreground:
            ttyfile = open(os.path.join(self.config.workdir, 'process.err'),
                           'a')
            logger.info("saving stderr and stdout to {0}".format(
                os.path.join(self.config.workdir, 'process.err')))
            args.preserve.append(ttyfile)

        if self.config.advanced.dump_core:
            logger.info("setting core dump size to unlimited")
            resource.setrlimit(
                resource.RLIMIT_CORE,
                (resource.RLIM_INFINITY, resource.RLIM_INFINITY))

        def localkill(num, frame):
            Terminate().run(args)

        signals = daemon.daemon.make_default_signal_map()
        signals[signal.SIGINT] = localkill
        signals[signal.SIGTERM] = localkill

        process = psutil.Process()
        preserved = [f.name for f in args.preserve]
        preserved += [os.path.realpath(os.path.abspath(f)) for f in preserved]
        openfiles = [
            f for f in process.open_files() if f.path not in preserved
        ]
        openconns = process.connections()

        for c in openconns:
            logger.debug("open connection: {}".format(c))
            args.preserve.append(c.fd)

        if len(openfiles) > 0:
            logger.error("cannot daemonize due to open files")
            for f in openfiles:
                logger.error("open file: {}".format(f.path))
            raise RuntimeError("open files or connections")

        with daemon.DaemonContext(
                detach_process=not args.foreground,
                stdout=sys.stdout if args.foreground else ttyfile,
                stderr=sys.stderr if args.foreground else ttyfile,
                files_preserve=args.preserve,
                working_directory=self.config.workdir,
                pidfile=util.get_lock(self.config.workdir, args.force),
                prevent_core=False,
                initgroups=False,
                signal_map=signals):
            self.sprint()

            logger.info("lobster terminated")
            if not args.foreground:
                logger.info("stderr and stdout saved in {0}".format(
                    os.path.join(self.config.workdir, 'process.err')))

            try:
                # Fails if something with working directory creation went wrong
                Status().run(args)
            except Exception:
                pass
Exemple #16
0
    def __init__(self, config):
        self.config = config
        self.basedirs = [config['configdir'], config['startdir']]
        self.workdir = config.get('workdir', os.getcwd())
        self.stageout = config.get('stageout location', os.getcwd())
        self.statusfile = os.path.join(self.workdir, 'status.yaml')
        self.parrot_path = os.path.dirname(util.which('parrot_run'))
        self.parrot_bin = os.path.join(self.workdir, 'bin')
        self.parrot_lib = os.path.join(self.workdir, 'lib')

        self.extra_inputs = {}
        self.args = {}
        self.outputs = {}
        self.outputformats = {}
        self.cmds = {}

        create = not util.checkpoint(self.workdir, 'id') and not self.config.get('merge', False)
        if create:
            self.taskid = 'lobster_{0}_{1}'.format(
                self.config['id'],
                sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:])
            with open(self.statusfile, 'wb') as f:
                yaml.dump({'id': self.taskid}, f, default_flow_style=False)
        else:
            self.taskid = util.checkpoint(self.workdir, 'id')
            util.register_checkpoint(self.workdir, 'RESTARTED', str(datetime.datetime.utcnow()))

        self.config = apply_matching(self.config)
        for cfg in self.config['tasks']:
            label = cfg['label']
            self.extra_inputs[label] = map(
                    partial(util.findpath, self.basedirs),
                    cfg.get('extra inputs', []))
            self.outputs[label] = cfg.get('outputs', [])
            self.args[label] = cfg.get('parameters', [])
            self.outputformats[label] = cfg.get("output format", "{base}_{id}.{ext}")
            self.cmds[label] = cfg.get('cmd')

            taskdir = os.path.join(self.workdir, label)
            stageoutdir = os.path.join(self.stageout, label)
            if create:
                for dir in [taskdir, stageoutdir]:
                    if not os.path.exists(dir):
                        os.makedirs(dir)
                    else:
                        # TODO warn about non-empty stageout directories
                        pass

                shutil.copy(self.config['filepath'], os.path.join(self.workdir, 'lobster_config.yaml'))

        for p in (self.parrot_bin, self.parrot_lib):
            if not os.path.exists(p):
                os.makedirs(p)

        for exe in ('parrot_run', 'chirp_put', 'chirp_get'):
            shutil.copy(util.which(exe), self.parrot_bin)
            subprocess.check_call(["strip", os.path.join(self.parrot_bin, exe)])
            for lib in util.ldd(exe):
                shutil.copy(lib, self.parrot_lib)

        p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib', 'lib64', 'libparrot_helper.so')
        shutil.copy(p_helper, self.parrot_lib)
Exemple #17
0
def run(args):
    dash_checker = cmssw.dash.JobStateChecker(300)
    with open(args.configfile) as configfile:
        config = yaml.load(configfile)

    workdir = config['workdir']
    if not os.path.exists(workdir):
        os.makedirs(workdir)
        util.register_checkpoint(workdir, "version", get_distribution('Lobster').version)
    else:
        util.verify(workdir)

    cmsjob = False
    if config.get('type', 'cmssw') == 'cmssw':
        cmsjob = True

        from ProdCommon.Credential.CredentialAPI import CredentialAPI
        cred = CredentialAPI({'credential': 'Proxy'})
        if cred.checkCredential(Time=60):
            if not 'X509_USER_PROXY' in os.environ:
                os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy()
        else:
            if config.get('advanced', {}).get('renew proxy', True):
                try:
                    cred.ManualRenewCredential()
                except Exception as e:
                    print("could not renew proxy")
                    sys.exit(1)
            else:
                print("please renew your proxy")
                sys.exit(1)

    print "Saving log to {0}".format(os.path.join(workdir, 'lobster.log'))

    if not args.foreground:
        ttyfile = open(os.path.join(workdir, 'lobster.err'), 'a')
        print "Saving stderr and stdout to {0}".format(os.path.join(workdir, 'lobster.err'))

    signals = daemon.daemon.make_default_signal_map()
    signals[signal.SIGTERM] = lambda num, frame: kill(args)

    with daemon.DaemonContext(
            detach_process=not args.foreground,
            stdout=sys.stdout if args.foreground else ttyfile,
            stderr=sys.stderr if args.foreground else ttyfile,
            working_directory=workdir,
            pidfile=util.get_lock(workdir),
            signal_map=signals):

        fileh = logging.handlers.RotatingFileHandler(os.path.join(workdir, 'lobster.log'), maxBytes=500e6, backupCount=10)
        fileh.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s"))
        fileh.setLevel(config.get('advanced', {}).get('log level', 2) * 10)

        logger.addHandler(fileh)
        logger.setLevel(config.get('advanced', {}).get('log level', 2) * 10)

        if args.foreground:
            console = logging.StreamHandler()
            console.setLevel(config.get('advanced', {}).get('log level', 2) * 10)
            console.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s"))
            logger.addHandler(console)

        config['configdir'] = args.configdir
        config['filename'] = args.configfile
        config['startdir'] = args.startdir
        if cmsjob:
            job_src = cmssw.JobProvider(config)
            actions = cmssw.Actions(config)
        else:
            job_src = job.SimpleJobProvider(config)
            actions = None

        logger.info("using wq from {0}".format(wq.__file__))

        wq.cctools_debug_flags_set("all")
        wq.cctools_debug_config_file(os.path.join(workdir, "work_queue_debug.log"))
        wq.cctools_debug_config_file_size(1 << 29)

        queue = wq.WorkQueue(-1)
        queue.specify_log(os.path.join(workdir, "work_queue.log"))
        queue.specify_name("lobster_" + config["id"])
        queue.specify_keepalive_timeout(300)
        # queue.tune("short-timeout", 600)
        queue.tune("transfer-outlier-factor", 4)
        queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND)

        logger.info("starting queue as {0}".format(queue.name))
        logger.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name))

        payload = config.get('advanced', {}).get('payload', 400)
        abort_active = False
        abort_threshold = config.get('advanced', {}).get('abort threshold', 400)
        abort_multiplier = config.get('advanced', {}).get('abort multiplier', 4)

        if util.checkpoint(workdir, 'KILLED') == 'PENDING':
            util.register_checkpoint(workdir, 'KILLED', 'RESTART')

        jobits_left = 0
        successful_jobs = 0

        creation_time = 0
        destruction_time = 0

        with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile:
            statsfile.write(
                    "#timestamp " +
                    "total_workers_connected total_workers_joined total_workers_removed " +
                    "workers_busy workers_idle " +
                    "tasks_running " +
                    "total_send_time total_receive_time " +
                    "total_create_time total_return_time " +
                    "idle_percentage " +
                    "capacity " +
                    "efficiency " +
                    "total_memory " +
                    "total_cores " +
                    "jobits_left\n")

        while not job_src.done():
            jobits_left = job_src.work_left()
            stats = queue.stats

            with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile:
                now = datetime.datetime.now()
                statsfile.write(" ".join(map(str,
                    [
                        int(int(now.strftime('%s')) * 1e6 + now.microsecond),
                        stats.total_workers_connected,
                        stats.total_workers_joined,
                        stats.total_workers_removed,
                        stats.workers_busy,
                        stats.workers_idle,
                        stats.tasks_running,
                        stats.total_send_time,
                        stats.total_receive_time,
                        creation_time,
                        destruction_time,
                        stats.idle_percentage,
                        stats.capacity,
                        stats.efficiency,
                        stats.total_memory,
                        stats.total_cores,
                        jobits_left
                    ]
                    )) + "\n"
                )

            if util.checkpoint(workdir, 'KILLED') == 'PENDING':
                util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow()))
                # just in case, check for any remaining not done task that
                # hasn't been reported as aborted
                for task_id in queue._task_table.keys():
                    status = cmssw.dash.status_map[queue.task_state(task_id)]
                    if status not in (cmssw.dash.DONE, cmssw.dash.ABORTED):
                        job_src._JobProvider__dash.update_job(task_id, cmssw.dash.ABORTED)

                logger.info("terminating gracefully")
                break

            logger.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format(
                    stats.workers_busy,
                    stats.workers_busy + stats.workers_ready,
                    jobits_left,
                    stats.tasks_running,
                    stats.tasks_waiting))

            hunger = max(payload - stats.tasks_waiting, 0)

            t = time.time()
            while hunger > 0:
                jobs = job_src.obtain(50)

                if jobs == None or len(jobs) == 0:
                    break

                hunger -= len(jobs)
                cores = config.get('cores per job', 1)
                for id, cmd, inputs, outputs in jobs:
                    task = wq.Task(cmd)
                    task.specify_tag(id)
                    task.specify_cores(cores)
                    # temporary work-around?
                    # task.specify_memory(1000)
                    # task.specify_disk(4000)

                    for (local, remote, cache) in inputs:
                        if os.path.isfile(local):
                            cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE
                            task.specify_input_file(str(local), str(remote), cache_opt)
                        elif os.path.isdir(local):
                            task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT,
                                    wq.WORK_QUEUE_CACHE, recursive=True)
                        else:
                            logger.critical("cannot send file to worker: {0}".format(local))
                            raise NotImplementedError

                    for (local, remote) in outputs:
                        task.specify_output_file(str(local), str(remote))

                    queue.submit(task)
            creation_time += int((time.time() - t) * 1e6)

            # update dashboard status for all not done tasks
            # report Done status only once when releasing the task
            # WAITING_RETRIEVAL is not a valid status in dashboard
            # so, skipping it for now
            monitor = job_src._JobProvider__dash
            queue = queue
            exclude_states = (cmssw.dash.DONE, cmssw.dash.WAITING_RETRIEVAL)
            try:
                dash_checker.update_dashboard_states(monitor, queue, exclude_states)
            except Exception as e:
                logger.warning("Could not update job states to dashboard")

            task = queue.wait(300)
            tasks = []
            while task:
                if task.return_status == 0:
                    successful_jobs += 1
                tasks.append(task)
                if queue.stats.tasks_complete > 0:
                    task = queue.wait(1)
                else:
                    task = None
            if len(tasks) > 0:
                try:
                    t = time.time()
                    job_src.release(tasks)
                    destruction_time += int((time.time() - t) * 1e6)
                except:
                    tb = traceback.format_exc()
                    logger.critical("cannot recover from the following exception:\n" + tb)
                    for task in tasks:
                        logger.critical("tried to return task {0} from {1}".format(task.tag, task.hostname))
                    raise
            if successful_jobs >= abort_threshold and not abort_active:
                logger.info("activating fast abort with multiplier: {0}".format(abort_multiplier))
                abort_active = True
                queue.activate_fast_abort(abort_multiplier)

            # recurring actions are triggered here
            if actions:
                actions.take()
        if jobits_left == 0:
            logger.info("no more work left to do")
Exemple #18
0
 def setup(self, config):
     self._workflowid = util.checkpoint(config.workdir, 'id')
Exemple #19
0
    def __init__(self, config):
        super(JobProvider, self).__init__(config)

        self.__chirp = self.config.get('stageout server', None)
        self.__sandbox = os.path.join(self.workdir, 'sandbox')

        self.__datasets = {}
        self.__configs = {}
        self.__jobhandlers = {}
        self.__interface = MetaInterface()
        self.__store = jobit.JobitStore(self.config)

        self.__grid_files = [(os.path.join('/cvmfs/grid.cern.ch', x), os.path.join('grid', x)) for x in
                                 ['3.2.11-1/external/etc/profile.d/clean-grid-env-funcs.sh',
                                  '3.2.11-1/external/etc/profile.d/grid-env-funcs.sh',
                                  '3.2.11-1/external/etc/profile.d/grid-env.sh',
                                  '3.2.11-1/etc/profile.d/grid-env.sh',
                                  '3.2.11-1/glite/bin/voms-proxy-info',
                                  '3.2.11-1/glite/lib64/libvomsapi_nog.so.0.0.0',
                                  '3.2.11-1/glite/lib64/libvomsapi_nog.so.0',
                                  'etc/grid-security/certificates'
                                  ]
                             ]

        if self.config.get('use dashboard', False):
            logging.info("using dashboard with task id {0}".format(self.taskid))
            self.__dash = dash.Monitor(self.taskid)
        else:
            self.__dash = dash.DummyMonitor(self.taskid)

        if not util.checkpoint(self.workdir, 'sandbox'):
            blacklist = self.config.get('sandbox blacklist', [])
            sandbox.package(os.environ['LOCALRT'], self.__sandbox, blacklist, self.config.get('recycle sandbox'))
            util.register_checkpoint(self.workdir, 'sandbox', 'CREATED')
            self.__dash.register_run()

        else:
            for id in self.__store.reset_jobits():
                self.__dash.update_job(id, dash.ABORTED)

        for cfg in self.config['tasks']:
            label = cfg['label']
            cfg['basedirs'] = self.basedirs

            cms_config = cfg.get('cmssw config')
            if cms_config:
                self.__configs[label] = os.path.basename(cms_config)

            self.__datasets[label] = cfg.get('dataset', cfg.get('files', ''))

            if cms_config and not cfg.has_key('outputs'):
                sys.argv = [sys.argv[0]] #To avoid problems loading configs that use the VarParsing module
                with open(cms_config, 'r') as f:
                    source = imp.load_source('cms_config_source', cms_config, f)
                    cfg_interface = CfgInterface(source.process)
                    if hasattr(cfg_interface.data.GlobalTag.globaltag, 'value'): #Possibility: make this mandatory?
                        cfg['global tag'] = cfg_interface.data.GlobalTag.globaltag.value()
                    for m in cfg_interface.data.outputModules:
                        self.outputs[label].append(getattr(cfg_interface.data, m).fileName._value)

            taskdir = os.path.join(self.workdir, label)
            if not util.checkpoint(self.workdir, label):
                if cms_config:
                    shutil.copy(util.findpath(self.basedirs, cms_config), os.path.join(taskdir, os.path.basename(cms_config)))

                logging.info("querying backend for {0}".format(label))
                dataset_info = self.__interface.get_info(cfg)

                logging.info("registering {0} in database".format(label))
                self.__store.register(cfg, dataset_info)
                util.register_checkpoint(self.workdir, label, 'REGISTERED')

            elif os.path.exists(os.path.join(taskdir, 'running')):
                for d in os.listdir(os.path.join(taskdir, 'running')):
                    shutil.move(os.path.join(taskdir, 'running', d), os.path.join(taskdir, 'failed'))