#!/usr/bin/env python from setuptools import setup from lobster.util import get_version setup( name='Lobster', version=get_version(), description='Opportunistic HEP computing tool', author='Anna Woodard, Matthias Wolf', url='https://github.com/matz-e/lobster', packages=[ 'lobster', 'lobster.cmssw', 'lobster.cmssw.commands', 'lobster.core', 'lobster.commands', 'lobster.monitor', 'lobster.monitor.elk' ], package_data={ 'lobster': [ 'core/data/task.py', 'core/data/wrapper.sh', 'core/data/mtab', 'core/data/siteconf/JobConfig/site-local-config.xml', 'core/data/siteconf/PhEDEx/storage.xml', 'core/data/merge_cfg.py', 'core/data/merge_reports.py', 'core/data/report.json.in', 'commands/data/index.html', 'commands/data/gh.png', 'commands/data/styles.css', 'commands/data/category.html', 'monitor/elk/data/index/*.json', 'monitor/elk/data/dash/*.json', 'monitor/elk/data/vis/*.json', 'monitor/elk/data/*.json' ] }, install_requires=[ 'argparse', 'elasticsearch>=5.0.0,<6.0.0',
def run(self, args): self.config = args.config if args.finalize: args.config.advanced.threshold_for_failure = 0 args.config.advanced.threshold_for_skipping = 0 if not os.path.exists(self.config.workdir): os.makedirs(self.config.workdir) if not util.checkpoint(self.config.workdir, "version"): util.register_checkpoint(self.config.workdir, "version", util.get_version()) else: util.verify(self.config.workdir) if not args.foreground: ttyfile = open(os.path.join(self.config.workdir, 'process.err'), 'a') logger.info("saving stderr and stdout to {0}".format( os.path.join(self.config.workdir, 'process.err'))) args.preserve.append(ttyfile) if self.config.advanced.dump_core: logger.info("setting core dump size to unlimited") resource.setrlimit( resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) def localkill(num, frame): Terminate().run(args) signals = daemon.daemon.make_default_signal_map() signals[signal.SIGINT] = localkill signals[signal.SIGTERM] = localkill process = psutil.Process() preserved = [f.name for f in args.preserve] preserved += [os.path.realpath(os.path.abspath(f)) for f in preserved] openfiles = [ f for f in process.open_files() if f.path not in preserved ] openconns = process.connections() for c in openconns: logger.debug("open connection: {}".format(c)) args.preserve.append(c.fd) if len(openfiles) > 0: logger.error("cannot daemonize due to open files") for f in openfiles: logger.error("open file: {}".format(f.path)) raise RuntimeError("open files or connections") with daemon.DaemonContext( detach_process=not args.foreground, stdout=sys.stdout if args.foreground else ttyfile, stderr=sys.stderr if args.foreground else ttyfile, files_preserve=args.preserve, working_directory=self.config.workdir, pidfile=util.get_lock(self.config.workdir, args.force), prevent_core=False, initgroups=False, signal_map=signals): self.sprint() logger.info("lobster terminated") if not args.foreground: logger.info("stderr and stdout saved in {0}".format( os.path.join(self.config.workdir, 'process.err'))) try: # Fails if something with working directory creation went wrong Status().run(args) except Exception: pass
def sprint(self): with util.PartiallyMutable.unlock(): self.source = TaskProvider(self.config) action = actions.Actions(self.config, self.source) logger.info("using wq from {0}".format(wq.__file__)) logger.info("running Lobster version {0}".format(util.get_version())) logger.info("current PID is {0}".format(os.getpid())) wq.cctools_debug_flags_set("all") wq.cctools_debug_config_file( os.path.join(self.config.workdir, "work_queue_debug.log")) wq.cctools_debug_config_file_size(1 << 29) self.queue = wq.WorkQueue(self.config.advanced.wq_port) self.queue.specify_min_taskid(self.source.max_taskid() + 1) self.queue.specify_log( os.path.join(self.config.workdir, "work_queue.log")) self.queue.specify_transactions_log( os.path.join(self.config.workdir, "transactions.log")) self.queue.specify_name("lobster_" + self.config.label) self.queue.specify_keepalive_timeout(300) # self.queue.tune("short-timeout", 600) self.queue.tune("transfer-outlier-factor", 4) self.queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND) if self.config.advanced.full_monitoring: self.queue.enable_monitoring_full(None) else: self.queue.enable_monitoring(None) logger.info("starting queue as {0}".format(self.queue.name)) abort_active = False abort_threshold = self.config.advanced.abort_threshold abort_multiplier = self.config.advanced.abort_multiplier wq_max_retries = self.config.advanced.wq_max_retries if util.checkpoint(self.config.workdir, 'KILLED') == 'PENDING': util.register_checkpoint(self.config.workdir, 'KILLED', 'RESTART') # time in seconds to wait for WQ to return tasks, with minimum wait # time in case no more tasks are waiting interval = 120 interval_minimum = 30 tasks_left = 0 units_left = 0 successful_tasks = 0 categories = [] self.setup_logging('all') # Workflows can be assigned categories, with each category having # different cpu/memory/walltime requirements that WQ will automatically # fine-tune for category in self.config.categories: constraints = category.wq() if category.name != 'merge': categories.append(category.name) self.setup_logging(category.name) self.queue.specify_category_mode(category.name, category.mode) if category.mode == wq.WORK_QUEUE_ALLOCATION_MODE_FIXED: self.queue.specify_category_max_resources( category.name, constraints) else: self.queue.specify_category_first_allocation_guess( category.name, constraints) logger.debug('Category {0}: {1}'.format(category.name, constraints)) if 'wall_time' not in constraints: self.queue.activate_fast_abort_category( category.name, abort_multiplier) proxy_email_sent = False while not self.source.done(): with self.measure('status'): tasks_left = self.source.tasks_left() units_left = self.source.work_left() logger.debug("expecting {0} tasks, still".format(tasks_left)) self.queue.specify_num_tasks_left(tasks_left) for c in categories + ['all']: self.log(c, units_left) if util.checkpoint(self.config.workdir, 'KILLED') == 'PENDING': util.register_checkpoint(self.config.workdir, 'KILLED', str(datetime.datetime.utcnow())) # let the task source shut down gracefully logger.info("terminating task source") self.source.terminate() logger.info("terminating gracefully") break with self.measure('create'): have = {} for c in categories: cstats = self.queue.stats_category(c) have[c] = { 'running': cstats.tasks_running, 'queued': cstats.tasks_waiting } stats = self.queue.stats_hierarchy tasks = self.source.obtain(stats.total_cores, have) expiry = None if self.config.advanced.proxy: expiry = self.config.advanced.proxy.expires() proxy_time_left = self.config.advanced.proxy.time_left() if proxy_time_left >= 24 * 3600: proxy_email_sent = False if proxy_time_left < 24 * 3600 and not proxy_email_sent: util.sendemail( "Your proxy is about to expire.\n" + "Timeleft: " + str(datetime.timedelta(seconds=proxy_time_left)), self.config) proxy_email_sent = True for category, cmd, id, inputs, outputs, env, dir in tasks: task = wq.Task(cmd) task.specify_category(category) task.specify_tag(id) task.specify_max_retries(wq_max_retries) task.specify_monitor_output( os.path.join(dir, 'resource_monitor')) for k, v in env.items(): task.specify_environment_variable(k, v) for (local, remote, cache) in inputs: cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE if os.path.isfile(local) or os.path.isdir(local): task.specify_input_file(str(local), str(remote), cache_opt) else: logger.critical( "cannot send file to worker: {0}".format( local)) raise NotImplementedError for (local, remote) in outputs: task.specify_output_file(str(local), str(remote)) if expiry: task.specify_end_time(expiry * 10**6) self.queue.submit(task) with self.measure('status'): stats = self.queue.stats_hierarchy logger.info( "{0} out of {1} workers busy; {2} tasks running, {3} waiting; {4} units left" .format(stats.workers_busy, stats.workers_busy + stats.workers_ready, stats.tasks_running, stats.tasks_waiting, units_left)) with self.measure('update'): self.source.update(self.queue) # recurring actions are triggered here; plotting etc should run # while we have WQ hand us back tasks w/o any database # interaction with self.measure('action'): if action: action.take() with self.measure('fetch'): starttime = time.time() task = self.queue.wait(interval) tasks = [] while task: if task.return_status == 0: successful_tasks += 1 elif task.return_status in self.config.advanced.bad_exit_codes: logger.warning( "blacklisting host {0} due to bad exit code from task {1}" .format(task.hostname, task.tag)) self.queue.blacklist(task.hostname) tasks.append(task) remaining = int(starttime + interval - time.time()) if (interval - remaining < interval_minimum or self.queue.stats.tasks_waiting > 0 ) and remaining > 0: task = self.queue.wait(remaining) else: task = None # TODO do we really need this? We have everything based on # categories by now, so this should not be needed. if abort_threshold > 0 and successful_tasks >= abort_threshold and not abort_active: logger.info( "activating fast abort with multiplier: {0}".format( abort_multiplier)) abort_active = True self.queue.activate_fast_abort(abort_multiplier) if len(tasks) > 0: try: with self.measure('return'): self.source.release(tasks) except Exception: tb = traceback.format_exc() logger.critical( "cannot recover from the following exception:\n" + tb) util.sendemail( "Your Lobster project has crashed from the following exception:\n" + tb, self.config) for task in tasks: logger.critical( "tried to return task {0} from {1}".format( task.tag, task.hostname)) raise if units_left == 0: logger.info("no more work left to do") util.sendemail("Your Lobster project is done!", self.config) if self.config.elk: self.config.elk.end() if action: action.take(True)
#!/usr/bin/env python from setuptools import setup from lobster.util import get_version setup( name='Lobster', version=get_version(), description='Opportunistic HEP computing tool', author='Anna Woodard, Matthias Wolf', url='https://github.com/matz-e/lobster', packages=[ 'lobster', 'lobster.cmssw', 'lobster.cmssw.commands', 'lobster.core', 'lobster.commands', 'lobster.monitor', 'lobster.monitor.elk' ], package_data={'lobster': [ 'core/data/task.py', 'core/data/wrapper.sh', 'core/data/mtab', 'core/data/siteconf/JobConfig/site-local-config.xml', 'core/data/siteconf/PhEDEx/storage.xml', 'core/data/merge_cfg.py', 'core/data/merge_reports.py', 'core/data/report.json.in', 'commands/data/index.html',