def __init__(self, config): util.Timing.__init__(self, 'dash', 'handler', 'updates', 'elk', 'transfers', 'cleanup', 'propagate', 'sqlite') self.config = config self.basedirs = [config.base_directory, config.startup_directory] self.workdir = config.workdir self._storage = config.storage self.statusfile = os.path.join(self.workdir, 'status.json') self.siteconf = os.path.join(self.workdir, 'siteconf') self.parrot_path = os.path.dirname(util.which('parrot_run')) self.parrot_bin = os.path.join(self.workdir, 'bin') self.parrot_lib = os.path.join(self.workdir, 'lib') self.__algo = Algo(config) self.__host = socket.getfqdn() try: siteconf = loadSiteLocalConfig() self.__ce = siteconf.siteName self.__se = siteconf.localStageOutPNN() self.__frontier_proxy = siteconf.frontierProxies[0] except (SiteConfigError, IndexError): logger.error("can't load siteconfig, defaulting to hostname") self.__ce = socket.getfqdn() self.__se = socket.getfqdn() try: self.__frontier_proxy = os.environ['HTTP_PROXY'] except KeyError: logger.error( "can't determine proxy for Frontier via $HTTP_PROXY") sys.exit(1) try: with open('/etc/cvmfs/default.local') as f: lines = f.readlines() except IOError: lines = [] for l in lines: m = re.match('\s*CVMFS_HTTP_PROXY\s*=\s*[\'"]?(.*)[\'"]?', l) if m: self.__cvmfs_proxy = m.group(1).strip("\"'") break else: try: self.__cvmfs_proxy = os.environ['HTTP_PROXY'] except KeyError: logger.error("can't determine proxy for CVMFS via $HTTP_PROXY") sys.exit(1) logger.debug("using {} as proxy for CVMFS".format(self.__cvmfs_proxy)) logger.debug("using {} as proxy for Frontier".format( self.__frontier_proxy)) logger.debug("using {} as osg_version".format( self.config.advanced.osg_version)) util.sendemail("Your Lobster project has started!", self.config) self.__taskhandlers = {} self.__store = unit.UnitStore(self.config) self.__setup_inputs() self.copy_siteconf() create = not util.checkpoint(self.workdir, 'id') if create: self.taskid = 'lobster_{0}_{1}'.format( self.config.label, sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:]) util.register_checkpoint(self.workdir, 'id', self.taskid) shutil.copy(self.config.base_configuration, os.path.join(self.workdir, 'config.py')) else: self.taskid = util.checkpoint(self.workdir, 'id') util.register_checkpoint(self.workdir, 'RESTARTED', str(datetime.datetime.utcnow())) if not util.checkpoint(self.workdir, 'executable'): # We can actually have more than one exe name (one per task label) # Set 'cmsRun' if any of the tasks are of that type, # or use cmd command if all tasks execute the same cmd, # or use 'noncmsRun' if task cmds are different # Using this for dashboard exe name reporting cmsconfigs = [wflow.pset for wflow in self.config.workflows] cmds = [wflow.command for wflow in self.config.workflows] if any(cmsconfigs): exename = 'cmsRun' elif all(x == cmds[0] and x is not None for x in cmds): exename = cmds[0] else: exename = 'noncmsRun' util.register_checkpoint(self.workdir, 'executable', exename) for wflow in self.config.workflows: if create and not util.checkpoint(self.workdir, wflow.label): wflow.setup(self.workdir, self.basedirs) logger.info("querying backend for {0}".format(wflow.label)) with fs.alternative(): dataset_info = wflow.dataset.get_info() logger.info("registering {0} in database".format(wflow.label)) self.__store.register_dataset(wflow, dataset_info, wflow.category.runtime) util.register_checkpoint(self.workdir, wflow.label, 'REGISTERED') elif os.path.exists(os.path.join(wflow.workdir, 'running')): for id in self.get_taskids(wflow.label): util.move(wflow.workdir, id, 'failed') for wflow in self.config.workflows: if wflow.parent: getattr(self.config.workflows, wflow.parent.label).register(wflow) if create: total_units = wflow.dataset.total_units * len( wflow.unique_arguments) self.__store.register_dependency(wflow.label, wflow.parent.label, total_units) if not util.checkpoint(self.workdir, 'sandbox cmssw version'): util.register_checkpoint(self.workdir, 'sandbox', 'CREATED') versions = set([w.version for w in self.config.workflows]) if len(versions) == 1: util.register_checkpoint(self.workdir, 'sandbox cmssw version', list(versions)[0]) if self.config.elk: if create: categories = { wflow.category.name: [] for wflow in self.config.workflows } for category in categories: for workflow in self.config.workflows: if workflow.category.name == category: categories[category].append(workflow.label) self.config.elk.create(categories) else: self.config.elk.resume() self.config.advanced.dashboard.setup(self.config) if create: self.config.save() self.config.advanced.dashboard.register_run() else: self.config.advanced.dashboard.update_task_status( (id_, dash.ABORTED) for id_ in self.__store.reset_units()) for p in (self.parrot_bin, self.parrot_lib): if not os.path.exists(p): os.makedirs(p) for exe in ('parrot_run', 'chirp', 'chirp_put', 'chirp_get'): shutil.copy(util.which(exe), self.parrot_bin) subprocess.check_call( ["strip", os.path.join(self.parrot_bin, exe)]) p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib', 'lib64', 'libparrot_helper.so') shutil.copy(p_helper, self.parrot_lib)
def __init__(self, config): util.Timing.__init__(self, 'dash', 'handler', 'updates', 'elk', 'transfers', 'cleanup', 'propagate', 'sqlite') self.config = config self.basedirs = [config.base_directory, config.startup_directory] self.workdir = config.workdir self._storage = config.storage self.statusfile = os.path.join(self.workdir, 'status.json') self.siteconf = os.path.join(self.workdir, 'siteconf') self.parrot_path = os.path.dirname(util.which('parrot_run')) self.parrot_bin = os.path.join(self.workdir, 'bin') self.parrot_lib = os.path.join(self.workdir, 'lib') self.__algo = Algo(config) self.__host = socket.getfqdn() try: siteconf = loadSiteLocalConfig() self.__ce = siteconf.siteName self.__se = siteconf.localStageOutPNN() self.__frontier_proxy = siteconf.frontierProxies[0] except (SiteConfigError, IndexError): logger.error("can't load siteconfig, defaulting to hostname") self.__ce = socket.getfqdn() self.__se = socket.getfqdn() try: self.__frontier_proxy = os.environ['HTTP_PROXY'] except KeyError: logger.error("can't determine proxy for Frontier via $HTTP_PROXY") sys.exit(1) try: with open('/etc/cvmfs/default.local') as f: lines = f.readlines() except IOError: lines = [] for l in lines: m = re.match('\s*CVMFS_HTTP_PROXY\s*=\s*[\'"]?(.*)[\'"]?', l) if m: self.__cvmfs_proxy = m.group(1).strip("\"'") break else: try: self.__cvmfs_proxy = os.environ['HTTP_PROXY'] except KeyError: logger.error("can't determine proxy for CVMFS via $HTTP_PROXY") sys.exit(1) logger.debug("using {} as proxy for CVMFS".format(self.__cvmfs_proxy)) logger.debug("using {} as proxy for Frontier".format(self.__frontier_proxy)) logger.debug("using {} as osg_version".format(self.config.advanced.osg_version)) util.sendemail("Your Lobster project has started!", self.config) self.__taskhandlers = {} self.__store = unit.UnitStore(self.config) self.__setup_inputs() self.copy_siteconf() create = not util.checkpoint(self.workdir, 'id') if create: self.taskid = 'lobster_{0}_{1}'.format( self.config.label, sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:]) util.register_checkpoint(self.workdir, 'id', self.taskid) shutil.copy(self.config.base_configuration, os.path.join(self.workdir, 'config.py')) else: self.taskid = util.checkpoint(self.workdir, 'id') util.register_checkpoint(self.workdir, 'RESTARTED', str(datetime.datetime.utcnow())) if not util.checkpoint(self.workdir, 'executable'): # We can actually have more than one exe name (one per task label) # Set 'cmsRun' if any of the tasks are of that type, # or use cmd command if all tasks execute the same cmd, # or use 'noncmsRun' if task cmds are different # Using this for dashboard exe name reporting cmsconfigs = [wflow.pset for wflow in self.config.workflows] cmds = [wflow.command for wflow in self.config.workflows] if any(cmsconfigs): exename = 'cmsRun' elif all(x == cmds[0] and x is not None for x in cmds): exename = cmds[0] else: exename = 'noncmsRun' util.register_checkpoint(self.workdir, 'executable', exename) for wflow in self.config.workflows: if create and not util.checkpoint(self.workdir, wflow.label): wflow.setup(self.workdir, self.basedirs) logger.info("querying backend for {0}".format(wflow.label)) with fs.alternative(): dataset_info = wflow.dataset.get_info() logger.info("registering {0} in database".format(wflow.label)) self.__store.register_dataset(wflow, dataset_info, wflow.category.runtime) util.register_checkpoint(self.workdir, wflow.label, 'REGISTERED') elif os.path.exists(os.path.join(wflow.workdir, 'running')): for id in self.get_taskids(wflow.label): util.move(wflow.workdir, id, 'failed') for wflow in self.config.workflows: if wflow.parent: getattr(self.config.workflows, wflow.parent.label).register(wflow) if create: total_units = wflow.dataset.total_units * len(wflow.unique_arguments) self.__store.register_dependency(wflow.label, wflow.parent.label, total_units) if not util.checkpoint(self.workdir, 'sandbox cmssw version'): util.register_checkpoint(self.workdir, 'sandbox', 'CREATED') versions = set([w.version for w in self.config.workflows]) if len(versions) == 1: util.register_checkpoint(self.workdir, 'sandbox cmssw version', list(versions)[0]) if self.config.elk: if create: categories = {wflow.category.name: [] for wflow in self.config.workflows} for category in categories: for workflow in self.config.workflows: if workflow.category.name == category: categories[category].append(workflow.label) self.config.elk.create(categories) else: self.config.elk.resume() self.config.advanced.dashboard.setup(self.config) if create: self.config.save() self.config.advanced.dashboard.register_run() else: self.config.advanced.dashboard.update_task_status( (id_, dash.ABORTED) for id_ in self.__store.reset_units() ) for p in (self.parrot_bin, self.parrot_lib): if not os.path.exists(p): os.makedirs(p) for exe in ('parrot_run', 'chirp', 'chirp_put', 'chirp_get'): shutil.copy(util.which(exe), self.parrot_bin) subprocess.check_call(["strip", os.path.join(self.parrot_bin, exe)]) p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib', 'lib64', 'libparrot_helper.so') shutil.copy(p_helper, self.parrot_lib)
def release(self, tasks): fail_cleanup = [] merge_cleanup = [] input_cleanup = [] update = defaultdict(list) propagate = defaultdict(dict) input_files = defaultdict(set) summary = ReleaseSummary() transfers = defaultdict(lambda: defaultdict(Counter)) with self.measure('dash'): self.config.advanced.dashboard.update_task_status( (task.tag, dash.DONE) for task in tasks) for task in tasks: with self.measure('updates'): handler = self.__taskhandlers[task.tag] failed, task_update, file_update, unit_update = handler.process( task, summary, transfers) wflow = getattr(self.config.workflows, handler.dataset) with self.measure('elk'): if self.config.elk: self.config.elk.index_task(task) self.config.elk.index_task_update(task_update) with self.measure('handler'): if failed: faildir = util.move(wflow.workdir, handler.id, 'failed') summary.dir(str(handler.id), faildir) fail_cleanup.extend([lf for rf, lf in handler.outputs]) else: util.move(wflow.workdir, handler.id, 'successful') merge = isinstance(handler, MergeTaskHandler) if (wflow.merge_size <= 0 or merge) and len(handler.outputs) > 0: outfn = handler.outputs[0][1] outinfo = handler.output_info for dep in wflow.dependents: propagate[dep.label][outfn] = outinfo if merge: merge_cleanup.extend(handler.input_files) if wflow.cleanup_input: input_files[handler.dataset].update( set([f for (_, _, f) in file_update])) update[(handler.dataset, handler.unit_source)].append( (task_update, file_update, unit_update)) del self.__taskhandlers[task.tag] with self.measure('dash'): self.config.advanced.dashboard.update_task_status( (task.tag, dash.RETRIEVED) for task in tasks) if len(update) > 0: with self.measure('sqlite'): logger.info(summary) self.__store.update_units(update) with self.measure('cleanup'): if len(input_files) > 0: input_cleanup.extend(self.__store.finished_files(input_files)) for cleanup in [fail_cleanup, merge_cleanup + input_cleanup]: if len(cleanup) > 0: try: fs.remove(*cleanup) except (IOError, OSError): pass except ValueError as e: logger.error("error removing {0}:\n{1}".format( task.tag, e)) with self.measure('propagate'): for label, infos in propagate.items(): unique_args = getattr(self.config.workflows, label).unique_arguments self.__store.register_files(infos, label, unique_args) if len(transfers) > 0: with self.measure('transfers'): self.__store.update_transfers(transfers) if self.config.elk: with self.measure('elk'): try: self.config.elk.index_summary( self.__store.workflow_status()) except Exception as e: logger.error('ELK failed to index summary:\n{}'.format(e))
def release(self, tasks): fail_cleanup = [] merge_cleanup = [] input_cleanup = [] update = defaultdict(list) propagate = defaultdict(dict) input_files = defaultdict(set) summary = ReleaseSummary() transfers = defaultdict(lambda: defaultdict(Counter)) with self.measure('dash'): self.config.advanced.dashboard.update_task_status( (task.tag, dash.DONE) for task in tasks ) for task in tasks: with self.measure('updates'): handler = self.__taskhandlers[task.tag] failed, task_update, file_update, unit_update = handler.process(task, summary, transfers) wflow = getattr(self.config.workflows, handler.dataset) with self.measure('elk'): if self.config.elk: self.config.elk.index_task(task) self.config.elk.index_task_update(task_update) with self.measure('handler'): if failed: faildir = util.move(wflow.workdir, handler.id, 'failed') summary.dir(str(handler.id), faildir) fail_cleanup.extend([lf for rf, lf in handler.outputs]) else: util.move(wflow.workdir, handler.id, 'successful') merge = isinstance(handler, MergeTaskHandler) if (wflow.merge_size <= 0 or merge) and len(handler.outputs) > 0: outfn = handler.outputs[0][1] outinfo = handler.output_info for dep in wflow.dependents: propagate[dep.label][outfn] = outinfo if merge: merge_cleanup.extend(handler.input_files) if wflow.cleanup_input: input_files[handler.dataset].update(set([f for (_, _, f) in file_update])) update[(handler.dataset, handler.unit_source)].append((task_update, file_update, unit_update)) del self.__taskhandlers[task.tag] with self.measure('dash'): self.config.advanced.dashboard.update_task_status( (task.tag, dash.RETRIEVED) for task in tasks ) if len(update) > 0: with self.measure('sqlite'): logger.info(summary) self.__store.update_units(update) with self.measure('cleanup'): if len(input_files) > 0: input_cleanup.extend(self.__store.finished_files(input_files)) for cleanup in [fail_cleanup, merge_cleanup + input_cleanup]: if len(cleanup) > 0: try: fs.remove(*cleanup) except (IOError, OSError): pass except ValueError as e: logger.error("error removing {0}:\n{1}".format(task.tag, e)) with self.measure('propagate'): for label, infos in propagate.items(): unique_args = getattr(self.config.workflows, label).unique_arguments self.__store.register_files(infos, label, unique_args) if len(transfers) > 0: with self.measure('transfers'): self.__store.update_transfers(transfers) if self.config.elk: with self.measure('elk'): try: self.config.elk.index_summary(self.__store.workflow_status()) except Exception as e: logger.error('ELK failed to index summary:\n{}'.format(e))