Example #1
0
    def __init__(self, config):
        util.Timing.__init__(self, 'dash', 'handler', 'updates', 'elk',
                             'transfers', 'cleanup', 'propagate', 'sqlite')

        self.config = config
        self.basedirs = [config.base_directory, config.startup_directory]
        self.workdir = config.workdir
        self._storage = config.storage
        self.statusfile = os.path.join(self.workdir, 'status.json')
        self.siteconf = os.path.join(self.workdir, 'siteconf')

        self.parrot_path = os.path.dirname(util.which('parrot_run'))
        self.parrot_bin = os.path.join(self.workdir, 'bin')
        self.parrot_lib = os.path.join(self.workdir, 'lib')

        self.__algo = Algo(config)
        self.__host = socket.getfqdn()
        try:
            siteconf = loadSiteLocalConfig()
            self.__ce = siteconf.siteName
            self.__se = siteconf.localStageOutPNN()
            self.__frontier_proxy = siteconf.frontierProxies[0]
        except (SiteConfigError, IndexError):
            logger.error("can't load siteconfig, defaulting to hostname")
            self.__ce = socket.getfqdn()
            self.__se = socket.getfqdn()
            try:
                self.__frontier_proxy = os.environ['HTTP_PROXY']
            except KeyError:
                logger.error(
                    "can't determine proxy for Frontier via $HTTP_PROXY")
                sys.exit(1)

        try:
            with open('/etc/cvmfs/default.local') as f:
                lines = f.readlines()
        except IOError:
            lines = []
        for l in lines:
            m = re.match('\s*CVMFS_HTTP_PROXY\s*=\s*[\'"]?(.*)[\'"]?', l)
            if m:
                self.__cvmfs_proxy = m.group(1).strip("\"'")
                break
        else:
            try:
                self.__cvmfs_proxy = os.environ['HTTP_PROXY']
            except KeyError:
                logger.error("can't determine proxy for CVMFS via $HTTP_PROXY")
                sys.exit(1)

        logger.debug("using {} as proxy for CVMFS".format(self.__cvmfs_proxy))
        logger.debug("using {} as proxy for Frontier".format(
            self.__frontier_proxy))
        logger.debug("using {} as osg_version".format(
            self.config.advanced.osg_version))
        util.sendemail("Your Lobster project has started!", self.config)

        self.__taskhandlers = {}
        self.__store = unit.UnitStore(self.config)

        self.__setup_inputs()
        self.copy_siteconf()

        create = not util.checkpoint(self.workdir, 'id')
        if create:
            self.taskid = 'lobster_{0}_{1}'.format(
                self.config.label,
                sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:])
            util.register_checkpoint(self.workdir, 'id', self.taskid)
            shutil.copy(self.config.base_configuration,
                        os.path.join(self.workdir, 'config.py'))
        else:
            self.taskid = util.checkpoint(self.workdir, 'id')
            util.register_checkpoint(self.workdir, 'RESTARTED',
                                     str(datetime.datetime.utcnow()))

        if not util.checkpoint(self.workdir, 'executable'):
            # We can actually have more than one exe name (one per task label)
            # Set 'cmsRun' if any of the tasks are of that type,
            # or use cmd command if all tasks execute the same cmd,
            # or use 'noncmsRun' if task cmds are different
            # Using this for dashboard exe name reporting
            cmsconfigs = [wflow.pset for wflow in self.config.workflows]
            cmds = [wflow.command for wflow in self.config.workflows]
            if any(cmsconfigs):
                exename = 'cmsRun'
            elif all(x == cmds[0] and x is not None for x in cmds):
                exename = cmds[0]
            else:
                exename = 'noncmsRun'

            util.register_checkpoint(self.workdir, 'executable', exename)

        for wflow in self.config.workflows:
            if create and not util.checkpoint(self.workdir, wflow.label):
                wflow.setup(self.workdir, self.basedirs)
                logger.info("querying backend for {0}".format(wflow.label))
                with fs.alternative():
                    dataset_info = wflow.dataset.get_info()

                logger.info("registering {0} in database".format(wflow.label))
                self.__store.register_dataset(wflow, dataset_info,
                                              wflow.category.runtime)
                util.register_checkpoint(self.workdir, wflow.label,
                                         'REGISTERED')
            elif os.path.exists(os.path.join(wflow.workdir, 'running')):
                for id in self.get_taskids(wflow.label):
                    util.move(wflow.workdir, id, 'failed')

        for wflow in self.config.workflows:
            if wflow.parent:
                getattr(self.config.workflows,
                        wflow.parent.label).register(wflow)
                if create:
                    total_units = wflow.dataset.total_units * len(
                        wflow.unique_arguments)
                    self.__store.register_dependency(wflow.label,
                                                     wflow.parent.label,
                                                     total_units)

        if not util.checkpoint(self.workdir, 'sandbox cmssw version'):
            util.register_checkpoint(self.workdir, 'sandbox', 'CREATED')
            versions = set([w.version for w in self.config.workflows])
            if len(versions) == 1:
                util.register_checkpoint(self.workdir, 'sandbox cmssw version',
                                         list(versions)[0])

        if self.config.elk:
            if create:
                categories = {
                    wflow.category.name: []
                    for wflow in self.config.workflows
                }
                for category in categories:
                    for workflow in self.config.workflows:
                        if workflow.category.name == category:
                            categories[category].append(workflow.label)
                self.config.elk.create(categories)
            else:
                self.config.elk.resume()

        self.config.advanced.dashboard.setup(self.config)
        if create:
            self.config.save()
            self.config.advanced.dashboard.register_run()
        else:
            self.config.advanced.dashboard.update_task_status(
                (id_, dash.ABORTED) for id_ in self.__store.reset_units())

        for p in (self.parrot_bin, self.parrot_lib):
            if not os.path.exists(p):
                os.makedirs(p)

        for exe in ('parrot_run', 'chirp', 'chirp_put', 'chirp_get'):
            shutil.copy(util.which(exe), self.parrot_bin)
            subprocess.check_call(
                ["strip", os.path.join(self.parrot_bin, exe)])

        p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib',
                                'lib64', 'libparrot_helper.so')
        shutil.copy(p_helper, self.parrot_lib)
Example #2
0
    def __init__(self, config):
        util.Timing.__init__(self, 'dash', 'handler', 'updates', 'elk', 'transfers', 'cleanup', 'propagate', 'sqlite')

        self.config = config
        self.basedirs = [config.base_directory, config.startup_directory]
        self.workdir = config.workdir
        self._storage = config.storage
        self.statusfile = os.path.join(self.workdir, 'status.json')
        self.siteconf = os.path.join(self.workdir, 'siteconf')

        self.parrot_path = os.path.dirname(util.which('parrot_run'))
        self.parrot_bin = os.path.join(self.workdir, 'bin')
        self.parrot_lib = os.path.join(self.workdir, 'lib')

        self.__algo = Algo(config)
        self.__host = socket.getfqdn()
        try:
            siteconf = loadSiteLocalConfig()
            self.__ce = siteconf.siteName
            self.__se = siteconf.localStageOutPNN()
            self.__frontier_proxy = siteconf.frontierProxies[0]
        except (SiteConfigError, IndexError):
            logger.error("can't load siteconfig, defaulting to hostname")
            self.__ce = socket.getfqdn()
            self.__se = socket.getfqdn()
            try:
                self.__frontier_proxy = os.environ['HTTP_PROXY']
            except KeyError:
                logger.error("can't determine proxy for Frontier via $HTTP_PROXY")
                sys.exit(1)

        try:
            with open('/etc/cvmfs/default.local') as f:
                lines = f.readlines()
        except IOError:
            lines = []
        for l in lines:
            m = re.match('\s*CVMFS_HTTP_PROXY\s*=\s*[\'"]?(.*)[\'"]?', l)
            if m:
                self.__cvmfs_proxy = m.group(1).strip("\"'")
                break
        else:
            try:
                self.__cvmfs_proxy = os.environ['HTTP_PROXY']
            except KeyError:
                logger.error("can't determine proxy for CVMFS via $HTTP_PROXY")
                sys.exit(1)

        logger.debug("using {} as proxy for CVMFS".format(self.__cvmfs_proxy))
        logger.debug("using {} as proxy for Frontier".format(self.__frontier_proxy))
        logger.debug("using {} as osg_version".format(self.config.advanced.osg_version))
        util.sendemail("Your Lobster project has started!", self.config)

        self.__taskhandlers = {}
        self.__store = unit.UnitStore(self.config)

        self.__setup_inputs()
        self.copy_siteconf()

        create = not util.checkpoint(self.workdir, 'id')
        if create:
            self.taskid = 'lobster_{0}_{1}'.format(
                self.config.label,
                sha1(str(datetime.datetime.utcnow())).hexdigest()[-16:])
            util.register_checkpoint(self.workdir, 'id', self.taskid)
            shutil.copy(self.config.base_configuration, os.path.join(self.workdir, 'config.py'))
        else:
            self.taskid = util.checkpoint(self.workdir, 'id')
            util.register_checkpoint(self.workdir, 'RESTARTED', str(datetime.datetime.utcnow()))

        if not util.checkpoint(self.workdir, 'executable'):
            # We can actually have more than one exe name (one per task label)
            # Set 'cmsRun' if any of the tasks are of that type,
            # or use cmd command if all tasks execute the same cmd,
            # or use 'noncmsRun' if task cmds are different
            # Using this for dashboard exe name reporting
            cmsconfigs = [wflow.pset for wflow in self.config.workflows]
            cmds = [wflow.command for wflow in self.config.workflows]
            if any(cmsconfigs):
                exename = 'cmsRun'
            elif all(x == cmds[0] and x is not None for x in cmds):
                exename = cmds[0]
            else:
                exename = 'noncmsRun'

            util.register_checkpoint(self.workdir, 'executable', exename)

        for wflow in self.config.workflows:
            if create and not util.checkpoint(self.workdir, wflow.label):
                wflow.setup(self.workdir, self.basedirs)
                logger.info("querying backend for {0}".format(wflow.label))
                with fs.alternative():
                    dataset_info = wflow.dataset.get_info()

                logger.info("registering {0} in database".format(wflow.label))
                self.__store.register_dataset(wflow, dataset_info, wflow.category.runtime)
                util.register_checkpoint(self.workdir, wflow.label, 'REGISTERED')
            elif os.path.exists(os.path.join(wflow.workdir, 'running')):
                for id in self.get_taskids(wflow.label):
                    util.move(wflow.workdir, id, 'failed')

        for wflow in self.config.workflows:
            if wflow.parent:
                getattr(self.config.workflows, wflow.parent.label).register(wflow)
                if create:
                    total_units = wflow.dataset.total_units * len(wflow.unique_arguments)
                    self.__store.register_dependency(wflow.label, wflow.parent.label, total_units)

        if not util.checkpoint(self.workdir, 'sandbox cmssw version'):
            util.register_checkpoint(self.workdir, 'sandbox', 'CREATED')
            versions = set([w.version for w in self.config.workflows])
            if len(versions) == 1:
                util.register_checkpoint(self.workdir, 'sandbox cmssw version', list(versions)[0])

        if self.config.elk:
            if create:
                categories = {wflow.category.name: [] for wflow in self.config.workflows}
                for category in categories:
                    for workflow in self.config.workflows:
                        if workflow.category.name == category:
                            categories[category].append(workflow.label)
                self.config.elk.create(categories)
            else:
                self.config.elk.resume()

        self.config.advanced.dashboard.setup(self.config)
        if create:
            self.config.save()
            self.config.advanced.dashboard.register_run()
        else:
            self.config.advanced.dashboard.update_task_status(
                (id_, dash.ABORTED) for id_ in self.__store.reset_units()
            )

        for p in (self.parrot_bin, self.parrot_lib):
            if not os.path.exists(p):
                os.makedirs(p)

        for exe in ('parrot_run', 'chirp', 'chirp_put', 'chirp_get'):
            shutil.copy(util.which(exe), self.parrot_bin)
            subprocess.check_call(["strip", os.path.join(self.parrot_bin, exe)])

        p_helper = os.path.join(os.path.dirname(self.parrot_path), 'lib', 'lib64', 'libparrot_helper.so')
        shutil.copy(p_helper, self.parrot_lib)
Example #3
0
    def release(self, tasks):
        fail_cleanup = []
        merge_cleanup = []
        input_cleanup = []
        update = defaultdict(list)
        propagate = defaultdict(dict)
        input_files = defaultdict(set)
        summary = ReleaseSummary()
        transfers = defaultdict(lambda: defaultdict(Counter))

        with self.measure('dash'):
            self.config.advanced.dashboard.update_task_status(
                (task.tag, dash.DONE) for task in tasks)

        for task in tasks:
            with self.measure('updates'):
                handler = self.__taskhandlers[task.tag]
                failed, task_update, file_update, unit_update = handler.process(
                    task, summary, transfers)

                wflow = getattr(self.config.workflows, handler.dataset)

            with self.measure('elk'):
                if self.config.elk:
                    self.config.elk.index_task(task)
                    self.config.elk.index_task_update(task_update)

            with self.measure('handler'):
                if failed:
                    faildir = util.move(wflow.workdir, handler.id, 'failed')
                    summary.dir(str(handler.id), faildir)
                    fail_cleanup.extend([lf for rf, lf in handler.outputs])
                else:
                    util.move(wflow.workdir, handler.id, 'successful')

                    merge = isinstance(handler, MergeTaskHandler)

                    if (wflow.merge_size <= 0
                            or merge) and len(handler.outputs) > 0:
                        outfn = handler.outputs[0][1]
                        outinfo = handler.output_info
                        for dep in wflow.dependents:
                            propagate[dep.label][outfn] = outinfo

                    if merge:
                        merge_cleanup.extend(handler.input_files)

                    if wflow.cleanup_input:
                        input_files[handler.dataset].update(
                            set([f for (_, _, f) in file_update]))

            update[(handler.dataset, handler.unit_source)].append(
                (task_update, file_update, unit_update))

            del self.__taskhandlers[task.tag]

        with self.measure('dash'):
            self.config.advanced.dashboard.update_task_status(
                (task.tag, dash.RETRIEVED) for task in tasks)

        if len(update) > 0:
            with self.measure('sqlite'):
                logger.info(summary)
                self.__store.update_units(update)

        with self.measure('cleanup'):
            if len(input_files) > 0:
                input_cleanup.extend(self.__store.finished_files(input_files))

            for cleanup in [fail_cleanup, merge_cleanup + input_cleanup]:
                if len(cleanup) > 0:
                    try:
                        fs.remove(*cleanup)
                    except (IOError, OSError):
                        pass
                    except ValueError as e:
                        logger.error("error removing {0}:\n{1}".format(
                            task.tag, e))

        with self.measure('propagate'):
            for label, infos in propagate.items():
                unique_args = getattr(self.config.workflows,
                                      label).unique_arguments
                self.__store.register_files(infos, label, unique_args)

        if len(transfers) > 0:
            with self.measure('transfers'):
                self.__store.update_transfers(transfers)

        if self.config.elk:
            with self.measure('elk'):
                try:
                    self.config.elk.index_summary(
                        self.__store.workflow_status())
                except Exception as e:
                    logger.error('ELK failed to index summary:\n{}'.format(e))
Example #4
0
    def release(self, tasks):
        fail_cleanup = []
        merge_cleanup = []
        input_cleanup = []
        update = defaultdict(list)
        propagate = defaultdict(dict)
        input_files = defaultdict(set)
        summary = ReleaseSummary()
        transfers = defaultdict(lambda: defaultdict(Counter))

        with self.measure('dash'):
            self.config.advanced.dashboard.update_task_status(
                (task.tag, dash.DONE) for task in tasks
            )

        for task in tasks:
            with self.measure('updates'):
                handler = self.__taskhandlers[task.tag]
                failed, task_update, file_update, unit_update = handler.process(task, summary, transfers)

                wflow = getattr(self.config.workflows, handler.dataset)

            with self.measure('elk'):
                if self.config.elk:
                    self.config.elk.index_task(task)
                    self.config.elk.index_task_update(task_update)

            with self.measure('handler'):
                if failed:
                    faildir = util.move(wflow.workdir, handler.id, 'failed')
                    summary.dir(str(handler.id), faildir)
                    fail_cleanup.extend([lf for rf, lf in handler.outputs])
                else:
                    util.move(wflow.workdir, handler.id, 'successful')

                    merge = isinstance(handler, MergeTaskHandler)

                    if (wflow.merge_size <= 0 or merge) and len(handler.outputs) > 0:
                        outfn = handler.outputs[0][1]
                        outinfo = handler.output_info
                        for dep in wflow.dependents:
                            propagate[dep.label][outfn] = outinfo

                    if merge:
                        merge_cleanup.extend(handler.input_files)

                    if wflow.cleanup_input:
                        input_files[handler.dataset].update(set([f for (_, _, f) in file_update]))

            update[(handler.dataset, handler.unit_source)].append((task_update, file_update, unit_update))

            del self.__taskhandlers[task.tag]

        with self.measure('dash'):
            self.config.advanced.dashboard.update_task_status(
                (task.tag, dash.RETRIEVED) for task in tasks
            )

        if len(update) > 0:
            with self.measure('sqlite'):
                logger.info(summary)
                self.__store.update_units(update)

        with self.measure('cleanup'):
            if len(input_files) > 0:
                input_cleanup.extend(self.__store.finished_files(input_files))

            for cleanup in [fail_cleanup, merge_cleanup + input_cleanup]:
                if len(cleanup) > 0:
                    try:
                        fs.remove(*cleanup)
                    except (IOError, OSError):
                        pass
                    except ValueError as e:
                        logger.error("error removing {0}:\n{1}".format(task.tag, e))

        with self.measure('propagate'):
            for label, infos in propagate.items():
                unique_args = getattr(self.config.workflows, label).unique_arguments
                self.__store.register_files(infos, label, unique_args)

        if len(transfers) > 0:
            with self.measure('transfers'):
                self.__store.update_transfers(transfers)

        if self.config.elk:
            with self.measure('elk'):
                try:
                    self.config.elk.index_summary(self.__store.workflow_status())
                except Exception as e:
                    logger.error('ELK failed to index summary:\n{}'.format(e))