def clean(argv): for i, arg in enumerate(argv): if (arg[0] == '-') and arg[1].isdigit(): argv[i] = ' ' + arg parser = ArgumentParser( description='Convience tools for cleaning up after stimela') add = parser.add_argument add("-ai", "--all-images", action="store_true", help="Remove all images pulled/built by stimela. This include CAB images") add("-ab", "--all-base", action="store_true", help="Remove all base images") add("-ac", "--all-cabs", action="store_true", help="Remove all CAB images") add("-aC", "--all-containers", action="store_true", help="Stop and/or Remove all stimela containers") add("-bl", "--build-label", default=USER.lower(), help="Label for cab images. All cab images will be named <CAB_LABEL>_<cab name>. The default is $USER") args = parser.parse_args(argv) log = logger.StimelaLogger(LOG_FILE) log_cabs = logger.StimelaLogger('{0:s}/{1:s}_stimela_logfile.json'.format(LOG_HOME, args.build_label)) if args.all_images: images = log.info['images'].keys() images = log_cabs.info['images'].keys() for image in images: utils.xrun('docker', ['rmi', image]) log.remove('images', image) log.write() images = log_cabs.info['images'].keys() for image in images: if log_cabs.info['images'][image]['CAB']: utils.xrun('docker', ['rmi', image]) log_cabs.remove('images', image) log_cabs.write() if args.all_base: images = log.info['images'].keys() for image in images: if log.info['images'][image]['CAB'] is False: utils.xrun('docker', ['rmi', image]) log.remove('images', image) log.write() if args.all_cabs: images = log_cabs.info['images'].keys() for image in images: if log_cabs.info['images'][image]['CAB']: utils.xrun('docker', ['rmi', image]) log_cabs.remove('images', image) log_cabs.write() if args.all_containers: containers = log.info['containers'].keys() for container in containers: cont = docker.Container( log.info['containers'][container]['IMAGE'], container) try: status = cont.info()['State']['Status'].lower() except: print('Could not inspect container {}. It probably doesn\'t exist, will remove it from log'.format( container)) status = "no there" if status == 'running': # Kill the container instead of stopping it, so that effect can be felt py parent process utils.xrun('docker', ['kill', container]) cont.remove() elif status in ['exited', 'dead']: cont.remove() log.remove('containers', container) log.write()
def docker_job(self, image, config=None, input=None, output=None, msdir=None, shared_memory='1gb', build_label=None, **kw): """ Add a task to a stimela recipe image : stimela cab name, e.g. 'cab/simms' name : This name will be part of the name of the contaier that will execute the task (now optional) config : Dictionary of options to parse to the task. This will modify the parameters in the default parameter file which can be viewd by running 'stimela cabs -i <cab name>', e.g 'stimela cabs -i simms' input : input dirctory for cab output : output directory for cab msdir : MS directory for cab. Only specify if different from recipe ms_dir """ # check if name has any offending charecters offenders = re.findall('\W', self.name) if offenders: raise StimelaCabParameterError('The cab name \'{:s}\' has some non-alphanumeric characters.' ' Charecters making up this name must be in [a-z,A-Z,0-9,_]'.format(self.name)) # Update I/O with values specified on command line # TODO (sphe) I think this feature should be removed script_context = self.recipe.stimela_context input = script_context.get('_STIMELA_INPUT', None) or input output = script_context.get('_STIMELA_OUTPUT', None) or output output = os.path.abspath(output) msdir = script_context.get('_STIMELA_MSDIR', None) or msdir build_label = script_context.get( '_STIMELA_BUILD_LABEL', None) or build_label # Get location of template parameters file cabs_logger = get_cabs( '{0:s}/{1:s}_stimela_logfile.json'.format(stimela.LOG_HOME, build_label)) try: cabpath = cabs_logger['{0:s}_{1:s}'.format( build_label, image)]['DIR'] except KeyError: raise StimelaCabParameterError( 'Cab {} has is uknown to stimela. Was it built?'.format(image)) parameter_file = cabpath+'/parameters.json' name = '{0}-{1}{2}'.format(self.name, id(image), str(time.time()).replace('.', '')) _cab = cab.CabDefinition(indir=input, outdir=output, msdir=msdir, parameter_file=parameter_file) cont = docker.Container(image, name, label=self.label, logger=self.log, shared_memory=shared_memory, log_container=stimela.LOG_FILE, time_out=self.time_out) # Container parameter file will be updated and validated before the container is executed cont._cab = _cab cont.parameter_file_name = '{0}/{1}.json'.format( self.recipe.parameter_file_dir, name) # Remove dismissable kw arguments: ops_to_pop = [] for op in config: if isinstance(config[op], dismissable): ops_to_pop.append(op) for op in ops_to_pop: arg = config.pop(op)() if arg is not None: config[op] = arg cont.config = config cont.add_volume( "{0:s}/cargo/cab/docker_run".format(self.recipe.stimela_path), "/docker_run", perm="ro") cont.COMMAND = "/bin/sh -c /docker_run" # These are standard volumes and # environmental variables. These will be # always exist in a cab container cont.add_volume(self.recipe.stimela_path, '/scratch/stimela', perm='ro') cont.add_volume(self.recipe.parameter_file_dir, '/configs', perm='ro') cont.add_environ('CONFIG', '/configs/{}.json'.format(name)) cab.IODEST = CONT_IO["docker"] if msdir: md = cab.IODEST["msfile"] cont.add_volume(msdir, md) cont.add_environ('MSDIR', md) # Keep a record of the content of the # volume dirname, dirs, files = [a for a in next(os.walk(msdir))] cont.msdir_content = { "volume": dirname, "dirs": dirs, "files": files, } self.log.debug( 'Mounting volume \'{0}\' from local file system to \'{1}\' in the container'.format(msdir, md)) if input: cont.add_volume(input, cab.IODEST["input"], perm='ro') cont.add_environ('INPUT', cab.IODEST["input"]) # Keep a record of the content of the # volume dirname, dirs, files = [a for a in next(os.walk(input))] cont.input_content = { "volume": dirname, "dirs": dirs, "files": files, } self.log.debug('Mounting volume \'{0}\' from local file system to \'{1}\' in the container'.format( input, cab.IODEST["input"])) if not os.path.exists(output): os.mkdir(output) od = cab.IODEST["output"] cont.add_environ('HOME', od) cont.add_environ('OUTPUT', od) self.log_dir = os.path.abspath(self.log_dir or output) log_dir_name = os.path.basename(self.log_dir or output) logfile_name = 'log-{0:s}.txt'.format(name.split('-')[0]) self.logfile = cont.logfile = '{0:s}/{1:s}'.format( self.log_dir, logfile_name) cont.add_volume(output, od, "rw") if not os.path.exists(self.logfile): with open(self.logfile, "w") as std: pass cont.add_volume( self.logfile, "{0:s}/logfile".format(self.log_dir), "rw") cont.add_environ('LOGFILE', "{0:}/logfile".format(self.log_dir)) self.log.debug( 'Mounting volume \'{0}\' from local file system to \'{1}\' in the container'.format(output, od)) cont.image = '{0}_{1}'.format(build_label, image) # Added and ready for execution self.job = cont return 0
def run(self, steps=None, resume=False, redo=None): """ Run a Stimela recipe. steps : recipe steps to run resume : resume recipe from last run redo : Re-run an old recipe from a .last file """ recipe = { "name": self.name, "steps": [] } start_at = 0 if redo: recipe = utils.readJson(redo) self.log.info('Rerunning recipe {0} from {1}'.format( recipe['name'], redo)) self.log.info('Recreating recipe instance..') self.jobs = [] for step in recipe['steps']: # add I/O folders to the json file # add a string describing the contents of these folders # The user has to ensure that these folders exist, and have the required content if step['jtype'] == 'docker': self.log.info('Adding job \'{0}\' to recipe. The container will be named \'{1}\''.format( step['cab'], step['name'])) cont = docker.Container(step['cab'], step['name'], label=step['label'], logger=self.log, shared_memory=step['shared_memory']) self.log.debug('Adding volumes {0} and environmental variables {1}'.format( step['volumes'], step['environs'])) cont.volumes = step['volumes'] cont.environs = step['environs'] cont.shared_memory = step['shared_memory'] cont.input_content = step['input_content'] cont.msdir_content = step['msdir_content'] cont.logfile = step['logfile'] job = StimelaJob( step['name'], recipe=self, label=step['label']) job.job = cont job.jtype = 'docker' elif step['jtype'] == 'function': name = step['name'] func = inspect.currentframe( ).f_back.f_locals[step['function']] job = StimelaJob(name, recipe=self, label=step['label']) job.python_job(func, step['parameters']) job.jtype = 'function' self.jobs.append(job) elif resume: self.log.info("Resuming recipe from last run.") try: recipe = utils.readJson(self.resume_file) except IOError: raise StimelaRecipeExecutionError( "Cannot resume pipeline, resume file '{}' not found".format(self.resume_file)) steps_ = recipe.pop('steps') recipe['steps'] = [] _steps = [] for step in steps_: if step['status'] == 'completed': recipe['steps'].append(step) continue label = step['label'] number = step['number'] # Check if the recipe flow has changed if label == self.jobs[number-1].label: self.log.info( 'recipe step \'{0}\' is fit for re-execution. Label = {1}'.format(number, label)) _steps.append(number) else: raise StimelaRecipeExecutionError( 'Recipe flow, or task scheduling has changed. Cannot resume recipe. Label = {0}'.format(label)) # Check whether there are steps to resume if len(_steps) == 0: self.log.info( 'All the steps were completed. No steps to resume') sys.exit(0) steps = _steps if getattr(steps, '__iter__', False): _steps = [] if isinstance(steps[0], str): labels = [job.label.split('::')[0] for job in self.jobs] for step in steps: try: _steps.append(labels.index(step)+1) except ValueError: raise StimelaCabParameterError( 'Recipe label ID [{0}] doesn\'t exist'.format(step)) steps = _steps else: steps = range(1, len(self.jobs)+1) jobs = [(step, self.jobs[step-1]) for step in steps] for i, (step, job) in enumerate(jobs): self.log.info('Running job {}'.format(job.name)) self.log.info('STEP {0} :: {1}'.format(i+1, job.label)) self.active = job try: if job.jtype == 'function': job.run_python_job() elif job.jtype in ['docker', 'singularity', 'udocker', 'podman']: with open(job.job.logfile, 'a') as astd: astd.write('\n-----------------------------------\n') astd.write( 'Stimela version : {}\n'.format(version)) astd.write( 'Cab name : {}\n'.format(job.job.image)) astd.write('-------------------------------------\n') run_job = getattr(job, "run_{0:s}_job".format(job.jtype)) run_job() self.log2recipe(job, recipe, step, 'completed') except (utils.StimelaCabRuntimeError, StimelaRecipeExecutionError, StimelaCabParameterError) as e: self.completed = [jb[1] for jb in jobs[:i]] self.remaining = [jb[1] for jb in jobs[i+1:]] self.failed = job self.log.info( 'Recipe execution failed while running job {}'.format(job.name)) self.log.info('Completed jobs : {}'.format( [c.name for c in self.completed])) self.log.info('Remaining jobs : {}'.format( [c.name for c in self.remaining])) self.log2recipe(job, recipe, step, 'failed') for step, jb in jobs[i+1:]: self.log.info( 'Logging remaining task: {}'.format(jb.label)) self.log2recipe(jb, recipe, step, 'remaining') self.log.info( 'Saving pipeline information in {}'.format(self.resume_file)) utils.writeJson(self.resume_file, recipe) pe = PipelineException(e, self.completed, job, self.remaining) raise_(pe, None, sys.exc_info()[2]) except: import traceback traceback.print_exc() raise RuntimeError( "An unhandled exception has occured. This is a bug, please report") finally: if job.jtype == 'singularity' and job.created: job.job.stop() self.log.info( 'Saving pipeline information in {}'.format(self.resume_file)) utils.writeJson(self.resume_file, recipe) self.log.info('Recipe executed successfully') return 0
def run(self, steps=None, resume=False, redo=None): """ Run a Stimela recipe. steps : recipe steps to run resume : resume recipe from last run redo : Re-run an old recipe from a .last file """ recipe = { "name": self.name, "steps": [] } start_at = 0 if redo: recipe = utils.readJson(redo) self.log.info('Rerunning recipe {0} from {1}'.format( recipe['name'], redo)) self.log.info('Recreating recipe instance..') self.jobs = [] for step in recipe['steps']: # add I/O folders to the json file # add a string describing the contents of these folders # The user has to ensure that these folders exist, and have the required content if step['jtype'] == 'docker': self.log.info('Adding job \'{0}\' to recipe. The container will be named \'{1}\''.format( step['cab'], step['name'])) cont = docker.Container(step['cab'], step['name'], label=step['label'], logger=self.log, shared_memory=step['shared_memory'], workdir=WORKDIR) self.log.debug('Adding volumes {0} and environmental variables {1}'.format( step['volumes'], step['environs'])) cont.volumes = step['volumes'] cont.environs = step['environs'] cont.shared_memory = step['shared_memory'] cont.input_content = step['input_content'] cont.msdir_content = step['msdir_content'] cont.logfile = step['logfile'] job = StimelaJob( step['name'], recipe=self, label=step['label'], cabpath=self.cabpath) job.job = cont job.jtype = 'docker' elif step['jtype'] == 'function': name = step['name'] func = inspect.currentframe( ).f_back.f_locals[step['function']] job = StimelaJob(name, recipe=self, label=step['label']) job.python_job(func, step['parameters']) job.jtype = 'function' self.jobs.append(job) elif resume: self.log.info("Resuming recipe from last run.") try: recipe = utils.readJson(self.resume_file) except IOError: raise StimelaRecipeExecutionError( "Cannot resume pipeline, resume file '{}' not found".format(self.resume_file)) steps_ = recipe.pop('steps') recipe['steps'] = [] _steps = [] for step in steps_: if step['status'] == 'completed': recipe['steps'].append(step) continue label = step['label'] number = step['number'] # Check if the recipe flow has changed if label == self.jobs[number-1].label: self.log.info( 'recipe step \'{0}\' is fit for re-execution. Label = {1}'.format(number, label)) _steps.append(number) else: raise StimelaRecipeExecutionError( 'Recipe flow, or task scheduling has changed. Cannot resume recipe. Label = {0}'.format(label)) # Check whether there are steps to resume if len(_steps) == 0: self.log.info( 'All the steps were completed. No steps to resume') sys.exit(0) steps = _steps if getattr(steps, '__iter__', False): _steps = [] if isinstance(steps[0], str): labels = [job.label.split('::')[0] for job in self.jobs] for step in steps: try: _steps.append(labels.index(step)+1) except ValueError: raise StimelaCabParameterError( 'Recipe label ID [{0}] doesn\'t exist'.format(step)) steps = _steps else: steps = range(1, len(self.jobs)+1) jobs = [(step, self.jobs[step-1]) for step in steps] # TIMESTR = "%Y-%m-%d %H:%M:%S" # TIMESTR = "%H:%M:%S" for i, (step, job) in enumerate(jobs): start_time = datetime.now() job.log.info('job started at {}'.format(start_time), # the extra attributes are filtered by e.g. the CARACal logger extra=dict(stimela_job_state=(job.name, "running"))) self.log.info('STEP {0} :: {1}'.format(i+1, job.label)) self.active = job try: with open(job.logfile, 'a') as astd: astd.write('\n-----------------------------------\n') astd.write( 'Stimela version : {}\n'.format(version)) astd.write( 'Cab name : {}\n'.format(job.image)) astd.write('-------------------------------------\n') job.run_job() self.log2recipe(job, recipe, step, 'completed') self.completed.append(job) finished_time = datetime.now() job.log.info('job complete at {} after {}'.format(finished_time, finished_time-start_time), # the extra attributes are filtered by e.g. the CARACal logger extra=dict(stimela_job_state=(job.name, "complete"))) except (utils.StimelaCabRuntimeError, StimelaRecipeExecutionError, StimelaCabParameterError) as e: self.remaining = [jb[1] for jb in jobs[i+1:]] self.failed = job finished_time = datetime.now() job.log.error(str(e), extra=dict(stimela_job_state=(job.name, "failed"), boldface=True)) job.log.error('job failed at {} after {}'.format(finished_time, finished_time-start_time), extra=dict(stimela_job_state=(job.name, "failed"), color=None)) for line in traceback.format_exc().splitlines(): job.log.error(line, extra=dict(traceback_report=True)) self.log.info('Completed jobs : {}'.format( [c.name for c in self.completed])) self.log.info('Remaining jobs : {}'.format( [c.name for c in self.remaining])) self.log2recipe(job, recipe, step, 'failed') for step, jb in jobs[i+1:]: self.log.info( 'Logging remaining task: {}'.format(jb.label)) self.log2recipe(jb, recipe, step, 'remaining') self.log.info( 'Saving pipeline information in {}'.format(self.resume_file)) utils.writeJson(self.resume_file, recipe) # raise pipeline exception. Original exception context is discarded by "from None" (since we've already # logged it above, we don't need to include it with the new exception) raise PipelineException(e, self.completed, job, self.remaining) from None self.log.info( 'Saving pipeline information in {}'.format(self.resume_file)) utils.writeJson(self.resume_file, recipe) self.log.info('Recipe executed successfully') return 0