def test_pipeline(self): """Test the pipeline generation""" self.setup_pipeline( ('A', 'ls -l ${file}.txt > @{file}.ls'), ('B', 'wc ${file}.ls > @{file}.c'), ('C', 'ls -l ${file}.txt ${file}.ls ${file}.c > @{file}.out'), ('D', 'wc ${file}.out > @{file}.out'), ) result = self.pipeline.run("pipe", output_dir=self.dir, file=self.filename) limit = 40 while not result.update_all(): if limit == 40: get_job_manager().run_all() elif limit == 35: get_job_manager().finish_all() time.sleep(0.2) limit -= 1 self.assertTrue(limit > 0, "Pipeline test timed out.") results = list(result.programs.all()) self.assertEqual(len(results), 4) filename = self.filename[:-4] + ".%s" self.assertProgram(results[0], self.filename, filename % "ls") self.assertProgram(results[1], filename % "ls", filename % "c") #self.assertProgram(results[2], [ # filename % "ls", self.filename, filename % "c"], filename % "out") self.assertProgram(results[3], filename % "out", filename % "out")
def test_duration(self): """Test the duration during a run""" self.assertEqual(type(get_job_manager()).__name__, 'FakeJobManager') self.setup_pipeline(('DUR', 'sleep 5')) results = self.pipeline.run("pipe", output_dir=self.dir) result = results.programs.get() get_job_manager().run_all() for count in range(5): result.update_status() self.assertEqual(int(result.duration), count + 1) time.sleep(1)
def setUp(self): super(ManagerTestBase, self).setUp() self.tempdir = tempfile.mkdtemp(suffix='chore-tests') self.manager = get_job_manager(self.manager_cls, self.tempdir, self.batched) if not self.manager.is_enabled(): self.skipTest("Manager {} is not enabled".format(self.manager_cls)) self.filename = tempfile.mktemp(prefix='test-job-')
def update_status(self, commit=True): """Take data from the job manager and populate the database""" job_manager = get_job_manager() if self.is_submitted and not self.is_complete: dur = None data = job_manager.status(self.job_id, clean=False) age = now() - self.submitted if not data and age > timedelta(hours=1): # This usually means the job is so old that it's gone from # the job manager queue and we have no further information about it self.is_complete = True self.is_error = True self.error_text = "Job Disapeared from Job Queue" return if data.get('status', 'notfound') in ('finished', ): if data['finished'] and data['started']: dur = data['finished'] - data['started'] self.duration = dur.total_seconds() + int( dur.microseconds > 0) self.completed = data['finished'] self.is_complete = True self.is_error = data['return'] != 0 if data['error']: self.error_text = data[ 'error'][:10240] # Limit errors to 10k self.input_size = self.update_size(*self.input_fn) / 1024.0 self.output_size = self.update_size(*self.output_fn) / 1024.0 if data.get('started', None) is not None: if not self.is_started: self.is_started = True self.started = data['started'] # Save the duration so far dur = now() - data['started'] # Round up any microseconds, useful for testing non-zero time self.duration = dur.total_seconds() + int(dur.microseconds > 0) if data and self.previous_id: for prev in ProgramRun.objects.filter(job_id=self.previous_id): if prev.is_error: job_manager.stop(self.job_id) self.is_error = True if commit: self.save() # We're going to force an error out of hiding. if self.is_complete and self.error_text == 'None': (_, error) = job_manager.job_read(self.job_id, 'err') if error is not None: self.error_text = "Broken JobID error: " + error else: self.error_text = "Lost error for {}".format(self.job_id) self.is_error = True self.save() return self.is_complete
def stop(self, msg='Stopped'): """Stop this program from running""" if self.is_submitted and not self.is_complete: ret = get_job_manager().stop(self.job_id) self.is_error = True self.is_complete = True self.error_text = msg self.save() return ret return True
def get_context_data(self, **kw): data = super(JobViewer, self).get_context_data(**kw) data['pipeline'] = get_job_manager() kw = {} if 'user' in self.request.GET: kw['user'] = self.request.GET['user'] cols = [c for c in self.request.GET.get('cols', '').split(',') if c] data['object_list'] = [self.get_item(item, cols)\ for item in data['pipeline'].jobs_status(*cols, **kw)] data['pipeline_name'] = type(data['pipeline']).__name__ data['cols'] = cols return data
def _raw_status(self): # This fixed to batch mode FALSE, change to `status` if you need batch mode kw = {} if self.submitted: kw['start'] = self.submitted - timedelta(days=1) kw['end'] = self.submitted + timedelta(days=7) else: # This is really weird kw['start'] = '2016-01-01' kw['end'] = '2040-01-01' return get_job_manager().job_status(self.job_id, **kw)
def get_context_data(self, **kw): data = super(JobViewer, self).get_context_data(**kw) data['pipeline'] = get_job_manager() kw = {} if 'user' in self.request.GET: kw['user'] = self.request.GET['user'] if 'wckeys' in self.request.GET: kw['wckeys'] = [ k for k in self.request.GET.get('wckeys', '').split(',') if k ] if 'start' in self.request.GET: kw['start'] = self.request.GET['start'] else: kw['start'] = (date.today() - timedelta(days=7)).isoformat() if 'end' in self.request.GET: kw['end'] = self.request.GET['end'] else: kw['end'] = (date.today() + timedelta(days=1)).isoformat() if 'col' in self.request.GET: cols = list(self.request.GET.getlist('col')) else: cols = [ c for c in self.request.GET.get('cols', '').split(',') if c ] data['object_list'] = [self.get_item(item, cols)\ for item in data['pipeline'].jobs_status(*cols, **kw) if self.filter_item(item, str(item['pid'])) ] data['pipeline_name'] = type(data['pipeline']).__name__ data['cols'] = cols data['kw'] = kw data['extra_cols'] = [ ('User', 'Job Running User'), ('Account', 'O2 Account Name'), ('Partition', 'Server Partition'), ('AveRSS', 'Average resident set size'), ('TotalCPU', 'Total CPU'), ('ReqMem', 'Requested Memory'), ('MaxVMSize', 'Maximum Used Memory'), ('AveDiskRead', 'Average Disk Read'), ('AveDiskWrite', 'Average Disk Write'), ] return data
def run(self, commit=True, **kwargs): """Run this pipeline run (creates ProgramRun objects)""" runs = [] if not commit: self.test_programs = [] if 'clean_files' in kwargs: self.clean_files = '\n'.join(kwargs['clean_files']) if commit: self.save() for pipe in self.pipeline.programs.all(): if commit: run, _ = ProgramRun.objects.get_or_create(piperun=self, **pipe.prepare( self.pk)) else: run = ProgramRun(piperun=self, **pipe.prepare(self.pk)) self.test_programs.append(run) runs.append(run) for prev, run, foll in tripplet(runs): if not run.is_submitted: if not run.submit( commit=commit, previous=prev, follower=foll, **kwargs): return False else: data = get_job_manager().status(run.job_id, clean=False) if data.get('finished', None) and data.get('return', 1) != 1: raise JobSubmissionError("Existing job already failed.") # Sort out the filenames for the next call in the chain for package, filename in run.program.prepare_files(**kwargs): name = package[1] if name in kwargs: if isinstance(kwargs[name], list): kwargs[name].append(filename) else: kwargs[name] = [kwargs[name], filename] else: kwargs[name] = [filename] return True
def job_submit(self, cmd, **kwargs): """Actually submit job to job_manager""" job_manager = get_job_manager() job_manager.submit(self.job_id, cmd, **kwargs)
def job_clean(self): """Remove old command files""" job_manager = get_job_manager() job_manager.job_clean_fn(self.job_id, 'out') job_manager.job_clean_fn(self.job_id, 'err')
def update_status(self, commit=True, force=False): """Take data from the job manager and populate the database""" job_manager = get_job_manager() if self.is_submitted and (not self.is_complete or force): dur = None data = self._raw_status() if not data or 'status' not in data: # This usually means the job is so old that it's gone from # the job manager queue and we have no further information about it if now() - self.submitted < timedelta(days=1): return self.is_complete = True self.is_error = True self.error_text = "Job Stopped" self.save() return self.job_state = data.get('state', '') if not self.submitted and 'submit' in data: self.submitted = data['submit'] # Attempt to reclassify failures of quality control steps if self.job_state == 'FAILED' and self.program.quality_control: self.job_state = 'INVALID' if data.get('status', 'notfound') in ('finished', ): if data['finished'] and data['started']: dur = data['finished'] - data['started'] self.duration = dur.total_seconds() + int( dur.microseconds > 0) self.completed = data['finished'] self.is_complete = True self.is_error = data['return'] != 0 self.error_text = "" if data['error']: self.error_text += data[ 'error'][:10240] # Limit errors to 10k self.update_sizes() if data.get('started', None) is not None: if not self.is_started: self.is_started = True self.started = data['started'] # Save the duration so far dur = now() - data['started'] # Round up any microseconds, useful for testing non-zero time self.duration = dur.total_seconds() + int(dur.microseconds > 0) if data and self.previous_id: for prev in ProgramRun.objects.filter(job_id=self.previous_id): if prev.is_error: job_manager.stop(self.job_id) self.is_error = True if commit: self.save() # We're going to force an error out of hiding. if self.is_complete and self.error_text == 'None': (_, error) = job_manager.job_read(self.job_id, 'err') if error is not None: self.error_text = "Broken JobID error: " + error else: self.error_text = "Lost error for {}".format(self.job_id) self.is_error = True self.save() return self.is_complete