class Storage: def __init__(self): self.c = Configurator() self._root = os.path.normpath(self.c.get('STORAGE', 'cache_dir')) def exists(self, filename): if filename != None: return os.path.exists(self.abspath(filename)) else: log.debug('inside MProcessor storage, method exists, filename is %s' % filename) return False def abspath(self, path, in_cache=False): """absolute path of path, relative to the cache_dir defined in mprocessor.cfg If the argument path is already an absolute path, returns it unchanged, unless in_cache = True. In this case raise an exception unless path points inside cache_dir """ if os.path.isabs(path): abs = path else: abs = os.path.abspath(os.path.join(self._root, path)) abs = normpath(abs) if in_cache: if not abs.startswith(self._root): raise StorageError('%s is not under %s' % (abs, self._root)) return abs def relpath(self, path, in_cache=False): """ This is the inverse of abspath """ path = os.path.abspath(path) path = normpath(path) rel = os.path.relpath(path, self._root) if in_cache: if rel.startswith('..'): raise StorageError('%s is not under %s' % (path, self._root)) else: return rel
class Batch: def __init__(self, process_pk): process = Process.objects.get(pk=process_pk) self.cfg = Configurator() self.max_outstanding = self.cfg.getint('MPROCESSOR', 'max_outstanding') self.batch_size = self.cfg.getint('MPROCESSOR', 'batch_size') # how many items to load self.pipeline = loads(process.pipeline.params) self.dag = DAG(self.pipeline) self.schedule_length = len(self.pipeline) self.process = process self.scripts = self._get_scripts(self.pipeline) self.all_targets_read = False # True when all targets have been read self.gameover = False # True when all targets are done self.outstanding = 0 # number of not yet answered requests self.cur_batch = 0 # index in batch self.cur_task = 0 # index in tasks self.totals = {'update':0, 'passed':0, 'failed':0, 'targets': 0, None: 0} self.results = {} def run(self): "Start the iteration initializing state so that the iteration starts correctly" log.debug('### Running batch for process %s' % (str(self.process.pk),)) self.process.targets = ProcessTarget.objects.filter(process=self.process).count() self.tasks = [] self._iterate() def stop(self, seconds_offset=0): log.info('stopping process %s' % self.process.pk) with transaction.commit_on_success(): when = datetime.datetime.now() + datetime.timedelta(seconds=seconds_offset) self.process.end_date = when self.process.save() self.gameover = True def _update_item_stats(self, item, action, result, success, failure, cancelled): #log.debug('_update_item_stats: item=%s action=%s success=%s, failure=%s, cancelled=%s' % (item.target_id, action, success, failure, cancelled)) #d item.actions_passed += success item.actions_failed += failure item.actions_cancelled += cancelled item.actions_todo -= (success + failure + cancelled) if item.pk not in self.results: self.results[item.pk] = {} self.results[item.pk][action] = (success, result) if item.actions_todo <= 0 or failure > 0: item.result = dumps(self.results[item.pk]) if item.actions_todo <= 0: #log.debug('_update_item_stats: finalizing item %s' % item.target_id) #d del self.results[item.pk] def _get_scripts(self, pipeline): """Load scripts from plugin directory. Returns the dictionary {'script_name': (callable, params)} Throws an exception if not all scripts can be loaded. """ plugins_module = self.cfg.get("MPROCESSOR", "plugins") scripts = {} for script_key, script_dict in pipeline.items(): script_name = script_dict['script_name'] full_name = plugins_module + '.' + script_name + '.run' p = full_name.split('.') log.info('<$> loading script: %s' % '.'.join(p[:-1])) m = __import__('.'.join(p[:-1]), fromlist = p[:-1]) f = getattr(m, p[-1], None) if not f or not callable(f): raise BatchError('Plugin %s has no callable run method' % script_name) else: scripts[script_key] = (f, script_dict.get('params', {})) return scripts def _new_batch(self): "Loads from db the next batch of items and associate a schedule to each item" if self.all_targets_read: return [] targetset = ProcessTarget.objects.filter(process=self.process.pk)[self.cur_batch:self.cur_batch + self.batch_size] if targetset: self.cur_batch += self.batch_size ret = [{'item':x, 'schedule':Schedule(self.dag, x.target_id)} for x in targetset] # item, index of current action, schedule else: self.all_targets_read = True ret = [] return ret def _get_action(self): """returns the first action found or None. Delete tasks with no actions left""" #log.debug("_get_action on num_tasks=%s" % len(self.tasks)) #d to_delete = [] action = '' for n in xrange(len(self.tasks)): idx = (self.cur_task + n) % len(self.tasks) task = self.tasks[idx] action = task['schedule'].action_to_run() if action is None: to_delete.append(task) elif action: break #log.debug('to_delete %s' % to_delete) #d for t in to_delete: #log.debug('deleting done target %s' % t['item'].target_id) #d self.tasks.remove(t) # update cur_task so that we do not always start querying the same task for new actions if action: idx = self.tasks.index(task) self.cur_task = (idx + 1) % len(self.tasks) else: self.cur_task = 0 # if action is None or empy there is no action ready to run # if there are new targets available try to read some and find some new action if action: return action, task else: if not self.all_targets_read and self.outstanding < self.max_outstanding: new_tasks = self._new_batch() if new_tasks: self.cur_task = len(self.tasks) self.tasks.extend(new_tasks) if self.all_targets_read and not self.tasks: log.debug("_get_action: gameover") self.stop() return None, None def _iterate(self): """ Run the actions listed in schedule on the items returned by _new_batch """ #log.debug('_iterate: oustanding=%s' % self.outstanding) #d while True: if self.gameover: log.debug('_iterate: gameover') return action, task = self._get_action() if action: log.debug('processing action: "%s"' % (action, )) item, schedule = task['item'], task['schedule'] method, params = self.scripts[action] try: item_params = loads(item.params) # tmp bug fixing starts here for k in params.keys(): if params[k] == '' and (k in item_params[action]): params[k] = item_params[action][k] # tmp bug fixing ends here params.update(item_params.get('*', {})) x = re.compile('^[a-z_]+' ) # cut out digits from action name params.update(item_params.get(x.match(action).group(), {})) self.outstanding += 1 #params = {u'source_variant_name': u'original'} res = method(self.process.workspace, item.target_id, **params) self._handle_ok(res, item, schedule, action, params) except Exception, e: log.error('ERROR in %s: %s %s' % (str(method), type(e), str(e))) self._handle_err(str(e), item, schedule, action, params) # If _get_action did not find anything and there are no more targets, no action # will be available until an action completes and allows more actions to go ready. if not (self.outstanding < self.max_outstanding and (action or not self.all_targets_read)): break