def jobzip(self, job, **jobargs): from disco.util import iskv from disco.worker.classic.modutil import find_modules jobzip = super(Worker, self).jobzip(job, **jobargs) def get(key): return self.getitem(key, job, jobargs) if isinstance(get('required_files'), dict): for path, bytes in get('required_files').iteritems(): jobzip.writestr(path, bytes) else: for path in get('required_files'): jobzip.write(path, os.path.join('lib', os.path.basename(path))) if get('required_modules') is None: self['required_modules'] = find_modules([obj for key in self for obj in util.iterify(get(key)) if callable(obj)], exclude=['Task']) for mod in get('required_modules'): if iskv(mod): jobzip.writepath(mod[1]) for func in ('map', 'reduce'): if isinstance(get(func), dict): for path, bytes in get(func).iteritems(): jobzip.writestr(os.path.join('ext.%s' % func, path), bytes) return jobzip
def run(self, task, job, **jobargs): # Entry point into the executing pipeline worker task. This # initializes the task environment, sets up the current stage, # and then executes it. worker.active_task = task for key in self: self[key] = self.getitem(key, job, jobargs) sys_version = '{0[0]}.{0[1]}'.format(sys.version_info[:2]) assert self['version'] == sys_version, "Python version mismatch" # Set up the task environment. globals_ = globals().copy() for module in self['required_modules']: name = module[0] if util.iskv(module) else module globals_[name.split('.')[-1]] = __import__(name, fromlist=[name]) for obj in util.flatten(self.values()): util.globalize(obj, globals_) # Set up the stage. params = self.getitem('params', job, jobargs, worker.Params()) pipeline = dict([(s.name, (idx, s)) for idx, (g, s) in enumerate(self['pipeline'])]) pipe_idx, stage = pipeline[task.stage] stage.taskinfo = TaskInfo(jobname=task.jobname, host=task.host, stage=task.stage, group=task.group, label=task.group_label) if not stage.input_chain: stage.input_chain = Stage.default_input_chain(pipe_idx) if not stage.output_chain: stage.output_chain = Stage.default_output_chain # And now run it. self.run_stage(task, stage, params)
def jobzip(self, job, **jobargs): """ A hook provided by the :class:`Worker` for creating the :term:`job home` zip. The base implementation creates a minimal zip file containing the Disco standard library, and any user-specified required files and modules. :return: a :class:`disco.fileutils.DiscoZipFile`. """ # First, add the disco standard library. from clx import __file__ as clxpath from disco import __file__ as discopath from disco.fileutils import DiscoZipFile jobzip = DiscoZipFile() jobzip.writepath(os.path.dirname(clxpath), exclude=('.pyc', '__pycache__')) jobzip.writepath(os.path.dirname(discopath), exclude=('.pyc', '__pycache__')) jobzip.writesource(job) jobzip.writesource(self) # Then, add any user-specified required files. from disco.util import iskv def get(key): return self.getitem(key, job, jobargs) if isinstance(get('required_files'), dict): for path, bytes in get('required_files').items(): jobzip.writestr(path, bytes) else: for path in get('required_files'): jobzip.write(path, os.path.join('lib', os.path.basename(path))) if get('required_modules') is None: self['required_modules'] = self.get_modules(job, **jobargs) for mod in get('required_modules'): if iskv(mod): jobzip.writepath(mod[1]) # Done with basic minimal zip. return jobzip
def pack(self): """Pack up the :class:`JobDict` for sending over the wire.""" jobpack = {} if self['required_files']: if not isinstance(self['required_files'], dict): self['required_files'] = util.pack_files(self['required_files']) else: self['required_files'] = {} self['required_files'].update(util.pack_files( o[1] for o in self['required_modules'] if util.iskv(o))) for key in self.defaults: if key == 'input': jobpack['input'] = ' '.join( '\n'.join(reversed(list(util.iterify(url)))) for url in self['input']) elif key in ('nr_reduces', 'prefix'): jobpack[key] = str(self[key]) elif key == 'scheduler': scheduler = self['scheduler'] for key in scheduler: jobpack['sched_%s' % key] = str(scheduler[key]) elif self[key] is None: pass elif key in self.stacks: jobpack[key] = util.pack_stack(self[key]) else: jobpack[key] = util.pack(self[key]) return encode_netstring_fd(jobpack)
def run(self, task, job, **jobargs): # Entry point into the executing pipeline worker task. This # initializes the task environment, sets up the current stage, # and then executes it. for key in self: self[key] = self.getitem(key, job, jobargs) sys_version = '{0[0]}.{0[1]}'.format(sys.version_info[:2]) assert self['version'] == sys_version, "Python version mismatch" # Set up the task environment. globals_ = globals().copy() for module in self['required_modules']: name = module[0] if util.iskv(module) else module globals_[name.split('.')[-1]] = __import__(name, fromlist=[name]) for obj in util.flatten(self.values()): util.globalize(obj, globals_) # Set up the stage. params = self.getitem('params', job, jobargs, worker.Params()) pipeline = dict([(s.name, (idx, s)) for idx, (g, s) in enumerate(self['pipeline'])]) pipe_idx, stage = pipeline[task.stage] stage.taskinfo = TaskInfo(jobname=task.jobname, host=task.host, stage=task.stage, group=task.group, label=task.group_label) if not stage.input_chain: stage.input_chain = Stage.default_input_chain(pipe_idx) if not stage.output_chain: stage.output_chain = Stage.default_output_chain # And now run it. self.run_stage(task, stage, params)
def jobzip(self, job, **jobargs): from disco.util import iskv from disco.worker.classic.modutil import find_modules jobzip = super(Worker, self).jobzip(job, **jobargs) def get(key): return self.getitem(key, job, jobargs) if isinstance(get('required_files'), dict): for path, bytes in get('required_files').items(): jobzip.writestr(path, bytes) else: for path in get('required_files'): jobzip.write(path, os.path.join('lib', os.path.basename(path))) if get('required_modules') is None: self['required_modules'] = find_modules([ obj for key in self for obj in util.iterify(get(key)) if callable(obj) ], exclude=['Task']) for mod in get('required_modules'): if iskv(mod): jobzip.writepath(mod[1]) for func in ('map', 'reduce'): if isinstance(get(func), dict): for path, bytes in get(func).items(): jobzip.writestr(os.path.join('ext.{0}'.format(func), path), bytes) return jobzip
def insert_globals(self, functions): for fn in functions: if isinstance(fn, functools.partial): fn=fn.func if isinstance(fn, FunctionType): fn.func_globals.setdefault('Task', self) for module in self.required_modules: mod_name = module[0] if util.iskv(module) else module mod = __import__(mod_name, fromlist=[mod_name]) fn.func_globals.setdefault(mod_name.split('.')[-1], mod)
def insert_globals(self, functions): write_files(self.required_files, self.lib) sys.path.insert(0, self.lib) for fn in functions: if isinstance(fn, partial): fn = fn.func if isinstance(fn, FunctionType): fn.func_globals.setdefault('Task', self) for module in self.required_modules: mod_name = module[0] if util.iskv(module) else module mod = __import__(mod_name, fromlist=[mod_name]) fn.func_globals.setdefault(mod_name.split('.')[-1], mod)
def run(self, task, job, **jobargs): global Task Task = task for key in self: self[key] = self.getitem(key, job, jobargs) assert self['version'] == '%s.%s' % sys.version_info[:2], "Python version mismatch" params = self['params'] if isinstance(self[task.mode], dict): params = self['ext_params'] self[task.mode] = external.prepare(params, task.mode) globals_ = globals().copy() for module in self['required_modules']: name = module[0] if util.iskv(module) else module globals_[name.split('.')[-1]] = __import__(name, fromlist=[name]) for obj in util.flatten(self.values()): util.globalize(obj, globals_) getattr(self, task.mode)(task, params) external.close()
def run(self, task, job, **jobargs): global Task Task = task for key in self: self[key] = self.getitem(key, job, jobargs) assert self['version'] == '{0[0]}.{0[1]}'.format(sys.version_info[:2]), "Python version mismatch" params = self['params'] if isinstance(self[task.stage], dict): params = self['ext_params'] self[task.stage] = external.prepare(params, task.stage) globals_ = globals().copy() for module in self['required_modules']: name = module[0] if util.iskv(module) else module globals_[name.split('.')[-1]] = __import__(name, fromlist=[name]) for obj in util.flatten(self.values()): util.globalize(obj, globals_) getattr(self, task.stage)(task, params) external.close()
def pack(self): """Pack up the :class:`JobDict` for sending over the wire.""" jobpack = {} if self['required_files']: if not isinstance(self['required_files'], dict): self['required_files'] = util.pack_files( self['required_files']) else: self['required_files'] = {} self['required_files'].update( util.pack_files(o[1] for o in self['required_modules'] if util.iskv(o))) for key in self.defaults: if key in ('map', 'reduce'): if self[key] is None: continue if key == 'input': jobpack['input'] = ' '.join( '\n'.join(reversed(list(util.iterify(url)))) for url in self['input']) elif key == 'username': jobpack['username'] = str(self['username']) elif key in ('nr_reduces', 'prefix'): jobpack[key] = str(self[key]) elif key == 'scheduler': scheduler = self['scheduler'] for key in scheduler: jobpack['sched_%s' % key] = str(scheduler[key]) elif key in self.stacks: jobpack[key] = util.pack_stack(self[key]) else: jobpack[key] = util.pack(self[key]) return encode_netstring_fd(jobpack)
def kvify(entry): yield entry if iskv(entry) else ('', entry)
def evaluate(expression, entry): if iskv(entry): k, v = entry yield eval(expression)
def where(predicate, entry): if iskv(entry): k, v = entry if eval(predicate): yield entry