def __init__(self, *args, **kwargs): super(JobDict, self).__init__(*args, **kwargs) # -- backwards compatibility -- if 'reduce_writer' in kwargs or 'map_writer' in kwargs: warn("Writers are deprecated - use output_stream.add() instead", DeprecationWarning) # -- required modules and files -- if self['required_modules'] is None: functions = util.flatten(util.iterify(self[f]) for f in chain(self.functions, self.stacks)) self['required_modules'] = find_modules([f for f in functions if callable(f)]) # -- external flags -- if isinstance(self['map'], dict): self['ext_map'] = True if isinstance(self['reduce'], dict): self['ext_reduce'] = True # -- input -- ddfs = self.pop('ddfs', None) self['input'] = [list(util.iterify(url)) for i in self['input'] for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs)] # partitions must be an integer internally self['partitions'] = self['partitions'] or 0 # set nr_reduces: ignored if there is not actually a reduce specified if self['map']: # partitioned map has N reduces; non-partitioned map has 1 reduce self['nr_reduces'] = self['partitions'] or 1 elif self.input_is_partitioned: # Only reduce, with partitions: len(dir://) specifies nr_reduces self['nr_reduces'] = 1 + max(id for dir in self['input'] for id, url in util.read_index(dir[0])) else: # Only reduce, without partitions can only have 1 reduce self['nr_reduces'] = 1 # merge_partitions iff the inputs to reduce are partitioned if self['merge_partitions']: if self['partitions'] or self.input_is_partitioned: self['nr_reduces'] = 1 else: raise DiscoError("Can't merge partitions without partitions") # -- scheduler -- scheduler = self.__class__.defaults['scheduler'].copy() scheduler.update(self['scheduler']) if int(scheduler['max_cores']) < 1: raise DiscoError("max_cores must be >= 1") self['scheduler'] = scheduler # -- sanity checks -- for key in self: if key not in self.defaults: raise DiscoError("Unknown job argument: %s" % key)
def inquire(program, indexspec, inquiry, query=None): for result in program.client.inquire(indexspec, inquiry, query=query, streams=program.options.streams, reduce=program.options.reduce, params=dict(program.options.params)): print '\t'.join('%s' % (e,) for e in iterify(result)).rstrip()
def pack(self): """Pack up the :class:`JobDict` for sending over the wire.""" jobpack = {} if self['required_files']: if not isinstance(self['required_files'], dict): self['required_files'] = util.pack_files(self['required_files']) else: self['required_files'] = {} self['required_files'].update(util.pack_files( o[1] for o in self['required_modules'] if util.iskv(o))) for key in self.defaults: if key == 'input': jobpack['input'] = ' '.join( '\n'.join(reversed(list(util.iterify(url)))) for url in self['input']) elif key in ('nr_reduces', 'prefix'): jobpack[key] = str(self[key]) elif key == 'scheduler': scheduler = self['scheduler'] for key in scheduler: jobpack['sched_%s' % key] = str(scheduler[key]) elif self[key] is None: pass elif key in self.stacks: jobpack[key] = util.pack_stack(self[key]) else: jobpack[key] = util.pack(self[key]) return encode_netstring_fd(jobpack)
def range_header(offset): def httprange(start='', end=''): return '%s-%s' % (start, end) if offset: return {'Range': 'bytes=%s' % httprange(*tuple(iterify(offset)))} return {}
def _upload(self, urls, source, token=None, to_master=True, **kwargs): urls = [self._resolve(proxy_url(url, proxy=self.proxy, meth='PUT', to_master=to_master)) for url in iterify(urls)] return upload(urls, source, token=self._token(url, token, 'PUT'), **kwargs)
def range_header(offset): def httprange(start="", end=""): return "%s-%s" % (start, end) if offset: return {"Range": "bytes=%s" % httprange(*tuple(iterify(offset)))} return {}
def jobzip(self, job, **jobargs): from disco.util import iskv from disco.worker.classic.modutil import find_modules jobzip = super(Worker, self).jobzip(job, **jobargs) def get(key): return self.getitem(key, job, jobargs) if isinstance(get('required_files'), dict): for path, bytes in get('required_files').items(): jobzip.writestr(path, bytes) else: for path in get('required_files'): jobzip.write(path, os.path.join('lib', os.path.basename(path))) if get('required_modules') is None: self['required_modules'] = find_modules([ obj for key in self for obj in util.iterify(get(key)) if callable(obj) ], exclude=['Task']) for mod in get('required_modules'): if iskv(mod): jobzip.writepath(mod[1]) for func in ('map', 'reduce'): if isinstance(get(func), dict): for path, bytes in get(func).items(): jobzip.writestr(os.path.join('ext.{0}'.format(func), path), bytes) return jobzip
def jobzip(self, job, **jobargs): from disco.util import iskv from disco.worker.classic.modutil import find_modules jobzip = super(Worker, self).jobzip(job, **jobargs) def get(key): return self.getitem(key, job, jobargs) if isinstance(get('required_files'), dict): for path, bytes in get('required_files').iteritems(): jobzip.writestr(path, bytes) else: for path in get('required_files'): jobzip.write(path, os.path.join('lib', os.path.basename(path))) if get('required_modules') is None: self['required_modules'] = find_modules([obj for key in self for obj in util.iterify(get(key)) if callable(obj)], exclude=['Task']) for mod in get('required_modules'): if iskv(mod): jobzip.writepath(mod[1]) for func in ('map', 'reduce'): if isinstance(get(func), dict): for path, bytes in get(func).iteritems(): jobzip.writestr(os.path.join('ext.%s' % func, path), bytes) return jobzip
def inquire(program, indexspec, inquiry, query=None): for result in program.client.inquire(indexspec, inquiry, query=query, streams=program.options.streams, reduce=program.options.reduce, params=dict(program.options.params)): print '\t'.join('%s' % (e, ) for e in iterify(result)).rstrip()
def wait(program, jobname): """Usage: jobname Wait for the named job to complete and print the list of results. """ from disco.util import iterify for result in program.disco.wait(jobname): print('\t'.join('{0}'.format(e,) for e in iterify(result)).rstrip())
def _upload(self, urls, source, token=None, **kwargs): urls = [ self._resolve(self._maybe_proxy(url, method='PUT')) for url in iterify(urls) ] return upload(urls, source, token=self._token(url, token, 'PUT'), **kwargs)
def range_header(offset): def httprange(start='', end=''): return '{0}-{1}'.format(start, end) if offset: return { 'Range': 'bytes={0}'.format(httprange(*tuple(iterify(offset)))) } return {}
def results(program, jobname): """Usage: jobname Print the list of results for a completed job. """ from disco.util import iterify status, results = program.disco.results(jobname) for result in results: print('\t'.join('{0}'.format((e,)) for e in iterify(result)).rstrip())
def mapresults(program, jobname): """Usage: jobname Print the list of results from the map phase of a job. This is useful for resuming a job which has failed during reduce. """ from disco.util import iterify for result in program.disco.mapresults(jobname): print '\t'.join('%s' % (e,) for e in iterify(result)).rstrip()
def get_modules(self, job, **jobargs): from disco.worker.modutil import find_modules from disco.util import iterify def get(key): return self.getitem(key, job, jobargs) return find_modules([obj for key in self for obj in iterify(get(key)) if callable(obj)], exclude=['Task'])
def stageresults(program, jobname): """Usage: jobname -S stage Print the list of results from a stage of a job. This is useful for resuming a job which has failed during following stages. """ from disco.util import iterify stagename = program.options.stage for result in program.disco.stageresults(jobname, stagename): print('\t'.join('{0}'.format((e,)) for e in iterify(result)).rstrip())
def inputs(self, job): for input in util.iterify(self[job]): if isinstance(input, Job): status, results = input.results() if status in ('unknown job', 'active'): yield [None] elif status == 'ready': yield results else: raise JobError(input, "Status %s" % status) else: yield [input]
def get_modules(self, job, **jobargs): from disco.worker.modutil import find_modules from disco.util import iterify def get(key): return self.getitem(key, job, jobargs) from inspect import getsourcefile, getmodule job_path = getsourcefile(getmodule(job)) return find_modules([obj for key in self for obj in iterify(get(key)) if callable(obj)], job_path=job_path, exclude=['Task'])
def xcat(program, *urls): """Usage: [-i] [-p] [-R reader] [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from disco.core import result_iterator from disco.util import iterify, reify tags, urls = program.separate_tags(*urls) reader = reify(program.options.reader or 'disco.func.chain_reader') for result in result_iterator(chain(urls, program.blobs(*tags)), reader=reader): print '\t'.join(map(str, iterify(result)))
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) for record in classic_iterator(chain(urls, program.blobs(*tags)), input_stream=stream, reader=reader): print('\t'.join('{0}'.format(e) for e in iterify(record)).rstrip())
def xcat(program, *urls): """Usage: [-i] [-p] [-R reader] [-t token] [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from disco.core import RecordIter from disco.util import iterify, reify tags, urls = program.separate_tags(*urls) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) for record in RecordIter(chain(urls, program.blobs(*tags)), input_stream=stream, reader=reader): print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify, urlresolve, proxy_url tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset] for repset in chain(urls, program.blobs(*tags))] for record in classic_iterator(bloburls, input_stream=stream, reader=reader): print '\t'.join('%s' % (e, ) for e in iterify(record)).rstrip()
def xcat(program, *urls): """Usage: [urls ...] Concatenate the extracted results stored in url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from disco.core import classic_iterator from disco.util import iterify, reify, urlresolve, proxy_url tags, urls = program.separate_tags(*program.input(*urls)) stream = reify(program.options.stream) reader = program.options.reader reader = reify('disco.func.chain_reader' if reader is None else reader) bloburls = [[proxy_url(urlresolve(u), to_master=False) for u in repset] for repset in chain(urls, program.blobs(*tags))] for record in classic_iterator(bloburls, input_stream=stream, reader=reader): print '\t'.join('%s' % (e,) for e in iterify(record)).rstrip()
def pack(self): """Pack up the :class:`JobDict` for sending over the wire.""" jobpack = {} if self['required_files']: if not isinstance(self['required_files'], dict): self['required_files'] = util.pack_files( self['required_files']) else: self['required_files'] = {} self['required_files'].update( util.pack_files(o[1] for o in self['required_modules'] if util.iskv(o))) for key in self.defaults: if key in ('map', 'reduce'): if self[key] is None: continue if key == 'input': jobpack['input'] = ' '.join( '\n'.join(reversed(list(util.iterify(url)))) for url in self['input']) elif key == 'username': jobpack['username'] = str(self['username']) elif key in ('nr_reduces', 'prefix'): jobpack[key] = str(self[key]) elif key == 'scheduler': scheduler = self['scheduler'] for key in scheduler: jobpack['sched_%s' % key] = str(scheduler[key]) elif key in self.stacks: jobpack[key] = util.pack_stack(self[key]) else: jobpack[key] = util.pack(self[key]) return encode_netstring_fd(jobpack)
def urls(self, inputs): def serverify(input): return '%s/%s' % (self.address, input) return [[serverify(url) for url in iterify(input)] for input in inputs]
def urls(self, inputs): def serverify(input): return '{0}/{1}'.format(self.address, input) return [[serverify(url) for url in iterify(input)] for input in inputs]
def canonizetags(tags): return [tagname(tag) for tag in iterify(tags)]
def __iter__(self): for urls in self.urls: for replicas in util.urllist(urls, ddfs=self.ddfs): self.notifier(replicas) for entry in self.try_replicas(list(util.iterify(replicas))): yield entry
def results(self, jobspec, timeout=2000): """ Returns a list of results for a single job or for many concurrently running jobs, depending on the type of *jobspec*. :type jobspec: :class:`disco.job.Job`, string, or list :param jobspec: If a job or job name is provided, return a tuple which looks like:: status, results If a list is provided, return two lists: inactive jobs and active jobs. Both the lists contain elements of the following type:: jobname, (status, results) where status is one of: ``'unknown job'``, ``'dead'``, ``'active'``, or ``'ready'``. :type timeout: int :param timeout: wait at most this many milliseconds, for at least one on the jobs to finish. Using a list of jobs is a more efficient way to wait for multiple jobs to finish. Consider the following example that prints out results as soon as the jobs (initially ``active``) finish:: while active: inactive, active = disco.results(jobs) for jobname, (status, results) in inactive: if status == 'ready': for k, v in result_iterator(results): print(k, v) disco.purge(jobname) Note how the list of active jobs, ``active``, returned by :meth:`Disco.results`, can be used as the input to this function as well. """ def jobname(job): if isinstance(job, Job): return job.name elif isinstance(job, basestring): return job return job[0] jobnames = [jobname(job) for job in util.iterify(jobspec)] results = json.loads(self.request('/disco/ctrl/get_results', json.dumps([timeout, jobnames]))) others, active = [], [] for jobname, (status, result) in results: if isinstance(jobspec, (Job, basestring)): return status, result elif status == 'active': active.append((jobname, (status, result))) else: others.append((jobname, (status, result))) return others, active
def __iter__(self): for result in self.results: for urls in util.urllist(result, ddfs=self.ddfs): self.notifier(urls) for entry in self.try_replicas(list(util.iterify(urls))): yield entry
def canonizetags(tags): return [canonizetag(tag) for tag in iterify(tags)]
def test_iterify(self): self.assertEquals([5], list(iterify(5))) self.assertEquals([5], list(iterify([5])))
def chunk_name(replicas, n): url = list(iterify(replicas))[0] return self.safe_name('%s-%s' % (os.path.basename(url), n))
def range_header(offset): def httprange(start='', end=''): return '{0}-{1}'.format(start, end) if offset: return {'Range': 'bytes={0}'.format(httprange(*tuple(iterify(offset))))} return {}
def _upload(self, urls, source, token=None, **kwargs): urls = [self._resolve(self._maybe_proxy(url, method='PUT')) for url in iterify(urls)] return upload(urls, source, token=self._token(url, token, 'PUT'), **kwargs)
def __init__(self, *args, **kwargs): super(JobDict, self).__init__(*args, **kwargs) # -- backwards compatibility -- if 'reduce_writer' in kwargs or 'map_writer' in kwargs: warn("Writers are deprecated - use output_stream.add() instead", DeprecationWarning) # -- required modules and files -- if self['required_modules'] is None: functions = util.flatten( util.iterify(self[f]) for f in chain(self.functions, self.stacks)) self['required_modules'] = find_modules( [f for f in functions if callable(f)]) # -- external flags -- if isinstance(self['map'], dict): self['ext_map'] = True if isinstance(self['reduce'], dict): self['ext_reduce'] = True # -- input -- ddfs = self.pop('ddfs', None) self['input'] = [ list(util.iterify(url)) for i in self['input'] for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs) ] # partitions must be an integer internally self['partitions'] = self['partitions'] or 0 # set nr_reduces: ignored if there is not actually a reduce specified if self['map']: # partitioned map has N reduces; non-partitioned map has 1 reduce self['nr_reduces'] = self['partitions'] or 1 elif self.input_is_partitioned: # Only reduce, with partitions: len(dir://) specifies nr_reduces self['nr_reduces'] = 1 + max( id for dir in self['input'] for id, url in util.read_index(dir[0])) else: # Only reduce, without partitions can only have 1 reduce self['nr_reduces'] = 1 # merge_partitions iff the inputs to reduce are partitioned if self['merge_partitions']: if self['partitions'] or self.input_is_partitioned: self['nr_reduces'] = 1 else: raise DiscoError("Can't merge partitions without partitions") # -- scheduler -- scheduler = self.__class__.defaults['scheduler'].copy() scheduler.update(self['scheduler']) if int(scheduler['max_cores']) < 1: raise DiscoError("max_cores must be >= 1") self['scheduler'] = scheduler # -- sanity checks -- for key in self: if key not in self.defaults: raise DiscoError("Unknown job argument: %s" % key)
def relativizetags(tags, parent): return [relativizetag(tag, parent) for tag in iterify(tags)]
def results(self, jobspec, timeout=2000): """ Returns a list of results for a single job or for many concurrently running jobs, depending on the type of *jobspec*. :type jobspec: :class:`disco.job.Job`, string, or list :param jobspec: If a job or job name is provided, return a tuple which looks like:: status, results If a list is provided, return two lists: inactive jobs and active jobs. Both the lists contain elements of the following type:: jobname, (status, results) where status is one of: ``'unknown job'``, ``'dead'``, ``'active'``, or ``'ready'``. :type timeout: int :param timeout: wait at most this many milliseconds, for at least one on the jobs to finish. Using a list of jobs is a more efficient way to wait for multiple jobs to finish. Consider the following example that prints out results as soon as the jobs (initially ``active``) finish:: while active: inactive, active = disco.results(jobs) for jobname, (status, results) in inactive: if status == 'ready': for k, v in result_iterator(results): print k, v disco.purge(jobname) Note how the list of active jobs, ``active``, returned by :meth:`Disco.results`, can be used as the input to this function as well. """ def jobname(job): if isinstance(job, Job): return job.name elif isinstance(job, basestring): return job return job[0] jobnames = [jobname(job) for job in util.iterify(jobspec)] results = json.loads( self.request('/disco/ctrl/get_results', json.dumps([timeout, jobnames]))) others, active = [], [] for jobname, (status, result) in results: if isinstance(jobspec, (Job, basestring)): return status, result elif status == 'active': active.append((jobname, (status, result))) else: others.append((jobname, (status, result))) return others, active
def input(self): from disco.util import iterify return [[str(url) for url in iterify(input)] for input in self['input']]
def _upload(self, urls, source, **kwargs): urls = [self._maybe_proxy(url, method='PUT') for url in iterify(urls)] return upload(urls, source, **kwargs)