def classic_iterator(urls, reader=task_io.chain_reader, input_stream=(func.map_input_stream, ), notifier=func.notifier, params=None, ddfs=None): """ An iterator over records as seen by the classic map interface. :type reader: :func:`disco.worker.task_io.input_stream` :param reader: shortcut for the last input stream applied. :type input_stream: sequence of :func:`disco.worker.task_io.input_stream` :param input_stream: used to read from a custom file format. :type notifier: :func:`disco.func.notifier` :param notifier: called when the task opens a url. """ from disco.worker import Input from disco.worker.classic.worker import Worker worker = Worker(map_reader=reader, map_input_stream=input_stream) settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() for input in util.inputlist(urls, settings=settings): if isinstance(input, basestring): dest = proxy_url(input, to_master=False) elif isinstance(input, tuple): dest = tuple([proxy_url(i, to_master=False) for i in input]) else: dest = [proxy_url(i, to_master=False) for i in input] notifier(dest) for record in Input(dest, open=worker.opener('map', 'in', params)): yield record
def classic_iterator(urls, reader=func.chain_reader, input_stream=(func.map_input_stream, ), notifier=func.notifier, params=None, ddfs=None): """ An iterator over records as seen by the classic map interface. :type reader: :func:`disco.classic.worker.func.input_stream` :param reader: shortcut for the last input stream applied. :type input_stream: sequence of :func:`disco.classic.worker.func.input_stream` :param input_stream: used to read from a custom file format. :type notifier: :func:`disco.classic.worker.func.notifier` :param notifier: called when the task opens a url. """ from disco.worker import Input from disco.worker.classic.worker import Worker worker = Worker(map_reader=reader, map_input_stream=input_stream) settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() for input in util.inputlist(urls, settings=settings): notifier(input) for record in Input(input, open=worker.opener('map', 'in', params)): yield record
def reduce_input(self, task, params): # master should feed only the partitioned inputs to reduce (and shuffle them?) from disco.worker import SerialInput from disco.util import inputlist, ispartitioned, shuffled inputs = [[url for rid, url in i.replicas] for i in self.get_inputs()] label = None if ispartitioned(inputs): label = task.group_label return self.sort(SerialInput(shuffled(inputlist(inputs, label=label)), task=task, open=self.opener('reduce', 'in', params)), task)
def reduce_input(self, task, params): # master should feed only the partitioned inputs to reduce (and shuffle them?) from disco.worker import SerialInput from disco.util import inputlist, ispartitioned, shuffled inputs = [[url for rid, url in i.replicas] for i in self.get_inputs()] label = None if ispartitioned(inputs) and not self['merge_partitions']: label = task.group_label return self.sort(SerialInput(shuffled(inputlist(inputs, label=label)), task=task, open=self.opener('reduce', 'in', params)), task)
def reduce_input(self, task, params): # master should feed only the partitioned inputs to reduce (and shuffle them?) from disco.worker import SerialInput from disco.util import inputlist, ispartitioned, shuffled inputs = [[url for rid, url in i.replicas] for i in self.get_inputs()] partition = None if ispartitioned(inputs) and not self['merge_partitions']: partition = str(task.taskid) return self.sort(SerialInput(shuffled(inputlist(inputs, partition=partition)), task=task, open=self.opener('reduce', 'in', params)), task)
def jobdict(self, job, **jobargs): """ Creates :ref:`jobdict` for the :class:`Worker`. Makes use of the following parameters, in addition to those defined by the :class:`Worker` itself: Uses :meth:`getitem` to resolve the values of parameters. :return: the :term:`job dict`. """ from disco.error import DiscoError def get(key, default=None): return self.getitem(key, job, jobargs, default) stages, pipeline = set(), [] for stage in get('pipeline', []): if len(stage) == 2: g, s = stage concurrent = False elif len(stage) == 3: g, s, concurrent = stage else: raise DiscoError("Bad Stage {0}".format(stage)) if g not in self.group_ops: raise DiscoError("Unknown grouping {0}".format(g)) if s.name in stages: raise DiscoError("Repeated stage {0}".format(s.name)) stages.add(s.name) pipeline.append((s.name, g, concurrent)) from disco.util import isiterable, inputlist job_input = get('input', []) if not isiterable(job_input): raise DiscoError("Job 'input' is not a list of input locations," "or a list of such lists: {0}".format(job_input)) input = inputlist(job_input, label=None, settings=job.settings) pipe_input = [[0, 0, inp] for inp in input] jobdict = super(Worker, self).jobdict(job, **jobargs) jobdict.update({ 'worker': self.bin, 'pipeline': pipeline, 'inputs': pipe_input }) return jobdict
def jobdict(self, job, **jobargs): """ Creates :ref:`jobdict` for the :class:`Worker`. Makes use of the following parameters, in addition to those defined by the :class:`Worker` itself: Uses :meth:`getitem` to resolve the values of parameters. :return: the :term:`job dict`. """ from disco.error import DiscoError def get(key, default=None): return self.getitem(key, job, jobargs, default) stages, pipeline = set(), [] for stage in get("pipeline", []): if len(stage) == 2: g, s = stage concurrent = False elif len(stage) == 3: g, s, concurrent = stage else: raise DiscoError("Bad Stage {0}".format(stage)) if g not in self.group_ops: raise DiscoError("Unknown grouping {0}".format(g)) if s.name in stages: raise DiscoError("Repeated stage {0}".format(s.name)) stages.add(s.name) pipeline.append((s.name, g, concurrent)) from disco.util import isiterable, inputlist job_input = get("input", []) if not isiterable(job_input): raise DiscoError( "Job 'input' is not a list of input locations," "or a list of such lists: {0}".format(job_input) ) input = inputlist(job_input, label=None, settings=job.settings) pipe_input = [[0, 0, inp] for inp in input] jobdict = super(Worker, self).jobdict(job, **jobargs) jobdict.update({"worker": self.bin, "pipeline": pipeline, "inputs": pipe_input}) return jobdict
def sorted_iterator(urls, reader=func.chain_reader, input_stream=(func.map_input_stream, ), notifier=func.notifier, params=None, ddfs=None): from disco.worker import Input from disco.worker.classic.worker import Worker worker = Worker(map_reader=reader, map_input_stream=input_stream) settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() inputs = [] for input in util.inputlist(urls, settings=settings): notifier(input) instream = Input(input, open=worker.opener('map', 'in', params)) if instream: inputs.append(instream) return SortedIterator(inputs)
def sorted_iterator(urls, reader=func.chain_reader, input_stream=(func.map_input_stream,), notifier=func.notifier, params=None, ddfs=None): from disco.worker import Input from disco.worker.classic.worker import Worker worker = Worker(map_reader=reader, map_input_stream=input_stream) settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() inputs = [] for input in util.inputlist(urls, settings=settings): notifier(input) instream = Input(input, open=worker.opener('map', 'in', params)) if instream: inputs.append(instream) return SortedIterator(inputs)
def result_iterator(urls, reader=task_io.chain_reader, input_stream=(func.map_input_stream, ), notifier=func.notifier, params=None, ddfs=None): """ An iterator over records stored in either disco or ddfs. :type reader: :func:`disco.worker.task_io.input_stream` :param reader: shortcut for the last input stream applied. :type input_stream: sequence of :func:`disco.worker.task_io.input_stream` :param input_stream: used to read from a custom file format. :type notifier: :func:`disco.func.notifier` :param notifier: called when the task opens a url. """ from disco.worker import Input from disco.worker.task_io import StreamCombiner settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings() for input in util.inputlist(urls, settings=settings): if isinstance(input, basestring): dest = proxy_url(input, to_master=False) elif isinstance(input, tuple): dest = tuple([proxy_url(i, to_master=False) for i in input]) else: dest = [proxy_url(i, to_master=False) for i in input] notifier(dest) def open(url): streams = [s for s in input_stream] if reader: streams += [reader] return StreamCombiner(url, streams, params) for record in Input(dest, open=open): yield record
def open(url, task=None): partition = str(task.taskid) if task else None return SerialInput(shuffled(inputlist([url], partition=partition)), open=lambda url: schemes.open_chain(url, task=task))
def jobdict(self, job, **jobargs): """ Creates :ref:`jobdict` for the :class:`Worker`. Makes use of the following parameters, in addition to those defined by the :class:`Worker` itself: :type input: list of urls or list of list of urls :param input: used to set :attr:`jobdict.input`. Disco natively handles the following url schemes: * ``http://...`` - any HTTP address * ``file://...`` or no scheme - a local file. The file must exist on all nodes where the tasks are run. Due to these restrictions, this form has only limited use. * ``tag://...`` - a tag stored in :ref:`DDFS` * ``raw://...`` - pseudo-address: use the address itself as data. * ``dir://...`` - used by Disco internally. * ``disco://...`` - used by Disco internally. .. seealso:: :mod:`disco.schemes`. :type name: string :param name: directly sets :attr:`jobdict.prefix`. :type owner: string :param owner: directly sets :attr:`jobdict.owner`. If not specified, uses :envvar:`DISCO_JOB_OWNER`. :type scheduler: dict :param scheduler: directly sets :attr:`jobdict.scheduler`. Uses :meth:`getitem` to resolve the values of parameters. :return: the :term:`job dict`. """ from disco.util import isiterable, inputlist, ispartitioned, read_index from disco.error import DiscoError def get(key, default=None): return self.getitem(key, job, jobargs, default) has_map = bool(get('map')) has_reduce = bool(get('reduce')) job_input = get('input', []) if not isiterable(job_input): raise DiscoError("Job 'input' is not a list of input locations," "or a list of such lists: {0}".format(job_input)) input = inputlist(job_input, partition=None if has_map else False, settings=job.settings) # -- nr_reduces -- # ignored if there is not actually a reduce specified # XXX: master should always handle this if has_map: # partitioned map has N reduces; non-partitioned map has 1 reduce nr_reduces = get('partitions') or 1 elif ispartitioned(input): # no map, with partitions: len(dir://) specifies nr_reduces nr_reduces = 1 + max(int(id) for dir in input for id, url in read_index(dir)) else: # no map, without partitions can only have 1 reduce nr_reduces = 1 if get('merge_partitions'): nr_reduces = 1 return {'input': input, 'worker': self.bin, 'map?': has_map, 'reduce?': has_reduce, 'nr_reduces': nr_reduces, 'prefix': get('name'), 'scheduler': get('scheduler', {}), 'owner': get('owner', job.settings['DISCO_JOB_OWNER'])}
def jobdict(self, job, **jobargs): """ Creates :ref:`jobdict` for the :class:`Worker`. Makes use of the following parameters, in addition to those defined by the :class:`Worker` itself: :type input: list of urls or list of list of urls :param input: used to set :attr:`jobdict.input`. Disco natively handles the following url schemes: * ``http://...`` - any HTTP address * ``file://...`` or no scheme - a local file. The file must exist on all nodes where the tasks are run. Due to these restrictions, this form has only limited use. * ``tag://...`` - a tag stored in :ref:`DDFS` * ``raw://...`` - pseudo-address: use the address itself as data. * ``dir://...`` - used by Disco internally. * ``disco://...`` - used by Disco internally. .. seealso:: :mod:`disco.schemes`. :type name: string :param name: directly sets :attr:`jobdict.prefix`. :type owner: string :param owner: directly sets :attr:`jobdict.owner`. If not specified, uses :envvar:`DISCO_JOB_OWNER`. :type scheduler: dict :param scheduler: directly sets :attr:`jobdict.scheduler`. Uses :meth:`getitem` to resolve the values of parameters. :return: the :term:`job dict`. """ from disco.util import inputlist, ispartitioned, read_index def get(key, default=None): return self.getitem(key, job, jobargs, default) has_map = bool(get('map')) has_reduce = bool(get('reduce')) input = inputlist(get('input', []), partition=None if has_map else False, settings=job.settings) # -- nr_reduces -- # ignored if there is not actually a reduce specified # XXX: master should always handle this if has_map: # partitioned map has N reduces; non-partitioned map has 1 reduce nr_reduces = get('partitions') or 1 elif ispartitioned(input): # no map, with partitions: len(dir://) specifies nr_reduces nr_reduces = 1 + max( int(id) for dir in input for id, url in read_index(dir)) else: # no map, without partitions can only have 1 reduce nr_reduces = 1 if get('merge_partitions'): nr_reduces = 1 return { 'input': input, 'worker': self.bin, 'map?': has_map, 'reduce?': has_reduce, 'nr_reduces': nr_reduces, 'prefix': get('name'), 'scheduler': get('scheduler', {}), 'owner': get('owner', job.settings['DISCO_JOB_OWNER']) }
def jobdict(self, job, **jobargs): """ Creates :ref:`jobdict` for the :class:`Worker`. Makes use of the following parameters, in addition to those defined by the :class:`Worker` itself: :type input: list of urls or list of list of urls :param input: used to set :attr:`jobdict.input`. Disco natively handles the following url schemes: * ``http://...`` - any HTTP address * ``file://...`` or no scheme - a local file. The file must exist on all nodes where the tasks are run. Due to these restrictions, this form has only limited use. * ``tag://...`` - a tag stored in :ref:`DDFS` * ``raw://...`` - pseudo-address: use the address itself as data. * ``dir://...`` - used by Disco internally. * ``disco://...`` - used by Disco internally. .. seealso:: :mod:`disco.schemes`. :type scheduler: dict :param scheduler: directly sets :attr:`jobdict.scheduler`. .. deprecated:: 0.5 *scheduler* params are now ignored. Uses :meth:`getitem` to resolve the values of parameters. :return: the :term:`job dict`. """ from disco.util import isiterable, inputlist, ispartitioned, read_index from disco.error import DiscoError def get(key, default=None): return self.getitem(key, job, jobargs, default) has_map = bool(get('map')) has_reduce = bool(get('reduce')) job_input = get('input', []) has_save_results = get('save', False) or get('save_results', False) if not isiterable(job_input): raise DiscoError("Job 'input' is not a list of input locations," "or a list of such lists: {0}".format(job_input)) input = inputlist(job_input, label=None if has_map else False, settings=job.settings) # -- nr_reduces -- # ignored if there is not actually a reduce specified # XXX: master should always handle this if has_map: # partitioned map has N reduces; non-partitioned map has 1 reduce nr_reduces = get('partitions') or 1 elif ispartitioned(input): # no map, with partitions: len(dir://) specifies nr_reduces nr_reduces = 1 + max(int(id) for dir in input for id, url, size in read_index(dir)) else: # no map, without partitions can only have 1 reduce nr_reduces = 1 if get('merge_partitions'): nr_reduces = 1 jobdict = super(Worker, self).jobdict(job, **jobargs) jobdict.update({'input': input, 'worker': self.bin, 'map?': has_map, 'reduce?': has_reduce, 'nr_reduces': nr_reduces, 'save_results': has_save_results, 'scheduler': get('scheduler', {})}) return jobdict
def open(url, task=None): label = task.group_label if task else None return SerialInput(shuffled(inputlist([url], label=label)), open=lambda url: schemes.open_chain(url, task=task))