Beispiel #1
0
def classic_iterator(urls,
                     reader=task_io.chain_reader,
                     input_stream=(func.map_input_stream, ),
                     notifier=func.notifier,
                     params=None,
                     ddfs=None):
    """
    An iterator over records as seen by the classic map interface.

    :type  reader: :func:`disco.worker.task_io.input_stream`
    :param reader: shortcut for the last input stream applied.

    :type  input_stream: sequence of :func:`disco.worker.task_io.input_stream`
    :param input_stream: used to read from a custom file format.

    :type  notifier: :func:`disco.func.notifier`
    :param notifier: called when the task opens a url.
    """
    from disco.worker import Input
    from disco.worker.classic.worker import Worker
    worker = Worker(map_reader=reader, map_input_stream=input_stream)
    settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings()
    for input in util.inputlist(urls, settings=settings):
        if isinstance(input, basestring):
            dest = proxy_url(input, to_master=False)
        elif isinstance(input, tuple):
            dest = tuple([proxy_url(i, to_master=False) for i in input])
        else:
            dest = [proxy_url(i, to_master=False) for i in input]
        notifier(dest)
        for record in Input(dest, open=worker.opener('map', 'in', params)):
            yield record
Beispiel #2
0
def classic_iterator(urls,
                     reader=func.chain_reader,
                     input_stream=(func.map_input_stream, ),
                     notifier=func.notifier,
                     params=None,
                     ddfs=None):
    """
    An iterator over records as seen by the classic map interface.

    :type  reader: :func:`disco.classic.worker.func.input_stream`
    :param reader: shortcut for the last input stream applied.

    :type  input_stream: sequence of :func:`disco.classic.worker.func.input_stream`
    :param input_stream: used to read from a custom file format.

    :type  notifier: :func:`disco.classic.worker.func.notifier`
    :param notifier: called when the task opens a url.
    """
    from disco.worker import Input
    from disco.worker.classic.worker import Worker
    worker = Worker(map_reader=reader, map_input_stream=input_stream)
    settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings()
    for input in util.inputlist(urls, settings=settings):
        notifier(input)
        for record in Input(input, open=worker.opener('map', 'in', params)):
            yield record
Beispiel #3
0
 def reduce_input(self, task, params):
     # master should feed only the partitioned inputs to reduce (and shuffle them?)
     from disco.worker import SerialInput
     from disco.util import inputlist, ispartitioned, shuffled
     inputs = [[url for rid, url in i.replicas] for i in self.get_inputs()]
     label = None
     if ispartitioned(inputs):
         label = task.group_label
     return self.sort(SerialInput(shuffled(inputlist(inputs, label=label)),
                                  task=task,
                                  open=self.opener('reduce', 'in', params)),
                      task)
Beispiel #4
0
 def reduce_input(self, task, params):
     # master should feed only the partitioned inputs to reduce (and shuffle them?)
     from disco.worker import SerialInput
     from disco.util import inputlist, ispartitioned, shuffled
     inputs = [[url for rid, url in i.replicas] for i in self.get_inputs()]
     label = None
     if ispartitioned(inputs) and not self['merge_partitions']:
         label = task.group_label
     return self.sort(SerialInput(shuffled(inputlist(inputs, label=label)),
                                  task=task,
                                  open=self.opener('reduce', 'in', params)),
                      task)
Beispiel #5
0
 def reduce_input(self, task, params):
     # master should feed only the partitioned inputs to reduce (and shuffle them?)
     from disco.worker import SerialInput
     from disco.util import inputlist, ispartitioned, shuffled
     inputs = [[url for rid, url in i.replicas] for i in self.get_inputs()]
     partition = None
     if ispartitioned(inputs) and not self['merge_partitions']:
         partition = str(task.taskid)
     return self.sort(SerialInput(shuffled(inputlist(inputs, partition=partition)),
                                  task=task,
                                  open=self.opener('reduce', 'in', params)),
                      task)
Beispiel #6
0
    def jobdict(self, job, **jobargs):
        """
        Creates :ref:`jobdict` for the :class:`Worker`.

        Makes use of the following parameters, in addition to those
        defined by the :class:`Worker` itself:

        Uses :meth:`getitem` to resolve the values of parameters.

        :return: the :term:`job dict`.
        """
        from disco.error import DiscoError

        def get(key, default=None):
            return self.getitem(key, job, jobargs, default)

        stages, pipeline = set(), []
        for stage in get('pipeline', []):
            if len(stage) == 2:
                g, s = stage
                concurrent = False
            elif len(stage) == 3:
                g, s, concurrent = stage
            else:
                raise DiscoError("Bad Stage {0}".format(stage))
            if g not in self.group_ops:
                raise DiscoError("Unknown grouping {0}".format(g))
            if s.name in stages:
                raise DiscoError("Repeated stage {0}".format(s.name))
            stages.add(s.name)
            pipeline.append((s.name, g, concurrent))

        from disco.util import isiterable, inputlist
        job_input = get('input', [])
        if not isiterable(job_input):
            raise DiscoError("Job 'input' is not a list of input locations,"
                             "or a list of such lists: {0}".format(job_input))
        input = inputlist(job_input, label=None, settings=job.settings)
        pipe_input = [[0, 0, inp] for inp in input]
        jobdict = super(Worker, self).jobdict(job, **jobargs)
        jobdict.update({
            'worker': self.bin,
            'pipeline': pipeline,
            'inputs': pipe_input
        })
        return jobdict
Beispiel #7
0
    def jobdict(self, job, **jobargs):
        """
        Creates :ref:`jobdict` for the :class:`Worker`.

        Makes use of the following parameters, in addition to those
        defined by the :class:`Worker` itself:

        Uses :meth:`getitem` to resolve the values of parameters.

        :return: the :term:`job dict`.
        """
        from disco.error import DiscoError

        def get(key, default=None):
            return self.getitem(key, job, jobargs, default)

        stages, pipeline = set(), []
        for stage in get("pipeline", []):
            if len(stage) == 2:
                g, s = stage
                concurrent = False
            elif len(stage) == 3:
                g, s, concurrent = stage
            else:
                raise DiscoError("Bad Stage {0}".format(stage))
            if g not in self.group_ops:
                raise DiscoError("Unknown grouping {0}".format(g))
            if s.name in stages:
                raise DiscoError("Repeated stage {0}".format(s.name))
            stages.add(s.name)
            pipeline.append((s.name, g, concurrent))

        from disco.util import isiterable, inputlist

        job_input = get("input", [])
        if not isiterable(job_input):
            raise DiscoError(
                "Job 'input' is not a list of input locations," "or a list of such lists: {0}".format(job_input)
            )
        input = inputlist(job_input, label=None, settings=job.settings)
        pipe_input = [[0, 0, inp] for inp in input]
        jobdict = super(Worker, self).jobdict(job, **jobargs)
        jobdict.update({"worker": self.bin, "pipeline": pipeline, "inputs": pipe_input})
        return jobdict
Beispiel #8
0
def sorted_iterator(urls,
                    reader=func.chain_reader,
                    input_stream=(func.map_input_stream, ),
                    notifier=func.notifier,
                    params=None,
                    ddfs=None):

    from disco.worker import Input
    from disco.worker.classic.worker import Worker

    worker = Worker(map_reader=reader, map_input_stream=input_stream)
    settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings()

    inputs = []
    for input in util.inputlist(urls, settings=settings):
        notifier(input)
        instream = Input(input, open=worker.opener('map', 'in', params))
        if instream:
            inputs.append(instream)

    return SortedIterator(inputs)
Beispiel #9
0
def sorted_iterator(urls,
                    reader=func.chain_reader,
                    input_stream=(func.map_input_stream,),
                    notifier=func.notifier,
                    params=None,
                    ddfs=None):

    from disco.worker import Input
    from disco.worker.classic.worker import Worker

    worker = Worker(map_reader=reader, map_input_stream=input_stream)
    settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings()

    inputs = []
    for input in util.inputlist(urls, settings=settings):
        notifier(input)
        instream = Input(input, open=worker.opener('map', 'in', params))
        if instream:
            inputs.append(instream)

    return SortedIterator(inputs)
Beispiel #10
0
def result_iterator(urls,
                     reader=task_io.chain_reader,
                     input_stream=(func.map_input_stream, ),
                     notifier=func.notifier,
                     params=None,
                     ddfs=None):
    """
    An iterator over records stored in either disco or ddfs.

    :type  reader: :func:`disco.worker.task_io.input_stream`
    :param reader: shortcut for the last input stream applied.

    :type  input_stream: sequence of :func:`disco.worker.task_io.input_stream`
    :param input_stream: used to read from a custom file format.

    :type  notifier: :func:`disco.func.notifier`
    :param notifier: called when the task opens a url.
    """
    from disco.worker import Input
    from disco.worker.task_io import StreamCombiner
    settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings()
    for input in util.inputlist(urls, settings=settings):
        if isinstance(input, basestring):
            dest = proxy_url(input, to_master=False)
        elif isinstance(input, tuple):
            dest = tuple([proxy_url(i, to_master=False) for i in input])
        else:
            dest = [proxy_url(i, to_master=False) for i in input]
        notifier(dest)

        def open(url):
            streams = [s for s in input_stream]
            if reader:
                streams += [reader]
            return StreamCombiner(url, streams, params)

        for record in Input(dest, open=open):
            yield record
def result_iterator(urls,
                    reader=task_io.chain_reader,
                    input_stream=(func.map_input_stream, ),
                    notifier=func.notifier,
                    params=None,
                    ddfs=None):
    """
    An iterator over records stored in either disco or ddfs.

    :type  reader: :func:`disco.worker.task_io.input_stream`
    :param reader: shortcut for the last input stream applied.

    :type  input_stream: sequence of :func:`disco.worker.task_io.input_stream`
    :param input_stream: used to read from a custom file format.

    :type  notifier: :func:`disco.func.notifier`
    :param notifier: called when the task opens a url.
    """
    from disco.worker import Input
    from disco.worker.task_io import StreamCombiner
    settings = DiscoSettings(DISCO_MASTER=ddfs) if ddfs else DiscoSettings()
    for input in util.inputlist(urls, settings=settings):
        if isinstance(input, basestring):
            dest = proxy_url(input, to_master=False)
        elif isinstance(input, tuple):
            dest = tuple([proxy_url(i, to_master=False) for i in input])
        else:
            dest = [proxy_url(i, to_master=False) for i in input]
        notifier(dest)

        def open(url):
            streams = [s for s in input_stream]
            if reader:
                streams += [reader]
            return StreamCombiner(url, streams, params)

        for record in Input(dest, open=open):
            yield record
def open(url, task=None):
    partition = str(task.taskid) if task else None
    return SerialInput(shuffled(inputlist([url], partition=partition)),
                       open=lambda url: schemes.open_chain(url, task=task))
Beispiel #13
0
def open(url, task=None):
    partition = str(task.taskid) if task else None
    return SerialInput(shuffled(inputlist([url], partition=partition)),
                       open=lambda url: schemes.open_chain(url, task=task))
Beispiel #14
0
    def jobdict(self, job, **jobargs):
        """
        Creates :ref:`jobdict` for the :class:`Worker`.

        Makes use of the following parameters,
        in addition to those defined by the :class:`Worker` itself:

        :type  input: list of urls or list of list of urls
        :param input: used to set :attr:`jobdict.input`.
                Disco natively handles the following url schemes:

                * ``http://...`` - any HTTP address
                * ``file://...`` or no scheme - a local file.
                    The file must exist on all nodes where the tasks are run.
                    Due to these restrictions, this form has only limited use.
                * ``tag://...`` - a tag stored in :ref:`DDFS`
                * ``raw://...`` - pseudo-address: use the address itself as data.
                * ``dir://...`` - used by Disco internally.
                * ``disco://...`` - used by Disco internally.

                .. seealso:: :mod:`disco.schemes`.

        :type  name: string
        :param name: directly sets :attr:`jobdict.prefix`.

        :type  owner: string
        :param owner: directly sets :attr:`jobdict.owner`.
                      If not specified, uses :envvar:`DISCO_JOB_OWNER`.

        :type  scheduler: dict
        :param scheduler: directly sets :attr:`jobdict.scheduler`.

        Uses :meth:`getitem` to resolve the values of parameters.

        :return: the :term:`job dict`.
        """
        from disco.util import isiterable, inputlist, ispartitioned, read_index
        from disco.error import DiscoError
        def get(key, default=None):
            return self.getitem(key, job, jobargs, default)
        has_map = bool(get('map'))
        has_reduce = bool(get('reduce'))
        job_input = get('input', [])
        if not isiterable(job_input):
            raise DiscoError("Job 'input' is not a list of input locations,"
                             "or a list of such lists: {0}".format(job_input))
        input = inputlist(job_input,
                          partition=None if has_map else False,
                          settings=job.settings)

        # -- nr_reduces --
        # ignored if there is not actually a reduce specified
        # XXX: master should always handle this
        if has_map:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            nr_reduces = get('partitions') or 1
        elif ispartitioned(input):
            # no map, with partitions: len(dir://) specifies nr_reduces
            nr_reduces = 1 + max(int(id)
                                 for dir in input
                                 for id, url in read_index(dir))
        else:
            # no map, without partitions can only have 1 reduce
            nr_reduces = 1

        if get('merge_partitions'):
            nr_reduces = 1

        return {'input': input,
                'worker': self.bin,
                'map?': has_map,
                'reduce?': has_reduce,
                'nr_reduces': nr_reduces,
                'prefix': get('name'),
                'scheduler': get('scheduler', {}),
                'owner': get('owner', job.settings['DISCO_JOB_OWNER'])}
Beispiel #15
0
    def jobdict(self, job, **jobargs):
        """
        Creates :ref:`jobdict` for the :class:`Worker`.

        Makes use of the following parameters,
        in addition to those defined by the :class:`Worker` itself:

        :type  input: list of urls or list of list of urls
        :param input: used to set :attr:`jobdict.input`.
                Disco natively handles the following url schemes:

                * ``http://...`` - any HTTP address
                * ``file://...`` or no scheme - a local file.
                    The file must exist on all nodes where the tasks are run.
                    Due to these restrictions, this form has only limited use.
                * ``tag://...`` - a tag stored in :ref:`DDFS`
                * ``raw://...`` - pseudo-address: use the address itself as data.
                * ``dir://...`` - used by Disco internally.
                * ``disco://...`` - used by Disco internally.

                .. seealso:: :mod:`disco.schemes`.

        :type  name: string
        :param name: directly sets :attr:`jobdict.prefix`.

        :type  owner: string
        :param owner: directly sets :attr:`jobdict.owner`.
                      If not specified, uses :envvar:`DISCO_JOB_OWNER`.

        :type  scheduler: dict
        :param scheduler: directly sets :attr:`jobdict.scheduler`.

        Uses :meth:`getitem` to resolve the values of parameters.

        :return: the :term:`job dict`.
        """
        from disco.util import inputlist, ispartitioned, read_index

        def get(key, default=None):
            return self.getitem(key, job, jobargs, default)

        has_map = bool(get('map'))
        has_reduce = bool(get('reduce'))
        input = inputlist(get('input', []),
                          partition=None if has_map else False,
                          settings=job.settings)

        # -- nr_reduces --
        # ignored if there is not actually a reduce specified
        # XXX: master should always handle this
        if has_map:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            nr_reduces = get('partitions') or 1
        elif ispartitioned(input):
            # no map, with partitions: len(dir://) specifies nr_reduces
            nr_reduces = 1 + max(
                int(id) for dir in input for id, url in read_index(dir))
        else:
            # no map, without partitions can only have 1 reduce
            nr_reduces = 1

        if get('merge_partitions'):
            nr_reduces = 1

        return {
            'input': input,
            'worker': self.bin,
            'map?': has_map,
            'reduce?': has_reduce,
            'nr_reduces': nr_reduces,
            'prefix': get('name'),
            'scheduler': get('scheduler', {}),
            'owner': get('owner', job.settings['DISCO_JOB_OWNER'])
        }
Beispiel #16
0
    def jobdict(self, job, **jobargs):
        """
        Creates :ref:`jobdict` for the :class:`Worker`.

        Makes use of the following parameters, in addition to those
        defined by the :class:`Worker` itself:

        :type  input: list of urls or list of list of urls
        :param input: used to set :attr:`jobdict.input`.
                Disco natively handles the following url schemes:

                * ``http://...`` - any HTTP address
                * ``file://...`` or no scheme - a local file.
                    The file must exist on all nodes where the tasks are run.
                    Due to these restrictions, this form has only limited use.
                * ``tag://...`` - a tag stored in :ref:`DDFS`
                * ``raw://...`` - pseudo-address: use the address itself as data.
                * ``dir://...`` - used by Disco internally.
                * ``disco://...`` - used by Disco internally.

                .. seealso:: :mod:`disco.schemes`.

        :type  scheduler: dict
        :param scheduler: directly sets :attr:`jobdict.scheduler`.

                          .. deprecated:: 0.5
                                  *scheduler* params are now ignored.

        Uses :meth:`getitem` to resolve the values of parameters.

        :return: the :term:`job dict`.
        """
        from disco.util import isiterable, inputlist, ispartitioned, read_index
        from disco.error import DiscoError
        def get(key, default=None):
            return self.getitem(key, job, jobargs, default)
        has_map = bool(get('map'))
        has_reduce = bool(get('reduce'))
        job_input = get('input', [])
        has_save_results = get('save', False) or get('save_results', False)
        if not isiterable(job_input):
            raise DiscoError("Job 'input' is not a list of input locations,"
                             "or a list of such lists: {0}".format(job_input))
        input = inputlist(job_input,
                          label=None if has_map else False,
                          settings=job.settings)

        # -- nr_reduces --
        # ignored if there is not actually a reduce specified
        # XXX: master should always handle this
        if has_map:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            nr_reduces = get('partitions') or 1
        elif ispartitioned(input):
            # no map, with partitions: len(dir://) specifies nr_reduces
            nr_reduces = 1 + max(int(id)
                                 for dir in input
                                 for id, url, size in read_index(dir))
        else:
            # no map, without partitions can only have 1 reduce
            nr_reduces = 1

        if get('merge_partitions'):
            nr_reduces = 1

        jobdict = super(Worker, self).jobdict(job, **jobargs)
        jobdict.update({'input': input,
                        'worker': self.bin,
                        'map?': has_map,
                        'reduce?': has_reduce,
                        'nr_reduces': nr_reduces,
                        'save_results': has_save_results,
                        'scheduler': get('scheduler', {})})
        return jobdict
Beispiel #17
0
def open(url, task=None):
    label = task.group_label if task else None
    return SerialInput(shuffled(inputlist([url], label=label)),
                       open=lambda url: schemes.open_chain(url, task=task))
def open(url, task=None):
    label = task.group_label if task else None
    return SerialInput(shuffled(inputlist([url], label=label)),
                       open=lambda url: schemes.open_chain(url, task=task))