Exemple #1
0
    def __init__(self, *args, **kwargs):
        super(JobDict, self).__init__(*args, **kwargs)

        # -- backwards compatibility --
        if 'reduce_writer' in kwargs or 'map_writer' in kwargs:
            warn("Writers are deprecated - use output_stream.add() instead",
                    DeprecationWarning)

        # -- required modules and files --
        if self['required_modules'] is None:
            functions = util.flatten(util.iterify(self[f])
                                     for f in chain(self.functions, self.stacks))
            self['required_modules'] = find_modules([f for f in functions
                                                     if callable(f)])

        # -- external flags --
        if isinstance(self['map'], dict):
            self['ext_map'] = True
        if isinstance(self['reduce'], dict):
            self['ext_reduce'] = True

        # -- input --
        ddfs = self.pop('ddfs', None)
        self['input'] = [list(util.iterify(url))
                         for i in self['input']
                         for url in util.urllist(i, listdirs=bool(self['map']),
                                                 ddfs=ddfs)]

        # partitions must be an integer internally
        self['partitions'] = self['partitions'] or 0
        # set nr_reduces: ignored if there is not actually a reduce specified
        if self['map']:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            self['nr_reduces'] = self['partitions'] or 1
        elif self.input_is_partitioned:
            # Only reduce, with partitions: len(dir://) specifies nr_reduces
            self['nr_reduces'] = 1 + max(id for dir in self['input']
                                         for id, url in util.read_index(dir[0]))
        else:
            # Only reduce, without partitions can only have 1 reduce
            self['nr_reduces'] = 1

        # merge_partitions iff the inputs to reduce are partitioned
        if self['merge_partitions']:
            if self['partitions'] or self.input_is_partitioned:
                self['nr_reduces'] = 1
            else:
                raise DiscoError("Can't merge partitions without partitions")

        # -- scheduler --
        scheduler = self.__class__.defaults['scheduler'].copy()
        scheduler.update(self['scheduler'])
        if int(scheduler['max_cores']) < 1:
            raise DiscoError("max_cores must be >= 1")
        self['scheduler'] = scheduler

        # -- sanity checks --
        for key in self:
            if key not in self.defaults:
                raise DiscoError("Unknown job argument: %s" % key)
Exemple #2
0
 def update_label_map(lm, i):
     reps = [url for rid, url in i.replicas]
     if ispartitioned(reps):
         for l, url, size in read_index(reps[0]):
             if i.label in ('all', l):
                 lm[l].append([url])
     else:
         lm[i.label].append(reps)
Exemple #3
0
 def update_label_map(lm, i):
     reps = [url for rid, url in i.replicas]
     if ispartitioned(reps):
         for l, url, size in read_index(reps[0]):
             if i.label in ('all', l):
                 lm[l].append([url])
     else:
         lm[i.label].append(reps)
Exemple #4
0
 def labelexpand(self, task, stage, input, params):
     def input_open(url):
         return task_io.ClassicFile(url, stage.input_chain, params)
     def make_input(inp):
         return worker.Input(inp, task=task, open=input_open)
     if input.isindex:
         for l, url, sz in sorted(util.read_index(input.locations[0])):
             if input.label in ('all', l):
                 yield l, make_input(url)
     else:
         yield input.label, make_input(input)
Exemple #5
0
 def labelexpand(self, task, stage, input, params):
     def input_open(url):
         return task_io.ClassicFile(url, stage.input_chain, params)
     def make_input(inp):
         return worker.Input(inp, task=task, open=input_open)
     if input.isindex:
         for l, url, sz in sorted(util.read_index(input.locations[0])):
             if input.label in ('all', l):
                 yield l, make_input(url)
     else:
         yield input.label, make_input(input)
Exemple #6
0
    def jobdict(self, job, **jobargs):
        """
        Creates :ref:`jobdict` for the :class:`Worker`.

        Makes use of the following parameters,
        in addition to those defined by the :class:`Worker` itself:

        :type  input: list of urls or list of list of urls
        :param input: used to set :attr:`jobdict.input`.
                Disco natively handles the following url schemes:

                * ``http://...`` - any HTTP address
                * ``file://...`` or no scheme - a local file.
                    The file must exist on all nodes where the tasks are run.
                    Due to these restrictions, this form has only limited use.
                * ``tag://...`` - a tag stored in :ref:`DDFS`
                * ``raw://...`` - pseudo-address: use the address itself as data.
                * ``dir://...`` - used by Disco internally.
                * ``disco://...`` - used by Disco internally.

                .. seealso:: :mod:`disco.schemes`.

        :type  name: string
        :param name: directly sets :attr:`jobdict.prefix`.

        :type  owner: string
        :param owner: directly sets :attr:`jobdict.owner`.
                      If not specified, uses :envvar:`DISCO_JOB_OWNER`.

        :type  scheduler: dict
        :param scheduler: directly sets :attr:`jobdict.scheduler`.

        Uses :meth:`getitem` to resolve the values of parameters.

        :return: the :term:`job dict`.
        """
        from disco.util import isiterable, inputlist, ispartitioned, read_index
        from disco.error import DiscoError
        def get(key, default=None):
            return self.getitem(key, job, jobargs, default)
        has_map = bool(get('map'))
        has_reduce = bool(get('reduce'))
        job_input = get('input', [])
        if not isiterable(job_input):
            raise DiscoError("Job 'input' is not a list of input locations,"
                             "or a list of such lists: {0}".format(job_input))
        input = inputlist(job_input,
                          partition=None if has_map else False,
                          settings=job.settings)

        # -- nr_reduces --
        # ignored if there is not actually a reduce specified
        # XXX: master should always handle this
        if has_map:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            nr_reduces = get('partitions') or 1
        elif ispartitioned(input):
            # no map, with partitions: len(dir://) specifies nr_reduces
            nr_reduces = 1 + max(int(id)
                                 for dir in input
                                 for id, url in read_index(dir))
        else:
            # no map, without partitions can only have 1 reduce
            nr_reduces = 1

        if get('merge_partitions'):
            nr_reduces = 1

        return {'input': input,
                'worker': self.bin,
                'map?': has_map,
                'reduce?': has_reduce,
                'nr_reduces': nr_reduces,
                'prefix': get('name'),
                'scheduler': get('scheduler', {}),
                'owner': get('owner', job.settings['DISCO_JOB_OWNER'])}
Exemple #7
0
    def jobdict(self, job, **jobargs):
        """
        Creates :ref:`jobdict` for the :class:`Worker`.

        Makes use of the following parameters,
        in addition to those defined by the :class:`Worker` itself:

        :type  input: list of urls or list of list of urls
        :param input: used to set :attr:`jobdict.input`.
                Disco natively handles the following url schemes:

                * ``http://...`` - any HTTP address
                * ``file://...`` or no scheme - a local file.
                    The file must exist on all nodes where the tasks are run.
                    Due to these restrictions, this form has only limited use.
                * ``tag://...`` - a tag stored in :ref:`DDFS`
                * ``raw://...`` - pseudo-address: use the address itself as data.
                * ``dir://...`` - used by Disco internally.
                * ``disco://...`` - used by Disco internally.

                .. seealso:: :mod:`disco.schemes`.

        :type  name: string
        :param name: directly sets :attr:`jobdict.prefix`.

        :type  owner: string
        :param owner: directly sets :attr:`jobdict.owner`.
                      If not specified, uses :envvar:`DISCO_JOB_OWNER`.

        :type  scheduler: dict
        :param scheduler: directly sets :attr:`jobdict.scheduler`.

        Uses :meth:`getitem` to resolve the values of parameters.

        :return: the :term:`job dict`.
        """
        from disco.util import inputlist, ispartitioned, read_index

        def get(key, default=None):
            return self.getitem(key, job, jobargs, default)

        has_map = bool(get('map'))
        has_reduce = bool(get('reduce'))
        input = inputlist(get('input', []),
                          partition=None if has_map else False,
                          settings=job.settings)

        # -- nr_reduces --
        # ignored if there is not actually a reduce specified
        # XXX: master should always handle this
        if has_map:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            nr_reduces = get('partitions') or 1
        elif ispartitioned(input):
            # no map, with partitions: len(dir://) specifies nr_reduces
            nr_reduces = 1 + max(
                int(id) for dir in input for id, url in read_index(dir))
        else:
            # no map, without partitions can only have 1 reduce
            nr_reduces = 1

        if get('merge_partitions'):
            nr_reduces = 1

        return {
            'input': input,
            'worker': self.bin,
            'map?': has_map,
            'reduce?': has_reduce,
            'nr_reduces': nr_reduces,
            'prefix': get('name'),
            'scheduler': get('scheduler', {}),
            'owner': get('owner', job.settings['DISCO_JOB_OWNER'])
        }
Exemple #8
0
    def jobdict(self, job, **jobargs):
        """
        Creates :ref:`jobdict` for the :class:`Worker`.

        Makes use of the following parameters, in addition to those
        defined by the :class:`Worker` itself:

        :type  input: list of urls or list of list of urls
        :param input: used to set :attr:`jobdict.input`.
                Disco natively handles the following url schemes:

                * ``http://...`` - any HTTP address
                * ``file://...`` or no scheme - a local file.
                    The file must exist on all nodes where the tasks are run.
                    Due to these restrictions, this form has only limited use.
                * ``tag://...`` - a tag stored in :ref:`DDFS`
                * ``raw://...`` - pseudo-address: use the address itself as data.
                * ``dir://...`` - used by Disco internally.
                * ``disco://...`` - used by Disco internally.

                .. seealso:: :mod:`disco.schemes`.

        :type  scheduler: dict
        :param scheduler: directly sets :attr:`jobdict.scheduler`.

                          .. deprecated:: 0.5
                                  *scheduler* params are now ignored.

        Uses :meth:`getitem` to resolve the values of parameters.

        :return: the :term:`job dict`.
        """
        from disco.util import isiterable, inputlist, ispartitioned, read_index
        from disco.error import DiscoError
        def get(key, default=None):
            return self.getitem(key, job, jobargs, default)
        has_map = bool(get('map'))
        has_reduce = bool(get('reduce'))
        job_input = get('input', [])
        has_save_results = get('save', False) or get('save_results', False)
        if not isiterable(job_input):
            raise DiscoError("Job 'input' is not a list of input locations,"
                             "or a list of such lists: {0}".format(job_input))
        input = inputlist(job_input,
                          label=None if has_map else False,
                          settings=job.settings)

        # -- nr_reduces --
        # ignored if there is not actually a reduce specified
        # XXX: master should always handle this
        if has_map:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            nr_reduces = get('partitions') or 1
        elif ispartitioned(input):
            # no map, with partitions: len(dir://) specifies nr_reduces
            nr_reduces = 1 + max(int(id)
                                 for dir in input
                                 for id, url, size in read_index(dir))
        else:
            # no map, without partitions can only have 1 reduce
            nr_reduces = 1

        if get('merge_partitions'):
            nr_reduces = 1

        jobdict = super(Worker, self).jobdict(job, **jobargs)
        jobdict.update({'input': input,
                        'worker': self.bin,
                        'map?': has_map,
                        'reduce?': has_reduce,
                        'nr_reduces': nr_reduces,
                        'save_results': has_save_results,
                        'scheduler': get('scheduler', {})})
        return jobdict
Exemple #9
0
    def __init__(self, *args, **kwargs):
        super(JobDict, self).__init__(*args, **kwargs)

        # -- backwards compatibility --
        if 'reduce_writer' in kwargs or 'map_writer' in kwargs:
            warn("Writers are deprecated - use output_stream.add() instead",
                 DeprecationWarning)

        # -- required modules and files --
        if self['required_modules'] is None:
            functions = util.flatten(
                util.iterify(self[f])
                for f in chain(self.functions, self.stacks))
            self['required_modules'] = find_modules(
                [f for f in functions if callable(f)])

        # -- external flags --
        if isinstance(self['map'], dict):
            self['ext_map'] = True
        if isinstance(self['reduce'], dict):
            self['ext_reduce'] = True

        # -- input --
        ddfs = self.pop('ddfs', None)
        self['input'] = [
            list(util.iterify(url)) for i in self['input']
            for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs)
        ]

        # partitions must be an integer internally
        self['partitions'] = self['partitions'] or 0
        # set nr_reduces: ignored if there is not actually a reduce specified
        if self['map']:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            self['nr_reduces'] = self['partitions'] or 1
        elif self.input_is_partitioned:
            # Only reduce, with partitions: len(dir://) specifies nr_reduces
            self['nr_reduces'] = 1 + max(
                id for dir in self['input']
                for id, url in util.read_index(dir[0]))
        else:
            # Only reduce, without partitions can only have 1 reduce
            self['nr_reduces'] = 1

        # merge_partitions iff the inputs to reduce are partitioned
        if self['merge_partitions']:
            if self['partitions'] or self.input_is_partitioned:
                self['nr_reduces'] = 1
            else:
                raise DiscoError("Can't merge partitions without partitions")

        # -- scheduler --
        scheduler = self.__class__.defaults['scheduler'].copy()
        scheduler.update(self['scheduler'])
        if int(scheduler['max_cores']) < 1:
            raise DiscoError("max_cores must be >= 1")
        self['scheduler'] = scheduler

        # -- sanity checks --
        for key in self:
            if key not in self.defaults:
                raise DiscoError("Unknown job argument: %s" % key)