Example #1
0
    def __init__(self, *args, **kwargs):
        super(JobDict, self).__init__(*args, **kwargs)

        # -- backwards compatibility --
        if 'reduce_writer' in kwargs or 'map_writer' in kwargs:
            warn("Writers are deprecated - use output_stream.add() instead",
                    DeprecationWarning)

        # -- required modules and files --
        if self['required_modules'] is None:
            functions = util.flatten(util.iterify(self[f])
                                     for f in chain(self.functions, self.stacks))
            self['required_modules'] = find_modules([f for f in functions
                                                     if callable(f)])

        # -- external flags --
        if isinstance(self['map'], dict):
            self['ext_map'] = True
        if isinstance(self['reduce'], dict):
            self['ext_reduce'] = True

        # -- input --
        ddfs = self.pop('ddfs', None)
        self['input'] = [list(util.iterify(url))
                         for i in self['input']
                         for url in util.urllist(i, listdirs=bool(self['map']),
                                                 ddfs=ddfs)]

        # partitions must be an integer internally
        self['partitions'] = self['partitions'] or 0
        # set nr_reduces: ignored if there is not actually a reduce specified
        if self['map']:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            self['nr_reduces'] = self['partitions'] or 1
        elif self.input_is_partitioned:
            # Only reduce, with partitions: len(dir://) specifies nr_reduces
            self['nr_reduces'] = 1 + max(id for dir in self['input']
                                         for id, url in util.read_index(dir[0]))
        else:
            # Only reduce, without partitions can only have 1 reduce
            self['nr_reduces'] = 1

        # merge_partitions iff the inputs to reduce are partitioned
        if self['merge_partitions']:
            if self['partitions'] or self.input_is_partitioned:
                self['nr_reduces'] = 1
            else:
                raise DiscoError("Can't merge partitions without partitions")

        # -- scheduler --
        scheduler = self.__class__.defaults['scheduler'].copy()
        scheduler.update(self['scheduler'])
        if int(scheduler['max_cores']) < 1:
            raise DiscoError("max_cores must be >= 1")
        self['scheduler'] = scheduler

        # -- sanity checks --
        for key in self:
            if key not in self.defaults:
                raise DiscoError("Unknown job argument: %s" % key)
Example #2
0
 def required_modules(self):
     return modutil.find_modules([self.map])
Example #3
0
 def test_missing(self):
     self.assertRaises(ModUtilImportError, lambda: modutil.find_modules([missing_module]))
Example #4
0
 def assertFindsModules(self, functions, modules, send_modules=True, recurse=True):
     self.assertEquals(sorted(modutil.find_modules(functions,
                               send_modules=send_modules,
                               recurse=recurse)),
               sorted(modules))
Example #5
0
File: core.py Project: mshron/disco
    def __init__(self, *args, **kwargs):
        super(JobDict, self).__init__(*args, **kwargs)

        # -- backwards compatibility --
        if 'reduce_writer' in kwargs or 'map_writer' in kwargs:
            warn("Writers are deprecated - use output_stream.add() instead",
                 DeprecationWarning)

        # -- required modules and files --
        if self['required_modules'] is None:
            functions = util.flatten(
                util.iterify(self[f])
                for f in chain(self.functions, self.stacks))
            self['required_modules'] = find_modules(
                [f for f in functions if callable(f)])

        # -- external flags --
        if isinstance(self['map'], dict):
            self['ext_map'] = True
        if isinstance(self['reduce'], dict):
            self['ext_reduce'] = True

        # -- input --
        ddfs = self.pop('ddfs', None)
        self['input'] = [
            list(util.iterify(url)) for i in self['input']
            for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs)
        ]

        # partitions must be an integer internally
        self['partitions'] = self['partitions'] or 0
        # set nr_reduces: ignored if there is not actually a reduce specified
        if self['map']:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            self['nr_reduces'] = self['partitions'] or 1
        elif self.input_is_partitioned:
            # Only reduce, with partitions: len(dir://) specifies nr_reduces
            self['nr_reduces'] = 1 + max(
                id for dir in self['input']
                for id, url in util.read_index(dir[0]))
        else:
            # Only reduce, without partitions can only have 1 reduce
            self['nr_reduces'] = 1

        # merge_partitions iff the inputs to reduce are partitioned
        if self['merge_partitions']:
            if self['partitions'] or self.input_is_partitioned:
                self['nr_reduces'] = 1
            else:
                raise DiscoError("Can't merge partitions without partitions")

        # -- scheduler --
        scheduler = self.__class__.defaults['scheduler'].copy()
        scheduler.update(self['scheduler'])
        if int(scheduler['max_cores']) < 1:
            raise DiscoError("max_cores must be >= 1")
        self['scheduler'] = scheduler

        # -- sanity checks --
        for key in self:
            if key not in self.defaults:
                raise DiscoError("Unknown job argument: %s" % key)
Example #6
0
 def required_modules(self):
     return modutil.find_modules([self.map])
Example #7
0
 def test_missing(self):
     self.assertRaises(ModUtilImportError, lambda: modutil.find_modules([missing_module]))
Example #8
0
 def assertFindsModules(self, functions, modules, send_modules=True, recurse=True):
     self.assertEquals(sorted(modutil.find_modules(functions,
                               send_modules=send_modules,
                               recurse=recurse)),
               sorted(modules))