Example #1
0
    def run(self, task, job, **jobargs):
        # Entry point into the executing pipeline worker task.  This
        # initializes the task environment, sets up the current stage,
        # and then executes it.
        for key in self:
            self[key] = self.getitem(key, job, jobargs)
        sys_version = '{0[0]}.{0[1]}'.format(sys.version_info[:2])
        assert self['version'] == sys_version, "Python version mismatch"

        # Set up the task environment.
        globals_ = globals().copy()
        for module in self['required_modules']:
            name = module[0] if util.iskv(module) else module
            globals_[name.split('.')[-1]] = __import__(name, fromlist=[name])
        for obj in util.flatten(self.values()):
            util.globalize(obj, globals_)

        # Set up the stage.
        params = self.getitem('params', job, jobargs, worker.Params())
        pipeline = dict([(s.name, (idx, s))
                         for idx, (g, s) in enumerate(self['pipeline'])])
        pipe_idx, stage = pipeline[task.stage]
        stage.taskinfo = TaskInfo(jobname=task.jobname, host=task.host,
                                  stage=task.stage, group=task.group,
                                  label=task.group_label)
        if not stage.input_chain:
            stage.input_chain = Stage.default_input_chain(pipe_idx)
        if not stage.output_chain:
            stage.output_chain = Stage.default_output_chain
        # And now run it.
        self.run_stage(task, stage, params)
Example #2
0
    def run(self, task, job, **jobargs):
        # Entry point into the executing pipeline worker task.  This
        # initializes the task environment, sets up the current stage,
        # and then executes it.
        worker.active_task = task
        for key in self:
            self[key] = self.getitem(key, job, jobargs)
        sys_version = '{0[0]}.{0[1]}'.format(sys.version_info[:2])
        assert self['version'] == sys_version, "Python version mismatch"

        # Set up the task environment.
        globals_ = globals().copy()
        for module in self['required_modules']:
            name = module[0] if util.iskv(module) else module
            globals_[name.split('.')[-1]] = __import__(name, fromlist=[name])
        for obj in util.flatten(self.values()):
            util.globalize(obj, globals_)

        # Set up the stage.
        params = self.getitem('params', job, jobargs, worker.Params())
        pipeline = dict([(s.name, (idx, s))
                         for idx, (g, s) in enumerate(self['pipeline'])])
        pipe_idx, stage = pipeline[task.stage]
        stage.taskinfo = TaskInfo(jobname=task.jobname,
                                  host=task.host,
                                  stage=task.stage,
                                  group=task.group,
                                  label=task.group_label)
        if not stage.input_chain:
            stage.input_chain = Stage.default_input_chain(pipe_idx)
        if not stage.output_chain:
            stage.output_chain = Stage.default_output_chain
        # And now run it.
        self.run_stage(task, stage, params)
Example #3
0
    def __init__(self, *args, **kwargs):
        super(JobDict, self).__init__(*args, **kwargs)

        # -- backwards compatibility --
        if 'reduce_writer' in kwargs or 'map_writer' in kwargs:
            warn("Writers are deprecated - use output_stream.add() instead",
                    DeprecationWarning)

        # -- required modules and files --
        if self['required_modules'] is None:
            functions = util.flatten(util.iterify(self[f])
                                     for f in chain(self.functions, self.stacks))
            self['required_modules'] = find_modules([f for f in functions
                                                     if callable(f)])

        # -- external flags --
        if isinstance(self['map'], dict):
            self['ext_map'] = True
        if isinstance(self['reduce'], dict):
            self['ext_reduce'] = True

        # -- input --
        ddfs = self.pop('ddfs', None)
        self['input'] = [list(util.iterify(url))
                         for i in self['input']
                         for url in util.urllist(i, listdirs=bool(self['map']),
                                                 ddfs=ddfs)]

        # partitions must be an integer internally
        self['partitions'] = self['partitions'] or 0
        # set nr_reduces: ignored if there is not actually a reduce specified
        if self['map']:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            self['nr_reduces'] = self['partitions'] or 1
        elif self.input_is_partitioned:
            # Only reduce, with partitions: len(dir://) specifies nr_reduces
            self['nr_reduces'] = 1 + max(id for dir in self['input']
                                         for id, url in util.read_index(dir[0]))
        else:
            # Only reduce, without partitions can only have 1 reduce
            self['nr_reduces'] = 1

        # merge_partitions iff the inputs to reduce are partitioned
        if self['merge_partitions']:
            if self['partitions'] or self.input_is_partitioned:
                self['nr_reduces'] = 1
            else:
                raise DiscoError("Can't merge partitions without partitions")

        # -- scheduler --
        scheduler = self.__class__.defaults['scheduler'].copy()
        scheduler.update(self['scheduler'])
        if int(scheduler['max_cores']) < 1:
            raise DiscoError("max_cores must be >= 1")
        self['scheduler'] = scheduler

        # -- sanity checks --
        for key in self:
            if key not in self.defaults:
                raise DiscoError("Unknown job argument: %s" % key)
Example #4
0
 def setUp(self):
     host, port = self.test_server_address
     # assumption: scheduler starts scheduling tasks in the
     # order specified by self.input
     self.blacklisted = sorted(self.nodes.keys())
     self.input = flatten([N * ['http://%s/%s:%d' % (node, host, port)]
             for node in self.blacklisted])
     self.whitelist = {}
     for i in range(len(self.blacklisted) - 1):
         self.disco.blacklist(self.blacklisted[i + 1])
         self.whitelist[self.blacklisted[i]] =\
             (N, self.blacklisted[i + 1])
     super(BlacklistTestCase, self).setUp()
Example #5
0
    def run(self, task, job, **jobargs):
        global Task
        Task = task
        for key in self:
            self[key] = self.getitem(key, job, jobargs)
        assert self['version'] == '%s.%s' % sys.version_info[:2], "Python version mismatch"

        params = self['params']
        if isinstance(self[task.mode], dict):
            params = self['ext_params']
            self[task.mode] = external.prepare(params, task.mode)

        globals_ = globals().copy()
        for module in self['required_modules']:
            name = module[0] if util.iskv(module) else module
            globals_[name.split('.')[-1]] = __import__(name, fromlist=[name])
        for obj in util.flatten(self.values()):
            util.globalize(obj, globals_)

        getattr(self, task.mode)(task, params)
        external.close()
Example #6
0
    def run(self, task, job, **jobargs):
        global Task
        Task = task
        for key in self:
            self[key] = self.getitem(key, job, jobargs)
        assert self['version'] == '{0[0]}.{0[1]}'.format(sys.version_info[:2]), "Python version mismatch"

        params = self['params']
        if isinstance(self[task.stage], dict):
            params = self['ext_params']
            self[task.stage] = external.prepare(params, task.stage)

        globals_ = globals().copy()
        for module in self['required_modules']:
            name = module[0] if util.iskv(module) else module
            globals_[name.split('.')[-1]] = __import__(name, fromlist=[name])
        for obj in util.flatten(self.values()):
            util.globalize(obj, globals_)

        getattr(self, task.stage)(task, params)
        external.close()
Example #7
0
 def test_flatten(self):
     self.assertEquals(list(range(7)), list(flatten(sequence)))
Example #8
0
 def runTest(self):
     self.assertEquals(
         list(flatten(N  * [b] for b in self.blacklisted)),
         sorted([n for n, v in self.results]))
Example #9
0
 def test_flatten(self):
     self.assertEquals(list(range(7)), list(flatten(sequence)))
Example #10
0
 def test_rapply(self):
     for x, y in zip(xrange(7), flatten(rapply(sequence, function))):
         self.assertEquals(function(x), y)
Example #11
0
File: core.py Project: mshron/disco
    def __init__(self, *args, **kwargs):
        super(JobDict, self).__init__(*args, **kwargs)

        # -- backwards compatibility --
        if 'reduce_writer' in kwargs or 'map_writer' in kwargs:
            warn("Writers are deprecated - use output_stream.add() instead",
                 DeprecationWarning)

        # -- required modules and files --
        if self['required_modules'] is None:
            functions = util.flatten(
                util.iterify(self[f])
                for f in chain(self.functions, self.stacks))
            self['required_modules'] = find_modules(
                [f for f in functions if callable(f)])

        # -- external flags --
        if isinstance(self['map'], dict):
            self['ext_map'] = True
        if isinstance(self['reduce'], dict):
            self['ext_reduce'] = True

        # -- input --
        ddfs = self.pop('ddfs', None)
        self['input'] = [
            list(util.iterify(url)) for i in self['input']
            for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs)
        ]

        # partitions must be an integer internally
        self['partitions'] = self['partitions'] or 0
        # set nr_reduces: ignored if there is not actually a reduce specified
        if self['map']:
            # partitioned map has N reduces; non-partitioned map has 1 reduce
            self['nr_reduces'] = self['partitions'] or 1
        elif self.input_is_partitioned:
            # Only reduce, with partitions: len(dir://) specifies nr_reduces
            self['nr_reduces'] = 1 + max(
                id for dir in self['input']
                for id, url in util.read_index(dir[0]))
        else:
            # Only reduce, without partitions can only have 1 reduce
            self['nr_reduces'] = 1

        # merge_partitions iff the inputs to reduce are partitioned
        if self['merge_partitions']:
            if self['partitions'] or self.input_is_partitioned:
                self['nr_reduces'] = 1
            else:
                raise DiscoError("Can't merge partitions without partitions")

        # -- scheduler --
        scheduler = self.__class__.defaults['scheduler'].copy()
        scheduler.update(self['scheduler'])
        if int(scheduler['max_cores']) < 1:
            raise DiscoError("max_cores must be >= 1")
        self['scheduler'] = scheduler

        # -- sanity checks --
        for key in self:
            if key not in self.defaults:
                raise DiscoError("Unknown job argument: %s" % key)
Example #12
0
    def _run(self, **kwargs):
        jobargs = util.DefaultDict(self.defaults.__getitem__, kwargs)

        # -- check parameters --

        # Backwards compatibility
        # (fun_map == map, input_files == input)
        if "fun_map" in kwargs:
            kwargs["map"] = kwargs["fun_map"]

        if "input_files" in kwargs:
            kwargs["input"] = kwargs["input_files"]

        if "chunked" in kwargs:
            raise DeprecationWarning("Argument 'chunked' is deprecated")

        if "nr_maps" in kwargs:
            sys.stderr.write("Warning: nr_maps is deprecated. " "Use scheduler = {'max_cores': N} instead.\n")
            sched = jobargs["scheduler"].copy()
            if "max_cores" not in sched:
                sched["max_cores"] = int(jobargs["nr_maps"])
            jobargs["scheduler"] = sched

        if not "input" in kwargs:
            raise DiscoError("Argument input is required")

        if not ("map" in kwargs or "reduce" in kwargs):
            raise DiscoError("Specify map and/or reduce")

        for p in kwargs:
            if p not in Job.defaults:
                raise DiscoError("Unknown argument: %s" % p)

        input = kwargs["input"]

        # -- initialize request --

        request = {
            "prefix": self.name,
            "version": ".".join(map(str, sys.version_info[:2])),
            "params": cPickle.dumps(jobargs["params"], cPickle.HIGHEST_PROTOCOL),
            "sort": str(int(jobargs["sort"])),
            "mem_sort_limit": str(jobargs["mem_sort_limit"]),
            "status_interval": str(jobargs["status_interval"]),
            "profile": str(int(jobargs["profile"])),
        }

        # -- required modules --

        if "required_modules" in kwargs:
            rm = kwargs["required_modules"]
        else:
            functions = util.flatten(util.iterify(jobargs[f]) for f in self.mapreduce_functions)
            rm = modutil.find_modules([f for f in functions if callable(f)])

        send_mod = []
        imp_mod = []
        for mod in rm:
            if type(mod) == tuple:
                send_mod.append(mod[1])
                mod = mod[0]
            imp_mod.append(mod)

        request["required_modules"] = " ".join(imp_mod)
        rf = util.pack_files(send_mod)

        # -- input & output streams --

        for stream in ["map_input_stream", "map_output_stream", "reduce_input_stream", "reduce_output_stream"]:
            self.pack_stack(kwargs, request, stream)

        # -- required files --

        if "required_files" in kwargs:
            if isinstance(kwargs["required_files"], dict):
                rf.update(kwargs["required_files"])
            else:
                rf.update(util.pack_files(kwargs["required_files"]))
        if rf:
            request["required_files"] = util.pack(rf)

        # -- scheduler --

        sched = jobargs["scheduler"]
        sched_keys = ["max_cores", "force_local", "force_remote"]

        if "max_cores" not in sched:
            sched["max_cores"] = 2 ** 31
        elif sched["max_cores"] < 1:
            raise DiscoError("max_cores must be >= 1")

        for k in sched_keys:
            if k in sched:
                request["sched_" + k] = str(sched[k])

        # -- map --

        if "map" in kwargs:
            k = "ext_map" if isinstance(kwargs["map"], dict) else "map"
            request[k] = util.pack(kwargs["map"])

            for function_name in ("map_init", "map_reader", "map_writer", "partition", "combiner"):
                function = jobargs[function_name]
                if function:
                    request[function_name] = util.pack(function)

            def inputlist(input):
                if hasattr(input, "__iter__"):
                    return ["\n".join(reversed(list(input)))]
                return util.urllist(input)

            input = [e for i in input for e in inputlist(i)]

        # -- only reduce --

        else:
            # XXX: Check for redundant inputs, external &
            # partitioned inputs
            input = [url for i in input for url in util.urllist(i)]

        request["input"] = " ".join(input)

        if "ext_params" in kwargs:
            e = kwargs["ext_params"]
            request["ext_params"] = encode_netstring_fd(e) if isinstance(e, dict) else e

        # -- reduce --

        nr_reduces = jobargs["nr_reduces"]
        if "reduce" in kwargs:
            k = "ext_reduce" if isinstance(kwargs["reduce"], dict) else "reduce"
            request[k] = util.pack(kwargs["reduce"])

            for function_name in ("reduce_reader", "reduce_writer", "reduce_init"):
                function = jobargs[function_name]
                if function:
                    request[function_name] = util.pack(function)

        request["nr_reduces"] = str(nr_reduces)

        # -- encode and send the request --

        reply = self.master.request("/disco/job/new", encode_netstring_fd(request))

        if not reply.startswith("job started:"):
            raise DiscoError("Failed to start a job. Server replied: " + reply)
        self.name = reply.split(":", 1)[1]