Exemple #1
0
    def pack(self):
        """Pack up the :class:`JobDict` for sending over the wire."""
        jobpack = {}

        if self['required_files']:
            if not isinstance(self['required_files'], dict):
                self['required_files'] = util.pack_files(self['required_files'])
        else:
            self['required_files'] = {}

        self['required_files'].update(util.pack_files(
            o[1] for o in self['required_modules'] if util.iskv(o)))

        for key in self.defaults:
            if key == 'input':
                jobpack['input'] = ' '.join(
                    '\n'.join(reversed(list(util.iterify(url))))
                        for url in self['input'])
            elif key in ('nr_reduces', 'prefix'):
                jobpack[key] = str(self[key])
            elif key == 'scheduler':
                scheduler = self['scheduler']
                for key in scheduler:
                    jobpack['sched_%s' % key] = str(scheduler[key])
            elif self[key] is None:
                pass
            elif key in self.stacks:
                jobpack[key] = util.pack_stack(self[key])
            else:
                jobpack[key] = util.pack(self[key])
        return encode_netstring_fd(jobpack)
Exemple #2
0
    def pack(self):
        """Pack up the :class:`JobDict` for sending over the wire."""
        jobpack = {}

        if self['required_files']:
            if not isinstance(self['required_files'], dict):
                self['required_files'] = util.pack_files(
                    self['required_files'])
        else:
            self['required_files'] = {}

        self['required_files'].update(
            util.pack_files(o[1] for o in self['required_modules']
                            if util.iskv(o)))

        for key in self.defaults:
            if key in ('map', 'reduce'):
                if self[key] is None:
                    continue
            if key == 'input':
                jobpack['input'] = ' '.join(
                    '\n'.join(reversed(list(util.iterify(url))))
                    for url in self['input'])
            elif key == 'username':
                jobpack['username'] = str(self['username'])
            elif key in ('nr_reduces', 'prefix'):
                jobpack[key] = str(self[key])
            elif key == 'scheduler':
                scheduler = self['scheduler']
                for key in scheduler:
                    jobpack['sched_%s' % key] = str(scheduler[key])
            elif key in self.stacks:
                jobpack[key] = util.pack_stack(self[key])
            else:
                jobpack[key] = util.pack(self[key])
        return encode_netstring_fd(jobpack)
Exemple #3
0
 def __getstate__(self):
     return dict((k, util.pack(v))
         for k, v in self.__dict__.iteritems()
             if not k.startswith('_'))
Exemple #4
0
 def test_pack(self):
     now = datetime.now()
     self.assertEquals(now, unpack(pack(now)))
     self.assertEquals(666, unpack(pack(666)))
     self.assertEquals(function.func_code, unpack(pack(function)).func_code)
Exemple #5
0
 def __getstate__(self):
     return dict((k, util.pack(v)) for k, v in self.__dict__.iteritems()
                 if not k.startswith('_'))
Exemple #6
0
    def _run(self, **kwargs):
        jobargs = util.DefaultDict(self.defaults.__getitem__, kwargs)

        # -- check parameters --

        # Backwards compatibility
        # (fun_map == map, input_files == input)
        if "fun_map" in kwargs:
            kwargs["map"] = kwargs["fun_map"]

        if "input_files" in kwargs:
            kwargs["input"] = kwargs["input_files"]

        if "chunked" in kwargs:
            raise DeprecationWarning("Argument 'chunked' is deprecated")

        if "nr_maps" in kwargs:
            sys.stderr.write("Warning: nr_maps is deprecated. " "Use scheduler = {'max_cores': N} instead.\n")
            sched = jobargs["scheduler"].copy()
            if "max_cores" not in sched:
                sched["max_cores"] = int(jobargs["nr_maps"])
            jobargs["scheduler"] = sched

        if not "input" in kwargs:
            raise DiscoError("Argument input is required")

        if not ("map" in kwargs or "reduce" in kwargs):
            raise DiscoError("Specify map and/or reduce")

        for p in kwargs:
            if p not in Job.defaults:
                raise DiscoError("Unknown argument: %s" % p)

        input = kwargs["input"]

        # -- initialize request --

        request = {
            "prefix": self.name,
            "version": ".".join(map(str, sys.version_info[:2])),
            "params": cPickle.dumps(jobargs["params"], cPickle.HIGHEST_PROTOCOL),
            "sort": str(int(jobargs["sort"])),
            "mem_sort_limit": str(jobargs["mem_sort_limit"]),
            "status_interval": str(jobargs["status_interval"]),
            "profile": str(int(jobargs["profile"])),
        }

        # -- required modules --

        if "required_modules" in kwargs:
            rm = kwargs["required_modules"]
        else:
            functions = util.flatten(util.iterify(jobargs[f]) for f in self.mapreduce_functions)
            rm = modutil.find_modules([f for f in functions if callable(f)])

        send_mod = []
        imp_mod = []
        for mod in rm:
            if type(mod) == tuple:
                send_mod.append(mod[1])
                mod = mod[0]
            imp_mod.append(mod)

        request["required_modules"] = " ".join(imp_mod)
        rf = util.pack_files(send_mod)

        # -- input & output streams --

        for stream in ["map_input_stream", "map_output_stream", "reduce_input_stream", "reduce_output_stream"]:
            self.pack_stack(kwargs, request, stream)

        # -- required files --

        if "required_files" in kwargs:
            if isinstance(kwargs["required_files"], dict):
                rf.update(kwargs["required_files"])
            else:
                rf.update(util.pack_files(kwargs["required_files"]))
        if rf:
            request["required_files"] = util.pack(rf)

        # -- scheduler --

        sched = jobargs["scheduler"]
        sched_keys = ["max_cores", "force_local", "force_remote"]

        if "max_cores" not in sched:
            sched["max_cores"] = 2 ** 31
        elif sched["max_cores"] < 1:
            raise DiscoError("max_cores must be >= 1")

        for k in sched_keys:
            if k in sched:
                request["sched_" + k] = str(sched[k])

        # -- map --

        if "map" in kwargs:
            k = "ext_map" if isinstance(kwargs["map"], dict) else "map"
            request[k] = util.pack(kwargs["map"])

            for function_name in ("map_init", "map_reader", "map_writer", "partition", "combiner"):
                function = jobargs[function_name]
                if function:
                    request[function_name] = util.pack(function)

            def inputlist(input):
                if hasattr(input, "__iter__"):
                    return ["\n".join(reversed(list(input)))]
                return util.urllist(input)

            input = [e for i in input for e in inputlist(i)]

        # -- only reduce --

        else:
            # XXX: Check for redundant inputs, external &
            # partitioned inputs
            input = [url for i in input for url in util.urllist(i)]

        request["input"] = " ".join(input)

        if "ext_params" in kwargs:
            e = kwargs["ext_params"]
            request["ext_params"] = encode_netstring_fd(e) if isinstance(e, dict) else e

        # -- reduce --

        nr_reduces = jobargs["nr_reduces"]
        if "reduce" in kwargs:
            k = "ext_reduce" if isinstance(kwargs["reduce"], dict) else "reduce"
            request[k] = util.pack(kwargs["reduce"])

            for function_name in ("reduce_reader", "reduce_writer", "reduce_init"):
                function = jobargs[function_name]
                if function:
                    request[function_name] = util.pack(function)

        request["nr_reduces"] = str(nr_reduces)

        # -- encode and send the request --

        reply = self.master.request("/disco/job/new", encode_netstring_fd(request))

        if not reply.startswith("job started:"):
            raise DiscoError("Failed to start a job. Server replied: " + reply)
        self.name = reply.split(":", 1)[1]
Exemple #7
0
 def pack_stack(self, kw, req, stream):
     if stream in kw:
         req[stream] = encode_netstring_str((f.func_name, util.pack(f)) for f in util.iterify(kw[stream]))