def pack(self): """Pack up the :class:`JobDict` for sending over the wire.""" jobpack = {} if self['required_files']: if not isinstance(self['required_files'], dict): self['required_files'] = util.pack_files(self['required_files']) else: self['required_files'] = {} self['required_files'].update(util.pack_files( o[1] for o in self['required_modules'] if util.iskv(o))) for key in self.defaults: if key == 'input': jobpack['input'] = ' '.join( '\n'.join(reversed(list(util.iterify(url)))) for url in self['input']) elif key in ('nr_reduces', 'prefix'): jobpack[key] = str(self[key]) elif key == 'scheduler': scheduler = self['scheduler'] for key in scheduler: jobpack['sched_%s' % key] = str(scheduler[key]) elif self[key] is None: pass elif key in self.stacks: jobpack[key] = util.pack_stack(self[key]) else: jobpack[key] = util.pack(self[key]) return encode_netstring_fd(jobpack)
def pack(self): """Pack up the :class:`JobDict` for sending over the wire.""" jobpack = {} if self['required_files']: if not isinstance(self['required_files'], dict): self['required_files'] = util.pack_files( self['required_files']) else: self['required_files'] = {} self['required_files'].update( util.pack_files(o[1] for o in self['required_modules'] if util.iskv(o))) for key in self.defaults: if key in ('map', 'reduce'): if self[key] is None: continue if key == 'input': jobpack['input'] = ' '.join( '\n'.join(reversed(list(util.iterify(url)))) for url in self['input']) elif key == 'username': jobpack['username'] = str(self['username']) elif key in ('nr_reduces', 'prefix'): jobpack[key] = str(self[key]) elif key == 'scheduler': scheduler = self['scheduler'] for key in scheduler: jobpack['sched_%s' % key] = str(scheduler[key]) elif key in self.stacks: jobpack[key] = util.pack_stack(self[key]) else: jobpack[key] = util.pack(self[key]) return encode_netstring_fd(jobpack)
def __getstate__(self): return dict((k, util.pack(v)) for k, v in self.__dict__.iteritems() if not k.startswith('_'))
def test_pack(self): now = datetime.now() self.assertEquals(now, unpack(pack(now))) self.assertEquals(666, unpack(pack(666))) self.assertEquals(function.func_code, unpack(pack(function)).func_code)
def _run(self, **kwargs): jobargs = util.DefaultDict(self.defaults.__getitem__, kwargs) # -- check parameters -- # Backwards compatibility # (fun_map == map, input_files == input) if "fun_map" in kwargs: kwargs["map"] = kwargs["fun_map"] if "input_files" in kwargs: kwargs["input"] = kwargs["input_files"] if "chunked" in kwargs: raise DeprecationWarning("Argument 'chunked' is deprecated") if "nr_maps" in kwargs: sys.stderr.write("Warning: nr_maps is deprecated. " "Use scheduler = {'max_cores': N} instead.\n") sched = jobargs["scheduler"].copy() if "max_cores" not in sched: sched["max_cores"] = int(jobargs["nr_maps"]) jobargs["scheduler"] = sched if not "input" in kwargs: raise DiscoError("Argument input is required") if not ("map" in kwargs or "reduce" in kwargs): raise DiscoError("Specify map and/or reduce") for p in kwargs: if p not in Job.defaults: raise DiscoError("Unknown argument: %s" % p) input = kwargs["input"] # -- initialize request -- request = { "prefix": self.name, "version": ".".join(map(str, sys.version_info[:2])), "params": cPickle.dumps(jobargs["params"], cPickle.HIGHEST_PROTOCOL), "sort": str(int(jobargs["sort"])), "mem_sort_limit": str(jobargs["mem_sort_limit"]), "status_interval": str(jobargs["status_interval"]), "profile": str(int(jobargs["profile"])), } # -- required modules -- if "required_modules" in kwargs: rm = kwargs["required_modules"] else: functions = util.flatten(util.iterify(jobargs[f]) for f in self.mapreduce_functions) rm = modutil.find_modules([f for f in functions if callable(f)]) send_mod = [] imp_mod = [] for mod in rm: if type(mod) == tuple: send_mod.append(mod[1]) mod = mod[0] imp_mod.append(mod) request["required_modules"] = " ".join(imp_mod) rf = util.pack_files(send_mod) # -- input & output streams -- for stream in ["map_input_stream", "map_output_stream", "reduce_input_stream", "reduce_output_stream"]: self.pack_stack(kwargs, request, stream) # -- required files -- if "required_files" in kwargs: if isinstance(kwargs["required_files"], dict): rf.update(kwargs["required_files"]) else: rf.update(util.pack_files(kwargs["required_files"])) if rf: request["required_files"] = util.pack(rf) # -- scheduler -- sched = jobargs["scheduler"] sched_keys = ["max_cores", "force_local", "force_remote"] if "max_cores" not in sched: sched["max_cores"] = 2 ** 31 elif sched["max_cores"] < 1: raise DiscoError("max_cores must be >= 1") for k in sched_keys: if k in sched: request["sched_" + k] = str(sched[k]) # -- map -- if "map" in kwargs: k = "ext_map" if isinstance(kwargs["map"], dict) else "map" request[k] = util.pack(kwargs["map"]) for function_name in ("map_init", "map_reader", "map_writer", "partition", "combiner"): function = jobargs[function_name] if function: request[function_name] = util.pack(function) def inputlist(input): if hasattr(input, "__iter__"): return ["\n".join(reversed(list(input)))] return util.urllist(input) input = [e for i in input for e in inputlist(i)] # -- only reduce -- else: # XXX: Check for redundant inputs, external & # partitioned inputs input = [url for i in input for url in util.urllist(i)] request["input"] = " ".join(input) if "ext_params" in kwargs: e = kwargs["ext_params"] request["ext_params"] = encode_netstring_fd(e) if isinstance(e, dict) else e # -- reduce -- nr_reduces = jobargs["nr_reduces"] if "reduce" in kwargs: k = "ext_reduce" if isinstance(kwargs["reduce"], dict) else "reduce" request[k] = util.pack(kwargs["reduce"]) for function_name in ("reduce_reader", "reduce_writer", "reduce_init"): function = jobargs[function_name] if function: request[function_name] = util.pack(function) request["nr_reduces"] = str(nr_reduces) # -- encode and send the request -- reply = self.master.request("/disco/job/new", encode_netstring_fd(request)) if not reply.startswith("job started:"): raise DiscoError("Failed to start a job. Server replied: " + reply) self.name = reply.split(":", 1)[1]
def pack_stack(self, kw, req, stream): if stream in kw: req[stream] = encode_netstring_str((f.func_name, util.pack(f)) for f in util.iterify(kw[stream]))