Exemple #1
0
    def _run(self, **kwargs):
        jobargs = util.DefaultDict(self.defaults.__getitem__, kwargs)

        # -- check parameters --

        # Backwards compatibility
        # (fun_map == map, input_files == input)
        if "fun_map" in kwargs:
            kwargs["map"] = kwargs["fun_map"]

        if "input_files" in kwargs:
            kwargs["input"] = kwargs["input_files"]

        if "chunked" in kwargs:
            raise DeprecationWarning("Argument 'chunked' is deprecated")

        if "nr_maps" in kwargs:
            sys.stderr.write("Warning: nr_maps is deprecated. " "Use scheduler = {'max_cores': N} instead.\n")
            sched = jobargs["scheduler"].copy()
            if "max_cores" not in sched:
                sched["max_cores"] = int(jobargs["nr_maps"])
            jobargs["scheduler"] = sched

        if not "input" in kwargs:
            raise DiscoError("Argument input is required")

        if not ("map" in kwargs or "reduce" in kwargs):
            raise DiscoError("Specify map and/or reduce")

        for p in kwargs:
            if p not in Job.defaults:
                raise DiscoError("Unknown argument: %s" % p)

        input = kwargs["input"]

        # -- initialize request --

        request = {
            "prefix": self.name,
            "version": ".".join(map(str, sys.version_info[:2])),
            "params": cPickle.dumps(jobargs["params"], cPickle.HIGHEST_PROTOCOL),
            "sort": str(int(jobargs["sort"])),
            "mem_sort_limit": str(jobargs["mem_sort_limit"]),
            "status_interval": str(jobargs["status_interval"]),
            "profile": str(int(jobargs["profile"])),
        }

        # -- required modules --

        if "required_modules" in kwargs:
            rm = kwargs["required_modules"]
        else:
            functions = util.flatten(util.iterify(jobargs[f]) for f in self.mapreduce_functions)
            rm = modutil.find_modules([f for f in functions if callable(f)])

        send_mod = []
        imp_mod = []
        for mod in rm:
            if type(mod) == tuple:
                send_mod.append(mod[1])
                mod = mod[0]
            imp_mod.append(mod)

        request["required_modules"] = " ".join(imp_mod)
        rf = util.pack_files(send_mod)

        # -- input & output streams --

        for stream in ["map_input_stream", "map_output_stream", "reduce_input_stream", "reduce_output_stream"]:
            self.pack_stack(kwargs, request, stream)

        # -- required files --

        if "required_files" in kwargs:
            if isinstance(kwargs["required_files"], dict):
                rf.update(kwargs["required_files"])
            else:
                rf.update(util.pack_files(kwargs["required_files"]))
        if rf:
            request["required_files"] = util.pack(rf)

        # -- scheduler --

        sched = jobargs["scheduler"]
        sched_keys = ["max_cores", "force_local", "force_remote"]

        if "max_cores" not in sched:
            sched["max_cores"] = 2 ** 31
        elif sched["max_cores"] < 1:
            raise DiscoError("max_cores must be >= 1")

        for k in sched_keys:
            if k in sched:
                request["sched_" + k] = str(sched[k])

        # -- map --

        if "map" in kwargs:
            k = "ext_map" if isinstance(kwargs["map"], dict) else "map"
            request[k] = util.pack(kwargs["map"])

            for function_name in ("map_init", "map_reader", "map_writer", "partition", "combiner"):
                function = jobargs[function_name]
                if function:
                    request[function_name] = util.pack(function)

            def inputlist(input):
                if hasattr(input, "__iter__"):
                    return ["\n".join(reversed(list(input)))]
                return util.urllist(input)

            input = [e for i in input for e in inputlist(i)]

        # -- only reduce --

        else:
            # XXX: Check for redundant inputs, external &
            # partitioned inputs
            input = [url for i in input for url in util.urllist(i)]

        request["input"] = " ".join(input)

        if "ext_params" in kwargs:
            e = kwargs["ext_params"]
            request["ext_params"] = encode_netstring_fd(e) if isinstance(e, dict) else e

        # -- reduce --

        nr_reduces = jobargs["nr_reduces"]
        if "reduce" in kwargs:
            k = "ext_reduce" if isinstance(kwargs["reduce"], dict) else "reduce"
            request[k] = util.pack(kwargs["reduce"])

            for function_name in ("reduce_reader", "reduce_writer", "reduce_init"):
                function = jobargs[function_name]
                if function:
                    request[function_name] = util.pack(function)

        request["nr_reduces"] = str(nr_reduces)

        # -- encode and send the request --

        reply = self.master.request("/disco/job/new", encode_netstring_fd(request))

        if not reply.startswith("job started:"):
            raise DiscoError("Failed to start a job. Server replied: " + reply)
        self.name = reply.split(":", 1)[1]
Exemple #2
0
        def _run(self, **kw):
                d = lambda x: kw.get(x, Job.defaults[x])

                # -- check parameters --

                # Backwards compatibility
                # (fun_map == map, input_files == input)
                if "fun_map" in kw:
                        kw["map"] = kw["fun_map"]

                if "input_files" in kw:
                        kw["input"] = kw["input_files"]

                if "chunked" in kw:
                        raise DiscoError("Argument 'chunked' is deprecated")

                if not "input" in kw:
                        raise DiscoError("input is required")

                if not ("map" in kw or "reduce" in kw):
                        raise DiscoError("Specify map and/or reduce")

                for p in kw:
                        if p not in Job.defaults:
                                raise DiscoError("Unknown argument: %s" % p)

                inputs = kw["input"]

                # -- initialize request --

                req = {"name": self.name,
                       "version": ".".join(map(str, sys.version_info[:2])),
                       "params": cPickle.dumps(d("params"), cPickle.HIGHEST_PROTOCOL),
                       "sort": str(int(d("sort"))),
                       "mem_sort_limit": str(d("mem_sort_limit")),
                       "status_interval": str(d("status_interval")),
                       "profile": str(int(d("profile")))}

                # -- required modules --

                if "required_modules" in kw:
                        rm = kw["required_modules"]
                else:
                        funlist = []
                        for f in Job.funs:
                                df = d(f)
                                if type(df) == types.FunctionType:
                                        funlist.append(df)
                                elif type(df) == list:
                                        funlist += df
                        rm = modutil.find_modules(funlist)
                send_mod = []
                imp_mod = []
                for mod in rm:
                        if type(mod) == tuple:
                                send_mod.append(mod[1])
                                mod = mod[0]
                        imp_mod.append(mod)

                req["required_modules"] = " ".join(imp_mod)
                rf = util.pack_files(send_mod)

                # -- required files --

                if "required_files" in kw:
                        if type(kw["required_files"]) == dict:
                                rf.update(kw["required_files"])
                        else:
                                rf.update(util.pack_files(\
                                        kw["required_files"]))
                if rf:
                        req["required_files"] = marshal.dumps(rf)

                # -- map --

                if "map" in kw:
                        if type(kw["map"]) == dict:
                                req["ext_map"] = marshal.dumps(kw["map"])
                        else:
                                req["map"] = marshal.dumps(kw["map"].func_code)

                        if "map_init" in kw:
                                req["map_init"] = marshal.dumps(\
                                        kw["map_init"].func_code)

                        req["map_reader"] =\
                                marshal.dumps(d("map_reader").func_code)
                        req["map_writer"] =\
                                marshal.dumps(d("map_writer").func_code)
                        req["partition"] =\
                                marshal.dumps(d("partition").func_code)

                        if "combiner" in kw:
                                req["combiner"] =\
                                        marshal.dumps(kw["combiner"].func_code)

                        parsed_inputs = []
                        for inp in inputs:
                                if type(inp) == list:
                                        parsed_inputs.append(
                                                "\n".join(reversed(inp)))
                                elif inp.startswith("dir://"):
                                        parsed_inputs += util.parse_dir(inp)
                                else:
                                        parsed_inputs.append(inp)
                        inputs = parsed_inputs

                        if "nr_maps" not in kw or kw["nr_maps"] > len(inputs):
                                nr_maps = len(inputs)
                        else:
                                nr_maps = kw["nr_maps"]

                # -- only reduce --

                else:
                        nr_maps = 0
                        ext_inputs = []
                        red_inputs = []
                        for inp in inputs:
                                if type(inp) == list:
                                        raise DiscoError("Reduce doesn't "\
                                                "accept redundant inputs")
                                elif inp.startswith("dir://"):
                                        if inp.endswith(".txt"):
                                                ext_inputs.append(inp)
                                        else:
                                                red_inputs.append(inp)
                                else:
                                        ext_inputs.append(inp)

                        if ext_inputs and red_inputs:
                                raise DiscoError("Can't mix partitioned "\
                                        "inputs with other inputs")
                        elif red_inputs:
                                q = lambda x: int(x.split(":")[-1]) + 1
                                nr_red = q(red_inputs[0])
                                for x in red_inputs:
                                        if q(x) != nr_red:
                                                raise DiscoError(\
                                                "Number of partitions must "\
                                                "match in all inputs")
                                n = d("nr_reduces") or nr_red
                                if n != nr_red:
                                        raise DiscoError(
                                        "Specified nr_reduces = %d but "\
                                        "number of partitions in the input "\
                                        "is %d" % (n, nr_red))
                                kw["nr_reduces"] = nr_red
                                inputs = red_inputs
                        elif d("nr_reduces") != 1:
                                raise DiscoError("nr_reduces must be 1 when "\
                                        "using non-partitioned inputs "\
                                        "without the map phase")
                        else:
                                inputs = ext_inputs

                # shuffle fixes a pathological case in the fifo scheduler:
                # if inputs for a node are consequent, data locality will be
                # lost after K inputs where K is the number of cores.
                # Randomizing the order of inputs makes this pathological case
                # unlikely. This issue will be fixed in the new scheduler.
                random.shuffle(inputs)

                req["input"] = " ".join(inputs)
                req["nr_maps"] = str(nr_maps)

                if "ext_params" in kw:
                        if type(kw["ext_params"]) == dict:
                                req["ext_params"] =\
                                        encode_netstring_fd(kw["ext_params"])
                        else:
                                req["ext_params"] = kw["ext_params"]

                # -- reduce --

                nr_reduces = d("nr_reduces")
                if "reduce" in kw:
                        if type(kw["reduce"]) == dict:
                                req["ext_reduce"] = marshal.dumps(kw["reduce"])
                                req["reduce"] = ""
                        else:
                                req["reduce"] = marshal.dumps(
                                        kw["reduce"].func_code)
                        nr_reduces = nr_reduces or min(max(nr_maps / 2, 1), 100)

                        req["reduce_reader"] =\
                                marshal.dumps(d("reduce_reader").func_code)
                        req["reduce_writer"] =\
                                marshal.dumps(d("reduce_writer").func_code)

                        if "reduce_init" in kw:
                                req["reduce_init"] = marshal.dumps(\
                                        kw["reduce_init"].func_code)
                else:
                        nr_reduces = nr_reduces or 0

                req["nr_reduces"] = str(nr_reduces)

                # -- encode and send the request --

                self.msg = encode_netstring_fd(req)
                reply = self.master.request("/disco/job/new", self.msg)

                if reply != "job started":
                        raise DiscoError("Failed to start a job. Server replied: " + reply)