Esempio n. 1
0
    def _run(self):
        red_out, out_url, fd_list = self.connect_output()
        red_in                    = iter(ReduceReader(self))
        params                    = self.params

        if self.ext_reduce:
            path = self.path('EXT_REDUCE')
            external.prepare(self.reduce, self.ext_params, path)
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        Message("Starting reduce")
        self.init(red_in, params)
        self.reduce(red_in, red_out, params)
        Message("Reduce done")

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Message("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            safe_update(index, {'%d %s' % (self.id, out_url): True})
            OutputURL(index_url)
Esempio n. 2
0
    def _run(self):
        entries = self.track_status(self, "%s entries reduced")
        red_out, out_url, fd_list = self.connect_output()
        params = self.params

        if self.ext_reduce:
            external.prepare(self.reduce, self.ext_params, self.path('ext.reduce'))
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        total_size = sum(size for fd, size, url in self.connected_inputs)
        Message("Input is %s" % (util.format_size(total_size)))

        self.init(entries, params)
        self.reduce(entries, red_out, params)

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Message("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            safe_update(index, ['%d %s' % (self.id, out_url)])
            OutputURL(index_url)
Esempio n. 3
0
    def _run(self):
        entries = self.track_status(self, "%s entries reduced")
        red_out, out_url, fd_list = self.connect_output()
        params = self.params

        if self.ext_reduce:
            external.prepare(self.reduce, self.ext_params, self.path('ext.reduce'))
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        total_size = sum(size for fd, size, url in self.connected_inputs)
        Status("Input is %s" % (util.format_size(total_size)))

        self.init(entries, params)
        if util.argcount(self.reduce) < 3:
            for k, v in self.reduce(entries, *(params, )):
                red_out.add(k, v)
        else:
            self.reduce(entries, red_out, params)

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Status("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            f = file(index, 'w')
            print >> f, '%d %s' % (self.id, out_url)
            sync(f)
            f.close()
            OutputURL(index_url)
Esempio n. 4
0
def op_map(job):
    msg("Received a new map job!")

    if len(Task.inputs) != 1:
        err("Map can only handle one input. Got: %s" %
            " ".join(Task.inputs))

    global fun_reader, fun_writer, fun_partition
    fun_reader = util.unpack(job['map_reader'], globals=globals())
    fun_writer = util.unpack(job['map_writer'], globals=globals())
    fun_partition = util.unpack(job['partition'], globals=globals())

    global fun_init
    if 'map_init' in job:
        fun_init = util.unpack(job['map_init'], globals=globals())

    global fun_map
    if 'ext_map' in job:
        if 'ext_params' in job:
            map_params = job['ext_params']
        else:
            map_params = "0\n"

        path = Task.path("EXT_MAP")
        external.prepare(job['ext_map'], map_params, path)
        fun_map = external.ext_map
    else:
        map_params = util.unpack(job['params'], globals=globals())
        fun_map = util.unpack(job['map'], globals=globals())

    global fun_combiner
    if 'combiner' in job:
        fun_combiner = util.unpack(job['combiner'], globals=globals())

    init_common(job)

    nr_part = max(1, Task.num_partitions)

    if 'combiner' in job:
        partitions = [MapOutput(i, map_params, fun_combiner)\
            for i in range(nr_part)]
    else:
        partitions = [MapOutput(i, map_params) for i in range(nr_part)]

    run_map(Task.inputs[0], partitions, map_params)
    external.close_ext()

    urls = {}
    for i, p in enumerate(partitions):
        p.close()
        urls["%d %s" % (i, p.url())] = True

    index, index_url = Task.map_index
    safe_update(index, urls)
    OutputURL(index_url)
Esempio n. 5
0
def op_reduce(job):
    msg("Received a new reduce job!")

    do_sort = int(job['sort'])
    mem_sort_limit = int(job['mem_sort_limit'])

    global fun_init
    if 'reduce_init' in job:
        fun_init = util.unpack(job['reduce_init'], globals=globals())

    global fun_reader, fun_writer
    fun_reader = util.unpack(job['reduce_reader'], globals=globals())
    fun_writer = util.unpack(job['reduce_writer'], globals=globals())

    global fun_reduce
    if 'ext_reduce' in job:
        if "ext_params" in job:
            red_params = job['ext_params']
        else:
            red_params = "0\n"

        path = Task.path("EXT_MAP")
        external.prepare(job['ext_reduce'], red_params, path)
        fun_reduce = external.ext_reduce
    else:
        fun_reduce = util.unpack(job['reduce'], globals=globals())
        red_params = util.unpack(job['params'], globals=globals())

    init_common(job)

    red_in = ReduceReader(Task.inputs, do_sort,
            mem_sort_limit, red_params).iter()
    red_out = ReduceOutput(red_params)

    msg("Starting reduce")
    fun_init(red_in, red_params)
    fun_reduce(red_in, red_out, red_params)
    msg("Reduce done")

    red_out.close()
    external.close_ext()

    index, index_url = Task.reduce_index
    safe_update(index, {"%d %s" % (Task.id, red_out.url()): True})
    OutputURL(index_url)
Esempio n. 6
0
    def _run(self):
        if len(self.inputs) != 1:
            TaskFailed("Map can only handle one input. Got: %s" % ' '.join(self.inputs))

        if self.ext_map:
            external.prepare(self.map, self.ext_params, self.path('EXT_MAP'))
            self.map = FunctionType(external.ext_map.func_code,
                                    globals=external.__dict__)
            self.insert_globals([self.map])

        partitions = [MapOutput(self, i) for i in xrange(self.num_partitions)]
        reader, sze, url = self.connect_input(self.inputs[0])
        params = self.params
        self.init(reader, params)

        entries = (self.map(entry, params) for entry in reader)
        for kvs in self.track_status(entries, "%s entries mapped"):
            for k, v in kvs:
                p = self.partition(k, self.num_partitions, params)
                partitions[p].add(k, v)

        external.close_ext()

        urls = {}
        for i, partition in enumerate(partitions):
            partition.close()
            urls['%d %s' % (i, partition.url)] = True

        index, index_url = self.map_index
        safe_update(index, urls)

        if self.save and not self.reduce:
            if self.ispartitioned:
                TaskFailed("Storing partitioned outputs in DDFS is not yet supported")
            else:
                OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
                Message("Results pushed to DDFS")
        else:
            OutputURL(index_url)
Esempio n. 7
0
    def _run(self):
        if len(self.inputs) != 1:
            TaskFailed("Map takes 1 input, got: %s" % ' '.join(self.inputs))

        if self.save and not self.reduce and self.ispartitioned:
            TaskFailed("Storing partitioned outputs in DDFS is not yet supported")

        if self.ext_map:
            external.prepare(self.map, self.ext_params, self.path('ext.map'))
            self.map = FunctionType(external.ext_map.func_code,
                                    globals=external.__dict__)
            self.insert_globals([self.map])

        entries = self.track_status(self, "%s entries mapped")
        params  = self.params
        outputs = [MapOutput(self, i)
                   for i in xrange(max(1, int(self.jobdict['partitions'])))]

        self.init(entries, params)
        for entry in entries:
            for k, v in self.map(entry, params):
                outputs[self.partition(k, len(outputs), params)].add(k, v)

        external.close_ext()

        index, index_url = self.map_index

        f = file(index, 'w')
        for i, output in enumerate(outputs):
            print >> f, '%d %s' % (i, output.url)
            output.close()
        sync(f)
        f.close()

        if self.save and not self.reduce:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Status("Results pushed to DDFS")
        else:
            OutputURL(index_url)