Ejemplo n.º 1
0
    def _run(self):
        red_out, out_url, fd_list = self.connect_output()
        red_in                    = iter(ReduceReader(self))
        params                    = self.params

        if self.ext_reduce:
            path = self.path('EXT_REDUCE')
            external.prepare(self.reduce, self.ext_params, path)
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        Message("Starting reduce")
        self.init(red_in, params)
        self.reduce(red_in, red_out, params)
        Message("Reduce done")

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Message("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            safe_update(index, {'%d %s' % (self.id, out_url): True})
            OutputURL(index_url)
Ejemplo n.º 2
0
    def _run(self):
        entries = self.track_status(self, "%s entries reduced")
        red_out, out_url, fd_list = self.connect_output()
        params = self.params

        if self.ext_reduce:
            external.prepare(self.reduce, self.ext_params, self.path('ext.reduce'))
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        total_size = sum(size for fd, size, url in self.connected_inputs)
        Status("Input is %s" % (util.format_size(total_size)))

        self.init(entries, params)
        if util.argcount(self.reduce) < 3:
            for k, v in self.reduce(entries, *(params, )):
                red_out.add(k, v)
        else:
            self.reduce(entries, red_out, params)

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Status("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            f = file(index, 'w')
            print >> f, '%d %s' % (self.id, out_url)
            sync(f)
            f.close()
            OutputURL(index_url)
Ejemplo n.º 3
0
    def _run(self):
        entries = self.track_status(self, "%s entries reduced")
        red_out, out_url, fd_list = self.connect_output()
        params = self.params

        if self.ext_reduce:
            external.prepare(self.reduce, self.ext_params, self.path('ext.reduce'))
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        total_size = sum(size for fd, size, url in self.connected_inputs)
        Message("Input is %s" % (util.format_size(total_size)))

        self.init(entries, params)
        self.reduce(entries, red_out, params)

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Message("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            safe_update(index, ['%d %s' % (self.id, out_url)])
            OutputURL(index_url)
Ejemplo n.º 4
0
    def _run(self):
        if len(self.inputs) != 1:
            TaskFailed("Map can only handle one input. Got: %s" % ' '.join(self.inputs))

        if self.ext_map:
            external.prepare(self.map, self.ext_params, self.path('EXT_MAP'))
            self.map = FunctionType(external.ext_map.func_code,
                                    globals=external.__dict__)
            self.insert_globals([self.map])

        partitions = [MapOutput(self, i) for i in xrange(self.num_partitions)]
        reader, sze, url = self.connect_input(self.inputs[0])
        params = self.params
        self.init(reader, params)

        entries = (self.map(entry, params) for entry in reader)
        for kvs in self.track_status(entries, "%s entries mapped"):
            for k, v in kvs:
                p = self.partition(k, self.num_partitions, params)
                partitions[p].add(k, v)

        external.close_ext()

        urls = {}
        for i, partition in enumerate(partitions):
            partition.close()
            urls['%d %s' % (i, partition.url)] = True

        index, index_url = self.map_index
        safe_update(index, urls)

        if self.save and not self.reduce:
            if self.ispartitioned:
                TaskFailed("Storing partitioned outputs in DDFS is not yet supported")
            else:
                OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
                Message("Results pushed to DDFS")
        else:
            OutputURL(index_url)
Ejemplo n.º 5
0
    def _run(self):
        if len(self.inputs) != 1:
            TaskFailed("Map takes 1 input, got: %s" % ' '.join(self.inputs))

        if self.save and not self.reduce and self.ispartitioned:
            TaskFailed("Storing partitioned outputs in DDFS is not yet supported")

        if self.ext_map:
            external.prepare(self.map, self.ext_params, self.path('ext.map'))
            self.map = FunctionType(external.ext_map.func_code,
                                    globals=external.__dict__)
            self.insert_globals([self.map])

        entries = self.track_status(self, "%s entries mapped")
        params  = self.params
        outputs = [MapOutput(self, i)
                   for i in xrange(max(1, int(self.jobdict['partitions'])))]

        self.init(entries, params)
        for entry in entries:
            for k, v in self.map(entry, params):
                outputs[self.partition(k, len(outputs), params)].add(k, v)

        external.close_ext()

        index, index_url = self.map_index

        f = file(index, 'w')
        for i, output in enumerate(outputs):
            print >> f, '%d %s' % (i, output.url)
            output.close()
        sync(f)
        f.close()

        if self.save and not self.reduce:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Status("Results pushed to DDFS")
        else:
            OutputURL(index_url)