Example #1
0
    def _run(self):
        entries = self.track_status(self, "%s entries reduced")
        red_out, out_url, fd_list = self.connect_output()
        params = self.params

        if self.ext_reduce:
            external.prepare(self.reduce, self.ext_params, self.path('ext.reduce'))
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        total_size = sum(size for fd, size, url in self.connected_inputs)
        Status("Input is %s" % (util.format_size(total_size)))

        self.init(entries, params)
        if util.argcount(self.reduce) < 3:
            for k, v in self.reduce(entries, *(params, )):
                red_out.add(k, v)
        else:
            self.reduce(entries, red_out, params)

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Status("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            f = file(index, 'w')
            print >> f, '%d %s' % (self.id, out_url)
            sync(f)
            f.close()
            OutputURL(index_url)
Example #2
0
    def _run(self):
        if len(self.inputs) != 1:
            TaskFailed("Map takes 1 input, got: %s" % ' '.join(self.inputs))

        if self.save and not self.reduce and self.ispartitioned:
            TaskFailed("Storing partitioned outputs in DDFS is not yet supported")

        if self.ext_map:
            external.prepare(self.map, self.ext_params, self.path('ext.map'))
            self.map = FunctionType(external.ext_map.func_code,
                                    globals=external.__dict__)
            self.insert_globals([self.map])

        entries = self.track_status(self, "%s entries mapped")
        params  = self.params
        outputs = [MapOutput(self, i)
                   for i in xrange(max(1, int(self.jobdict['partitions'])))]

        self.init(entries, params)
        for entry in entries:
            for k, v in self.map(entry, params):
                outputs[self.partition(k, len(outputs), params)].add(k, v)

        external.close_ext()

        index, index_url = self.map_index

        f = file(index, 'w')
        for i, output in enumerate(outputs):
            print >> f, '%d %s' % (i, output.url)
            output.close()
        sync(f)
        f.close()

        if self.save and not self.reduce:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Status("Results pushed to DDFS")
        else:
            OutputURL(index_url)