Ejemplo n.º 1
0
    def _run(self):
        entries = self.track_status(self, "%s entries reduced")
        red_out, out_url, fd_list = self.connect_output()
        params = self.params

        if self.ext_reduce:
            external.prepare(self.reduce, self.ext_params, self.path('ext.reduce'))
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        total_size = sum(size for fd, size, url in self.connected_inputs)
        Status("Input is %s" % (util.format_size(total_size)))

        self.init(entries, params)
        if util.argcount(self.reduce) < 3:
            for k, v in self.reduce(entries, *(params, )):
                red_out.add(k, v)
        else:
            self.reduce(entries, red_out, params)

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Status("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            f = file(index, 'w')
            print >> f, '%d %s' % (self.id, out_url)
            sync(f)
            f.close()
            OutputURL(index_url)
Ejemplo n.º 2
0
 def track_status(self, iterator, message_template):
     status_interval = self.status_interval
     n = -1
     for n, item in enumerate(iterator):
         if status_interval and (n + 1) % status_interval == 0:
             Status(message_template % (n + 1))
         yield item
     Status("Done: %s" % (message_template % (n + 1)))
Ejemplo n.º 3
0
 def __init__(self, mode, **taskargs):
     from disco import task
     AnnouncePID(os.getpid())
     Status("Received a new %s task!" % mode)
     self.task = getattr(task, mode.capitalize())(**taskargs)
     self.task.run()
     WorkerDone("Worker done")
Ejemplo n.º 4
0
class Reduce(Task):
    def _run(self):
        entries = self.track_status(self, "%s entries reduced")
        red_out, out_url, fd_list = self.connect_output()
        params = self.params

        if self.ext_reduce:
            external.prepare(self.reduce, self.ext_params, self.path('ext.reduce'))
            self.reduce = FunctionType(external.ext_reduce.func_code,
                                       globals=external.__dict__)
            self.insert_globals([self.reduce])

        total_size = sum(size for fd, size, url in self.connected_inputs)
        Status("Input is %s" % (util.format_size(total_size)))

        self.init(entries, params)
        if util.argcount(self.reduce) < 3:
            for k, v in self.reduce(entries, *(params, )):
                red_out.add(k, v)
        else:
            self.reduce(entries, red_out, params)

        self.close_output(fd_list)
        external.close_ext()

        if self.save:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Status("Results pushed to DDFS")
        else:
            index, index_url = self.reduce_index
            f = file(index, 'w')
            print >> f, '%d %s' % (self.id, out_url)
            sync(f)
            f.close()
            OutputURL(index_url)

    def __iter__(self):
        if self.sort == 'merge':
            return self.merge_sorted_entries
        elif self.sort:
            return self.sorted_entries
        return self.entries

    def disk_sort(self, filename):
        Status("Sorting %s..." % filename)
        try:
            subprocess.check_call(['sort',
                                   '-z',
                                   '-t', '\xff',
                                   '-k', '1,1',
                                   '-T', '.',
                                   '-S', self.sort_buffer_size,
                                   '-o', filename,
                                   filename])
        except subprocess.CalledProcessError, e:
            raise DataError("Sorting %s failed: %s" % (filename, e), filename)
        Status("Finished sorting")
Ejemplo n.º 5
0
    def sorted_entries(self):
        dlname = self.path('reduce-in-%d.dl' % self.id)
        Status("Downloading %s" % dlname)
        out_fd = AtomicFile(dlname, 'w')
        for key, value in self.entries:
            if not isinstance(key, str):
                raise ValueError("Keys must be strings for external sort", key)
            if '\xff' in key or '\x00' in key:
                raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
            else:
                # value pickled using protocol 0 will always be printable ASCII
                out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
        out_fd.close()
        Status("Downloaded OK")

        self.disk_sort(dlname)
        fd, size, url = comm.open_local(dlname)
        for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url):
            yield k, cPickle.loads(v)
Ejemplo n.º 6
0
 def disk_sort(self, filename):
     Status("Sorting %s..." % filename)
     try:
         subprocess.check_call(['sort',
                                '-z',
                                '-t', '\xff',
                                '-k', '1,1',
                                '-T', '.',
                                '-S', self.sort_buffer_size,
                                '-o', filename,
                                filename])
     except subprocess.CalledProcessError, e:
         raise DataError("Sorting %s failed: %s" % (filename, e), filename)
Ejemplo n.º 7
0
    def _run(self):
        if len(self.inputs) != 1:
            TaskFailed("Map takes 1 input, got: %s" % ' '.join(self.inputs))

        if self.save and not self.reduce and self.ispartitioned:
            TaskFailed("Storing partitioned outputs in DDFS is not yet supported")

        if self.ext_map:
            external.prepare(self.map, self.ext_params, self.path('ext.map'))
            self.map = FunctionType(external.ext_map.func_code,
                                    globals=external.__dict__)
            self.insert_globals([self.map])

        entries = self.track_status(self, "%s entries mapped")
        params  = self.params
        outputs = [MapOutput(self, i)
                   for i in xrange(max(1, int(self.jobdict['partitions'])))]

        self.init(entries, params)
        for entry in entries:
            for k, v in self.map(entry, params):
                outputs[self.partition(k, len(outputs), params)].add(k, v)

        external.close_ext()

        index, index_url = self.map_index

        f = file(index, 'w')
        for i, output in enumerate(outputs):
            print >> f, '%d %s' % (i, output.url)
            output.close()
        sync(f)
        f.close()

        if self.save and not self.reduce:
            OutputURL(util.ddfs_save(self.blobs, self.jobname, self.master))
            Status("Results pushed to DDFS")
        else:
            OutputURL(index_url)
Ejemplo n.º 8
0
 def map(e, params):
     Status("Internal msg")
     return []