Ejemplo n.º 1
0
def task_output_stream(stream, partition, url, params):
    """
    An :func:`output_stream` which returns a handle to a task output.
    The handle ensures that if a task fails, partially written data is ignored.
    """
    from disco.fileutils import AtomicFile
    return AtomicFile(url)
Ejemplo n.º 2
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading {0}".format(filename))
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, bytes):
            raise ValueError("Keys must be bytes for external sort", key)
        if b'\xff' in key or b'\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write(key + b'\xff')
            out_fd.write(pickle_dumps(value, 0) + b'\x00')
    out_fd.close()
    worker.send('MSG',
                "Downloaded {0:s} OK".format(format_size(getsize(filename))))
    worker.send('MSG', "Sorting {0}...".format(filename))
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in re_reader(b"(?s)(.*?)\xff(.*?)\x00", fd, len(fd), fd.url):
        yield k, pickle_loads(v)
Ejemplo n.º 3
0
def disk_sort(worker, input, filename, sort_buffer_size='10%'):
    from os.path import getsize
    from disco.comm import open_local
    from disco.util import format_size
    from disco.fileutils import AtomicFile
    worker.send('MSG', "Downloading %s" % filename)
    out_fd = AtomicFile(filename)
    for key, value in input:
        if not isinstance(key, str):
            raise ValueError("Keys must be strings for external sort", key)
        if '\xff' in key or '\x00' in key:
            raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
        else:
            # value pickled using protocol 0 will always be printable ASCII
            out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
    out_fd.close()
    worker.send('MSG', "Downloaded %s OK" % format_size(getsize(filename)))
    worker.send('MSG', "Sorting %s..." % filename)
    unix_sort(filename, sort_buffer_size=sort_buffer_size)
    worker.send('MSG', ("Finished sorting"))
    fd = open_local(filename)
    for k, v in delimited_reader(fd,
                                 len(fd),
                                 fd.url,
                                 delimiter='\xff',
                                 line_terminator='\x00'):
        yield k, cPickle.loads(v)
Ejemplo n.º 4
0
def disk_sort(input, filename, sort_keys, binaries=(), sort_buffer_size='10%',
              desc=False):
    import ujson
    from disco.comm import open_local
    from disco.fileutils import AtomicFile
    import base64
    # import sys
    # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg')
    # import pydevd
    # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True)
    out_fd = AtomicFile(filename)
    key_types = None
    MPT = ()
    # print "SORTKEY: %s" % repr(sort_keys)
    for key, _ in input:
        if isinstance(key, (str, unicode)):
            raise ValueError("Keys must be sequences", key)

        # determine if the key is numeric
        if key_types is None:
            key_types = []
            for kt in key:
                try:
                    float(kt)
                    key_types.append('n')
                except:
                    key_types.append('')

        #serialize the key - encoded either as NULL, json, or b64 - note that
        for i, each_key in enumerate(key):
            if each_key is None:
                ukey = b'\x00'
            elif i in binaries and key_types[i] != 'n':
                ukey = base64.b64encode(each_key)
            else:
                ukey = ujson.dumps(each_key)
            out_fd.write(ukey)
            out_fd.write(b'\xff')
        out_fd.write('\n')
    out_fd.flush()
    out_fd.close()
    unix_sort(filename,
              [(sk, key_types[sk]) for sk in sort_keys],
              sort_buffer_size=sort_buffer_size,
              desc=desc)
    fd = open_local(filename)
    for k in sort_reader(fd, fd.url):
        # yield [ujson.loads(key) if key != b'\x00' else None for key in k], MPT

        rval = []
        for i, key in enumerate(k):
            if key == b'\x00':
                rkey = None
            elif i in binaries:
                rkey = base64.b64decode(key)
            else:
                rkey = ujson.loads(key)
            rval.append(rkey)
        yield rval, MPT
Ejemplo n.º 5
0
def reduce_output_stream(stream, partition, url, params):
    """
    An :func:`output_stream` which returns a handle to a reduce output.
    The handle ensures that if a task fails, partially written data is ignored.
    """
    from disco.fileutils import AtomicFile
    path, url = Task.reduce_output
    Task.blobs.append(path)
    return AtomicFile(path, 'w'), url
Ejemplo n.º 6
0
def map_output_stream(stream, partition, url, params):
    """
    An :func:`output_stream` which returns a handle to a partition output.
    The handle ensures that if a task fails, partially written data is ignored.
    """
    from disco.fileutils import AtomicFile
    if Task.ispartitioned:
        path, url = Task.partition_output(partition)
    else:
        path, url = Task.map_output(partition)
    Task.blobs.append(path)
    return AtomicFile(path, 'w'), url
Ejemplo n.º 7
0
 def concat_input(cls, task, output_label, replicas):
     output = AtomicFile(task.output_path(output_label))
     BUFFER_SIZE = 1024 * 1024
     for reps in replicas:
         # Use only the first replica for now, since a set of one
         # is the most common case.
         # TODO: handle falling back to alternative replicas.
         inp = open_url(reps[0])
         buf = inp.read(BUFFER_SIZE)
         while (len(buf) > 0):
             output.write(buf)
             buf = inp.read(BUFFER_SIZE)
         inp.close()
     output.close()
     return output.path, output.size()
Ejemplo n.º 8
0
    def sorted_entries(self):
        dlname = self.path('reduce-in-%d.dl' % self.id)
        Status("Downloading %s" % dlname)
        out_fd = AtomicFile(dlname, 'w')
        for key, value in self.entries:
            if not isinstance(key, str):
                raise ValueError("Keys must be strings for external sort", key)
            if '\xff' in key or '\x00' in key:
                raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key)
            else:
                # value pickled using protocol 0 will always be printable ASCII
                out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0)))
        out_fd.close()
        Status("Downloaded OK")

        self.disk_sort(dlname)
        fd, size, url = comm.open_local(dlname)
        for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url):
            yield k, cPickle.loads(v)