def download_and_sort(self, params): dlname = Task.path("REDUCE_DL", Task.id) msg("Reduce will be downloaded to %s" % dlname) out_fd = AtomicFile(dlname, "w") for url in self.inputs: fd, sze, url = connect_input(url, params) for k, v in fun_reader(fd, sze, url): if " " in k: err("Spaces are not allowed in keys "\ "with external sort.") if "\0" in v: err("Zero bytes are not allowed in "\ "values with external sort. "\ "Consider using base64 encoding.") out_fd.write("%s %s\0" % (k, v)) out_fd.close() msg("Reduce input downloaded ok") msg("Starting external sort") sortname = Task.path("REDUCE_SORTED", Task.id) ensure_path(os.path.dirname(sortname)) cmd = ["sort", "-n", "-k", "1,1", "-z",\ "-t", " ", "-o", sortname, dlname] proc = subprocess.Popen(cmd) ret = proc.wait() if ret: err("Sorting %s to %s failed (%d)" %\ (dlname, sortname, ret)) msg("External sort done: %s" % sortname) return self.multi_file_iterator([sortname], params, reader =\ lambda fd, sze, url:\ re_reader("(?s)(.*?) (.*?)\000", fd, sze, url))
def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.util import format_size from disco.fileutils import AtomicFile worker.send('MSG', "Downloading {0}".format(filename)) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, bytes): raise ValueError("Keys must be bytes for external sort", key) if b'\xff' in key or b'\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write(key + b'\xff') out_fd.write(pickle_dumps(value, 0) + b'\x00') out_fd.close() worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename)))) worker.send('MSG', "Sorting {0}...".format(filename)) unix_sort(filename, sort_buffer_size=sort_buffer_size) worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in re_reader(b"(?s)(.*?)\xff(.*?)\x00", fd, len(fd), fd.url): yield k, pickle_loads(v)
def task_output_stream(stream, partition, url, params): """ An :func:`output_stream` which returns a handle to a task output. The handle ensures that if a task fails, partially written data is ignored. """ from disco.fileutils import AtomicFile return AtomicFile(url)
def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.fileutils import AtomicFile from disco.worker.task_io import re_reader if worker: worker.send('MSG', "Downloading {0}".format(filename)) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, bytes): raise ValueError("Keys must be bytes for external sort", key) if b'\xff' in key or b'\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write(key + b'\xff') out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00') out_fd.close() if worker: worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename)))) worker.send('MSG', "Sorting {0}...".format(filename)) unix_sort(filename, sort_buffer_size=sort_buffer_size) if worker: worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in sort_reader(fd, fd.url): yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
def reduce_output_stream(stream, partition, url, params): """ An :func:`output_stream` which returns a handle to a reduce output. The handle ensures that if a task fails, partially written data is ignored. """ from disco.fileutils import AtomicFile path, url = Task.reduce_output Task.blobs.append(path) return AtomicFile(path, 'w'), url
def sorted_entries(self): dlname = self.path('reduce-in-%d.dl' % self.id) Message("Downloading %s" % dlname) out_fd = AtomicFile(dlname, 'w') for key, value in self.entries: if not isinstance(key, str): raise ValueError("Keys must be strings for external sort") if '\xff' in key or '\x00' in key: raise ValueError("Cannot sort keys with 0xFF or 0x00 bytes") else: # value pickled using protocol 0 will always be printable ASCII out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0))) out_fd.close() Message("Downloaded OK") self.disk_sort(dlname) fd, size, url = comm.open_local(dlname) for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url): yield k, cPickle.loads(v)
def map_output_stream(stream, partition, url, params): """ An :func:`output_stream` which returns a handle to a partition output. The handle ensures that if a task fails, partially written data is ignored. """ from disco.fileutils import AtomicFile if Task.ispartitioned: path, url = Task.partition_output(partition) else: path, url = Task.map_output(partition) Task.blobs.append(path) return AtomicFile(path, 'w'), url
def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.util import format_size from disco.fileutils import AtomicFile worker.send('MSG', "Downloading %s" % filename) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, str): raise ValueError("Keys must be strings for external sort", key) if '\xff' in key or '\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0))) out_fd.close() worker.send('MSG', "Downloaded %s OK" % format_size(getsize(filename))) worker.send('MSG', "Sorting %s..." % filename) unix_sort(filename, sort_buffer_size=sort_buffer_size) worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in delimited_reader(fd, len(fd), fd.url, delimiter='\xff', line_terminator='\x00'): yield k, cPickle.loads(v)
def download_and_sort(self): dlname = self.task.path('REDUCE_DL', self.task.id) Message("Reduce will be downloaded to %s" % dlname) out_fd = AtomicFile(dlname, 'w') for url in self.inputs: reader, sze, url = self.task.connect_input(url) for k, v in reader: self.sort_writer(out_fd, k, v) out_fd.close() Message("Reduce input downloaded ok") Message("Starting external sort") sortname = self.task.path('REDUCE_SORTED', self.task.id) ensure_path(os.path.dirname(sortname)) cmd = ['sort', '-n', '-k', '1,1', '-T', '.', '-z', '-t', '\xff', '-o', sortname, dlname] proc = subprocess.Popen(cmd) ret = proc.wait() if ret: TaskFailed("Sorting %s to %s failed (%d)" % (dlname, sortname, ret)) Message("External sort done: %s" % sortname) return self.multi_file_iterator(self.sort_reader, inputs=[sortname])
def concat_input(cls, task, output_label, replicas): output = AtomicFile(task.output_path(output_label)) BUFFER_SIZE = 1024 * 1024 for reps in replicas: # Use only the first replica for now, since a set of one # is the most common case. # TODO: handle falling back to alternative replicas. inp = open_url(reps[0]) buf = inp.read(BUFFER_SIZE) while (len(buf) > 0): output.write(buf) buf = inp.read(BUFFER_SIZE) inp.close() output.close() return output.path, output.size()
def concat_input(cls, task, output_label, replicas): output = AtomicFile(task.output_path(output_label)) BUFFER_SIZE = 1024*1024 for reps in replicas: # Use only the first replica for now, since a set of one # is the most common case. # TODO: handle falling back to alternative replicas. inp = open_url(reps[0]) buf = inp.read(BUFFER_SIZE) while (len(buf) > 0): output.write(buf) buf = inp.read(BUFFER_SIZE) inp.close() output.close() return output.path, output.size()
def sorted_entries(self): dlname = self.path('reduce-in-%d.dl' % self.id) Status("Downloading %s" % dlname) out_fd = AtomicFile(dlname, 'w') for key, value in self.entries: if not isinstance(key, str): raise ValueError("Keys must be strings for external sort", key) if '\xff' in key or '\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write('%s\xff%s\x00' % (key, cPickle.dumps(value, 0))) out_fd.close() Status("Downloaded OK") self.disk_sort(dlname) fd, size, url = comm.open_local(dlname) for k, v in func.re_reader("(?s)(.*?)\xff(.*?)\x00", fd, size, url): yield k, cPickle.loads(v)
def disk_sort(input, filename, sort_keys, binaries=(), sort_buffer_size='10%', desc=False): import ujson from disco.comm import open_local from disco.fileutils import AtomicFile import base64 # import sys # sys.path.append('/Library/Python/2.7/site-packages/pycharm-debug.egg') # import pydevd # pydevd.settrace('localhost', port=12999, stdoutToServer=True, stderrToServer=True) out_fd = AtomicFile(filename) key_types = None MPT = () # print "SORTKEY: %s" % repr(sort_keys) for key, _ in input: if isinstance(key, (str, unicode)): raise ValueError("Keys must be sequences", key) # determine if the key is numeric if key_types is None: key_types = [] for kt in key: try: float(kt) key_types.append('n') except: key_types.append('') #serialize the key - encoded either as NULL, json, or b64 - note that for i, each_key in enumerate(key): if each_key is None: ukey = b'\x00' elif i in binaries and key_types[i] != 'n': ukey = base64.b64encode(each_key) else: ukey = ujson.dumps(each_key) out_fd.write(ukey) out_fd.write(b'\xff') out_fd.write('\n') out_fd.flush() out_fd.close() unix_sort(filename, [(sk, key_types[sk]) for sk in sort_keys], sort_buffer_size=sort_buffer_size, desc=desc) fd = open_local(filename) for k in sort_reader(fd, fd.url): # yield [ujson.loads(key) if key != b'\x00' else None for key in k], MPT rval = [] for i, key in enumerate(k): if key == b'\x00': rkey = None elif i in binaries: rkey = base64.b64decode(key) else: rkey = ujson.loads(key) rval.append(rkey) yield rval, MPT