def read_netstr(idx, data, tot): ldata = len(data) i = 0 lenstr = '' if ldata - idx < 11: data = data[idx:] + bytes_to_str(fd.read(8192)) ldata = len(data) idx = 0 i = data.find(' ', idx, idx + 11) if i == -1: raise DataError( "Corrupted input: " "Could not parse a value length at {0} bytes.".format(tot), fname) else: lenstr = data[idx:i + 1] idx = i + 1 if ldata < i + 1: raise DataError( "Truncated input: " "Expected {0} bytes, got {1}".format(size, tot), fname) try: llen = int(lenstr) except ValueError: raise DataError( "Corrupted input: " "Could not parse a value length at {0} bytes.".format(tot), fname) tot += len(lenstr) if ldata - idx < llen + 1: data = data[idx:] + bytes_to_str(fd.read(llen + 8193)) ldata = len(data) idx = 0 msg = data[idx:idx + llen] if idx + llen + 1 > ldata: raise DataError( "Truncated input: " "Expected a value of {0} bytes (offset {1} bytes)".format( llen + 1, tot), fname) tot += llen + 1 idx += llen + 1 return idx, data, tot, msg
def _push(self, source_target, replicas=None, forceon=[], exclude=[], **kwargs): source, target = source_target qs = urlencode([(k, v) for k, v in (('exclude', ','.join(exclude)), ('include', ','.join(forceon)), ('replicas', replicas)) if v]) urls = self._download('{0}/ddfs/new_blob/{1}?{2}'.format( self.master, target, qs)) try: return [ json.loads(bytes_to_str(url)) for url in self._upload( urls, source, to_master=False, **kwargs) ] except CommError as e: scheme, (host, port), path = urlsplit(e.url) if hasattr(source, "seek"): source.seek( 0) # source will be read again; seek to the beginning else: print("{0} is not seekable, retrying".format(source)) return self._push((source, target), replicas=replicas, forceon=forceon, exclude=exclude + [host], **kwargs)
def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.fileutils import AtomicFile from disco.worker.task_io import re_reader if worker: worker.send('MSG', "Downloading {0}".format(filename)) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, bytes): raise ValueError("Keys must be bytes for external sort", key) if b'\xff' in key or b'\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write(key + b'\xff') out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00') out_fd.close() if worker: worker.send( 'MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename)))) worker.send('MSG', "Sorting {0}...".format(filename)) unix_sort(filename, sort_buffer_size=sort_buffer_size) if worker: worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in sort_reader(fd, fd.url): yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
def t_read_until(self, delim, spent=0, bytes=''): while not bytes.endswith(delim): spent += self.select(spent) read_bytes = os.read(self.fd, 1) raise_if_empty(read_bytes) bytes += bytes_to_str(read_bytes) return spent, bytes
def read_index(dir): from disco.comm import open_url file = open_url(proxy_url(dir, to_master=False)) if dir.endswith(".gz"): file = gzip.GzipFile(fileobj=file) for line in file: yield bytes_to_str(line).split()
def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.fileutils import AtomicFile from disco.worker.task_io import re_reader if worker: worker.send('MSG', "Downloading {0}".format(filename)) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, bytes): raise ValueError("Keys must be bytes for external sort", key) if b'\xff' in key or b'\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write(key + b'\xff') out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00') out_fd.close() if worker: worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename)))) worker.send('MSG', "Sorting {0}...".format(filename)) unix_sort(filename, sort_buffer_size=sort_buffer_size) if worker: worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in sort_reader(fd, fd.url): yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
def cat(program, *urls): """Usage: [url ...] Concatenate the contents of all url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from subprocess import call from disco.comm import download from disco.util import deref, urlresolve, proxy_url from disco.compat import bytes_to_str ignore_missing = program.options.ignore_missing tags, urls = program.separate_tags(*urls) def curl(replicas): for replica in replicas: try: return download(proxy_url(urlresolve(replica, master=program.ddfs.master), to_master=False)) except Exception as e: sys.stderr.write("{0}\n".format(e)) if not ignore_missing: raise Exception("Failed downloading all replicas: {0}".format(replicas)) return '' for replicas in deref(chain(urls, program.blobs(*tags))): sys.stdout.write(bytes_to_str(curl(replicas)))
def cat(program, *urls): """Usage: [url ...] Concatenate the contents of all url[s] and print to stdout. If any of the url[s] are tags, the blobs reachable from the tags will be printed after any non-tag url[s]. """ from itertools import chain from subprocess import call from disco.comm import download from disco.util import deref, urlresolve, proxy_url from disco.compat import bytes_to_str ignore_missing = program.options.ignore_missing tags, urls = program.separate_tags(*urls) def curl(replicas): for replica in replicas: try: return download( proxy_url(urlresolve(replica, master=program.ddfs.master), to_master=False)) except Exception as e: sys.stderr.write("{0}\n".format(e)) if not ignore_missing: raise Exception( "Failed downloading all replicas: {0}".format(replicas)) return '' for replicas in deref(chain(urls, program.blobs(*tags))): sys.stdout.write(bytes_to_str(curl(replicas)))
def read_netstr(idx, data, tot): ldata = len(data) i = 0 lenstr = '' if ldata - idx < 11: data = data[idx:] + bytes_to_str(fd.read(8192)) ldata = len(data) idx = 0 i = data.find(' ', idx, idx + 11) if i == -1: raise DataError("Corrupted input: " "Could not parse a value length at {0} bytes." .format(tot), fname) else: lenstr = data[idx:i + 1] idx = i + 1 if ldata < i + 1: raise DataError("Truncated input: " "Expected {0} bytes, got {1}" .format(size, tot), fname) try: llen = int(lenstr) except ValueError: raise DataError("Corrupted input: " "Could not parse a value length at {0} bytes." .format(tot), fname) tot += len(lenstr) if ldata - idx < llen + 1: data = data[idx:] + bytes_to_str(fd.read(llen + 8193)) ldata = len(data) idx = 0 msg = data[idx:idx + llen] if idx + llen + 1 > ldata: raise DataError("Truncated input: " "Expected a value of {0} bytes (offset {1} bytes)" .format(llen + 1, tot), fname) tot += llen + 1 idx += llen + 1 return idx, data, tot, msg
def _download(self, url, data=None, token=None, method="GET", to_master=True): byts = download( self._resolve(proxy_url(url, proxy=self.proxy, meth=method, to_master=to_master)), data=data, method=method, token=self._token(url, token, method), ) return json.loads(bytes_to_str(byts))
def t_read(self, nbytes, spent=0, bytes=''): while True: spent += self.select(spent) read_bytes = os.read(self.fd, nbytes - len(bytes)) raise_if_empty(read_bytes) bytes += bytes_to_str(read_bytes) if nbytes <= len(bytes): return spent, bytes
def test_large(self): self.job = LargeOOBJob().run( input=['raw://{0}'.format(i) for i in range(self.num_workers)]) self.assertResults(self.job, []) self.assertEquals( sorted((key, bytes_to_str(self.job.oob_get(key))) for key in self.job.oob_list()), sorted(('{0}-{1}'.format(i, j), 'val:{0}-{1}'.format(i, j)) for i in range(self.num_workers) for j in range(10)))
def test_large(self): self.job = LargeOOBJob().run(input=['raw://{0}'.format(i) for i in range(self.num_workers)]) self.assertResults(self.job, []) self.assertEquals(sorted((key, bytes_to_str(self.job.oob_get(key))) for key in self.job.oob_list()), sorted(('{0}-{1}'.format(i, j), 'val:{0}-{1}'.format(i, j)) for i in range(self.num_workers) for j in range(10)))
def read_index(dir): # We might be given replicas of dirs; choose the first. if isiterable(dir): dir = dir[0] from disco.comm import open_url file = open_url(proxy_url(dir, to_master=False)) if dir.endswith(".gz"): file = gzip.GzipFile(fileobj=file) for line in file: label, url, size = bytes_to_str(line).split() yield int(label), url, int(size)
def input_stream(fd, sze, url, params): """Opens a StringIO whose data is everything after the url scheme. For example, `raw://hello_world` would return `hello_world` when read by the task. """ from disco.compat import StringIO, bytes_to_str from disco.util import schemesplit scheme, string = schemesplit(url) ascii = bytes_to_str(string) return (StringIO(ascii), len(ascii), url)
def _download(self, url, data=None, token=None, method='GET', to_master=True): byts = download(self._resolve( proxy_url(url, proxy=self.proxy, meth=method, to_master=to_master)), data=data, method=method, token=self._token(url, token, method)) return json.loads(bytes_to_str(byts))
def parse_message(msg): msg = bytes_to_str(msg) try: type, payload = msg.split('>', 1) payload = payload.strip() if type == '**<MSG': Worker.send('MSG', payload) elif type == '**<ERR': Worker.send('FATAL', payload) else: raise Exception except: # let master handle erroneous output sys.stderr.write(msg)
def _push(self, source_target, replicas=None, exclude=[], **kwargs): source, target = source_target qs = urlencode([(k, v) for k, v in (('exclude', ','.join(exclude)), ('replicas', replicas)) if v]) urls = self._download('{0}/ddfs/new_blob/{1}?{2}' .format(self.master, target, qs)) try: return [json.loads(bytes_to_str(url)) for url in self._upload(urls, source, to_master=False, **kwargs)] except CommError as e: scheme, (host, port), path = urlsplit(e.url) return self._push((source, target), replicas=replicas, exclude=exclude + [host], **kwargs)
def _push(self, source_target, replicas=None, exclude=[], **kwargs): source, target = source_target qs = urlencode([(k, v) for k, v in (('exclude', ','.join(exclude)), ('replicas', replicas)) if v]) urls = self._download('{0}/ddfs/new_blob/{1}?{2}'.format( self.master, target, qs)) try: return [ json.loads(bytes_to_str(url)) for url in self._upload( urls, source, to_master=False, **kwargs) ] except CommError as e: scheme, (host, port), path = urlsplit(e.url) return self._push((source, target), replicas=replicas, exclude=exclude + [host], **kwargs)
def _push(self, source_target, replicas=None, forceon=[], exclude=[], **kwargs): source, target = source_target qs = urlencode( [ (k, v) for k, v in (("exclude", ",".join(exclude)), ("include", ",".join(forceon)), ("replicas", replicas)) if v ] ) urls = self._download("{0}/ddfs/new_blob/{1}?{2}".format(self.master, target, qs)) try: return [json.loads(bytes_to_str(url)) for url in self._upload(urls, source, to_master=False, **kwargs)] except CommError as e: scheme, (host, port), path = urlsplit(e.url) if hasattr(source, "seek"): source.seek(0) # source will be read again; seek to the beginning else: print("{0} is not seekable, retrying".format(source)) return self._push((source, target), replicas=replicas, forceon=forceon, exclude=exclude + [host], **kwargs)
def old_netstr_reader(fd, size, fname, head=b''): """ Reader for Disco's default/internal key-value format. Reads output of a map / reduce job as the input for a new job. Specify this function as your :func:`map_reader` to use the output of a previous job as input to another job. """ if size is None: raise ValueError("Content-length must be defined") def read_netstr(idx, data, tot): ldata = len(data) i = 0 lenstr = '' if ldata - idx < 11: data = data[idx:] + bytes_to_str(fd.read(8192)) ldata = len(data) idx = 0 i = data.find(' ', idx, idx + 11) if i == -1: raise DataError("Corrupted input: " "Could not parse a value length at {0} bytes." .format(tot), fname) else: lenstr = data[idx:i + 1] idx = i + 1 if ldata < i + 1: raise DataError("Truncated input: " "Expected {0} bytes, got {1}" .format(size, tot), fname) try: llen = int(lenstr) except ValueError: raise DataError("Corrupted input: " "Could not parse a value length at {0} bytes." .format(tot), fname) tot += len(lenstr) if ldata - idx < llen + 1: data = data[idx:] + bytes_to_str(fd.read(llen + 8193)) ldata = len(data) idx = 0 msg = data[idx:idx + llen] if idx + llen + 1 > ldata: raise DataError("Truncated input: " "Expected a value of {0} bytes (offset {1} bytes)" .format(llen + 1, tot), fname) tot += llen + 1 idx += llen + 1 return idx, data, tot, msg data = bytes_to_str(head + fd.read(8192)) tot = idx = 0 while tot < size: key = val = '' idx, data, tot, key = read_netstr(idx, data, tot) idx, data, tot, val = read_netstr(idx, data, tot) yield key, val
def schemesplit(url): return bytes_to_str(url).split('://', 1) if '://' in bytes_to_str(url) else ('', url)
def jobenvs(self): dict_offset, envs_offset, home_offset, data_offset = self.offsets(self.jobfile) self.jobfile.seek(envs_offset) return json.loads(bytes_to_str(self.jobfile.read(home_offset - envs_offset)))
def map(k_v, params): yield bytes_to_str(k_v[0]), k_v[1]
def t_read(self, nbytes, spent=0, bytes=''): while True: spent += self.select(spent) bytes += bytes_to_str(os.read(self.fd, nbytes - len(bytes))) if nbytes <= len(bytes): return spent, bytes
def map(interface, state, label, inp): out = interface.output(0) for e in inp: out.add(int(e), (bytes_to_str(e)).strip())
def map(e, params): yield bytes_to_str(e), ''
def jobenvs(self): dict_offset, envs_offset, home_offset, data_offset = self.offsets( self.jobfile) self.jobfile.seek(envs_offset) return json.loads( bytes_to_str(self.jobfile.read(home_offset - envs_offset)))
def old_netstr_reader(fd, size, fname, head=b''): """ Reader for Disco's default/internal key-value format. Reads output of a map / reduce job as the input for a new job. Specify this function as your :func:`map_reader` to use the output of a previous job as input to another job. """ if size is None: raise ValueError("Content-length must be defined") def read_netstr(idx, data, tot): ldata = len(data) i = 0 lenstr = '' if ldata - idx < 11: data = data[idx:] + bytes_to_str(fd.read(8192)) ldata = len(data) idx = 0 i = data.find(' ', idx, idx + 11) if i == -1: raise DataError( "Corrupted input: " "Could not parse a value length at {0} bytes.".format(tot), fname) else: lenstr = data[idx:i + 1] idx = i + 1 if ldata < i + 1: raise DataError( "Truncated input: " "Expected {0} bytes, got {1}".format(size, tot), fname) try: llen = int(lenstr) except ValueError: raise DataError( "Corrupted input: " "Could not parse a value length at {0} bytes.".format(tot), fname) tot += len(lenstr) if ldata - idx < llen + 1: data = data[idx:] + bytes_to_str(fd.read(llen + 8193)) ldata = len(data) idx = 0 msg = data[idx:idx + llen] if idx + llen + 1 > ldata: raise DataError( "Truncated input: " "Expected a value of {0} bytes (offset {1} bytes)".format( llen + 1, tot), fname) tot += llen + 1 idx += llen + 1 return idx, data, tot, msg data = bytes_to_str(head + fd.read(8192)) tot = idx = 0 while tot < size: key = val = '' idx, data, tot, key = read_netstr(idx, data, tot) idx, data, tot, val = read_netstr(idx, data, tot) yield key, val
def map(e, params): return [(w, 1) for w in re.sub('\W', ' ', bytes_to_str(e)).lower().split()]
def map(string, params): return shuffled((base64.encodestring(str_to_bytes(c)), b'') for c in bytes_to_str(string * 10))
def reduce(iter, params): for k, vs in kvgroup(iter): yield bytes_to_str(base64.decodestring(k)), len(list(vs))
def map_input_stream2(stream, size, url, params): return StringIO('b' + bytes_to_str(stream.read()))
def reduce(interface, state, label, inp): for rec in sorted(inp): state.append((int(rec), (bytes_to_str(rec).strip())))
def Map(interface, state, label, inp): out = interface.output(0) for i in inp: for k, v in shuffled((base64.encodestring(str_to_bytes(c)), b'') for c in bytes_to_str(str_to_bytes(i) * 10)): out.add(k, v)
def map(e, params): x = bytes_to_str(load_oob(Task.master, params['job'], e)) assert x == 'value:{0}'.format(e) yield 'good', ''
def map(e, params): k = bytes_to_str(e) v = str_to_bytes('value:{0}'.format(k)) put(k, v) yield k, v
def t_read_until(self, delim, spent=0, bytes=''): while not bytes.endswith(delim): spent += self.select(spent) bytes += bytes_to_str(os.read(self.fd, 1)) return spent, bytes
def map(e, params): x, y = [float(x) for x in bytes_to_str(e).split('|')] yield mod1.plusceil(x, y) + math.ceil(1.5), ''
def safe_name(cls, name): return unsafe_re.sub('_', bytes_to_str(name))
def map_input_stream1(stream, size, url, params): return StringIO("a" + bytes_to_str(stream.read()))
def safe_name(cls, name): return unsafe_re.sub("_", bytes_to_str(name))
def map(e, params): yield int(e), (bytes_to_str(e)).strip()