def pack_kv(e): if isinstance(e, tuple): k, v = e else: k = b'' v = e return struct.pack("I", len(k)) + str_to_bytes(k) \ + struct.pack("I", len(v)) + str_to_bytes(v)
def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.fileutils import AtomicFile from disco.worker.task_io import re_reader if worker: worker.send('MSG', "Downloading {0}".format(filename)) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, bytes): raise ValueError("Keys must be bytes for external sort", key) if b'\xff' in key or b'\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write(key + b'\xff') out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00') out_fd.close() if worker: worker.send('MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename)))) worker.send('MSG', "Sorting {0}...".format(filename)) unix_sort(filename, sort_buffer_size=sort_buffer_size) if worker: worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in sort_reader(fd, fd.url): yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
def disk_sort(worker, input, filename, sort_buffer_size='10%'): from os.path import getsize from disco.comm import open_local from disco.fileutils import AtomicFile from disco.worker.task_io import re_reader if worker: worker.send('MSG', "Downloading {0}".format(filename)) out_fd = AtomicFile(filename) for key, value in input: if not isinstance(key, bytes): raise ValueError("Keys must be bytes for external sort", key) if b'\xff' in key or b'\x00' in key: raise ValueError("Cannot sort key with 0xFF or 0x00 bytes", key) else: # value pickled using protocol 0 will always be printable ASCII out_fd.write(key + b'\xff') out_fd.write(encode(pickle_dumps(value, 0)) + b'\x00') out_fd.close() if worker: worker.send( 'MSG', "Downloaded {0:s} OK".format(format_size(getsize(filename)))) worker.send('MSG', "Sorting {0}...".format(filename)) unix_sort(filename, sort_buffer_size=sort_buffer_size) if worker: worker.send('MSG', ("Finished sorting")) fd = open_local(filename) for k, v in sort_reader(fd, fd.url): yield k, bytes_to_str(decode(str_to_bytes(pickle_loads(v))))
def test_save_map(self): input = range(10) self.job = SaveMapJob().run(input=self.test_server.urls(input)) results = sorted(self.results(self.job)) self.tag = self.disco.results(self.job.name)[1][0] self.assertEquals(len(list(self.ddfs.blobs(self.tag))), len(input)) self.assertEquals(results, [(str_to_bytes(str(e)+'!'), '') for e in input])
def test_save_map(self): input = range(10) self.job = SaveMapJob().run(input=self.test_server.urls(input)) results = sorted(self.results(self.job)) self.tag = self.disco.results(self.job.name)[1][0] self.assertEquals(len(list(self.ddfs.blobs(self.tag))), len(input)) self.assertEquals(results, [(str_to_bytes(str(e) + '!'), '') for e in input])
def test_save(self): ducks = ['dewey', 'huey', 'louie'] a, b = SaveJob1(), SaveJob2() self.job = JobChain({a: self.test_server.urls(ducks), b: a}).wait() self.tag = self.disco.results(b)[1][0] self.assertAllEqual(sorted(self.results(b)), [(str_to_bytes('{0}!?!?'.format(d)), '') for d in ducks])
def test_extreduce(self): self.job = ExternalJob().run(input=self.test_server.urls(self.inputs), map=lambda e, params: [('', e)], reduce=external([self.binary])) ans = str_to_bytes( str( sum( map(ord, ''.join('test_{0}\n'.format(i) for i in self.inputs))))) self.assertEquals([(ans, ans)] * 10, list(self.results(self.job)))
def prepare(params, mode): global proc # op -> worker # find required files path = os.path.join('ext.{0}'.format(mode), 'op') os.chmod(path, stat.S_IEXEC) proc = Popen([path, mode], stdin=PIPE, stdout=PIPE, stderr=PIPE) register_poll() if params and isinstance(params, dict): proc.stdin.write(str_to_bytes(encode_netstring_fd(params))) else: proc.stdin.write('0\n') return globals()[mode]
def test_save_map(self): input = range(10) self.job = SaveMapJob().run(input=self.test_server.urls(input)) results = sorted(self.results(self.job)) self.tag = self.disco.results(self.job.name)[1][0] # Previously, each map would save one blob into DDFS. Now, # the pipeline termination does it, using the output of the # shuffle stage. So now, the number of blobs in the tag # depends on the grouping used for shuffle, and also the # number of nodes used. Hence, we cannot anymore assert on # the number of blobs in the tag. # self.assertEquals(len(list(self.ddfs.blobs(self.tag))), len(input)) self.assertEquals(results, [(str_to_bytes(str(e)+'!'), '') for e in input])
def test_save_map(self): input = range(10) self.job = SaveMapJob().run(input=self.test_server.urls(input)) results = sorted(self.results(self.job)) self.tag = self.disco.results(self.job.name)[1][0] # Previously, each map would save one blob into DDFS. Now, # the pipeline termination does it, using the output of the # shuffle stage. So now, the number of blobs in the tag # depends on the grouping used for shuffle, and also the # number of nodes used. Hence, we cannot anymore assert on # the number of blobs in the tag. # self.assertEquals(len(list(self.ddfs.blobs(self.tag))), len(input)) self.assertEquals(results, [(str_to_bytes(str(e) + '!'), '') for e in input])
def read(interface, state, label, inp): from disco import util for e in inp: scheme, netloc, _ = util.urlsplit(e) fileName, joinColumn = str(netloc).split('?') File = open(PREFIX + fileName, 'r') col = int(joinColumn) reader = csv.reader(File) firstRow = True for row in reader: if firstRow: tableName = row[0] firstRow = False else: fullName = tableName + '?' + str(col) Hash = int( hashlib.md5(str_to_bytes(row[col])).hexdigest(), 16) % 160 interface.output(Hash).add(fullName, row)
def read(interface, state, label, inp): from disco import util for e in inp: scheme, netloc, _ = util.urlsplit(e) fileName, joinColumn = str(netloc).split("?") File = open(PREFIX + fileName, "r") col = int(joinColumn) reader = csv.reader(File) firstRow = True for row in reader: if firstRow: tableName = row[0] firstRow = False else: fullName = tableName + "?" + str(col) Hash = int(hashlib.md5(str_to_bytes(row[col])).hexdigest(), 16) % 160 interface.output(Hash).add(fullName, row)
def runTest(self): self.job = RawJob().run(input=self.test_server.urls(self.input)) self.assertEqual(sorted(self.results(self.job)), sorted((str_to_bytes(i), '') for i in self.input))
def send_data(self, data): self.send_response(OK) self.send_header('Content-length', len(data or [])) self.end_headers() self.wfile.write(str_to_bytes(data))
def ansi_text(self, text, bgcolor=WHITE, fgcolor=BLACK): return self.background(bgcolor) + self.foreground( fgcolor) + str_to_bytes(text)
def map(e, params): for i in range(10): put('{0}-{1}'.format(e, i), str_to_bytes('val:{0}-{1}'.format(e, i))) return []
def map(e, params): k = bytes_to_str(e) v = str_to_bytes('value:{0}'.format(k)) put(k, v) yield k, v
def ansi_text(self, text, bgcolor=WHITE, fgcolor=BLACK): return self.background(bgcolor) + self.foreground(fgcolor) + str_to_bytes(text)
def Map(interface, state, label, inp): out = interface.output(0) for i in inp: for k, v in shuffled((base64.encodestring(str_to_bytes(c)), b'') for c in bytes_to_str(str_to_bytes(i) * 10)): out.add(k, v)
def checkAnswers(self, job, input): self.assertEquals(sorted(self.results(job)), sorted((str_to_bytes(str(i)), '') for i in input))
def runTest(self): self.job = SortJob().run(input=self.test_server.urls([''] * 100)) result = [i for i in self.results(self.job)] self.assertResults(self.job, sorted((str_to_bytes(c), 1000) for c in alphanum))
def map(string, params): return shuffled((base64.encodestring(str_to_bytes(c)), b'') for c in bytes_to_str(string * 10))
def Map(interface, state, label, inp): out = interface.output(0) for i in inp: out.add(str_to_bytes(i), u'\x00\x00')
def Reduce(interface, state, label, inp): out = interface.output(0) for k, vs in kvgroup(inp): out.add(str_to_bytes(k), 0)
def add(self, k, v): k, v = str(k), str(v) self.stream.write( str_to_bytes("%d %s %d %s\n" % (len(k), k, len(v), v)))
def read(self): if self.isopen: return BytesIO(str_to_bytes(self.source)).read return open(self.source, 'rb').read
def reduce(iter, params): for k, v in iter: assert v == get(k) x = 'reduce:{0}'.format(this_partition()) put(x, str_to_bytes('value:{0}'.format(x))) yield 'all', 'ok'
def test_extreduce(self): self.job = ExternalJob().run(input=self.test_server.urls(self.inputs), map=lambda e, params: [('', e)], reduce=external([self.binary])) ans = str_to_bytes(str(sum(map(ord, ''.join('test_{0}\n'.format(i) for i in self.inputs))))) self.assertEquals([(ans, ans)] * 10, list(self.results(self.job)))
def add(self, k, v): k, v = str(k), str(v) self.stream.write(str_to_bytes("%d %s %d %s\n" % (len(k), k, len(v), v)))
def getHash(line): return int(hashlib.md5(str_to_bytes(line)).hexdigest(), 16) % 128