def save(self, path, items): # TODO: purge old cache with atomic_file(path) as f: c = 0 f.write(struct.pack("I", c)) # check is marshalable and compatible with broadcast can_marshal = marshalable(items) for v in items: if can_marshal: try: r = 0, marshal.dumps(v) except Exception: r = 1, six.moves.cPickle.dumps(v, -1) can_marshal = False else: r = 1, six.moves.cPickle.dumps(v, -1) f.write(msgpack.packb(r)) c += 1 yield v bytes = f.tell() if bytes > 10 << 20: logger.warning("cached result is %dMB (larger than 10MB)", bytes >> 20) # count f.seek(0) f.write(struct.pack("I", c))
def save(self, path, items): # TODO: purge old cache with atomic_file(path) as f: c = 0 f.write(struct.pack("I", c)) try_marshal = True for v in items: if try_marshal: try: r = 0, marshal.dumps(v) except Exception: r = 1, cPickle.dumps(v, -1) try_marshal = False else: r = 1, cPickle.dumps(v, -1) f.write(msgpack.packb(r)) c += 1 yield v bytes = f.tell() if bytes > 10 << 20: logger.warning("cached result is %dMB (larger than 10MB)", bytes >> 20) # count f.seek(0) f.write(struct.pack("I", c))
def run(self, attempId): logger.debug("shuffling %d of %s", self.partition, self.rdd) for i, bucket in self._prepare_shuffle(self.rdd): try: if marshalable(bucket): flag, d = 'm', marshal.dumps(bucket) else: flag, d = 'p', cPickle.dumps(bucket, -1) except ValueError: flag, d = 'p', cPickle.dumps(bucket, -1) cd = compress(d) for tried in range(1, 4): try: path = LocalFileShuffle.getOutputFile( self.shuffleId, self.partition, i, len(cd) * tried) with atomic_file(path, bufsize=1024 * 4096) as f: f.write(flag + struct.pack("I", 5 + len(cd))) f.write(cd) break except IOError, e: logger.warning("write %s failed: %s, try again (%d)", path, e, tried) else: raise
def __init__(self, items): self.bufsize = 4096 * 1024 self.buf = None self.offset = 0 dirs = LocalFileShuffle.shuffleDir self.path = path = os.path.join( random.choice(dirs[1:]) if dirs[1:] else dirs[0], 'shuffle-%s.tmp.gz' % uuid.uuid4().hex) with atomic_file(path, bufsize=self.bufsize) as f: f = gzip.GzipFile(fileobj=f) items = sorted(items, key=lambda k_v: k_v[0]) try: for i in items: s = marshal.dumps(i) f.write(struct.pack("I", len(s))) f.write(s) self.loads = marshal.loads except Exception: f.rewind() for i in items: s = cPickle.dumps(i) f.write(struct.pack("I", len(s))) f.write(s) self.loads = cPickle.loads f.close()
def save(self, path, items): # TODO: purge old cache with atomic_file(path) as f: c = 0 f.write(struct.pack("I", c)) try_marshal = True for v in items: if try_marshal: try: r = 0, marshal.dumps(v) except Exception: r = 1, cPickle.dumps(v, -1) try_marshal = False else: r = 1, cPickle.dumps(v, -1) f.write(msgpack.packb(r)) c += 1 yield v bytes = f.tell() if bytes > 10<<20: logger.warning("cached result is %dMB (larger than 10MB)", bytes>>20) # count f.seek(0) f.write(struct.pack("I", c))
def run_without_sorted(self, it): for i, bucket in it: try: if marshalable(bucket): flag, d = b'm', marshal.dumps(bucket) else: flag, d = b'p', six.moves.cPickle.dumps(bucket, -1) except ValueError: flag, d = b'p', six.moves.cPickle.dumps(bucket, -1) cd = compress(d) env.task_stats.bytes_shuffle_write += len(cd) for tried in range(1, 4): try: path = LocalFileShuffle.getOutputFile( self.shuffleId, self.partition, i, len(cd) * tried) with atomic_file(path, bufsize=1024 * 4096) as f: f.write(flag + struct.pack("I", 5 + len(cd))) f.write(cd) break except IOError as e: logger.warning("write %s failed: %s, try again (%d)", path, e, tried) else: raise e return LocalFileShuffle.getServerUri()
def compute(self, split): buffers = [list() for i in self.fields] remain_size = STRIPE_DATA_SIZE path = os.path.join(self.path, '%04d.dt' % split.index) indices = dict((i, AdaptiveIndex()) for i in self.indices) def write_stripe(f, compressed, header, padding=True): h = compress(marshal.dumps(header)) assert len(h) < STRIPE_HEADER_SIZE f.write(struct.pack('I', len(h))) f.write(h) padding_size = STRIPE_SIZE - len(h) - 4 for c in compressed: f.write(c) padding_size -= len(c) if padding: f.write('\0' * padding_size) with atomic_file(path) as f: stripe_id = 0 for it in chain(self.prev.iterator(sp) for sp in split.splits): row = it[:len(self.fields)] size = len(marshal.dumps(tuple(row))) if size > STRIPE_DATA_SIZE: raise RuntimeError('Row too big') if size > remain_size: compressed = [compress(marshal.dumps(tuple(b))) for b in buffers] _sizes = tuple(map(len, compressed)) _remain_size = STRIPE_DATA_SIZE - sum(_sizes) if size > _remain_size: write_stripe(f, compressed, _sizes) buffers = [list() for i in self.fields] remain_size = STRIPE_DATA_SIZE stripe_id += 1 else: remain_size = _remain_size remain_size -= size for i, value in enumerate(row): buffers[i].append(value) field = self.fields[i] if field in self.indices: indices[field].add(value, stripe_id) if any(buffers): compressed = [compress(marshal.dumps(tuple(b))) for b in buffers] _sizes = tuple(map(len, compressed)) write_stripe(f, compressed, _sizes, False) footer_indices = zlib.compress(cPickle.dumps(indices, -1)) footer_fields = compress(marshal.dumps(self.fields)) f.write(footer_indices) f.write(footer_fields) f.write(struct.pack('II', len(footer_fields), len(footer_indices))) yield path
def __init__(self, items, rddconf): self.path = path = LocalFileShuffle.get_tmp() with atomic_file(path, bufsize=4096) as f: if not isinstance(items, list): items = list(items) items.sort(key=itemgetter(0)) serializer = get_serializer(rddconf) serializer.dump_stream(items, f) self.size = f.tell() self.num_batch = serializer.num_batch
def _flush(self): if not self.updated: return updated_keys = {} path = self._get_path() uri = env.get('SERVER_URI') server_uri = '%s/%s' % (uri, os.path.basename(path)) for k, v in self.updated.items(): key = self._get_key(k) if key in updated_keys: updated_keys[key][k] = v else: updated_keys[key] = {k: v} uid = uuid.uuid4().get_hex() for key, updated in updated_keys.items(): new = self._fetch_missing(key) for k, v in updated.items(): if v is None: new.pop(k) else: new[k] = v filename = '%s_%s_%s' % (key, self.generation, uid) fn = os.path.join(path, filename) if os.path.exists(fn): raise RuntimeError('conflict uuid for mutable_dict') url = '%s/%s' % (server_uri, filename) with atomic_file(fn) as f: data = compress(six.moves.cPickle.dumps(new)) f.write(struct.pack('<I', len(data) + 4) + data) env.trackerClient.call( AddItemMessage('mutable_dict_new:%s' % key, url)) files = glob.glob(os.path.join(path, '%s-*' % self.uuid)) for f in files: if int(f.split('_')[-2]) < self.generation - 1: try: os.remove(f) except OSError: pass self.updated.clear() self.data = LRUDict(self.cacheLimit)
def _flush(self): if not self.updated: return updated_keys = {} path = self._get_path() uri = env.get('SERVER_URI') server_uri = '%s/%s' % (uri, os.path.basename(path)) for k,v in self.updated.items(): key = self._get_key(k) if key in updated_keys: updated_keys[key][k] = v else: updated_keys[key] = {k:v} uid = uuid.uuid4().get_hex() for key, updated in updated_keys.items(): new = self._fetch_missing(key) for k,v in updated.items(): if v is None: new.pop(k) else: new[k] = v filename = '%s_%s_%s' % (key, self.generation, uid) fn = os.path.join(path, filename) if os.path.exists(fn): raise RuntimeError('conflict uuid for mutable_dict') url = '%s/%s' % (server_uri, filename) with atomic_file(fn) as f: data = compress(cPickle.dumps(new)) f.write(struct.pack('<I', len(data)+4) + data) env.trackerClient.call(AddItemMessage('mutable_dict_new:%s' % key, url)) files = glob.glob(os.path.join(path, '%s-*' % self.uuid )) for f in files: if int(f.split('_')[-2]) < self.generation -1: try: os.remove(f) except OSError: pass self.updated.clear() self.data = LRUDict(self.cacheLimit)
def run_with_sorted(self, it): serializer = GroupByAutoBatchedSerializer( ) if self.iter_values else AutoBatchedSerializer() for i, bucket in it: for tried in range(1, 4): try: path = LocalFileShuffle.getOutputFile( self.shuffleId, self.partition, i) with atomic_file(path, bufsize=1024 * 4096) as f: items = sorted(bucket.items(), key=lambda x: x[0]) serializer.dump_stream(items, f) env.task_stats.bytes_shuffle_write += f.tell() break except IOError as e: logger.warning("write %s failed: %s, try again (%d)", path, e, tried) else: raise e return LocalFileShuffle.getServerUri()
def run(self, attempId): logger.debug("shuffling %d of %s", self.partition, self.rdd) for i, bucket in self._prepare_shuffle(self.rdd): try: if marshalable(bucket): flag, d = 'm', marshal.dumps(bucket) else: flag, d = 'p', cPickle.dumps(bucket, -1) except ValueError: flag, d = 'p', cPickle.dumps(bucket, -1) cd = compress(d) for tried in range(1, 4): try: path = LocalFileShuffle.getOutputFile(self.shuffleId, self.partition, i, len(cd) * tried) with atomic_file(path, bufsize=1024*4096) as f: f.write(flag + struct.pack("I", 5 + len(cd))) f.write(cd) break except IOError, e: logger.warning("write %s failed: %s, try again (%d)", path, e, tried) else: raise
def write(self, path): output_file = os.path.join(path, 'metadata') with atomic_file(output_file) as f: f.write(pickle.dumps(self, -1))