def save(self, path, items): # TODO: purge old cache with atomic_file(path) as f: c = 0 f.write(struct.pack("I", c)) # check is marshalable and compatible with broadcast can_marshal = marshalable(items) for v in items: if can_marshal: try: r = 0, marshal.dumps(v) except Exception: r = 1, cPickle.dumps(v, -1) can_marshal = False else: r = 1, cPickle.dumps(v, -1) f.write(msgpack.packb(r)) c += 1 yield v bytes = f.tell() if bytes > 10 << 20: logger.warning("cached result is %dMB (larger than 10MB)", bytes >> 20) # count f.seek(0) f.write(struct.pack("I", c))
def compute(self, split): buffers = [list() for i in self.fields] remain_size = STRIPE_DATA_SIZE path = os.path.join(self.path, '%04d.dt' % split.index) indices = dict((i, AdaptiveIndex()) for i in self.indices) def write_stripe(f, compressed, header, padding=True): h = compress(marshal.dumps(header)) assert len(h) < STRIPE_HEADER_SIZE f.write(struct.pack('I', len(h))) f.write(h) padding_size = STRIPE_SIZE - len(h) - 4 for c in compressed: f.write(c) padding_size -= len(c) if padding: f.write('\0' * padding_size) with atomic_file(path) as f: stripe_id = 0 for it in chain(self.prev.iterator(sp) for sp in split.splits): row = it[:len(self.fields)] size = len(marshal.dumps(tuple(row))) if size > STRIPE_DATA_SIZE: raise RuntimeError('Row too big') if size > remain_size: compressed = [compress(marshal.dumps(tuple(b))) for b in buffers] _sizes = tuple(map(len, compressed)) _remain_size = STRIPE_DATA_SIZE - sum(_sizes) if size > _remain_size: write_stripe(f, compressed, _sizes) buffers = [list() for i in self.fields] remain_size = STRIPE_DATA_SIZE stripe_id += 1 else: remain_size = _remain_size remain_size -= size for i, value in enumerate(row): buffers[i].append(value) field = self.fields[i] if field in self.indices: indices[field].add(value, stripe_id) if any(buffers): compressed = [compress(marshal.dumps(tuple(b))) for b in buffers] _sizes = tuple(map(len, compressed)) write_stripe(f, compressed, _sizes, False) footer_indices = zlib.compress(cPickle.dumps(indices, -1)) footer_fields = compress(marshal.dumps(self.fields)) f.write(footer_indices) f.write(footer_fields) f.write(struct.pack('II', len(footer_fields), len(footer_indices))) yield path
def __init__(self, items, rddconf): self.path = path = LocalFileShuffle.get_tmp() with atomic_file(path, bufsize=4096) as f: if not isinstance(items, list): items = list(items) items.sort(key=itemgetter(0)) serializer = get_serializer(rddconf) serializer.dump_stream(items, f) self.size = f.tell() self.num_batch = serializer.num_batch
def __init__(self, items, rddconf): self.path = path = env.workdir.alloc_tmp("sorted_items") with atomic_file(path, bufsize=4096) as f: if not isinstance(items, list): items = list(items) items.sort(key=itemgetter(0)) serializer = get_serializer(rddconf) serializer.dump_stream(items, f) self.size = f.tell() self.num_batch = serializer.num_batch
def _flush(self): if not self.updated: return updated_keys = {} dirname = "mutable_dict" tmppath = env.workdir.alloc_tmp_dir(dirname) path = env.workdir.export(tmppath, dirname) uri = env.get('SERVER_URI') server_uri = '%s/%s' % (uri, os.path.basename(path)) for k, v in self.updated.items(): key = self._get_key(k) if key in updated_keys: updated_keys[key][k] = v else: updated_keys[key] = {k: v} uid = uuid_pkg.uuid4().get_hex() for key, updated in updated_keys.items(): new = self._fetch_missing(key) for k, v in updated.items(): if v is None: new.pop(k) else: new[k] = v filename = '%s_%s_%s' % (key, self.generation, uid) fn = os.path.join(path, filename) if os.path.exists(fn): raise RuntimeError('conflict uuid for mutable_dict') url = '%s/%s' % (server_uri, filename) with atomic_file(fn) as f: data = compress(cPickle.dumps(new)) f.write(struct.pack('<I', len(data) + 4) + data) env.trackerClient.call( AddItemMessage('mutable_dict_new:%s' % key, url)) files = glob.glob(os.path.join(path, '%s-*' % self.uuid)) for f in files: if int(f.split('_')[-2]) < self.generation - 1: try: os.remove(f) except OSError: pass self.updated.clear() self.data = LRUDict(self.cacheLimit)
def _flush(self): if not self.updated: return updated_keys = {} dirname = "mutable_dict" tmppath = env.workdir.alloc_tmp_dir(dirname) path = env.workdir.export(tmppath, dirname) uri = env.get('SERVER_URI') server_uri = '%s/%s' % (uri, os.path.basename(path)) for k, v in self.updated.items(): key = self._get_key(k) if key in updated_keys: updated_keys[key][k] = v else: updated_keys[key] = {k: v} uid = uuid_pkg.uuid4().get_hex() for key, updated in updated_keys.items(): new = self._fetch_missing(key) for k, v in updated.items(): if v is None: new.pop(k) else: new[k] = v filename = '%s_%s_%s' % (key, self.generation, uid) fn = os.path.join(path, filename) if os.path.exists(fn): raise RuntimeError('conflict uuid for mutable_dict') url = '%s/%s' % (server_uri, filename) with atomic_file(fn) as f: data = compress(cPickle.dumps(new)) f.write(struct.pack('<I', len(data) + 4) + data) env.trackerClient.call(AddItemMessage('mutable_dict_new:%s' % key, url)) files = glob.glob(os.path.join(path, '%s-*' % self.uuid)) for f in files: if int(f.split('_')[-2]) < self.generation - 1: try: os.remove(f) except OSError: pass self.updated.clear() self.data = LRUDict(self.cacheLimit)
def write(self, path): output_file = os.path.join(path, 'metadata') with atomic_file(output_file) as f: f.write(pickle.dumps(self, -1))