Beispiel #1
0
    def save(self, path, items):
        # TODO: purge old cache
        with atomic_file(path) as f:
            c = 0
            f.write(struct.pack("I", c))
            # check is marshalable and compatible with broadcast
            can_marshal = marshalable(items)
            for v in items:
                if can_marshal:
                    try:
                        r = 0, marshal.dumps(v)
                    except Exception:
                        r = 1, cPickle.dumps(v, -1)
                        can_marshal = False
                else:
                    r = 1, cPickle.dumps(v, -1)
                f.write(msgpack.packb(r))
                c += 1
                yield v

            bytes = f.tell()
            if bytes > 10 << 20:
                logger.warning("cached result is %dMB (larger than 10MB)", bytes >> 20)
            # count
            f.seek(0)
            f.write(struct.pack("I", c))
Beispiel #2
0
    def save(self, path, items):
        # TODO: purge old cache
        with atomic_file(path) as f:
            c = 0
            f.write(struct.pack("I", c))
            # check is marshalable and compatible with broadcast
            can_marshal = marshalable(items)
            for v in items:
                if can_marshal:
                    try:
                        r = 0, marshal.dumps(v)
                    except Exception:
                        r = 1, cPickle.dumps(v, -1)
                        can_marshal = False
                else:
                    r = 1, cPickle.dumps(v, -1)
                f.write(msgpack.packb(r))
                c += 1
                yield v

            bytes = f.tell()
            if bytes > 10 << 20:
                logger.warning("cached result is %dMB (larger than 10MB)",
                               bytes >> 20)
            # count
            f.seek(0)
            f.write(struct.pack("I", c))
Beispiel #3
0
    def compute(self, split):
        buffers = [list() for i in self.fields]
        remain_size = STRIPE_DATA_SIZE
        path = os.path.join(self.path, '%04d.dt' % split.index)
        indices = dict((i, AdaptiveIndex()) for i in self.indices)

        def write_stripe(f, compressed, header, padding=True):
            h = compress(marshal.dumps(header))
            assert len(h) < STRIPE_HEADER_SIZE
            f.write(struct.pack('I', len(h)))
            f.write(h)
            padding_size = STRIPE_SIZE - len(h) - 4
            for c in compressed:
                f.write(c)
                padding_size -= len(c)

            if padding:
                f.write('\0' * padding_size)

        with atomic_file(path) as f:
            stripe_id = 0
            for it in chain(self.prev.iterator(sp) for sp in split.splits):
                row = it[:len(self.fields)]
                size = len(marshal.dumps(tuple(row)))
                if size > STRIPE_DATA_SIZE:
                    raise RuntimeError('Row too big')

                if size > remain_size:
                    compressed = [compress(marshal.dumps(tuple(b))) for b in buffers]
                    _sizes = tuple(map(len, compressed))
                    _remain_size = STRIPE_DATA_SIZE - sum(_sizes)
                    if size > _remain_size:
                        write_stripe(f, compressed, _sizes)
                        buffers = [list() for i in self.fields]
                        remain_size = STRIPE_DATA_SIZE
                        stripe_id += 1
                    else:
                        remain_size = _remain_size

                remain_size -= size
                for i, value in enumerate(row):
                    buffers[i].append(value)
                    field = self.fields[i]
                    if field in self.indices:
                        indices[field].add(value, stripe_id)

            if any(buffers):
                compressed = [compress(marshal.dumps(tuple(b))) for b in buffers]
                _sizes = tuple(map(len, compressed))
                write_stripe(f, compressed, _sizes, False)

            footer_indices = zlib.compress(cPickle.dumps(indices, -1))
            footer_fields = compress(marshal.dumps(self.fields))
            f.write(footer_indices)
            f.write(footer_fields)
            f.write(struct.pack('II', len(footer_fields), len(footer_indices)))

        yield path
Beispiel #4
0
 def __init__(self, items, rddconf):
     self.path = path = LocalFileShuffle.get_tmp()
     with atomic_file(path, bufsize=4096) as f:
         if not isinstance(items, list):
             items = list(items)
         items.sort(key=itemgetter(0))
         serializer = get_serializer(rddconf)
         serializer.dump_stream(items, f)
         self.size = f.tell()
         self.num_batch = serializer.num_batch
Beispiel #5
0
 def __init__(self, items, rddconf):
     self.path = path = LocalFileShuffle.get_tmp()
     with atomic_file(path, bufsize=4096) as f:
         if not isinstance(items, list):
             items = list(items)
         items.sort(key=itemgetter(0))
         serializer = get_serializer(rddconf)
         serializer.dump_stream(items, f)
         self.size = f.tell()
         self.num_batch = serializer.num_batch
Beispiel #6
0
 def __init__(self, items, rddconf):
     self.path = path = env.workdir.alloc_tmp("sorted_items")
     with atomic_file(path, bufsize=4096) as f:
         if not isinstance(items, list):
             items = list(items)
         items.sort(key=itemgetter(0))
         serializer = get_serializer(rddconf)
         serializer.dump_stream(items, f)
         self.size = f.tell()
         self.num_batch = serializer.num_batch
Beispiel #7
0
 def __init__(self, items, rddconf):
     self.path = path = env.workdir.alloc_tmp("sorted_items")
     with atomic_file(path, bufsize=4096) as f:
         if not isinstance(items, list):
             items = list(items)
         items.sort(key=itemgetter(0))
         serializer = get_serializer(rddconf)
         serializer.dump_stream(items, f)
         self.size = f.tell()
         self.num_batch = serializer.num_batch
Beispiel #8
0
    def _flush(self):
        if not self.updated:
            return

        updated_keys = {}
        dirname = "mutable_dict"
        tmppath = env.workdir.alloc_tmp_dir(dirname)
        path = env.workdir.export(tmppath, dirname)
        uri = env.get('SERVER_URI')
        server_uri = '%s/%s' % (uri, os.path.basename(path))

        for k, v in self.updated.items():
            key = self._get_key(k)
            if key in updated_keys:
                updated_keys[key][k] = v
            else:
                updated_keys[key] = {k: v}

        uid = uuid_pkg.uuid4().get_hex()
        for key, updated in updated_keys.items():
            new = self._fetch_missing(key)
            for k, v in updated.items():
                if v is None:
                    new.pop(k)
                else:
                    new[k] = v

            filename = '%s_%s_%s' % (key, self.generation, uid)
            fn = os.path.join(path, filename)
            if os.path.exists(fn):
                raise RuntimeError('conflict uuid for mutable_dict')

            url = '%s/%s' % (server_uri, filename)
            with atomic_file(fn) as f:
                data = compress(cPickle.dumps(new))
                f.write(struct.pack('<I', len(data) + 4) + data)

            env.trackerClient.call(
                AddItemMessage('mutable_dict_new:%s' % key, url))

            files = glob.glob(os.path.join(path, '%s-*' % self.uuid))
            for f in files:
                if int(f.split('_')[-2]) < self.generation - 1:
                    try:
                        os.remove(f)
                    except OSError:
                        pass

        self.updated.clear()
        self.data = LRUDict(self.cacheLimit)
Beispiel #9
0
    def _flush(self):
        if not self.updated:
            return

        updated_keys = {}
        dirname = "mutable_dict"
        tmppath = env.workdir.alloc_tmp_dir(dirname)
        path = env.workdir.export(tmppath, dirname)
        uri = env.get('SERVER_URI')
        server_uri = '%s/%s' % (uri, os.path.basename(path))

        for k, v in self.updated.items():
            key = self._get_key(k)
            if key in updated_keys:
                updated_keys[key][k] = v
            else:
                updated_keys[key] = {k: v}

        uid = uuid_pkg.uuid4().get_hex()
        for key, updated in updated_keys.items():
            new = self._fetch_missing(key)
            for k, v in updated.items():
                if v is None:
                    new.pop(k)
                else:
                    new[k] = v

            filename = '%s_%s_%s' % (key, self.generation, uid)
            fn = os.path.join(path, filename)
            if os.path.exists(fn):
                raise RuntimeError('conflict uuid for mutable_dict')

            url = '%s/%s' % (server_uri, filename)
            with atomic_file(fn) as f:
                data = compress(cPickle.dumps(new))
                f.write(struct.pack('<I', len(data) + 4) + data)

            env.trackerClient.call(AddItemMessage('mutable_dict_new:%s' % key, url))

            files = glob.glob(os.path.join(path, '%s-*' % self.uuid))
            for f in files:
                if int(f.split('_')[-2]) < self.generation - 1:
                    try:
                        os.remove(f)
                    except OSError:
                        pass

        self.updated.clear()
        self.data = LRUDict(self.cacheLimit)
Beispiel #10
0
 def write(self, path):
     output_file = os.path.join(path, 'metadata')
     with atomic_file(output_file) as f:
         f.write(pickle.dumps(self, -1))
Beispiel #11
0
 def write(self, path):
     output_file = os.path.join(path, 'metadata')
     with atomic_file(output_file) as f:
         f.write(pickle.dumps(self, -1))