Ejemplo n.º 1
0
    def save(self, path, items):
        # TODO: purge old cache
        with atomic_file(path) as f:
            c = 0
            f.write(struct.pack("I", c))
            # check is marshalable and compatible with broadcast
            can_marshal = marshalable(items)
            for v in items:
                if can_marshal:
                    try:
                        r = 0, marshal.dumps(v)
                    except Exception:
                        r = 1, six.moves.cPickle.dumps(v, -1)
                        can_marshal = False
                else:
                    r = 1, six.moves.cPickle.dumps(v, -1)
                f.write(msgpack.packb(r))
                c += 1
                yield v

            bytes = f.tell()
            if bytes > 10 << 20:
                logger.warning("cached result is %dMB (larger than 10MB)",
                               bytes >> 20)
            # count
            f.seek(0)
            f.write(struct.pack("I", c))
Ejemplo n.º 2
0
    def save(self, path, items):
        # TODO: purge old cache
        with atomic_file(path) as f:
            c = 0
            f.write(struct.pack("I", c))
            try_marshal = True
            for v in items:
                if try_marshal:
                    try:
                        r = 0, marshal.dumps(v)
                    except Exception:
                        r = 1, cPickle.dumps(v, -1)
                        try_marshal = False
                else:
                    r = 1, cPickle.dumps(v, -1)
                f.write(msgpack.packb(r))
                c += 1
                yield v

            bytes = f.tell()
            if bytes > 10 << 20:
                logger.warning("cached result is %dMB (larger than 10MB)",
                               bytes >> 20)
            # count
            f.seek(0)
            f.write(struct.pack("I", c))
Ejemplo n.º 3
0
    def run(self, attempId):
        logger.debug("shuffling %d of %s", self.partition, self.rdd)
        for i, bucket in self._prepare_shuffle(self.rdd):
            try:
                if marshalable(bucket):
                    flag, d = 'm', marshal.dumps(bucket)
                else:
                    flag, d = 'p', cPickle.dumps(bucket, -1)
            except ValueError:
                flag, d = 'p', cPickle.dumps(bucket, -1)
            cd = compress(d)
            for tried in range(1, 4):
                try:
                    path = LocalFileShuffle.getOutputFile(
                        self.shuffleId, self.partition, i,
                        len(cd) * tried)
                    with atomic_file(path, bufsize=1024 * 4096) as f:
                        f.write(flag + struct.pack("I", 5 + len(cd)))
                        f.write(cd)

                    break
                except IOError, e:
                    logger.warning("write %s failed: %s, try again (%d)", path,
                                   e, tried)
            else:
                raise
Ejemplo n.º 4
0
    def __init__(self, items):
        self.bufsize = 4096 * 1024
        self.buf = None
        self.offset = 0
        dirs = LocalFileShuffle.shuffleDir
        self.path = path = os.path.join(
            random.choice(dirs[1:]) if dirs[1:] else dirs[0],
            'shuffle-%s.tmp.gz' % uuid.uuid4().hex)

        with atomic_file(path, bufsize=self.bufsize) as f:
            f = gzip.GzipFile(fileobj=f)
            items = sorted(items, key=lambda k_v: k_v[0])
            try:
                for i in items:
                    s = marshal.dumps(i)
                    f.write(struct.pack("I", len(s)))
                    f.write(s)
                self.loads = marshal.loads
            except Exception:
                f.rewind()
                for i in items:
                    s = cPickle.dumps(i)
                    f.write(struct.pack("I", len(s)))
                    f.write(s)
                self.loads = cPickle.loads
            f.close()
Ejemplo n.º 5
0
    def __init__(self, items):
        self.bufsize = 4096 * 1024
        self.buf = None
        self.offset = 0
        dirs = LocalFileShuffle.shuffleDir
        self.path = path = os.path.join(
            random.choice(dirs[1:]) if dirs[1:] else dirs[0],
            'shuffle-%s.tmp.gz' % uuid.uuid4().hex)

        with atomic_file(path, bufsize=self.bufsize) as f:
            f = gzip.GzipFile(fileobj=f)
            items = sorted(items, key=lambda k_v: k_v[0])
            try:
                for i in items:
                    s = marshal.dumps(i)
                    f.write(struct.pack("I", len(s)))
                    f.write(s)
                self.loads = marshal.loads
            except Exception:
                f.rewind()
                for i in items:
                    s = cPickle.dumps(i)
                    f.write(struct.pack("I", len(s)))
                    f.write(s)
                self.loads = cPickle.loads
            f.close()
Ejemplo n.º 6
0
    def save(self, path, items):
        # TODO: purge old cache
        with atomic_file(path) as f:
            c = 0
            f.write(struct.pack("I", c))
            try_marshal = True
            for v in items:
                if try_marshal:
                    try:
                        r = 0, marshal.dumps(v)
                    except Exception:
                        r = 1, cPickle.dumps(v, -1)
                        try_marshal = False
                else:
                    r = 1, cPickle.dumps(v, -1)
                f.write(msgpack.packb(r))
                c += 1
                yield v

            bytes = f.tell()
            if bytes > 10<<20:
                logger.warning("cached result is %dMB (larger than 10MB)", bytes>>20)
            # count
            f.seek(0)
            f.write(struct.pack("I", c))
Ejemplo n.º 7
0
    def run_without_sorted(self, it):
        for i, bucket in it:
            try:
                if marshalable(bucket):
                    flag, d = b'm', marshal.dumps(bucket)
                else:
                    flag, d = b'p', six.moves.cPickle.dumps(bucket, -1)
            except ValueError:
                flag, d = b'p', six.moves.cPickle.dumps(bucket, -1)
            cd = compress(d)
            env.task_stats.bytes_shuffle_write += len(cd)
            for tried in range(1, 4):
                try:
                    path = LocalFileShuffle.getOutputFile(
                        self.shuffleId, self.partition, i,
                        len(cd) * tried)
                    with atomic_file(path, bufsize=1024 * 4096) as f:
                        f.write(flag + struct.pack("I", 5 + len(cd)))
                        f.write(cd)

                    break
                except IOError as e:
                    logger.warning("write %s failed: %s, try again (%d)", path,
                                   e, tried)
            else:
                raise e

        return LocalFileShuffle.getServerUri()
Ejemplo n.º 8
0
    def compute(self, split):
        buffers = [list() for i in self.fields]
        remain_size = STRIPE_DATA_SIZE
        path = os.path.join(self.path, '%04d.dt' % split.index)
        indices = dict((i, AdaptiveIndex()) for i in self.indices)

        def write_stripe(f, compressed, header, padding=True):
            h = compress(marshal.dumps(header))
            assert len(h) < STRIPE_HEADER_SIZE
            f.write(struct.pack('I', len(h)))
            f.write(h)
            padding_size = STRIPE_SIZE - len(h) - 4
            for c in compressed:
                f.write(c)
                padding_size -= len(c)

            if padding:
                f.write('\0' * padding_size)

        with atomic_file(path) as f:
            stripe_id = 0
            for it in chain(self.prev.iterator(sp) for sp in split.splits):
                row = it[:len(self.fields)]
                size = len(marshal.dumps(tuple(row)))
                if size > STRIPE_DATA_SIZE:
                    raise RuntimeError('Row too big')

                if size > remain_size:
                    compressed = [compress(marshal.dumps(tuple(b))) for b in buffers]
                    _sizes = tuple(map(len, compressed))
                    _remain_size = STRIPE_DATA_SIZE - sum(_sizes)
                    if size > _remain_size:
                        write_stripe(f, compressed, _sizes)
                        buffers = [list() for i in self.fields]
                        remain_size = STRIPE_DATA_SIZE
                        stripe_id += 1
                    else:
                        remain_size = _remain_size

                remain_size -= size
                for i, value in enumerate(row):
                    buffers[i].append(value)
                    field = self.fields[i]
                    if field in self.indices:
                        indices[field].add(value, stripe_id)

            if any(buffers):
                compressed = [compress(marshal.dumps(tuple(b))) for b in buffers]
                _sizes = tuple(map(len, compressed))
                write_stripe(f, compressed, _sizes, False)

            footer_indices = zlib.compress(cPickle.dumps(indices, -1))
            footer_fields = compress(marshal.dumps(self.fields))
            f.write(footer_indices)
            f.write(footer_fields)
            f.write(struct.pack('II', len(footer_fields), len(footer_indices)))

        yield path
Ejemplo n.º 9
0
 def __init__(self, items, rddconf):
     self.path = path = LocalFileShuffle.get_tmp()
     with atomic_file(path, bufsize=4096) as f:
         if not isinstance(items, list):
             items = list(items)
         items.sort(key=itemgetter(0))
         serializer = get_serializer(rddconf)
         serializer.dump_stream(items, f)
         self.size = f.tell()
         self.num_batch = serializer.num_batch
Ejemplo n.º 10
0
    def _flush(self):
        if not self.updated:
            return

        updated_keys = {}
        path = self._get_path()
        uri = env.get('SERVER_URI')
        server_uri = '%s/%s' % (uri, os.path.basename(path))

        for k, v in self.updated.items():
            key = self._get_key(k)
            if key in updated_keys:
                updated_keys[key][k] = v
            else:
                updated_keys[key] = {k: v}

        uid = uuid.uuid4().get_hex()
        for key, updated in updated_keys.items():
            new = self._fetch_missing(key)
            for k, v in updated.items():
                if v is None:
                    new.pop(k)
                else:
                    new[k] = v

            filename = '%s_%s_%s' % (key, self.generation, uid)
            fn = os.path.join(path, filename)
            if os.path.exists(fn):
                raise RuntimeError('conflict uuid for mutable_dict')

            url = '%s/%s' % (server_uri, filename)
            with atomic_file(fn) as f:
                data = compress(six.moves.cPickle.dumps(new))
                f.write(struct.pack('<I', len(data) + 4) + data)

            env.trackerClient.call(
                AddItemMessage('mutable_dict_new:%s' % key, url))

            files = glob.glob(os.path.join(path, '%s-*' % self.uuid))
            for f in files:
                if int(f.split('_')[-2]) < self.generation - 1:
                    try:
                        os.remove(f)
                    except OSError:
                        pass

        self.updated.clear()
        self.data = LRUDict(self.cacheLimit)
Ejemplo n.º 11
0
    def _flush(self):
        if not self.updated:
            return

        updated_keys = {}
        path = self._get_path()
        uri = env.get('SERVER_URI')
        server_uri = '%s/%s' % (uri, os.path.basename(path))

        for k,v in self.updated.items():
            key = self._get_key(k)
            if key in updated_keys:
                updated_keys[key][k] = v
            else:
                updated_keys[key] = {k:v}

        uid = uuid.uuid4().get_hex()
        for key, updated in updated_keys.items():
            new = self._fetch_missing(key)
            for k,v in updated.items():
                if v is None:
                    new.pop(k)
                else:
                    new[k] = v

            filename = '%s_%s_%s' % (key, self.generation, uid)
            fn = os.path.join(path, filename)
            if os.path.exists(fn):
                raise RuntimeError('conflict uuid for mutable_dict')

            url = '%s/%s' % (server_uri, filename)
            with atomic_file(fn) as f:
                data = compress(cPickle.dumps(new))
                f.write(struct.pack('<I', len(data)+4) + data)

            env.trackerClient.call(AddItemMessage('mutable_dict_new:%s' % key, url))

            files = glob.glob(os.path.join(path, '%s-*' % self.uuid ))
            for f in files:
                if int(f.split('_')[-2]) < self.generation -1:
                    try:
                        os.remove(f)
                    except OSError:
                        pass

        self.updated.clear()
        self.data = LRUDict(self.cacheLimit)
Ejemplo n.º 12
0
 def run_with_sorted(self, it):
     serializer = GroupByAutoBatchedSerializer(
     ) if self.iter_values else AutoBatchedSerializer()
     for i, bucket in it:
         for tried in range(1, 4):
             try:
                 path = LocalFileShuffle.getOutputFile(
                     self.shuffleId, self.partition, i)
                 with atomic_file(path, bufsize=1024 * 4096) as f:
                     items = sorted(bucket.items(), key=lambda x: x[0])
                     serializer.dump_stream(items, f)
                     env.task_stats.bytes_shuffle_write += f.tell()
                 break
             except IOError as e:
                 logger.warning("write %s failed: %s, try again (%d)", path,
                                e, tried)
         else:
             raise e
     return LocalFileShuffle.getServerUri()
Ejemplo n.º 13
0
    def run(self, attempId):
        logger.debug("shuffling %d of %s", self.partition, self.rdd)
        for i, bucket in self._prepare_shuffle(self.rdd):
            try:
                if marshalable(bucket):
                    flag, d = 'm', marshal.dumps(bucket)
                else:
                    flag, d = 'p', cPickle.dumps(bucket, -1)
            except ValueError:
                flag, d = 'p', cPickle.dumps(bucket, -1)
            cd = compress(d)
            for tried in range(1, 4):
                try:
                    path = LocalFileShuffle.getOutputFile(self.shuffleId, self.partition, i, len(cd) * tried)
                    with atomic_file(path, bufsize=1024*4096) as f:
                        f.write(flag + struct.pack("I", 5 + len(cd)))
                        f.write(cd)

                    break
                except IOError, e:
                    logger.warning("write %s failed: %s, try again (%d)", path, e, tried)
            else:
                raise
Ejemplo n.º 14
0
 def write(self, path):
     output_file = os.path.join(path, 'metadata')
     with atomic_file(output_file) as f:
         f.write(pickle.dumps(self, -1))
Ejemplo n.º 15
0
 def write(self, path):
     output_file = os.path.join(path, 'metadata')
     with atomic_file(output_file) as f:
         f.write(pickle.dumps(self, -1))