Ejemplo n.º 1
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        setproctitle('dpark worker %s: run task %s' % (Script, task))
        
        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()

        if marshalable(result):
            flag, data = 0, marshal.dumps(result)
        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            workdir = env.get('WORKDIR')
            name = 'task_%s_%s.result' % (task.id, ntry)
            path = os.path.join(workdir, name) 
            f = open(path, 'w')
            f.write(data)
            f.close()
            data = LocalFileShuffle.getServerUri() + '/' + name
            flag += 2

        return mesos_pb2.TASK_FINISHED, cPickle.dumps((task.id, Success(), (flag, data), accUpdate), -1)
    except Exception, e:
        import traceback
        msg = traceback.format_exc()
        return mesos_pb2.TASK_FAILED, cPickle.dumps((task.id, OtherFailure(msg), None, None), -1)
Ejemplo n.º 2
0
    def save(self, path, items):
        # TODO: purge old cache
        with atomic_file(path) as f:
            c = 0
            f.write(struct.pack("I", c))
            # check is marshalable and compatible with broadcast
            can_marshal = marshalable(items)
            for v in items:
                if can_marshal:
                    try:
                        r = 0, marshal.dumps(v)
                    except Exception:
                        r = 1, cPickle.dumps(v, -1)
                        can_marshal = False
                else:
                    r = 1, cPickle.dumps(v, -1)
                f.write(msgpack.packb(r))
                c += 1
                yield v

            bytes = f.tell()
            if bytes > 10 << 20:
                logger.warning("cached result is %dMB (larger than 10MB)", bytes >> 20)
            # count
            f.seek(0)
            f.write(struct.pack("I", c))
Ejemplo n.º 3
0
 def run(self, attempId):
     logger.debug("shuffling %d of %s", self.partition, self.rdd)
     for i, bucket in self.rdd._prepare_shuffle(self.split,
                                                self.partitioner,
                                                self.aggregator):
         try:
             if marshalable(bucket):
                 flag, d = 'm', marshal.dumps(bucket)
             else:
                 flag, d = 'p', cPickle.dumps(bucket, -1)
         except ValueError:
             flag, d = 'p', cPickle.dumps(bucket, -1)
         cd = compress(d)
         for tried in range(1, 4):
             try:
                 path = LocalFileShuffle.getOutputFile(
                     self.shuffleId, self.partition, i,
                     len(cd) * tried)
                 tpath = path + ".%s.%s" % (socket.gethostname(),
                                            os.getpid())
                 f = open(tpath, 'wb', 1024 * 4096)
                 f.write(flag + struct.pack("I", 5 + len(cd)))
                 f.write(cd)
                 f.close()
                 os.rename(tpath, path)
                 break
             except IOError, e:
                 logger.warning("write %s failed: %s, try again (%d)", path,
                                e, tried)
                 try:
                     os.remove(tpath)
                 except OSError:
                     pass
         else:
             raise
Ejemplo n.º 4
0
    def run_without_sorted(self, it):
        for i, bucket in it:
            try:
                if marshalable(bucket):
                    flag, d = b'm', marshal.dumps(bucket)
                else:
                    flag, d = b'p', six.moves.cPickle.dumps(bucket, -1)
            except ValueError:
                flag, d = b'p', six.moves.cPickle.dumps(bucket, -1)
            cd = compress(d)
            env.task_stats.bytes_shuffle_write += len(cd)
            for tried in range(1, 4):
                try:
                    path = LocalFileShuffle.getOutputFile(
                        self.shuffleId, self.partition, i,
                        len(cd) * tried)
                    with atomic_file(path, bufsize=1024 * 4096) as f:
                        f.write(flag + struct.pack("I", 5 + len(cd)))
                        f.write(cd)

                    break
                except IOError as e:
                    logger.warning("write %s failed: %s, try again (%d)", path,
                                   e, tried)
            else:
                raise e

        return LocalFileShuffle.getServerUri()
Ejemplo n.º 5
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        setproctitle('dpark worker %s: run task %s' % (Script, task))

        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()

        if marshalable(result):
            flag, data = 0, marshal.dumps(result)
        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data))
            f = open(path, 'w')
            f.write(data)
            f.close()
            data = '/'.join([LocalFileShuffle.getServerUri()] + path.split('/')[-3:])
            flag += 2

        return mesos_pb2.TASK_FINISHED, cPickle.dumps((Success(), (flag, data), accUpdate), -1)
    except FetchFailed, e:
        return mesos_pb2.TASK_FAILED, cPickle.dumps((e, None, None), -1)
Ejemplo n.º 6
0
    def save(self, path, items):
        # TODO: purge old cache
        with atomic_file(path) as f:
            c = 0
            f.write(struct.pack("I", c))
            # check is marshalable and compatible with broadcast
            can_marshal = marshalable(items)
            for v in items:
                if can_marshal:
                    try:
                        r = 0, marshal.dumps(v)
                    except Exception:
                        r = 1, six.moves.cPickle.dumps(v, -1)
                        can_marshal = False
                else:
                    r = 1, six.moves.cPickle.dumps(v, -1)
                f.write(msgpack.packb(r))
                c += 1
                yield v

            bytes = f.tell()
            if bytes > 10 << 20:
                logger.warning("cached result is %dMB (larger than 10MB)",
                               bytes >> 20)
            # count
            f.seek(0)
            f.write(struct.pack("I", c))
Ejemplo n.º 7
0
    def run(self, attempId):
        logger.debug("shuffling %d of %s", self.partition, self.rdd)
        for i, bucket in self._prepare_shuffle(self.rdd):
            try:
                if marshalable(bucket):
                    flag, d = 'm', marshal.dumps(bucket)
                else:
                    flag, d = 'p', cPickle.dumps(bucket, -1)
            except ValueError:
                flag, d = 'p', cPickle.dumps(bucket, -1)
            cd = compress(d)
            for tried in range(1, 4):
                try:
                    path = LocalFileShuffle.getOutputFile(
                        self.shuffleId, self.partition, i,
                        len(cd) * tried)
                    with atomic_file(path, bufsize=1024 * 4096) as f:
                        f.write(flag + struct.pack("I", 5 + len(cd)))
                        f.write(cd)

                    break
                except IOError, e:
                    logger.warning("write %s failed: %s, try again (%d)", path,
                                   e, tried)
            else:
                raise
Ejemplo n.º 8
0
def run_task(task, ntry):
    try:
        setproctitle('dpark worker %s: run task %s' % (Script, task))
        Accumulator.clear()
        gc.disable()
        result = task.run(ntry)
        accUpdate = Accumulator.values()

        if marshalable(result):
            flag, data = 0, marshal.dumps(result)
        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            workdir = env.get('WORKDIR')
            name = 'task_%s_%s.result' % (task.id, ntry)
            path = os.path.join(workdir, name)
            f = open(path, 'w')
            f.write(data)
            f.close()
            data = LocalFileShuffle.getServerUri() + '/' + name
            flag += 2

        return mesos_pb2.TASK_FINISHED, cPickle.dumps(
            (task.id, Success(), (flag, data), accUpdate), -1)
    except Exception, e:
        import traceback
        msg = traceback.format_exc()
        return mesos_pb2.TASK_FAILED, cPickle.dumps(
            (task.id, OtherFailure(msg), None, None), -1)
Ejemplo n.º 9
0
 def run(self, attempId):
     logger.debug("shuffling %d of %s", self.partition, self.rdd)
     for i, bucket in self.rdd._prepare_shuffle(self.split, self.partitioner, self.aggregator):
         try:
             if marshalable(bucket):
                 flag, d = 'm', marshal.dumps(bucket)
             else:
                 flag, d = 'p', cPickle.dumps(bucket, -1)
         except ValueError:
             flag, d = 'p', cPickle.dumps(bucket, -1)
         cd = compress(d)
         for tried in range(1, 4):
             try:
                 path = LocalFileShuffle.getOutputFile(self.shuffleId, self.partition, i, len(cd) * tried)
                 tpath = path + ".%s.%s" % (socket.gethostname(), os.getpid())
                 f = open(tpath, 'wb', 1024*4096)
                 f.write(flag + struct.pack("I", 5 + len(cd)))
                 f.write(cd)
                 f.close()
                 os.rename(tpath, path)
                 break
             except IOError, e:
                 logger.warning("write %s failed: %s, try again (%d)", path, e, tried)
                 try: os.remove(tpath)
                 except OSError: pass
         else:
             raise
Ejemplo n.º 10
0
 def _prepare(self, items):
     items = list(items)
     try:
         if marshalable(items):
             is_marshal, d = True, marshal.dumps(items)
         else:
             is_marshal, d = False, six.moves.cPickle.dumps(items, -1)
     except ValueError:
         is_marshal, d = False, six.moves.cPickle.dumps(items, -1)
     data = compress(d)
     size = len(data)
     return (is_marshal, data), size
Ejemplo n.º 11
0
Archivo: task.py Proyecto: douban/dpark
 def _prepare(self, items):
     items = list(items)
     try:
         if marshalable(items):
             is_marshal, d = True, marshal.dumps(items)
         else:
             is_marshal, d = False, cPickle.dumps(items, -1)
     except ValueError:
         is_marshal, d = False, cPickle.dumps(items, -1)
     data = compress(d)
     size = len(data)
     return (is_marshal, data), size
Ejemplo n.º 12
0
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            # shuffle_id start from 1
            swd = ShuffleWorkDir(0, task.id, ttid.task_try)
            tmppath = swd.alloc_tmp(len(data))
            with open(tmppath, 'wb') as f:
                f.write(data)
                f.close()
            path = swd.export(tmppath)
            data = '/'.join([env.server_uri] + path.split('/')[-3:])
            flag += 2

        return TaskState.finished, cPickle.dumps(
            ((flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return TaskState.failed, TaskEndReason.fetch_failed, str(
            e), cPickle.dumps(e)
    except Exception as e:
        import traceback
        msg = traceback.format_exc()
        ename = e.__class__.__name__
        fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError,
                            LookupError, SyntaxError, TypeError,
                            AssertionError)
        prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED"
        return TaskState.failed, '{}_EXCEPTION_{}'.format(
            prefix, ename), msg, cPickle.dumps(e)
    finally:
        gc.collect()
        gc.enable()
Ejemplo n.º 13
0
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            # shuffle_id start from 1
            swd = ShuffleWorkDir(0, task.id, ttid.task_try)
            tmppath = swd.alloc_tmp(len(data))
            with open(tmppath, 'wb') as f:
                f.write(data)
                f.close()
            path = swd.export(tmppath)
            data = '/'.join(
                [env.server_uri] + path.split('/')[-3:]
            )
            flag += 2

        return TaskState.finished, cPickle.dumps(((flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return TaskState.failed, TaskEndReason.fetch_failed, str(e), cPickle.dumps(e)
    except Exception as e:
        import traceback
        msg = traceback.format_exc()
        ename = e.__class__.__name__
        fatal_exceptions = (DparkUserFatalError, ArithmeticError,
                            ValueError, LookupError, SyntaxError,
                            TypeError, AssertionError)
        prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED"
        return TaskState.failed, '{}_EXCEPTION_{}'.format(prefix, ename), msg, cPickle.dumps(e)
    finally:
        gc.collect()
        gc.enable()
Ejemplo n.º 14
0
    def blockifyObject(self, obj):
        try:
            if marshalable(obj):
                buf = '0'+marshal.dumps(obj)
            else:
                buf = '1'+cPickle.dumps(obj, -1)

        except Exception:
            buf = '1'+cPickle.dumps(obj, -1)

        N = self.BlockSize
        blockNum = len(buf) / N + 1
        val = [Block(i, compress(buf[i*N:i*N+N])) 
                    for i in range(blockNum)]
        return val, len(buf)
Ejemplo n.º 15
0
    def run(self, attempId):
        logger.debug("shuffling %d of %s", self.partition, self.rdd)
        numOutputSplits = self.partitioner.numPartitions
        getPartition = self.partitioner.getPartition
        mergeValue = self.aggregator.mergeValue
        createCombiner = self.aggregator.createCombiner

        buckets = [{} for i in range(numOutputSplits)]
        for k, v in self.rdd.iterator(self.split):
            bucketId = getPartition(k)
            bucket = buckets[bucketId]
            r = bucket.get(k, None)
            if r is not None:
                bucket[k] = mergeValue(r, v)
            else:
                bucket[k] = createCombiner(v)

        for i in range(numOutputSplits):
            try:
                if marshalable(buckets[i]):
                    flag, d = 'm', marshal.dumps(buckets[i])
                else:
                    flag, d = 'p', cPickle.dumps(buckets[i], -1)
            except ValueError:
                flag, d = 'p', cPickle.dumps(buckets[i], -1)
            cd = compress(d)
            for tried in range(1, 4):
                try:
                    path = LocalFileShuffle.getOutputFile(
                        self.shuffleId, self.partition, i,
                        len(cd) * tried)
                    tpath = path + ".%s.%s" % (socket.gethostname(),
                                               os.getpid())
                    f = open(tpath, 'wb', 1024 * 4096)
                    f.write(flag + struct.pack("I", 5 + len(cd)))
                    f.write(cd)
                    f.close()
                    os.rename(tpath, path)
                    break
                except IOError, e:
                    logger.warning("write %s failed: %s, try again (%d)", path,
                                   e, tried)
                    try:
                        os.remove(tpath)
                    except OSError:
                        pass
            else:
                raise
Ejemplo n.º 16
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception, e:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
Ejemplo n.º 17
0
    def blockifyObject(self, obj):
        try:
            if marshalable(obj):
                buf = '0' + marshal.dumps(obj)
            else:
                buf = '1' + cPickle.dumps(obj, -1)

        except Exception:
            buf = '1' + cPickle.dumps(obj, -1)

        N = self.BlockSize
        blockNum = len(buf) / N + 1
        val = [
            Block(i, compress(buf[i * N:i * N + N])) for i in range(blockNum)
        ]
        return val, len(buf)
Ejemplo n.º 18
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception, e:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
Ejemplo n.º 19
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        setproctitle('dpark worker %s: run task %s' % (Script, task))

        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception, e:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
Ejemplo n.º 20
0
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try,
                                                  len(data))
            f = open(path, 'wb')
            f.write(data)
            f.close()
            data = '/'.join([LocalFileShuffle.getServerUri()] +
                            path.split('/')[-3:])
            flag += 2

        return TaskState.finished, cPickle.dumps(
            ((flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return TaskState.failed, TaskEndReason.fetch_failed, str(
            e), cPickle.dumps(e)
    except Exception as e:
        import traceback
        msg = traceback.format_exc()
        ename = e.__class__.__name__
        return TaskState.failed, 'FAILED_EXCEPTION_{}'.format(
            ename), msg, cPickle.dumps(e)
    finally:
        gc.collect()
        gc.enable()
Ejemplo n.º 21
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = cPickle.loads(decompress(task_data))
        setproctitle('dpark worker %s: run task %s' % (Script, task))

        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception, e:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
Ejemplo n.º 22
0
    def to_blocks(self, uuid, obj):
        try:
            if marshalable(obj):
                buf = marshal.dumps((uuid, obj))
                type = MARSHAL_TYPE
            else:
                buf = cPickle.dumps((uuid, obj), -1)
                type = PICKLE_TYPE

        except Exception:
            buf = cPickle.dumps((uuid, obj), -1)
            type = PICKLE_TYPE

        checksum = binascii.crc32(buf) & 0xFFFF
        stream = struct.pack(self.header_fmt, type, checksum) + buf
        blockNum = (len(stream) + (BLOCK_SIZE - 1)) >> BLOCK_SHIFT
        blocks = [compress(stream[i*BLOCK_SIZE:(i+1)*BLOCK_SIZE]) for i in range(blockNum)]
        return blocks
Ejemplo n.º 23
0
    def to_blocks(self, uuid, obj):
        try:
            if marshalable(obj):
                buf = marshal.dumps((uuid, obj))
                type = MARSHAL_TYPE
            else:
                buf = cPickle.dumps((uuid, obj), -1)
                type = PICKLE_TYPE

        except Exception:
            buf = cPickle.dumps((uuid, obj), -1)
            type = PICKLE_TYPE

        checksum = binascii.crc32(buf) & 0xFFFF
        stream = struct.pack(self.header_fmt, type, checksum) + buf
        blockNum = (len(stream) + (BLOCK_SIZE - 1)) >> BLOCK_SHIFT
        blocks = [compress(stream[i*BLOCK_SIZE:(i+1)*BLOCK_SIZE]) for i in range(blockNum)]
        return blocks
Ejemplo n.º 24
0
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try,
                                                  len(data))
            f = open(path, 'wb')
            f.write(data)
            f.close()
            data = '/'.join([LocalFileShuffle.getServerUri()] +
                            path.split('/')[-3:])
            flag += 2

        return 'TASK_FINISHED', cPickle.dumps(
            (Success(), (flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return 'TASK_FAILED', cPickle.dumps((e, None, None, None), -1)
    except:
        import traceback
        msg = traceback.format_exc()
        return 'TASK_FAILED', cPickle.dumps(
            (OtherFailure(msg), None, None, None), -1)
    finally:
        gc.collect()
        gc.enable()
Ejemplo n.º 25
0
def run_task(task_data):
    try:
        gc.disable()
        task, ntry = loads(decompress(task_data))
        Accumulator.clear()
        result = task.run(ntry)
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception as e:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            path = LocalFileShuffle.getOutputFile(0, ntry, task.id, len(data))
            f = open(path, 'w')
            f.write(data)
            f.close()
            data = '/'.join(
                [LocalFileShuffle.getServerUri()] + path.split('/')[-3:]
            )
            flag += 2

        return 'TASK_FINISHED', cPickle.dumps(
            (Success(), (flag, data), accUpdate), -1)
    except FetchFailed as e:
        return 'TASK_FAILED', cPickle.dumps((e, None, None), -1)
    except:
        import traceback
        msg = traceback.format_exc()
        return 'TASK_FAILED', cPickle.dumps(
            (OtherFailure(msg), None, None), -1)
    finally:
        close_mfs()
        gc.collect()
        gc.enable()
Ejemplo n.º 26
0
    def run(self, attempId):
        logger.debug("shuffling %d of %s", self.partition, self.rdd)
        numOutputSplits = self.partitioner.numPartitions
        getPartition = self.partitioner.getPartition
        mergeValue = self.aggregator.mergeValue
        createCombiner = self.aggregator.createCombiner

        buckets = [{} for i in range(numOutputSplits)]
        for k,v in self.rdd.iterator(self.split):
            bucketId = getPartition(k)
            bucket = buckets[bucketId]
            r = bucket.get(k, None)
            if r is not None:
                bucket[k] = mergeValue(r, v)
            else:
                bucket[k] = createCombiner(v)

        for i in range(numOutputSplits):
            try:
                if marshalable(buckets[i]):
                    flag, d = 'm', marshal.dumps(buckets[i])
                else:
                    flag, d = 'p', cPickle.dumps(buckets[i], -1)
            except ValueError:
                flag, d = 'p', cPickle.dumps(buckets[i], -1)
            cd = compress(d)
            for tried in range(1, 4):
                try:
                    path = LocalFileShuffle.getOutputFile(self.shuffleId, self.partition, i, len(cd) * tried)
                    tpath = path + ".%s.%s" % (socket.gethostname(), os.getpid())
                    f = open(tpath, 'wb', 1024*4096)
                    f.write(flag + struct.pack("I", 5 + len(cd)))
                    f.write(cd)
                    f.close()
                    os.rename(tpath, path)
                    break
                except IOError, e:
                    logger.warning("write %s failed: %s, try again (%d)", path, e, tried)
                    try: os.remove(tpath)
                    except OSError: pass
            else:
                raise
Ejemplo n.º 27
0
    def to_blocks(self, uuid, obj):
        try:
            if marshalable(obj):
                buf = marshal.dumps((uuid, obj))
                type_ = MARSHAL_TYPE
            else:
                buf = cPickle.dumps((uuid, obj), -1)
                type_ = PICKLE_TYPE

        except Exception:
            buf = cPickle.dumps((uuid, obj), -1)
            type_ = PICKLE_TYPE

        checksum = binascii.crc32(buf) & 0xFFFF
        stream = struct.pack(self.header_fmt, type_, checksum) + buf
        blockNum = (len(stream) + (BLOCK_SIZE - 1)) >> BLOCK_SHIFT
        blocks = [compress(stream[i * BLOCK_SIZE:(i + 1) * BLOCK_SIZE]) for i in range(blockNum)]
        sizes = [len(block) for block in blocks]
        size_l = accumulate_list(sizes)
        block_map = list(izip(size_l[:-1], sizes))
        return blocks, size_l[-1], block_map
Ejemplo n.º 28
0
    def run(self, attempId):
        logger.debug("shuffling %d of %s", self.partition, self.rdd)
        for i, bucket in self._prepare_shuffle(self.rdd):
            try:
                if marshalable(bucket):
                    flag, d = 'm', marshal.dumps(bucket)
                else:
                    flag, d = 'p', cPickle.dumps(bucket, -1)
            except ValueError:
                flag, d = 'p', cPickle.dumps(bucket, -1)
            cd = compress(d)
            for tried in range(1, 4):
                try:
                    path = LocalFileShuffle.getOutputFile(self.shuffleId, self.partition, i, len(cd) * tried)
                    with atomic_file(path, bufsize=1024*4096) as f:
                        f.write(flag + struct.pack("I", 5 + len(cd)))
                        f.write(cd)

                    break
                except IOError, e:
                    logger.warning("write %s failed: %s, try again (%d)", path, e, tried)
            else:
                raise