Example #1
0
    def run(self, attempId):
        logger.debug("shuffling %d of %s", self.partition, self.rdd)
        numOutputSplits = self.partitioner.numPartitions
        getPartition = self.partitioner.getPartition
        mergeValue = self.aggregator.mergeValue
        createCombiner = self.aggregator.createCombiner

        buckets = [{} for i in range(numOutputSplits)]
        for k,v in self.rdd.iterator(self.split):
            bucketId = getPartition(k)
            bucket = buckets[bucketId]
            r = bucket.get(k, None)
            if r is not None:
                bucket[k] = mergeValue(r, v)
            else:
                bucket[k] = createCombiner(v)

        for i in range(numOutputSplits):
            path = LocalFileShuffle.getOutputFile(self.shuffleId, self.partition, i)
            if os.path.exists(path):
                continue
            tpath = path + ".%s.%s" % (socket.gethostname(), os.getpid())
            try:
                flag, d = 'm', marshal.dumps(buckets[i])
            except ValueError:
                flag, d = 'p', cPickle.dumps(buckets[i], -1)
            f = open(tpath, 'wb', 1024*4096)
            f.write(flag)
            f.write(comp.compress(d, 1))
            f.close()
            if not os.path.exists(path):
                os.rename(tpath, path)
            else:
                os.unlink(tpath)
        return LocalFileShuffle.getServerUri()
Example #2
0
    def run(self, attempId):
        logger.debug("shuffling %d of %s", self.partition, self.rdd)
        numOutputSplits = self.partitioner.numPartitions
        getPartition = self.partitioner.getPartition
        mergeValue = self.aggregator.mergeValue
        createCombiner = self.aggregator.createCombiner

        buckets = [{} for i in range(numOutputSplits)]
        for k,v in self.rdd.iterator(self.split):
            bucketId = getPartition(k)
            bucket = buckets[bucketId]
            r = bucket.get(k, None)
            if r is not None:
                bucket[k] = mergeValue(r, v)
            else:
                bucket[k] = createCombiner(v)

        for i in range(numOutputSplits):
            path = LocalFileShuffle.getOutputFile(self.shuffleId, self.partition, i)
            if os.path.exists(path):
                continue
            tpath = path + ".%s.%s" % (socket.gethostname(), os.getpid())
            try:
                flag, d = 'm', marshal.dumps(buckets[i])
            except ValueError:
                flag, d = 'p', cPickle.dumps(buckets[i], -1)
            f = open(tpath, 'wb', 1024*4096)
            f.write(flag)
            f.write(d)
            f.close()
            if not os.path.exists(path):
                os.rename(tpath, path)
            else:
                os.unlink(tpath)
        return LocalFileShuffle.getServerUri()
Example #3
0
    def start(self, isMaster, environ={}, isLocal=False):
        if getattr(self, 'started', False):
            return
        logger.debug("start env in %s: %s %s", os.getpid(), isMaster, environ)
        if isMaster:
            root = '/tmp/dpark'
            if not isLocal:
                root = os.environ.get("DPARK_SHARE_DIR")
            if not root:
                raise Exception("no shuffle dir exists")
            if not os.path.exists(root):
                os.mkdir(root, 0777)
                os.chmod(root, 0777)  # because of umask
            name = '%s-%s-%d' % (time.strftime("%Y%m%d-%H%M%S"),
                                 socket.gethostname(), os.getpid())
            self.workdir = os.path.join(root, name)
            os.makedirs(self.workdir)
            self.environ['WORKDIR'] = self.workdir
        else:
            self.environ.update(environ)

        from cache import CacheTracker
        self.cacheTracker = CacheTracker(isMaster)

        from shuffle import LocalFileShuffle, MapOutputTracker, SimpleShuffleFetcher
        LocalFileShuffle.initialize(isMaster)
        self.mapOutputTracker = MapOutputTracker(isMaster)
        self.shuffleFetcher = SimpleShuffleFetcher()

        from broadcast import Broadcast
        Broadcast.initialize(isMaster)

        self.started = True
        logger.debug("env started")
Example #4
0
    def start(self, isMaster, environ={}, isLocal=False):
        if self.started:
            return
        logger.debug("start env in %s: %s %s", os.getpid(),
                isMaster, environ)
        self.isMaster = isMaster
        self.isLocal = isLocal
        if isMaster:
            if os.environ.has_key('DPARK_WORK_DIR'):
                root = os.environ['DPARK_WORK_DIR']
            else:
                root = '/tmp/dpark'

            if not os.path.exists(root):
                os.mkdir(root, 0777)
                os.chmod(root, 0777) # because of umask
            name = '%s-%s-%d' % (time.strftime("%Y%m%d-%H%M%S"),
                socket.gethostname(), os.getpid())
            self.workdir = os.path.join(root, name)
            os.makedirs(self.workdir)
            self.environ['WORKDIR'] = self.workdir
            self.environ['COMPRESS'] = util.COMPRESS
        else:
            self.environ.update(environ)
            if self.environ['COMPRESS'] != util.COMPRESS:
                raise Exception("no %s available" % self.environ['COMPRESS'])

        self.ctx = zmq.Context()

        from cache import CacheTracker, LocalCacheTracker
        if isLocal:
            self.cacheTracker = LocalCacheTracker(isMaster)
        else:
            self.cacheTracker = CacheTracker(isMaster)

        from shuffle import LocalFileShuffle, MapOutputTracker, LocalMapOutputTracker
        LocalFileShuffle.initialize(isMaster)
        if isLocal:
            self.mapOutputTracker = LocalMapOutputTracker(isMaster)
        else:
            self.mapOutputTracker = MapOutputTracker(isMaster)
        from shuffle import SimpleShuffleFetcher, ParallelShuffleFetcher
        #self.shuffleFetcher = SimpleShuffleFetcher()
        self.shuffleFetcher = ParallelShuffleFetcher(2)

        from broadcast import Broadcast
        Broadcast.initialize(isMaster)

        self.started = True
        logger.debug("env started") 
Example #5
0
    def start(self, isMaster, environ={}, isLocal=False):
        if self.started:
            return
        logger.debug("start env in %s: %s %s", os.getpid(), isMaster, environ)
        self.isMaster = isMaster
        self.isLocal = isLocal
        if isMaster:
            if os.environ.has_key('DPARK_WORK_DIR'):
                root = os.environ['DPARK_WORK_DIR']
            else:
                root = '/tmp/dpark'

            if not os.path.exists(root):
                os.mkdir(root, 0777)
                os.chmod(root, 0777)  # because of umask
            name = '%s-%s-%d' % (time.strftime("%Y%m%d-%H%M%S"),
                                 socket.gethostname(), os.getpid())
            self.workdir = os.path.join(root, name)
            os.makedirs(self.workdir)
            self.environ['WORKDIR'] = self.workdir
            self.environ['COMPRESS'] = util.COMPRESS
        else:
            self.environ.update(environ)
            if self.environ['COMPRESS'] != util.COMPRESS:
                raise Exception("no %s available" % self.environ['COMPRESS'])

        self.ctx = zmq.Context()

        from cache import CacheTracker, LocalCacheTracker
        if isLocal:
            self.cacheTracker = LocalCacheTracker(isMaster)
        else:
            self.cacheTracker = CacheTracker(isMaster)

        from shuffle import LocalFileShuffle, MapOutputTracker, LocalMapOutputTracker
        LocalFileShuffle.initialize(isMaster)
        if isLocal:
            self.mapOutputTracker = LocalMapOutputTracker(isMaster)
        else:
            self.mapOutputTracker = MapOutputTracker(isMaster)
        from shuffle import SimpleShuffleFetcher, ParallelShuffleFetcher
        #self.shuffleFetcher = SimpleShuffleFetcher()
        self.shuffleFetcher = ParallelShuffleFetcher(2)

        from broadcast import Broadcast
        Broadcast.initialize(isMaster)

        self.started = True
        logger.debug("env started")
Example #6
0
    def start(self, isMaster, environ={}, isLocal=False, port=None):
        if getattr(self, 'started', False):
            return
        logger.debug("start env in %s: %s %s", os.getpid(),
                isMaster, environ)
        if isMaster:
            if isLocal:
                root = '/tmp/dpark'
                self.dfs = True
            elif os.environ.has_key('DPARK_SHARE_DIR'):
                root = os.environ['DPARK_SHARE_DIR']
                self.dfs = True
            elif os.environ.has_key('DPARK_WORK_DIR'):
                root = os.environ['DPARK_WORK_DIR']
                self.dfs = False
            else:
                raise Exception("no shuffle dir exists")
            if not os.path.exists(root):
                os.mkdir(root, 0777)
                os.chmod(root, 0777) # because of umask
            name = '%s-%s-%d' % (time.strftime("%Y%m%d-%H%M%S"),
                socket.gethostname(), os.getpid())
            self.workdir = os.path.join(root, name)
            os.makedirs(self.workdir)
            self.environ['WORKDIR'] = self.workdir
            self.environ['DPARK_HAS_DFS'] = str(self.dfs)
        else:
            self.environ.update(environ)
            self.dfs = (self.environ['DPARK_HAS_DFS'] == 'True')

        from cache import CacheTracker
        self.cacheTracker = CacheTracker(isMaster)
        
        from shuffle import LocalFileShuffle, MapOutputTracker, SimpleShuffleFetcher
        LocalFileShuffle.initialize(isMaster, port)
        self.mapOutputTracker = MapOutputTracker(isMaster)
        self.shuffleFetcher = SimpleShuffleFetcher()

        from broadcast import Broadcast
        Broadcast.initialize(isMaster)

        self.started = True
        logger.debug("env started") 
Example #7
0
File: env.py Project: haiger/dpark
    def start(self, isMaster, environ={}, isLocal=False):
        if getattr(self, "started", False):
            return
        logger.debug("start env in %s: %s %s", os.getpid(), isMaster, environ)
        if isMaster:
            root = "/tmp/dpark"
            if not isLocal:
                root = os.environ.get("DPARK_SHARE_DIR")
            if not root:
                raise Exception("no shuffle dir exists")
            if not os.path.exists(root):
                os.mkdir(root, 0777)
                os.chmod(root, 0777)  # because of umask
            name = "%s-%s-%d" % (time.strftime("%Y%m%d-%H%M%S"), socket.gethostname(), os.getpid())
            self.workdir = os.path.join(root, name)
            os.makedirs(self.workdir)
            self.environ["WORKDIR"] = self.workdir
        else:
            self.environ.update(environ)

        from cache import CacheTracker

        self.cacheTracker = CacheTracker(isMaster)

        from shuffle import LocalFileShuffle, MapOutputTracker, SimpleShuffleFetcher

        LocalFileShuffle.initialize(isMaster)
        self.mapOutputTracker = MapOutputTracker(isMaster)
        self.shuffleFetcher = SimpleShuffleFetcher()

        from broadcast import Broadcast

        Broadcast.initialize(isMaster)

        self.started = True
        logger.debug("env started")