Example #1
0
    def __init__(self, master=None):

        if 'MESOS_SLAVE_PID' in os.environ and 'DRUN_SIZE' not in os.environ:
            from executor import run
            run()
            sys.exit(0)

        options = parse_options()
        self.master = master or options.master

        if self.master.startswith('local'):
            self.scheduler = LocalScheduler()
            self.isLocal = True
        elif self.master.startswith('process'):
            self.scheduler = MultiProcessScheduler(options.parallel)
            self.isLocal = True
        elif (self.master.startswith('mesos')
              or self.master.startswith('zoo')):
            if '://' not in self.master:
                self.master = os.environ.get('MESOS_MASTER')
                if not self.master:
                    raise Exception("invalid uri of mesos master: %s" %
                                    self.master)
            self.scheduler = MesosScheduler(self.master, options)
            self.isLocal = False
        else:
            raise Exception("invalid master option: %s" % self.master)

        if options.parallel:
            self.defaultParallelism = options.parallel
        else:
            self.defaultParallelism = self.scheduler.defaultParallelism()
        self.defaultMinSplits = max(self.defaultParallelism, 2)

        self.started = False
Example #2
0
    def __init__(self, master=None):
        
        if 'MESOS_SLAVE_PID' in os.environ and 'DRUN_SIZE' not in os.environ:
            from executor import run
            run()
            sys.exit(0)
        
        options = parse_options()
        self.master = master or options.master

        if self.master.startswith('local'):
            self.scheduler = LocalScheduler()
            self.isLocal = True
        elif self.master.startswith('process'):
            self.scheduler = MultiProcessScheduler(options.parallel)
            self.isLocal = True
        elif (self.master.startswith('mesos')
              or self.master.startswith('zoo')):
            if '://' not in self.master:
                self.master = os.environ.get('MESOS_MASTER')
                if not self.master:
                    raise Exception("invalid uri of mesos master: %s" % self.master)
            self.scheduler = MesosScheduler(self.master, options) 
            self.isLocal = False
        else:
            raise Exception("invalid master option: %s" % self.master)

        if options.parallel:
            self.defaultParallelism = options.parallel
        else:
            self.defaultParallelism = self.scheduler.defaultParallelism()
        self.defaultMinSplits = max(self.defaultParallelism, 2)
       
        self.started = False
Example #3
0
    def __init__(self, master=None):
        
        if 'MESOS_SLAVE_PID' in os.environ and 'DRUN_SIZE' not in os.environ:
            from executor import run
            run()
            sys.exit(0)
        
        options = parse_options()
        self.options = options
        master = master or options.master

        if master == 'local':
            self.scheduler = LocalScheduler()
            self.isLocal = True
        elif master == 'process':
            self.scheduler = MultiProcessScheduler(options.parallel)
            self.isLocal = True
        else:
            if master == 'mesos':
                master = os.environ.get('MESOS_MASTER')
                if not master:
                    raise Exception("invalid uri of mesos master: %s" % master)
            if master.startswith('mesos://'):
                if '@' in master:
                    master = master[master.rfind('@')+1:]
                else:
                    master = master[master.rfind('//')+2:]
            elif master.startswith('zoo://'):
                master = 'zk' + master[3:]

            if ':' not in master:
                master += ':5050'
            self.scheduler = MesosScheduler(master, options) 
            self.isLocal = False
            
        self.master = master

        if options.parallel:
            self.defaultParallelism = options.parallel
        else:
            self.defaultParallelism = self.scheduler.defaultParallelism()
        self.defaultMinSplits = max(self.defaultParallelism, 2)
      
        try:
            from rfoo.utils import rconsole
            rconsole.spawn_server(locals(), 0)
        except ImportError:
            pass

        self.started = False
Example #4
0
    def init(self):
        if self.initialized:
            return

        #if 'MESOS_SLAVE_PID' in os.environ and 'DRUN_SIZE' not in os.environ:
        #    from executor import run
        #    run()
        #    sys.exit(0)
        
        options = parse_options()
        self.options = options
        master = self.master or options.master

        if master == 'local':
            self.scheduler = LocalScheduler()
            self.isLocal = True
        elif master == 'process':
            self.scheduler = MultiProcessScheduler(options.parallel)
            self.isLocal = False
        else:
            if master == 'mesos':
                master = os.environ.get('MESOS_MASTER')
                if not master:
                    raise Exception("invalid uri of mesos master: %s" % master)
            if master.startswith('mesos://'):
                if '@' in master:
                    master = master[master.rfind('@')+1:]
                else:
                    master = master[master.rfind('//')+2:]
            elif master.startswith('zoo://'):
                master = 'zk' + master[3:]

            if ':' not in master:
                master += ':5050'
            self.scheduler = MesosScheduler(master, options) 
            self.isLocal = False
            
        self.master = master

        if options.parallel:
            self.defaultParallelism = options.parallel
        else:
            self.defaultParallelism = self.scheduler.defaultParallelism()
        self.defaultMinSplits = max(self.defaultParallelism, 2)

        self.initialized = True
Example #5
0
    def init(self):
        if self.initialized:
            return

        #if 'MESOS_SLAVE_PID' in os.environ and 'DRUN_SIZE' not in os.environ:
        #    from executor import run
        #    run()
        #    sys.exit(0)

        options = parse_options()
        self.options = options
        master = self.master or options.master

        if master == 'local':
            self.scheduler = LocalScheduler()
            self.isLocal = True
        elif master == 'process':
            self.scheduler = MultiProcessScheduler(options.parallel)
            self.isLocal = False
        else:
            if master == 'mesos':
                master = os.environ.get('MESOS_MASTER')
                if not master:
                    raise Exception("invalid uri of mesos master: %s" % master)
            if master.startswith('mesos://'):
                if '@' in master:
                    master = master[master.rfind('@') + 1:]
                else:
                    master = master[master.rfind('//') + 2:]
            elif master.startswith('zoo://'):
                master = 'zk' + master[3:]

            if ':' not in master:
                master += ':5050'
            self.scheduler = MesosScheduler(master, options)
            self.isLocal = False

        self.master = master

        if options.parallel:
            self.defaultParallelism = options.parallel
        else:
            self.defaultParallelism = self.scheduler.defaultParallelism()
        self.defaultMinSplits = max(self.defaultParallelism, 2)

        self.initialized = True
Example #6
0
class DparkContext(object):
    nextShuffleId = 0
    def __init__(self, master=None):
        self.master = master
        self.initialized = False
        self.started = False

    def init(self):
        if self.initialized:
            return

        #if 'MESOS_SLAVE_PID' in os.environ and 'DRUN_SIZE' not in os.environ:
        #    from executor import run
        #    run()
        #    sys.exit(0)
        
        options = parse_options()
        self.options = options
        master = self.master or options.master

        if master == 'local':
            self.scheduler = LocalScheduler()
            self.isLocal = True
        elif master == 'process':
            self.scheduler = MultiProcessScheduler(options.parallel)
            self.isLocal = False
        else:
            if master == 'mesos':
                master = os.environ.get('MESOS_MASTER')
                if not master:
                    raise Exception("invalid uri of mesos master: %s" % master)
            if master.startswith('mesos://'):
                if '@' in master:
                    master = master[master.rfind('@')+1:]
                else:
                    master = master[master.rfind('//')+2:]
            elif master.startswith('zoo://'):
                master = 'zk' + master[3:]

            if ':' not in master:
                master += ':5050'
            self.scheduler = MesosScheduler(master, options) 
            self.isLocal = False
            
        self.master = master

        if options.parallel:
            self.defaultParallelism = options.parallel
        else:
            self.defaultParallelism = self.scheduler.defaultParallelism()
        self.defaultMinSplits = max(self.defaultParallelism, 2)

        self.initialized = True

    def newShuffleId(self):
        self.nextShuffleId += 1
        return self.nextShuffleId

    def parallelize(self, seq, numSlices=None): 
        self.init()
        if numSlices is None:
            numSlices = self.defaultParallelism
        return ParallelCollection(self, seq, numSlices)

    def makeRDD(self, seq, numSlices=None):
        return self.parallelize(seq, numSlices)
    
    def textFile(self, path, ext='', followLink=True, maxdepth=0, cls=TextFileRDD, *ka, **kws):
        if isinstance(path, (list, tuple)):
            return self.union([self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws)
                for p in path])

        def create_rdd(cls, path, *ka, **kw):
            if cls is TextFileRDD:
                if path.endswith('.bz2'):
                    return BZip2FileRDD(self, path, *ka, **kw)
                elif path.endswith('.gz'):
                    return GZipFileRDD(self, path, *ka, **kw)
            return cls(self, path, *ka, **kw)

        if os.path.isdir(path):
            paths = []
            for root,dirs,names in os.walk(path, followlinks=followLink):
                if maxdepth > 0:
                    depth = len(filter(None, root[len(path):].split('/'))) + 1
                    if depth > maxdepth:
                        break
                for n in sorted(names):
                    if n.endswith(ext) and not n.startswith('.'):
                        p = os.path.join(root, n)
                        if followLink or not os.path.islink(p):
                            paths.append(p)
                dirs.sort()
                for d in dirs[:]:
                    if d.startswith('.'):
                        dirs.remove(d)

            rdds = [create_rdd(cls, p, *ka, **kws) 
                     for p in paths]
            return self.union(rdds)
        else:
            return create_rdd(cls, path, *ka, **kws)

    def bzip2File(self, *args, **kwargs):
        "deprecated"
        logger.warning("bzip2File() is deprecated, use textFile('xx.bz2') instead")
        return self.textFile(cls=BZip2FileRDD, *args, **kwargs)

    def csvFile(self, path, dialect='excel', *args, **kwargs):
        """ deprecated. """
        return self.textFile(path, cls=TextFileRDD, *args, **kwargs).fromCsv(dialect)

    def binaryFile(self, path, fmt=None, length=None, *args, **kwargs):
        return self.textFile(path, cls=BinaryFileRDD, fmt=fmt, length=length, *args, **kwargs)

    def union(self, rdds):
        return UnionRDD(self, rdds)

    def zip(self, rdds):
        return ZippedRDD(self, rdds)

    def accumulator(self, init=0, param=None):
        return Accumulator(init, param)

    def broadcast(self, v):
        self.start()
        return Broadcast.newBroadcast(v, self.isLocal)

    def start(self):
        if self.started:
            return
        
        self.init()

        env.start(True, isLocal=self.isLocal)
        self.scheduler.start()
        self.started = True
        atexit.register(self.stop)

        def handler(signm, frame):
            logger.error("got signal %d, exit now", signm)
            self.scheduler.shutdown()

        signal.signal(signal.SIGTERM, handler)
        signal.signal(signal.SIGHUP, handler)
        signal.signal(signal.SIGABRT, handler)
        signal.signal(signal.SIGQUIT, handler)
        
        try:
            from rfoo.utils import rconsole
            rconsole.spawn_server(locals(), 0)
        except ImportError:
            pass

    def runJob(self, rdd, func, partitions=None, allowLocal=False):
        self.start()

        if partitions is None:
            partitions = range(len(rdd))
        try:
            gc.disable()
            for it in self.scheduler.runJob(rdd, func, partitions, allowLocal):
                yield it
        finally:
            gc.collect()
            gc.enable()

    def clear(self):
        if not self.started:
            return
        
        self.scheduler.clear()
        gc.collect()

    def stop(self):
        if not self.started:
            return

        self.scheduler.stop()
        env.stop()
        self.started = False

    def __getstate__(self):
        raise ValueError("should not pickle ctx")
Example #7
0
class DparkContext:
    nextShuffleId = 0

    def __init__(self, master=None):
        
        if 'MESOS_SLAVE_PID' in os.environ and 'DRUN_SIZE' not in os.environ:
            from executor import run
            run()
            sys.exit(0)
        
        options = parse_options()
        self.master = master or options.master

        if self.master.startswith('local'):
            self.scheduler = LocalScheduler()
            self.isLocal = True
        elif self.master.startswith('process'):
            self.scheduler = MultiProcessScheduler(options.parallel)
            self.isLocal = True
        elif (self.master.startswith('mesos')
              or self.master.startswith('zoo')):
            if '://' not in self.master:
                self.master = os.environ.get('MESOS_MASTER')
                if not self.master:
                    raise Exception("invalid uri of mesos master: %s" % self.master)
            self.scheduler = MesosScheduler(self.master, options) 
            self.isLocal = False
        else:
            raise Exception("invalid master option: %s" % self.master)

        if options.parallel:
            self.defaultParallelism = options.parallel
        else:
            self.defaultParallelism = self.scheduler.defaultParallelism()
        self.defaultMinSplits = max(self.defaultParallelism, 2)
       
        self.started = False

    def newShuffleId(self):
        self.nextShuffleId += 1
        return self.nextShuffleId

    def parallelize(self, seq, numSlices=None): 
        if numSlices is None:
            numSlices = self.defaultParallelism
        return ParallelCollection(self, seq, numSlices)

    def makeRDD(self, seq, numSlices=None):
        return self.parallelize(seq, numSlices)
    
    def textFile(self, path, ext='', followLink=True, maxdepth=0, cls=TextFileRDD, *ka, **kws):
        if isinstance(path, (list, tuple)):
            return self.union([self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws)
                for p in path]) 
        if os.path.isdir(path):
            paths = []
            for root,dirs,names in os.walk(path, followlinks=followLink):
                if maxdepth > 0:
                    depth = len(filter(None, root[len(path):].split('/'))) + 1
                    if depth > maxdepth:
                        break
                for n in sorted(names):
                    if n.endswith(ext) and not n.startswith('.'):
                        p = os.path.join(root, n)
                        if followLink or not os.path.islink(p):
                            paths.append(p)
                dirs.sort()
                for d in dirs[:]:
                    if d.startswith('.'):
                        dirs.remove(d)

            rdds = [cls(self, p, *ka, **kws) 
                     for p in paths]
            return self.union(rdds)
        else:
            return cls(self, path, *ka, **kws)

    def bzip2File(self, *args, **kwargs):
        return self.textFile(cls=BZip2FileRDD, *args, **kwargs)

    def csvFile(self, path, dialect='excel', *args, **kwargs):
        """ deprecated. """
        return self.textFile(path, cls=TextFileRDD, *args, **kwargs).fromCsv(dialect)

    def union(self, rdds):
        return UnionRDD(self, rdds)

    def accumulator(self, init=0, param=None):
        return Accumulator(init, param)

    def broadcast(self, v):
        self.start()
        return Broadcast.newBroadcast(v, self.master=='local')

    def start(self):
        if self.started:
            return

        env.start(True, isLocal=self.isLocal)
        self.scheduler.start()
        self.started = True
        atexit.register(self.stop)

        def handler(signm, frame):
            logger.error("got signal %d, exit now", signm)
            self.scheduler.shutdown()

        signal.signal(signal.SIGTERM, handler)
        signal.signal(signal.SIGHUP, handler)
        signal.signal(signal.SIGABRT, handler)
        signal.signal(signal.SIGQUIT, handler)

    def runJob(self, rdd, func, partitions=None, allowLocal=False):
        self.start()

        if partitions is None:
            partitions = range(len(rdd))
        return self.scheduler.runJob(rdd, func, partitions, allowLocal)

    def stop(self):
        if not self.started:
            return

        self.scheduler.stop()
        env.stop()
        self.started = False

    def __getstate__(self):
        raise ValueError("should not pickle ctx")
Example #8
0
class DparkContext:
    nextShuffleId = 0

    def __init__(self, master=None):

        if 'MESOS_SLAVE_PID' in os.environ and 'DRUN_SIZE' not in os.environ:
            from executor import run
            run()
            sys.exit(0)

        options = parse_options()
        self.master = master or options.master

        if self.master.startswith('local'):
            self.scheduler = LocalScheduler()
            self.isLocal = True
        elif self.master.startswith('process'):
            self.scheduler = MultiProcessScheduler(options.parallel)
            self.isLocal = True
        elif (self.master.startswith('mesos')
              or self.master.startswith('zoo')):
            if '://' not in self.master:
                self.master = os.environ.get('MESOS_MASTER')
                if not self.master:
                    raise Exception("invalid uri of mesos master: %s" %
                                    self.master)
            self.scheduler = MesosScheduler(self.master, options)
            self.isLocal = False
        else:
            raise Exception("invalid master option: %s" % self.master)

        if options.parallel:
            self.defaultParallelism = options.parallel
        else:
            self.defaultParallelism = self.scheduler.defaultParallelism()
        self.defaultMinSplits = max(self.defaultParallelism, 2)

        self.started = False

    def newShuffleId(self):
        self.nextShuffleId += 1
        return self.nextShuffleId

    def parallelize(self, seq, numSlices=None):
        if numSlices is None:
            numSlices = self.defaultParallelism
        return ParallelCollection(self, seq, numSlices)

    def makeRDD(self, seq, numSlices=None):
        return self.parallelize(seq, numSlices)

    def textFile(self,
                 path,
                 ext='',
                 followLink=True,
                 maxdepth=0,
                 cls=TextFileRDD,
                 *ka,
                 **kws):
        if isinstance(path, (list, tuple)):
            return self.union([
                self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws)
                for p in path
            ])
        if os.path.isdir(path):
            paths = []
            for root, dirs, names in os.walk(path, followlinks=followLink):
                if maxdepth > 0:
                    depth = len(filter(None, root[len(path):].split('/'))) + 1
                    if depth > maxdepth:
                        break
                for n in sorted(names):
                    if n.endswith(ext) and not n.startswith('.'):
                        p = os.path.join(root, n)
                        if followLink or not os.path.islink(p):
                            paths.append(p)
                dirs.sort()
                for d in dirs[:]:
                    if d.startswith('.'):
                        dirs.remove(d)

            rdds = [cls(self, p, *ka, **kws) for p in paths]
            return self.union(rdds)
        else:
            return cls(self, path, *ka, **kws)

    def bzip2File(self, *args, **kwargs):
        return self.textFile(cls=BZip2FileRDD, *args, **kwargs)

    def csvFile(self, path, dialect='excel', *args, **kwargs):
        """ deprecated. """
        return self.textFile(path, cls=TextFileRDD, *args,
                             **kwargs).fromCsv(dialect)

    def union(self, rdds):
        return UnionRDD(self, rdds)

    def accumulator(self, init=0, param=None):
        return Accumulator(init, param)

    def broadcast(self, v):
        self.start()
        return Broadcast.newBroadcast(v, self.master == 'local')

    def start(self):
        if self.started:
            return

        env.start(True, isLocal=self.isLocal)
        self.scheduler.start()
        self.started = True
        atexit.register(self.stop)

        def handler(signm, frame):
            logger.error("got signal %d, exit now", signm)
            self.scheduler.shutdown()

        signal.signal(signal.SIGTERM, handler)
        signal.signal(signal.SIGHUP, handler)
        signal.signal(signal.SIGABRT, handler)
        signal.signal(signal.SIGQUIT, handler)

    def runJob(self, rdd, func, partitions=None, allowLocal=False):
        self.start()

        if partitions is None:
            partitions = range(len(rdd))
        return self.scheduler.runJob(rdd, func, partitions, allowLocal)

    def stop(self):
        if not self.started:
            return

        self.scheduler.stop()
        env.stop()
        self.started = False

    def __getstate__(self):
        raise ValueError("should not pickle ctx")
Example #9
0
class DparkContext(object):
    nextShuffleId = 0

    def __init__(self, master=None):
        self.master = master
        self.initialized = False
        self.started = False

    def init(self):
        if self.initialized:
            return

        #if 'MESOS_SLAVE_PID' in os.environ and 'DRUN_SIZE' not in os.environ:
        #    from executor import run
        #    run()
        #    sys.exit(0)

        options = parse_options()
        self.options = options
        master = self.master or options.master

        if master == 'local':
            self.scheduler = LocalScheduler()
            self.isLocal = True
        elif master == 'process':
            self.scheduler = MultiProcessScheduler(options.parallel)
            self.isLocal = False
        else:
            if master == 'mesos':
                master = os.environ.get('MESOS_MASTER')
                if not master:
                    raise Exception("invalid uri of mesos master: %s" % master)
            if master.startswith('mesos://'):
                if '@' in master:
                    master = master[master.rfind('@') + 1:]
                else:
                    master = master[master.rfind('//') + 2:]
            elif master.startswith('zoo://'):
                master = 'zk' + master[3:]

            if ':' not in master:
                master += ':5050'
            self.scheduler = MesosScheduler(master, options)
            self.isLocal = False

        self.master = master

        if options.parallel:
            self.defaultParallelism = options.parallel
        else:
            self.defaultParallelism = self.scheduler.defaultParallelism()
        self.defaultMinSplits = max(self.defaultParallelism, 2)

        self.initialized = True

    def newShuffleId(self):
        self.nextShuffleId += 1
        return self.nextShuffleId

    def parallelize(self, seq, numSlices=None):
        self.init()
        if numSlices is None:
            numSlices = self.defaultParallelism
        return ParallelCollection(self, seq, numSlices)

    def makeRDD(self, seq, numSlices=None):
        return self.parallelize(seq, numSlices)

    def textFile(self,
                 path,
                 ext='',
                 followLink=True,
                 maxdepth=0,
                 cls=TextFileRDD,
                 *ka,
                 **kws):
        if isinstance(path, (list, tuple)):
            return self.union([
                self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws)
                for p in path
            ])

        def create_rdd(cls, path, *ka, **kw):
            if cls is TextFileRDD:
                if path.endswith('.bz2'):
                    return BZip2FileRDD(self, path, *ka, **kw)
                elif path.endswith('.gz'):
                    return GZipFileRDD(self, path, *ka, **kw)
            return cls(self, path, *ka, **kw)

        if os.path.isdir(path):
            paths = []
            for root, dirs, names in os.walk(path, followlinks=followLink):
                if maxdepth > 0:
                    depth = len(filter(None, root[len(path):].split('/'))) + 1
                    if depth > maxdepth:
                        break
                for n in sorted(names):
                    if n.endswith(ext) and not n.startswith('.'):
                        p = os.path.join(root, n)
                        if followLink or not os.path.islink(p):
                            paths.append(p)
                dirs.sort()
                for d in dirs[:]:
                    if d.startswith('.'):
                        dirs.remove(d)

            rdds = [create_rdd(cls, p, *ka, **kws) for p in paths]
            return self.union(rdds)
        else:
            return create_rdd(cls, path, *ka, **kws)

    def bzip2File(self, *args, **kwargs):
        "deprecated"
        logger.warning(
            "bzip2File() is deprecated, use textFile('xx.bz2') instead")
        return self.textFile(cls=BZip2FileRDD, *args, **kwargs)

    def csvFile(self, path, dialect='excel', *args, **kwargs):
        """ deprecated. """
        return self.textFile(path, cls=TextFileRDD, *args,
                             **kwargs).fromCsv(dialect)

    def binaryFile(self, path, fmt=None, length=None, *args, **kwargs):
        return self.textFile(path,
                             cls=BinaryFileRDD,
                             fmt=fmt,
                             length=length,
                             *args,
                             **kwargs)

    def union(self, rdds):
        return UnionRDD(self, rdds)

    def zip(self, rdds):
        return ZippedRDD(self, rdds)

    def accumulator(self, init=0, param=None):
        return Accumulator(init, param)

    def broadcast(self, v):
        self.start()
        return Broadcast.newBroadcast(v, self.isLocal)

    def start(self):
        if self.started:
            return

        self.init()

        env.start(True, isLocal=self.isLocal)
        self.scheduler.start()
        self.started = True
        atexit.register(self.stop)

        def handler(signm, frame):
            logger.error("got signal %d, exit now", signm)
            self.scheduler.shutdown()

        signal.signal(signal.SIGTERM, handler)
        signal.signal(signal.SIGHUP, handler)
        signal.signal(signal.SIGABRT, handler)
        signal.signal(signal.SIGQUIT, handler)

        try:
            from rfoo.utils import rconsole
            rconsole.spawn_server(locals(), 0)
        except ImportError:
            pass

    def runJob(self, rdd, func, partitions=None, allowLocal=False):
        self.start()

        if partitions is None:
            partitions = range(len(rdd))
        try:
            gc.disable()
            for it in self.scheduler.runJob(rdd, func, partitions, allowLocal):
                yield it
        finally:
            gc.collect()
            gc.enable()

    def clear(self):
        if not self.started:
            return

        self.scheduler.clear()
        gc.collect()

    def stop(self):
        if not self.started:
            return

        self.scheduler.stop()
        env.stop()
        self.started = False

    def __getstate__(self):
        raise ValueError("should not pickle ctx")