def gen_broadcast_path(work_dirs, uuid): work_dir = decide_dir(work_dirs) broadcast_dir = os.path.join(work_dir, 'broadcast') mkdir_p(broadcast_dir) uuid_path = '%s_%d' % (uuid, os.getpid()) broadcast_path = os.path.join(broadcast_dir, uuid_path) return broadcast_path
def __init__(self, tracker, path): try: mkdir_p(path) except: pass self.tracker = tracker self.root = path
def start(self): if self.started: return self.started = True logger.debug("start env in %s", os.getpid()) for d in self.workdir: util.mkdir_p(d) if 'TRACKER_ADDR' not in self.environ: from dpark.tracker import TrackerServer trackerServer = self.trackerServer = TrackerServer() self.trackerServer.start() self.register('TRACKER_ADDR', trackerServer.addr) from dpark.tracker import TrackerClient addr = self.get('TRACKER_ADDR') self.trackerClient = TrackerClient(addr) from dpark.cache import CacheTracker self.cacheTracker = CacheTracker() from dpark.shuffle import MapOutputTracker self.mapOutputTracker = MapOutputTracker() from dpark.shuffle import ParallelShuffleFetcher self.shuffleFetcher = ParallelShuffleFetcher(2) from dpark.broadcast import start_guide_manager, GUIDE_ADDR if GUIDE_ADDR not in self.environ: start_guide_manager() logger.debug("env started")
def _get_path(self): dirs = env.get('WORKDIR') if not dirs: raise RuntimeError('No available workdir') path = os.path.join(dirs[0], 'mutable_dict') if os.path.exists(path): return path st = os.statvfs(dirs[0]) ratio = st.f_bfree * 1.0 / st.f_blocks if ratio >= 0.66: mkdir_p(path) return path for d in dirs[1:]: p = os.path.join(d, 'mutable_dict') try: os.makedirs(p) os.symlink(p, path) except OSError as e: pass return path raise RuntimeError('Cannot find suitable workdir')
def getOutputFile(cls, shuffle_id, input_id, output_id, datasize=0): """ datasize < 0: disk first datasize > 0: memfirst datasize = 0: read only, use link """ shuffleDir = env.get('WORKDIR') path = os.path.join(shuffleDir[0], str(shuffle_id), str(input_id)) mkdir_p(path) p = os.path.join(path, str(output_id)) if datasize != 0 and len(shuffleDir) > 1: use_disk = datasize < 0 if datasize > 0: st = os.statvfs(path) free = st.f_bfree * st.f_bsize ratio = st.f_bfree * 1.0 / st.f_blocks use_disk = free < max(datasize, 1 << 30) or ratio < 0.66 if use_disk: d2 = os.path.join(random.choice(shuffleDir[1:]), str(shuffle_id), str(input_id)) mkdir_p(d2) p2 = os.path.join(d2, str(output_id)) if os.path.exists(p): os.remove(p) os.symlink(p2, p) if os.path.islink(p2): os.unlink(p2) # p == p2 return p2 return p
def registered(self, driver, executorInfo, frameworkInfo, agent_info): try: global Script ( Script, cwd, python_path, osenv, self.parallel, out_logger, err_logger, logLevel, args ) = marshal.loads(decode_data(executorInfo.data)) self.init_args = args sys.path = python_path os.environ.update(osenv) setproctitle('[Executor]' + Script) prefix = '[%s] ' % socket.gethostname() fmt = '%(asctime)-15s [%(levelname)s] [%(name)-9s] %(message)s' logging.basicConfig(format=fmt, level=logLevel) r1 = self.stdout_redirect = Redirect(1, out_logger, prefix) sys.stdout = r1.pipe_wfile r2 = self.stderr_redirect = Redirect(2, err_logger, prefix) sys.stderr = r2.pipe_wfile if os.path.exists(cwd): try: os.chdir(cwd) except Exception as e: logger.warning('change cwd to %s failed: %s', cwd, e) else: logger.warning('cwd (%s) not exists', cwd) self.workdir = args['WORKDIR'] main_workdir = self.workdir[0] root = os.path.dirname(main_workdir) if not os.path.exists(root): os.mkdir(root) os.chmod(root, 0o777) # because umask mkdir_p(main_workdir) self._try_flock(main_workdir) args['SERVER_URI'] = startWebServer(main_workdir) if 'MESOS_SLAVE_PID' in os.environ: # make unit test happy setup_cleaner_process(self.workdir) spawn(self.check_memory, driver) spawn(self.replier, driver) logger.debug('executor started at %s', agent_info.hostname) except Exception as e: import traceback msg = traceback.format_exc() logger.error('init executor failed: %s', msg) raise
def start(self, isMaster, environ={}): if self.started: return logger.debug("start env in %s: %s %s", os.getpid(), isMaster, environ) self.isMaster = isMaster if isMaster: roots = conf.DPARK_WORK_DIR if isinstance(roots, str): roots = roots.split(',') name = '%s-%s-%d' % (time.strftime("%Y%m%d-%H%M%S"), socket.gethostname(), os.getpid()) self.workdir = [os.path.join(root, name) for root in roots] try: for d in self.workdir: util.mkdir_p(d) except OSError as e: if environ.get('is_local', False): raise e self.environ['SERVER_URI'] = 'file://' + self.workdir[0] self.environ['WORKDIR'] = self.workdir self.environ['COMPRESS'] = util.COMPRESS else: self.environ.update(environ) if self.environ['COMPRESS'] != util.COMPRESS: raise Exception("no %s available" % self.environ['COMPRESS']) self.ctx = zmq.Context() from dpark.tracker import TrackerServer, TrackerClient if isMaster: self.trackerServer = TrackerServer() self.trackerServer.start() addr = self.trackerServer.addr env.register('TrackerAddr', addr) else: addr = env.get('TrackerAddr') self.trackerClient = TrackerClient(addr) from dpark.cache import CacheTracker self.cacheTracker = CacheTracker() from dpark.shuffle import LocalFileShuffle, MapOutputTracker LocalFileShuffle.initialize(isMaster) self.mapOutputTracker = MapOutputTracker() from dpark.shuffle import ParallelShuffleFetcher self.shuffleFetcher = ParallelShuffleFetcher(2) from dpark.broadcast import start_manager start_manager(isMaster) self.started = True logger.debug("env started")
def write(self, path): mkdir_p(path) fd, file_name = tempfile.mkstemp(dir=path) output_file = os.path.join(path, 'metadata') try: with os.fdopen(fd, 'wb+') as f: f.write(pickle.dumps(self, -1)) os.rename(file_name, output_file) finally: try: os.remove(file_name) except OSError: pass
def getOutputFile(cls, shuffleId, inputId, outputId, datasize=0): path = os.path.join(cls.shuffleDir[0], str(shuffleId), str(inputId)) mkdir_p(path) p = os.path.join(path, str(outputId)) if datasize > 0 and len(cls.shuffleDir) > 1: st = os.statvfs(path) free = st.f_bfree * st.f_bsize ratio = st.f_bfree * 1.0 / st.f_blocks if free < max(datasize, 1<<30) or ratio < 0.66: d2 = os.path.join(random.choice(cls.shuffleDir[1:]), str(shuffleId), str(inputId)) mkdir_p(d2) p2 = os.path.join(d2, str(outputId)) os.symlink(p2, p) if os.path.islink(p2): os.unlink(p2) # p == p2 return p2 return p
def getOutputFile(cls, shuffle_id, input_id, output_id, datasize=0): shuffleDir = env.get('WORKDIR') path = os.path.join(shuffleDir[0], str(shuffle_id), str(input_id)) mkdir_p(path) p = os.path.join(path, str(output_id)) if datasize > 0 and len(shuffleDir) > 1: # datasize > 0 means its writing st = os.statvfs(path) free = st.f_bfree * st.f_bsize ratio = st.f_bfree * 1.0 / st.f_blocks if free < max(datasize, 1 << 30) or ratio < 0.66: d2 = os.path.join(random.choice(shuffleDir[1:]), str(shuffle_id), str(input_id)) mkdir_p(d2) p2 = os.path.join(d2, str(output_id)) if os.path.exists(p): os.remove(p) os.symlink(p2, p) if os.path.islink(p2): os.unlink(p2) # p == p2 return p2 return p
def _get_path(self): dirs = env.get('WORKDIR') if not dirs: raise RuntimeError('No available workdir') path = os.path.join(dirs[0], 'mutable_dict') if os.path.exists(path): return path st = os.statvfs(dirs[0]) ratio = st.f_bfree * 1.0 / st.f_blocks if ratio >= 0.66: mkdir_p(path) return path for d in dirs[1:]: p = os.path.join(d, 'mutable_dict') try: os.makedirs(p) os.symlink(p, path) except OSError, e: pass return path
def getExecutorInfo(self, framework_id): info = mesos_pb2.ExecutorInfo() if hasattr(info, 'framework_id'): info.framework_id.value = framework_id if self.use_self_as_exec: info.command.value = os.path.abspath(sys.argv[0]) info.executor_id.value = sys.argv[0] else: info.command.value = '%s %s' % ( sys.executable, os.path.abspath( os.path.join(os.path.dirname(__file__), 'executor.py'))) info.executor_id.value = "default" v = info.command.environment.variables.add() v.name = 'UID' v.value = str(os.getuid()) v = info.command.environment.variables.add() v.name = 'GID' v.value = str(os.getgid()) if self.options.image and hasattr(info, 'container'): info.container.type = mesos_pb2.ContainerInfo.DOCKER info.container.docker.image = self.options.image for path in ['/etc/passwd', '/etc/group']: v = info.container.volumes.add() v.host_path = v.container_path = path v.mode = mesos_pb2.Volume.RO for path in conf.MOOSEFS_MOUNT_POINTS: v = info.container.volumes.add() v.host_path = v.container_path = path v.mode = mesos_pb2.Volume.RW for path in conf.DPARK_WORK_DIR.split(','): v = info.container.volumes.add() v.host_path = v.container_path = path v.mode = mesos_pb2.Volume.RW if self.options.volumes: for volume in self.options.volumes.split(','): fields = volume.split(':') if len(fields) == 3: host_path, container_path, mode = fields mode = mesos_pb2.Volume.RO if mode.lower( ) == 'ro' else mesos_pb2.Volume.RW elif len(fields) == 2: host_path, container_path = fields mode = mesos_pb2.Volume.RW elif len(fields) == 1: container_path, = fields host_path = '' mode = mesos_pb2.Volume.RW else: raise Exception("cannot parse volume %s", volume) mkdir_p(host_path) v = info.container.volumes.add() v.container_path = container_path v.mode = mode if host_path: v.host_path = host_path mem = info.resources.add() mem.name = 'mem' mem.type = 0 #mesos_pb2.Value.SCALAR mem.scalar.value = EXECUTOR_MEMORY Script = os.path.realpath(sys.argv[0]) if hasattr(info, 'name'): info.name = Script info.data = marshal.dumps( (Script, os.getcwd(), sys.path, dict(os.environ), self.task_per_node, self.out_logger, self.err_logger, self.logLevel, env.environ)) return info
def getExecutorInfo(self, framework_id): info = Dict() info.framework_id.value = framework_id if self.use_self_as_exec: info.command.value = os.path.abspath(sys.argv[0]) info.executor_id.value = sys.argv[0] else: info.command.value = '%s %s' % ( sys.executable, os.path.abspath( os.path.join(os.path.dirname(__file__), 'executor.py'))) info.executor_id.value = 'default' info.command.environment.variables = variables = [] v = Dict() variables.append(v) v.name = 'UID' v.value = str(os.getuid()) v = Dict() variables.append(v) v.name = 'GID' v.value = str(os.getgid()) if self.options.image: info.container.type = 'DOCKER' info.container.docker.image = self.options.image info.container.volumes = volumes = [] for path in ['/etc/passwd', '/etc/group']: v = Dict() volumes.append(v) v.host_path = v.container_path = path v.mode = 'RO' for path in conf.MOOSEFS_MOUNT_POINTS: v = Dict() volumes.append(v) v.host_path = v.container_path = path v.mode = 'RW' for path in conf.DPARK_WORK_DIR.split(','): v = Dict() volumes.append(v) v.host_path = v.container_path = path v.mode = 'RW' if self.options.volumes: for volume in self.options.volumes.split(','): fields = volume.split(':') if len(fields) == 3: host_path, container_path, mode = fields mode = mode.upper() assert mode in ('RO', 'RW') elif len(fields) == 2: host_path, container_path = fields mode = 'RW' elif len(fields) == 1: container_path, = fields host_path = '' mode = 'RW' else: raise Exception('cannot parse volume %s', volume) mkdir_p(host_path) v = Dict() volumes.append(v) v.container_path = container_path v.mode = mode if host_path: v.host_path = host_path info.resources = resources = [] mem = Dict() resources.append(mem) mem.name = 'mem' mem.type = 'SCALAR' mem.scalar.value = EXECUTOR_MEMORY cpus = Dict() resources.append(cpus) cpus.name = 'cpus' cpus.type = 'SCALAR' cpus.scalar.value = EXECUTOR_CPUS Script = os.path.realpath(sys.argv[0]) info.name = Script info.data = encode_data( marshal.dumps((Script, os.getcwd(), sys.path, dict(os.environ), self.task_per_node, self.out_logger, self.err_logger, self.logLevel, env.environ))) return info
try: os.chdir(cwd) except Exception, e: logger.warning("change cwd to %s failed: %s", cwd, e) else: logger.warning("cwd (%s) not exists", cwd) self.workdir = args['WORKDIR'] main_workdir = self.workdir[0] root = os.path.dirname(main_workdir) if not os.path.exists(root): os.mkdir(root) os.chmod(root, 0777) # because umask mkdir_p(main_workdir) self._try_flock(main_workdir) args['SERVER_URI'] = startWebServer(main_workdir) if 'MESOS_SLAVE_PID' in os.environ: # make unit test happy setup_cleaner_process(self.workdir) spawn(self.check_memory, driver) logger.debug("executor started at %s", slaveInfo.hostname) except Exception, e: import traceback msg = traceback.format_exc() logger.error("init executor failed: %s", msg) raise
def get_tmp(cls): dirs = env.get('WORKDIR') d = random.choice(dirs[1:]) if dirs[1:] else dirs[0] mkdir_p(d) return os.path.join(d, 'shuffle-%s.tmp' % uuid.uuid4().hex)
def registered(self, driver, executorInfo, frameworkInfo, agent_info): try: global Script (Script, cwd, python_path, osenv, self.parallel, out_logger, err_logger, logLevel, use_color, args) = marshal.loads(decode_data(executorInfo.data)) sys.path = python_path os.environ.update(osenv) setproctitle('[Executor]' + Script) prefix = formatter_message( '{MAGENTA}[%s]{RESET} ' % socket.gethostname().ljust(10), use_color) init_dpark_logger(logLevel, use_color=use_color) logging.root.setLevel(logLevel) r1 = self.stdout_redirect = Redirect(1, out_logger, prefix) sys.stdout = r1.pipe_wfile r2 = self.stderr_redirect = Redirect(2, err_logger, prefix) sys.stderr = r2.pipe_wfile if os.path.exists(cwd): try: os.chdir(cwd) except Exception as e: logger.warning('change cwd to %s failed: %s', cwd, e) else: logger.warning('cwd (%s) not exists', cwd) self.workdir = args['WORKDIR'] main_workdir = self.workdir[0] root = os.path.dirname(main_workdir) if not os.path.exists(root): os.mkdir(root) os.chmod(root, 0o777) # because umask mkdir_p(main_workdir) self._try_flock(main_workdir) args['SERVER_URI'] = startWebServer(main_workdir) if 'MESOS_SLAVE_PID' in os.environ: # make unit test happy setup_cleaner_process(self.workdir) spawn(self.check_memory, driver) spawn(self.replier, driver) env.environ.update(args) from dpark.broadcast import start_download_manager start_download_manager() logger.debug('executor started at %s', agent_info.hostname) except Exception as e: import traceback msg = traceback.format_exc() logger.error('init executor failed: %s', msg) raise
def getExecutorInfo(self, framework_id): info = mesos_pb2.ExecutorInfo() if hasattr(info, 'framework_id'): info.framework_id.value = framework_id if self.use_self_as_exec: info.command.value = os.path.abspath(sys.argv[0]) info.executor_id.value = sys.argv[0] else: info.command.value = '%s %s' % ( sys.executable, os.path.abspath(os.path.join(os.path.dirname(__file__), 'executor.py')) ) info.executor_id.value = "default" v = info.command.environment.variables.add() v.name = 'UID' v.value = str(os.getuid()) v = info.command.environment.variables.add() v.name = 'GID' v.value = str(os.getgid()) if self.options.image and hasattr(info, 'container'): info.container.type = mesos_pb2.ContainerInfo.DOCKER info.container.docker.image = self.options.image for path in ['/etc/passwd', '/etc/group']: v = info.container.volumes.add() v.host_path = v.container_path = path v.mode = mesos_pb2.Volume.RO for path in conf.MOOSEFS_MOUNT_POINTS: v = info.container.volumes.add() v.host_path = v.container_path = path v.mode = mesos_pb2.Volume.RW for path in conf.DPARK_WORK_DIR.split(','): v = info.container.volumes.add() v.host_path = v.container_path = path v.mode = mesos_pb2.Volume.RW if self.options.volumes: for volume in self.options.volumes.split(','): fields = volume.split(':') if len(fields) == 3: host_path, container_path, mode = fields mode = mesos_pb2.Volume.RO if mode.lower() == 'ro' else mesos_pb2.Volume.RW elif len(fields) == 2: host_path, container_path = fields mode = mesos_pb2.Volume.RW elif len(fields) == 1: container_path, = fields host_path = '' mode = mesos_pb2.Volume.RW else: raise Exception("cannot parse volume %s", volume) mkdir_p(host_path) v = info.container.volumes.add() v.container_path = container_path v.mode = mode if host_path: v.host_path = host_path mem = info.resources.add() mem.name = 'mem' mem.type = mesos_pb2.Value.SCALAR mem.scalar.value = EXECUTOR_MEMORY cpus = info.resources.add() cpus.name = 'cpus' cpus.type = mesos_pb2.Value.SCALAR cpus.scalar.value = EXECUTOR_CPUS Script = os.path.realpath(sys.argv[0]) if hasattr(info, 'name'): info.name = Script info.data = marshal.dumps((Script, os.getcwd(), sys.path, dict(os.environ), self.task_per_node, self.out_logger, self.err_logger, self.logLevel, env.environ)) return info
def __init__(self, tracker, path): mkdir_p(path) self.tracker = tracker self.root = path
def getExecutorInfo(self, framework_id): info = Dict() info.framework_id.value = framework_id if self.use_self_as_exec: info.command.value = os.path.abspath(sys.argv[0]) info.executor_id.value = sys.argv[0] else: info.command.value = '%s %s' % ( sys.executable, os.path.abspath( os.path.join( os.path.dirname(__file__), 'executor.py')) ) info.executor_id.value = 'default' info.command.environment.variables = variables = [] v = Dict() variables.append(v) v.name = 'UID' v.value = str(os.getuid()) v = Dict() variables.append(v) v.name = 'GID' v.value = str(os.getgid()) if self.options.image: info.container.type = 'DOCKER' info.container.docker.image = self.options.image info.container.volumes = volumes = [] for path in ['/etc/passwd', '/etc/group']: v = Dict() volumes.append(v) v.host_path = v.container_path = path v.mode = 'RO' for path in conf.MOOSEFS_MOUNT_POINTS: v = Dict() volumes.append(v) v.host_path = v.container_path = path v.mode = 'RW' for path in conf.DPARK_WORK_DIR.split(','): v = Dict() volumes.append(v) v.host_path = v.container_path = path v.mode = 'RW' if self.options.volumes: for volume in self.options.volumes.split(','): fields = volume.split(':') if len(fields) == 3: host_path, container_path, mode = fields mode = mode.upper() assert mode in ('RO', 'RW') elif len(fields) == 2: host_path, container_path = fields mode = 'RW' elif len(fields) == 1: container_path, = fields host_path = '' mode = 'RW' else: raise Exception('cannot parse volume %s', volume) mkdir_p(host_path) v = Dict() volumes.append(v) v.container_path = container_path v.mode = mode if host_path: v.host_path = host_path info.resources = resources = [] mem = Dict() resources.append(mem) mem.name = 'mem' mem.type = 'SCALAR' mem.scalar.value = EXECUTOR_MEMORY cpus = Dict() resources.append(cpus) cpus.name = 'cpus' cpus.type = 'SCALAR' cpus.scalar.value = EXECUTOR_CPUS Script = os.path.realpath(sys.argv[0]) info.name = Script info.data = encode_data(marshal.dumps( ( Script, os.getcwd(), sys.path, dict(os.environ), self.task_per_node, self.out_logger, self.err_logger, self.logLevel, env.environ ) )) return info