def __init__(self, autodetect=True, packages=None, ip=None, env=None, channels=None, conda_pars=None, **kwargs): ip = ip or socket.gethostbyname(socket.gethostname()) self.env = env self.application_master_container = None self.app_id = None self.channels = channels self.conda_pars = conda_pars try: self.local_cluster = LocalCluster(n_workers=0, ip=ip) except (OSError, IOError): self.local_cluster = LocalCluster(n_workers=0, scheduler_port=0, ip=ip) self.packages = list( sorted(unique((packages or []) + global_packages, key=first_word))) self.knit = Knit(autodetect=autodetect, **kwargs) atexit.register(self.stop)
def setup_cluster(config): if 'scheduler.ip' not in config: scheduler_ip = socket.gethostbyname(socket.gethostname()) else: scheduler_ip = config['scheduler.ip'] cluster = LocalCluster(n_workers=0, ip=scheduler_ip, port=config['scheduler.port'], diagnostics_port=config['scheduler.bokeh_port']) if hdfs3 is not None: hdfs = hdfs3.HDFileSystem(host=config.get('hdfs.host'), port=config.get('hdfs.port')) else: hdfs = None knit = Knit(hdfs=hdfs, hdfs_home=config.get('hdfs.home'), rm=config.get('yarn.host'), rm_port=config.get('yarn.port')) command = ('$PYTHON_BIN $CONDA_PREFIX/bin/dask-worker ' '--nprocs={nprocs:d} ' '--nthreads={nthreads:d} ' '--memory-limit={memory_limit:d} ' '{scheduler_address} ' '> /tmp/worker-log.out ' '2> /tmp/worker-log.err').format( nprocs=config['worker.processes'], nthreads=config['worker.threads_per_process'], memory_limit=int(config['worker.memory'] * 1e6), scheduler_address=cluster.scheduler.address) app_id = knit.start(command, env=config['cluster.env'], num_containers=config['cluster.count'], virtual_cores=config['worker.cpus'], memory=config['worker.memory'], queue=config['yarn.queue'], app_name='dask', checks=False) # Add a few missing fields to config before writing to disk config2 = config.copy() # The ip is optional, the port may be chosen dynamically config2['scheduler.ip'] = cluster.scheduler.ip config2['scheduler.port'] = cluster.scheduler.port # Fill in optional parameters with auto-detected versions config2['yarn.host'] = knit.conf['rm'] config2['yarn.port'] = knit.conf['rm_port'] config2['hdfs.home'] = knit.hdfs_home # Add in runtime information like app_id and daemon pid config2['application.id'] = app_id config2['application.pid'] = os.getpid() return cluster, knit, config2
def test_password_pass(): with pytest.raises(KnitException): k = Knit(pars={ 'hadoop.http.authentication.type': 'simple', 'hadoop.http.authentication.simple.anonymous.allowed': 'false'}) k = Knit(pars={ 'hadoop.http.authentication.type': 'simple', 'hadoop.http.authentication.simple.anonymous.allowed': 'false'}, password='******') assert k.yarn_api.auth[1] == 'hello'
def k(): knitter = Knit(nn='localhost', rm='localhost', nn_port=8020, rm_port=8088, replication_factor=1) try: yield knitter finally: # always kill, to avoid follow-on resource pressure try: knitter.kill() except: pass
class YarnPool(RemotePool): """The Yarn Pool mananger.""" def __init__(self, processes=None, port=0, authkey=None): super(YarnPool, self).__init__(processes=processes, port=port, authkey=authkey, workerscript=None) self.stopping = False self.knit = Knit(autodetect=True) cmd = ('python remoteworker.py --host {} --port {} --key {}' .format(socket.gethostname(), self.server.address[1], self.authkey)) self.app_id = self.knit.start( cmd, num_containers=self._processes, files=['joblibhadoop/yarn/remoteworker.py', ]) self.thread = Thread(target=self._monitor_appid) self.thread.deamon = True self.thread.start() def _start_remote_worker(self, pid): remote_worker = RemoteWorker(pid) self._pool.append(remote_worker) def _monitor_appid(self): while not self.stopping: try: status = self.knit.status() yarn_state = status['app']['state'] print("YARN application is {}".format(yarn_state)) # if yarn_state == 'FINISHED': # self.terminate() except: pass sleep(1) def terminate(self): self.stopping = True super(YarnPool, self).terminate() self.knit.kill() def __reduce__(self): pass
class YarnPool(RemotePool): def __init__(self, processes=None, port=0, authkey=None): super(YarnPool, self).__init__(processes=processes, port=port, authkey=authkey, workerscript=None) self.stopping = False from knit import Knit self.k = Knit(autodetect=True) cmd = "python remoteworker.py --port %d --key %s" % (self.s.address[1], self.authkey) self.app_id = self.k.start(cmd, self._processes, files=['remoteworker.py', ]) self.t = Thread(target=self._monitor_appid) self.t.deamon = True self.t.start() def _start_remote_worker(self, pid): rw = RemoteWorker(pid) self._pool.append(rw) def spinning_cursor(self): while True: for cursor in '|/-\\': yield cursor def _monitor_appid(self): cursor = self.spinning_cursor() while not self.stopping: try: status = self.k.status(self.app_id) yarnState = status['app']['state'] print "YARN application is", yarnState except: pass sleep(1) def terminate(self): self.stopping = True super(YarnPool, self).terminate() self.k.kill(self.app_id)
def __init__(self, processes=None, port=0, authkey=None): super(YarnPool, self).__init__(processes=processes, port=port, authkey=authkey, workerscript=None) self.stopping = False self.knit = Knit(autodetect=True) cmd = ('python remoteworker.py --host {} --port {} --key {}' .format(socket.gethostname(), self.server.address[1], self.authkey)) self.app_id = self.knit.start( cmd, num_containers=self._processes, files=['joblibhadoop/yarn/remoteworker.py', ]) self.thread = Thread(target=self._monitor_appid) self.thread.deamon = True self.thread.start()
def test_hdfs_home(): hdfs3 = pytest.importorskip('hdfs3') hdfs = hdfs3.HDFileSystem() d = '/tmp/test' try: hdfs.mkdir(d) k = Knit(nn='localhost', rm='localhost', nn_port=8020, rm_port=8088, replication_factor=1, hdfs_home=d) env_zip = k.create_env(env_name='dev', packages=['python=2.7'], remove=True) k.start('env', files=[env_zip], memory=128) assert d + '/.knitDeps' in hdfs.ls(d, False) assert d + "/.knitDeps/knit-1.0-SNAPSHOT.jar" in hdfs.ls(d + '/.knitDeps', False) assert d + "/.knitDeps/dev.zip" in hdfs.ls(d + '/.knitDeps', False) if not k.wait_for_completion(30): k.kill() finally: hdfs.rm(d, True) k.kill()
def clear(): c = CondaCreator() try: yield finally: shutil.rmtree(c.conda_envs) try: k = Knit() import hdfs3 hdfs = hdfs3.HDFileSystem() hdfs.rm(k.hdfs_home, recursive=True) except: pass
def __init__(self, processes=None, port=0, authkey=None): super(YarnPool, self).__init__(processes=processes, port=port, authkey=authkey, workerscript=None) self.stopping = False from knit import Knit self.k = Knit(autodetect=True) cmd = "python remoteworker.py --port %d --key %s" % (self.s.address[1], self.authkey) self.app_id = self.k.start(cmd, self._processes, files=['remoteworker.py', ]) self.t = Thread(target=self._monitor_appid) self.t.deamon = True self.t.start()
class DaskYARNCluster(object): """ Implements a dask cluster with YARN containers running the worker processes. A dask scheduler is started locally upon instantiation, but you must call ``start()`` to initiate the building of containers by YARN. Parameters ---------- nn, nn_port, rm, rm_port, user, autodetect: see knit.Knit env: str or None If provided, the path of a zipped conda env to put in containers packages: list of str Packages to install in the env to provide to containers *if* env is None. Uses conda spec for pinning versions. dask and distributed will always be included. channels: list of str If building an environment, pass these extra channels to conda using ``-c`` (i.e., in addition but of superior priority to any system default channels). conda_pars: dict Things to pass to CondaCreator ip: IP-like string or None Address for the scheduler to listen on. If not given, uses the system IP. """ def __init__(self, autodetect=True, packages=None, ip=None, env=None, channels=None, conda_pars=None, **kwargs): ip = ip or socket.gethostbyname(socket.gethostname()) self.env = env self.application_master_container = None self.app_id = None self.channels = channels self.conda_pars = conda_pars try: self.local_cluster = LocalCluster(n_workers=0, ip=ip) except (OSError, IOError): self.local_cluster = LocalCluster(n_workers=0, scheduler_port=0, ip=ip) self.packages = list( sorted(unique((packages or []) + global_packages, key=first_word))) self.knit = Knit(autodetect=autodetect, **kwargs) atexit.register(self.stop) @property def scheduler_address(self): return self.local_cluster.scheduler_address def start(self, n_workers=1, cpus=1, memory=2048, checks=True, **kwargs): """ Initiate workers. If required, environment is first built and uploaded to HDFS, and then a YARN application with the required number of containers is created. Parameters ---------- n_workers: int How many containers to create cpus: int=1 How many CPU cores is available in each container memory: int=2048 Memory available to each dask worker (in MB) checks: bool=True Whether to run pre-flight checks before submitting app to YARN kwargs: passed to ``Knit.start()`` Returns ------- YARN application ID. """ c = CondaCreator(channels=self.channels, **(self.conda_pars or {})) if self.env is None: env_name = 'dask-' + sha1('-'.join( self.packages).encode()).hexdigest() env_path = os.path.join(c.conda_envs, env_name) if os.path.exists(env_path + '.zip'): # zipfile exists, ready to upload self.env = env_path + '.zip' elif os.path.exists(env_path): # environment exists, can zip and upload c.zip_env(env_path) self.env = env_path + '.zip' else: # create env from scratch self.env = c.create_env(env_name=env_name, packages=self.packages) elif not self.env.endswith('.zip'): # given env directory, so zip it c.zip_env(self.env) self.env = self.env + '.zip' # TODO: memory should not be total available? command = '$PYTHON_BIN $CONDA_PREFIX/bin/dask-worker --nprocs=1 ' \ '--nthreads=%d --memory-limit=%d %s > ' \ '/tmp/worker-log.out 2> /tmp/worker-log.err' % ( cpus, memory * 1e6, self.local_cluster.scheduler.address) app_id = self.knit.start(command, env=self.env, num_containers=n_workers, virtual_cores=cpus, memory=memory, checks=checks, **kwargs) self.app_id = app_id return app_id def remove_worker(self, container_id): """ Stop worker and remove container Parameters ---------- container_id Returns ------- None """ self.knit.remove_containers(container_id) @property def workers(self): """ list of running container ids """ # remove container ...00001 -- this is applicationMaster's container and # should not be removed or counted as a worker containers = list(self.knit.get_container_statuses()) containers.sort() self.application_master_container = containers.pop(0) return containers @gen.coroutine def _start(self): pass def stop(self): """Kill the YARN application and all workers""" if self.knit: self.knit.kill() def add_workers(self, n_workers=1, cpus=1, memory=2048): """ Non-blocking function to ask Yarn for more containers/dask-workers Parameters ---------- n_workers: int number of containers to add (default: 1) cpus: int number of cpus (default: 1) memory: int amount of memory to allocate per container Returns ------- None """ self.knit.add_containers(num_containers=n_workers, virtual_cores=cpus, memory=memory) def __enter__(self): return self def __exit__(self, *args): self.close() def close(self): """Stop the scheduler and workers""" self.stop() self.local_cluster.close()
def test_connection_error(): k = Knit(rm_port=8089) with pytest.raises(YARNException) as e: k.start('ls') assert 'proxy' in str(e)