def dask_client(): cluster = LocalCluster(n_workers=NUM_WORKERS, threads_per_worker=2) client = Client(cluster) yield client # teardown client.close() cluster.close()
class TestDaskExecutor(TestBaseDask): def setUp(self): self.dagbag = DagBag(include_examples=True) self.cluster = LocalCluster() def test_dask_executor_functions(self): executor = DaskExecutor(cluster_address=self.cluster.scheduler_address) self.assert_tasks_on_executor(executor) @pytest.mark.quarantined def test_backfill_integration(self): """ Test that DaskExecutor can be used to backfill example dags """ dag = self.dagbag.get_dag('example_bash_operator') job = BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True, executor=DaskExecutor( cluster_address=self.cluster.scheduler_address)) job.run() def tearDown(self): self.cluster.close(timeout=5)
def test_dask_read_combine_instastack(self): from distributed import Client, LocalCluster from dask.distributed import wait cluster = LocalCluster(n_workers=1, threads_per_worker=1) c = Client(cluster) anxcor = Anxcor() anxcor.set_window_length(120.0) times = anxcor.get_starttimes(starttime_stamp, endtime_stamp, 0.5) bank = WavebankWrapper(source_dir) anxcor.add_dataset(bank, 'nodals') anxcor.save_at_task(target_dir, 'combine') result = anxcor.process(times,dask_client=c,stack=True) anxcor = Anxcor() anxcor.set_window_length(120.0) bank = WavebankWrapper(source_dir) anxcor.add_dataset(bank, 'nodals') anxcor.load_at_task(target_dir, 'combine') result = anxcor.process(times,dask_client=c,stack=True) how_many_nc = _how_many_fmt(target_dir, format='.nc') _clean_files_in_dir(target_dir) c.close() cluster.close() assert 48 == how_many_nc
class DaskExecutorTest(BaseDaskTest): def setUp(self): self.dagbag = DagBag(include_examples=True) self.cluster = LocalCluster() def test_dask_executor_functions(self): executor = DaskExecutor(cluster_address=self.cluster.scheduler_address) self.assert_tasks_on_executor(executor) def test_backfill_integration(self): """ Test that DaskExecutor can be used to backfill example dags """ dags = [ dag for dag in self.dagbag.dags.values() if dag.dag_id in [ 'example_bash_operator', # 'example_python_operator', ] ] for dag in dags: dag.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True, executor=DaskExecutor( cluster_address=self.cluster.scheduler_address)) job.run() def tearDown(self): self.cluster.close(timeout=5)
def test_get_batch_size_distributed(): cluster = LocalCluster(processes=False) with DistributedHandler(cluster.scheduler_address) as handler: assert handler._get_batch_size(handler.client) == DEFAULT_MAX_THREADS cluster.close()
def test_backfill_integration(self): """ Test that DaskExecutor can be used to backfill example dags """ cluster = LocalCluster() dags = [ dag for dag in self.dagbag.dags.values() if dag.dag_id in [ 'example_bash_operator', # 'example_python_operator', ] ] for dag in dags: dag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True, executor=DaskExecutor( cluster_address=cluster.scheduler_address)) job.run() cluster.close()
def test_backfill_integration(self): """ Test that DaskExecutor can be used to backfill example dags """ cluster = LocalCluster() dags = [ dag for dag in self.dagbag.dags.values() if dag.dag_id in [ 'example_bash_operator', # 'example_python_operator', ] ] for dag in dags: dag.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): job = BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True, executor=DaskExecutor( cluster_address=cluster.scheduler_address)) job.run() cluster.close()
class StartCluster(): def __init__(self, n_cores=None): if n_cores is None: n_cores = psutil.cpu_count() - 2 self.cluster = LocalCluster(processes=True, n_workers=1) self.client = Client(self.cluster) def __exit__(self, type, value, traceback): self.cluster.close()
def test_submit_task_instance_to_dask_cluster(self): """ Test that the DaskExecutor properly submits tasks to the cluster """ cluster = LocalCluster(nanny=False) executor = DaskExecutor(cluster_address=cluster.scheduler_address) args = dict(start_date=DEFAULT_DATE) def fail(): raise ValueError('Intentional failure.') with DAG('test-dag', default_args=args) as dag: # queue should be allowed, but ignored success_operator = PythonOperator(task_id='success', python_callable=lambda: True, queue='queue') fail_operator = PythonOperator(task_id='fail', python_callable=fail) success_ti = TaskInstance(success_operator, execution_date=DEFAULT_DATE) fail_ti = TaskInstance(fail_operator, execution_date=DEFAULT_DATE) # queue the tasks executor.queue_task_instance(success_ti) executor.queue_task_instance(fail_ti) # the tasks haven't been submitted to the cluster yet self.assertTrue(len(executor.futures) == 0) # after the heartbeat, they have been submitted executor.heartbeat() self.assertTrue(len(executor.futures) == 2) # wait a reasonable amount of time for the tasks to complete for _ in range(2): time.sleep(0.25) executor.heartbeat() # check that the futures were completed if len(executor.futures) == 2: raise ValueError('Failed to reach cluster before timeout.') self.assertTrue(len(executor.futures) == 0) # check that the taskinstances were updated success_ti.refresh_from_db() self.assertTrue(success_ti.state == State.SUCCESS) fail_ti.refresh_from_db() self.assertTrue(fail_ti.state == State.FAILED) cluster.close()
class DRMAACluster(object): def __init__(self, **kwargs): self.local_cluster = LocalCluster(n_workers=0, **kwargs) self.session = drmaa.Session() self.session.initialize() self.worker_template = self.session.createJobTemplate() self.worker_template.remoteCommand = os.path.join( sys.exec_prefix, 'bin', 'dask-worker') self.worker_template.jobName = 'dask-worker' self.worker_template.args = [ '%s:%d' % (socket.gethostname(), self.local_cluster.scheduler.port) ] self.worker_template.outputPath = ':/%s/out' % os.getcwd() self.worker_template.errorPath = ':/%s/err' % os.getcwd() self.worker_template.workingDirectory = os.getcwd() self.workers = [] @property def scheduler_address(self): return self.local_cluster.scheduler_address def start_workers(self, n=1): ids = self.session.runBulkJobs(self.worker_template, 1, n, 1) self.workers.extend(ids) def stop_workers(self, worker_ids, sync=False): for wid in worker_ids: try: self.session.control(wid, drmaa.JobControlAction.TERMINATE) except drmaa.errors.InvalidJobException: pass if sync: self.session.synchronize(worker_ids, dispose=True) def close(self): if self.workers: self.stop_workers(self.workers, sync=True) self.local_cluster.close() def __enter__(self): return self def __exit__(self, *args): self.close() def __del__(self): try: self.close() except: pass
def test_client(self): lc = LocalCluster(diagnostics_port=None) passed = Client(lc) client, shutdown_callback = _prepare_client(passed) self.assertEquals(client, passed) shutdown_callback() lc.close() self.assertEquals(lc.status, 'closed')
def test_dask_cluster_client(self): port = 8788 cluster = LocalCluster(processes=False, scheduler_port=port) client = configure_dask_cluster(address=f"localhost:{port}") self.assertEqual(None, client.cluster) self.assertEqual("running", client.status) close_dask_client() self.assertEqual("closed", client.status) self.assertEqual("running", cluster.status) cluster.close(timeout=10) self.assertEqual("closed", cluster.status)
def test_dask_execution(self): from distributed import Client, LocalCluster cluster = LocalCluster(n_workers=1, threads_per_worker=1) c = Client(cluster) anxcor = Anxcor() anxcor.set_window_length(120.0) times = anxcor.get_starttimes(starttime_stamp,endtime_stamp, 0.5) bank = WavebankWrapper(source_dir) anxcor.add_dataset(bank, 'nodals') result = anxcor.process(times,dask_client=c) pairs = list(result.coords['rec'].values)+ list(result.coords['src'].values) c.close() cluster.close() assert 6 ==len(pairs)
def test_dask_execution_exclude_with_stack_number(self): from distributed import Client, LocalCluster cluster = LocalCluster(n_workers=1, threads_per_worker=1) c = Client(cluster) anxcor = Anxcor() anxcor.set_window_length(120) anxcor.set_task_kwargs('crosscorrelate', dict(max_tau_shift=20.0)) times = anxcor.get_starttimes(starttime_stamp, endtime_stamp, 0.5) bank = WavebankWrapper(source_dir) anxcor.set_must_exclude_single_stations('AX.1') anxcor.add_dataset(bank, 'nodals') result = anxcor.process(times, dask_client=c, stack=10) pairs = list(result.coords['rec'].values) + list(result.coords['src'].values) c.close() cluster.close() assert 4 == len(pairs)
def test_twpice_case(): """ Use a test case from TWP-ICE """ Grid0 = pyart.io.read_grid(pydda.tests.EXAMPLE_RADAR0) Grid1 = pyart.io.read_grid(pydda.tests.EXAMPLE_RADAR1) sounding = pyart.io.read_arm_sonde(pydda.tests.SOUNDING_PATH) u_init, v_init, w_init = pydda.initialization.make_wind_field_from_profile( Grid0, sounding[1], vel_field='corrected_velocity') Grids = pydda.retrieval.get_dd_wind_field( [Grid0, Grid1], u_init, v_init, w_init, Co=100, Cm=1500.0, Cz=0, Cmod=0.0, vel_name='corrected_velocity', refl_field='reflectivity', frz=5000.0, filt_iterations=0, mask_outside_opt=True, upper_bc=1) # In this test grid, we expect the mean flow to be to the southeast # Maximum updrafts should be at least 10 m/s u_mean = np.nanmean(Grids[0].fields['u']['data']) v_mean = np.nanmean(Grids[0].fields['v']['data']) w_max = np.max(Grids[0].fields['v']['data']) assert u_mean > 0 assert v_mean < 0 assert w_max > 10 # Now we will test the nesting. Do the same retrieval, and make sure # that we get the same result within a prescribed tolerance cluster = LocalCluster(n_workers=2, processes=True) client = Client(cluster) Grids2 = pydda.retrieval.get_dd_wind_field_nested( [Grid0, Grid1], u_init, v_init, w_init, client, Co=100, Cm=1500.0, Cz=0, Cmod=0.0, vel_name='corrected_velocity', refl_field='reflectivity', frz=5000.0, filt_iterations=0, mask_outside_opt=True, upper_bc=1) # Make sure features are correlated between both versions. No reason # to expect the same answer, but features should be correlated # Nesting tends to make the updrafts a bit better resolved, so expect # less of an outright correlation (but still strong) assert np.corrcoef(Grids2[0].fields["u"]["data"].flatten(), Grids[0].fields["u"]["data"].flatten())[0, 1] > 0.9 assert np.corrcoef(Grids2[0].fields["v"]["data"].flatten(), Grids[0].fields["v"]["data"].flatten())[0, 1] > 0.9 assert np.corrcoef(Grids2[0].fields["w"]["data"].flatten(), Grids[0].fields["w"]["data"].flatten())[0, 1] > 0.5 cluster.close() client.close()
def test_with_distributed_client(self): lc = LocalCluster(diagnostics_port=None) client = Client(lc) graph = create_graph(net1_ex_matrix, net1_gene_names, net1_tf_names, "GBM", SGBM_KWARGS, target_genes=list(self.test_range), client=client) network_df = client.compute(graph, sync=True) self.assertEquals(len(self.test_range), len(network_df['target'].unique())) client.close() lc.close()
class DaskExecutorTest(BaseDaskTest): def setUp(self): self.dagbag = DagBag(include_examples=True) self.cluster = LocalCluster() @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_dask_executor_functions(self): executor = DaskExecutor(cluster_address=self.cluster.scheduler_address) self.assert_tasks_on_executor(executor) @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_backfill_integration(self): """ Test that DaskExecutor can be used to backfill example dags """ dags = [ dag for dag in self.dagbag.dags.values() if dag.dag_id in [ 'example_bash_operator', # 'example_python_operator', ] ] for dag in dags: dag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True, executor=DaskExecutor( cluster_address=self.cluster.scheduler_address)) job.run() def tearDown(self): self.cluster.close(timeout=5)
def test_dask_cluster_extraction_one_worker(self): cluster = LocalCluster(n_workers=1, threads_per_worker=1, dashboard_address=None) client = Client(cluster) address = client.scheduler_info()['address'] Distributor = ClusterDaskDistributor(address=address) df = self.create_test_data_sample() extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind", column_value="val", distributor=Distributor) self.assertIsInstance(extracted_features, pd.DataFrame) self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77]))) self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017]))) self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167]))) self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695]))) self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1]))) self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483]))) self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75]))) self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0]))) cluster.close()
def test_distributed_handler_distributed(values, expected_values): cluster = LocalCluster(processes=False) with DistributedHandler(cluster.scheduler_address) as handler: futures = handler.client.map(lambda x: x + 1, values) handler_map_results = handler.gather(futures) with DistributedHandler(cluster.scheduler_address) as handler: handler_batched_results = handler.batched_map(lambda x: x + 1, values) client = Client(cluster) futures = client.map(lambda x: x + 1, values) distributed_results = client.gather(futures) handler_map_results = set(handler_map_results) handler_batched_results = set(handler_batched_results) distributed_results = set(distributed_results) assert (handler_map_results == handler_batched_results and handler_map_results == distributed_results) cluster.close()
def test_dask_executor_functions(self): cluster = LocalCluster(nanny=False) executor = DaskExecutor(cluster_address=cluster.scheduler_address) success_command = 'echo 1' fail_command = 'exit 1' executor.execute_async(key='success', command=success_command) executor.execute_async(key='fail', command=fail_command) success_future = next(k for k, v in executor.futures.items() if v == 'success') fail_future = next(k for k, v in executor.futures.items() if v == 'fail') # wait for the futures to execute, with a timeout timeout = datetime.datetime.now() + datetime.timedelta(seconds=0.5) while not (success_future.done() and fail_future.done()): if datetime.datetime.now() > timeout: raise ValueError( 'The futures should have finished; there is probably ' 'an error communciating with the Dask cluster.') # both tasks should have finished self.assertTrue(success_future.done()) self.assertTrue(fail_future.done()) # check task exceptions self.assertTrue(success_future.exception() is None) self.assertTrue(fail_future.exception() is not None) # tell the executor to shut down executor.end() self.assertTrue(len(executor.futures) == 0) cluster.close()
def test_dask_executor_functions(self): cluster = LocalCluster() executor = DaskExecutor(cluster_address=cluster.scheduler_address) # start the executor executor.start() success_command = 'echo 1' fail_command = 'exit 1' executor.execute_async(key='success', command=success_command) executor.execute_async(key='fail', command=fail_command) success_future = next( k for k, v in executor.futures.items() if v == 'success') fail_future = next( k for k, v in executor.futures.items() if v == 'fail') # wait for the futures to execute, with a timeout timeout = datetime.datetime.now() + datetime.timedelta(seconds=30) while not (success_future.done() and fail_future.done()): if datetime.datetime.now() > timeout: raise ValueError( 'The futures should have finished; there is probably ' 'an error communciating with the Dask cluster.') # both tasks should have finished self.assertTrue(success_future.done()) self.assertTrue(fail_future.done()) # check task exceptions self.assertTrue(success_future.exception() is None) self.assertTrue(fail_future.exception() is not None) cluster.close()
import os import pandas as pd import argparse from dask.distributed import Client from distributed import Client, LocalCluster if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--cell_line', nargs=1, type=str, help='cell line to run on') parser.add_argument('--name', nargs=1, type=str, help='name of dataset') args = parser.parse_args() cl = args.cell_line[0] name = args.name[0] from arboreto.algo import grnboost2, genie3 from arboreto.utils import load_tf_names ex_matrix = pd.read_csv('~/data/spate116/GCN/%s/%s_expression_matrix_imputed.tsv' % (cl, name), sep='\t').transpose() cluster = LocalCluster() client = Client(cluster) print('here') network = grnboost2(expression_data=ex_matrix.to_numpy(), gene_names=ex_matrix.columns, client_or_address=client) network.to_csv('~/data/spate116/GCN/%s/%s_GRN.tsv' % (cl, name), sep='\t', header=True, index=False) client.close() cluster.close()
class DaskExecutorTest(unittest.TestCase): def setUp(self): self.dagbag = DagBag(include_examples=True) self.cluster = LocalCluster() @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_dask_executor_functions(self): executor = DaskExecutor(cluster_address=self.cluster.scheduler_address) # start the executor executor.start() success_command = 'echo 1' fail_command = 'exit 1' executor.execute_async(key='success', command=success_command) executor.execute_async(key='fail', command=fail_command) success_future = next(k for k, v in executor.futures.items() if v == 'success') fail_future = next(k for k, v in executor.futures.items() if v == 'fail') # wait for the futures to execute, with a timeout timeout = timezone.utcnow() + timedelta(seconds=30) while not (success_future.done() and fail_future.done()): if timezone.utcnow() > timeout: raise ValueError( 'The futures should have finished; there is probably ' 'an error communciating with the Dask cluster.') # both tasks should have finished self.assertTrue(success_future.done()) self.assertTrue(fail_future.done()) # check task exceptions self.assertTrue(success_future.exception() is None) self.assertTrue(fail_future.exception() is not None) @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_backfill_integration(self): """ Test that DaskExecutor can be used to backfill example dags """ dags = [ dag for dag in self.dagbag.dags.values() if dag.dag_id in [ 'example_bash_operator', # 'example_python_operator', ] ] for dag in dags: dag.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True, executor=DaskExecutor( cluster_address=self.cluster.scheduler_address)) job.run() def tearDown(self): self.cluster.close(timeout=5)
class JobQueueCluster(Cluster): """ Base class to launch Dask Clusters for Job queues This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster or SLURMCluster) Parameters ---------- name : str Name of Dask workers. cores : int Total number of cores per job memory: str Total amount of memory per job processes : int Number of processes per job interface : str Network interface like 'eth0' or 'ib0'. death_timeout : float Seconds to wait for a scheduler before closing workers local_directory : str Dask worker local directory for file spilling. extra : str Additional arguments to pass to `dask-worker` env_extra : list Other commands to add to script before launching worker. kwargs : dict Additional keyword arguments to pass to `LocalCluster` Attributes ---------- submit_command: str Abstract attribute for job scheduler submit command, should be overriden cancel_command: str Abstract attribute for job scheduler cancel command, should be overriden See Also -------- PBSCluster SLURMCluster """ _script_template = """ #!/bin/bash %(job_header)s %(env_header)s %(worker_command)s """.lstrip() # Following class attributes should be overriden by extending classes. submit_command = None cancel_command = None scheduler_name = '' _adaptive_options = {'worker_key': lambda ws: _job_id_from_worker_name(ws.name)} job_id_regexp = r'(?P<job_id>\d+)' def __init__(self, name=None, cores=None, memory=None, processes=None, interface=None, death_timeout=None, local_directory=None, extra=None, env_extra=None, walltime=None, threads=None, **kwargs ): """ """ # """ # This initializer should be considered as Abstract, and never used directly. # """ if threads is not None: raise ValueError(threads_deprecation_message) if not self.scheduler_name: raise NotImplementedError('JobQueueCluster is an abstract class that should not be instanciated.') if name is None: name = dask.config.get('jobqueue.%s.name' % self.scheduler_name) if cores is None: cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name) if memory is None: memory = dask.config.get('jobqueue.%s.memory' % self.scheduler_name) if processes is None: processes = dask.config.get('jobqueue.%s.processes' % self.scheduler_name) if interface is None: interface = dask.config.get('jobqueue.%s.interface' % self.scheduler_name) if death_timeout is None: death_timeout = dask.config.get('jobqueue.%s.death-timeout' % self.scheduler_name) if local_directory is None: local_directory = dask.config.get('jobqueue.%s.local-directory' % self.scheduler_name) if extra is None: extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name) if env_extra is None: env_extra = dask.config.get('jobqueue.%s.env-extra' % self.scheduler_name) if dask.config.get('jobqueue.%s.threads', None): warnings.warn(threads_deprecation_message) if cores is None: raise ValueError("You must specify how many cores to use per job like ``cores=8``") if memory is None: raise ValueError("You must specify how much memory to use per job like ``memory='24 GB'``") # This attribute should be overriden self.job_header = None if interface: extra += ' --interface %s ' % interface kwargs.setdefault('ip', get_ip_interface(interface)) else: kwargs.setdefault('ip', '') # Bokeh diagnostics server should listen on all interfaces diagnostics_ip_and_port = ('', 8787) self.local_cluster = LocalCluster(n_workers=0, diagnostics_port=diagnostics_ip_and_port, **kwargs) # Keep information on process, threads and memory, for use in # subclasses self.worker_memory = parse_bytes(memory) if memory is not None else None self.worker_processes = processes self.worker_cores = cores self.name = name # plugin for tracking job status self._scheduler_plugin = JobQueuePlugin() self.local_cluster.scheduler.add_plugin(self._scheduler_plugin) self._adaptive = None self._env_header = '\n'.join(env_extra) # dask-worker command line build dask_worker_command = '%(python)s -m distributed.cli.dask_worker' % dict(python=sys.executable) self._command_template = ' '.join([dask_worker_command, self.scheduler.address]) self._command_template += " --nthreads %d" % self.worker_threads if processes is not None and processes > 1: self._command_template += " --nprocs %d" % processes mem = format_bytes(self.worker_memory / self.worker_processes) mem = mem.replace(' ', '') self._command_template += " --memory-limit %s" % mem self._command_template += " --name %s--${JOB_ID}--" % name if death_timeout is not None: self._command_template += " --death-timeout %s" % death_timeout if local_directory is not None: self._command_template += " --local-directory %s" % local_directory if extra is not None: self._command_template += extra def __repr__(self): running_workers = sum(len(value) for value in self.running_jobs.values()) running_cores = running_workers * self.worker_threads total_jobs = len(self.pending_jobs) + len(self.running_jobs) total_workers = total_jobs * self.worker_processes running_memory = running_workers * self.worker_memory / self.worker_processes return (self.__class__.__name__ + '(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)' % (running_cores, format_bytes(running_memory), running_workers, total_workers, len(self.running_jobs), total_jobs) ) @property def pending_jobs(self): """ Jobs pending in the queue """ return self._scheduler_plugin.pending_jobs @property def running_jobs(self): """ Jobs with currenly active workers """ return self._scheduler_plugin.running_jobs @property def finished_jobs(self): """ Jobs that have finished """ return self._scheduler_plugin.finished_jobs @property def worker_threads(self): return int(self.worker_cores / self.worker_processes) def job_script(self): """ Construct a job submission script """ pieces = {'job_header': self.job_header, 'env_header': self._env_header, 'worker_command': self._command_template} return self._script_template % pieces @contextmanager def job_file(self): """ Write job submission script to temporary file """ with tmpfile(extension='sh') as fn: with open(fn, 'w') as f: logger.debug("writing job script: \n%s" % self.job_script()) f.write(self.job_script()) yield fn def _submit_job(self, script_filename): return self._call(shlex.split(self.submit_command) + [script_filename]) def start_workers(self, n=1): """ Start workers and point them to our local scheduler """ logger.debug('starting %s workers' % n) num_jobs = math.ceil(n / self.worker_processes) for _ in range(num_jobs): with self.job_file() as fn: out = self._submit_job(fn) job = self._job_id_from_submit_output(out.decode()) logger.debug("started job: %s" % job) self.pending_jobs[job] = {} @property def scheduler(self): """ The scheduler of this cluster """ return self.local_cluster.scheduler def _calls(self, cmds, **kwargs): """ Call a command using subprocess.communicate This centralizes calls out to the command line, providing consistent outputs, logging, and an opportunity to go asynchronous in the future Parameters ---------- cmd: List(List(str)) A list of commands, each of which is a list of strings to hand to subprocess.communicate Examples -------- >>> self._calls([['ls'], ['ls', '/foo']]) Returns ------- The stdout result as a string Also logs any stderr information """ logger.debug("Submitting the following calls to command line") procs = [] for cmd in cmds: logger.debug(' '.join(cmd)) procs.append(subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs)) result = [] for proc in procs: out, err = proc.communicate() if err: logger.error(err.decode()) result.append(out) return result def _call(self, cmd, **kwargs): """ Singular version of _calls """ return self._calls([cmd], **kwargs)[0] def stop_workers(self, workers): """ Stop a list of workers""" logger.debug("Stopping workers: %s" % workers) if not workers: return jobs = self._del_pending_jobs() # stop pending jobs too for w in workers: if isinstance(w, dict): jobs.append(_job_id_from_worker_name(w['name'])) else: jobs.append(_job_id_from_worker_name(w.name)) self.stop_jobs(set(jobs)) def stop_jobs(self, jobs): """ Stop a list of jobs""" logger.debug("Stopping jobs: %s" % jobs) if jobs: jobs = list(jobs) self._call([self.cancel_command] + list(set(jobs))) def scale_up(self, n, **kwargs): """ Brings total worker count up to ``n`` """ logger.debug("Scaling up to %d workers." % n) active_and_pending = sum([len(j) for j in self.running_jobs.values()]) active_and_pending += self.worker_processes * len(self.pending_jobs) logger.debug("Found %d active/pending workers." % active_and_pending) self.start_workers(n - active_and_pending) def scale_down(self, workers): ''' Close the workers with the given addresses ''' logger.debug("Scaling down. Workers: %s" % workers) worker_states = [] for w in workers: try: # Get the actual WorkerState worker_states.append(self.scheduler.workers[w]) except KeyError: logger.debug('worker %s is already gone' % w) self.stop_workers(worker_states) def stop_all_jobs(self): ''' Stops all running and pending jobs ''' jobs = self._del_pending_jobs() jobs += list(self.running_jobs.keys()) self.stop_jobs(set(jobs)) def close(self): ''' Stops all running and pending jobs and stops scheduler ''' self.stop_all_jobs() self.local_cluster.close() def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() self.local_cluster.__exit__(type, value, traceback) def _del_pending_jobs(self): jobs = list(self.pending_jobs.keys()) logger.debug("Deleting pending jobs %s" % jobs) for job_id in jobs: del self.pending_jobs[job_id] return jobs def _job_id_from_submit_output(self, out): match = re.search(self.job_id_regexp, out) if match is None: msg = ('Could not parse job id from submission command ' "output.\nJob id regexp is {!r}\nSubmission command " 'output is:\n{}'.format(self.job_id_regexp, out)) raise ValueError(msg) job_id = match.groupdict().get('job_id') if job_id is None: msg = ("You need to use a 'job_id' named group in your regexp, e.g. " "r'(?P<job_id>\d+)', in your regexp. Your regexp was: " "{!r}".format(self.job_id_regexp)) raise ValueError(msg) return job_id
class DRMAACluster(Cluster): def __init__(self, template=None, cleanup_interval=1000, hostname=None, script=None, preexec_commands=(), copy_script=True, ip='', **kwargs): """ Dask workers launched by a DRMAA-compatible cluster Parameters ---------- template: dict Dictionary specifying options to pass to the DRMAA cluster and the worker. Relevant items are: jobName: string Name of the job as known by the DRMAA cluster. args: list Extra string arguments to pass to dask-worker outputPath: string Path to the dask-worker stdout. Must start with ':'. Defaults to worker.JOBID.TASKID.out in current directory. errorPath: string Path to the dask-worker stderr. Must start with ':' Defaults to worker.JOBID.TASKID.err in current directory. workingDirectory: string Where dask-worker runs, defaults to current directory nativeSpecification: string Options native to the job scheduler cleanup_interval: int Time interval in seconds at which closed workers are cleaned. Defaults to 1000 hostname: string Host on which to start the local scheduler, defaults to localhost script: string (optional) Path to the dask-worker executable script. A temporary file will be made if none is provided (recommended) preexec_commands: tuple (optional) Commands to be executed first by temporary script. Cannot be specified at the same time as script. copy_script: bool Whether should copy the passed script to the current working directory. This is primarily to work around an issue with SGE. ip: string IP of the scheduler, default is the empty string which will listen on the primary ip address of the host **kwargs: Additional keyword arguments to be passed to the local scheduler Examples -------- >>> from dask_drmaa import DRMAACluster # doctest: +SKIP >>> cluster = DRMAACluster() # doctest: +SKIP >>> cluster.start_workers(10) # doctest: +SKIP >>> from distributed import Client # doctest: +SKIP >>> client = Client(cluster) # doctest: +SKIP >>> future = client.submit(lambda x: x + 1, 10) # doctest: +SKIP >>> future.result() # doctest: +SKIP 11 """ self.hostname = hostname or socket.gethostname() logger.info("Start local scheduler at %s", self.hostname) self.local_cluster = LocalCluster(n_workers=0, ip=ip, **kwargs) if script is None: fn = os.path.abspath(tempfile.mktemp( suffix='.sh', prefix='dask-worker-script-', dir=os.path.curdir, )) self.script = fn self._should_cleanup_script = True script_contents = make_job_script(executable=worker_bin_path, name='%s.%s' % (JOB_ID, TASK_ID), preexec=preexec_commands) with open(fn, 'wt') as f: f.write(script_contents) @atexit.register def remove_script(): if os.path.exists(fn): os.remove(fn) os.chmod(self.script, 0o777) else: self._should_cleanup_script = False if copy_script: with ignoring(EnvironmentError): # may be in the same path shutil.copy(script, os.path.curdir) # python 2.x returns None script = os.path.join(os.path.curdir, os.path.basename(script)) self._should_cleanup_script = True self.script = os.path.abspath(script) assert not preexec_commands, "Cannot specify both script and preexec_commands" # TODO: check that user-provided script is executable self.template = merge(default_template, {'remoteCommand': self.script}, template or {}) self._cleanup_callback = PeriodicCallback(callback=self.cleanup_closed_workers, callback_time=cleanup_interval, io_loop=self.scheduler.loop) self._cleanup_callback.start() self.workers = {} # {job-id: WorkerSpec} def adapt(self, **kwargs): """ Turn on adaptivity For keyword arguments see dask_drmaa.adaptive.Adaptive Examples -------- >>> cluster.adapt(minimum=0, maximum=10, interval='500ms') See Also -------- Cluster: an interface for other clusters to inherit from """ from .adaptive import Adaptive with ignoring(AttributeError): self._adaptive.stop() if not hasattr(self, '_adaptive_options'): self._adaptive_options = {} self._adaptive_options.update(kwargs) self._adaptive = Adaptive( self, self.scheduler, **self._adaptive_options ) return self._adaptive @gen.coroutine def _start(self): pass @property def scheduler(self): return self.local_cluster.scheduler def create_job_template(self, **kwargs): template = self.template.copy() if kwargs: template.update(kwargs) template['args'] = [self.scheduler_address] + template['args'] jt = get_session().createJobTemplate() valid_attributes = dir(jt) for key, value in template.items(): if key not in valid_attributes: raise ValueError("Invalid job template attribute %s" % key) setattr(jt, key, value) return jt def start_workers(self, n=1, **kwargs): if n == 0: return with log_errors(): with self.create_job_template(**kwargs) as jt: ids = get_session().runBulkJobs(jt, 1, n, 1) logger.info("Start %d workers. Job ID: %s", len(ids), ids[0].split('.')[0]) self.workers.update( {jid: WorkerSpec(job_id=jid, kwargs=kwargs, stdout=worker_out_path_template % dict(jid=jid, ext='out'), stderr=worker_out_path_template % dict(jid=jid, ext='err'), ) for jid in ids}) @gen.coroutine def stop_workers(self, worker_ids, sync=False): if isinstance(worker_ids, str): worker_ids = [worker_ids] elif worker_ids: worker_ids = list(worker_ids) else: return # Let the scheduler gracefully retire workers first ids_to_ips = { v['name']: k for k, v in self.scheduler.worker_info.items() } worker_ips = [ids_to_ips[wid] for wid in worker_ids if wid in ids_to_ips] retired = yield self.scheduler.retire_workers(workers=worker_ips, close_workers=True) logger.info("Retired workers %s", retired) for wid in list(worker_ids): try: get_session().control(wid, drmaa.JobControlAction.TERMINATE) except drmaa.errors.InvalidJobException: pass try: self.workers.pop(wid) except KeyError: # If we have multiple callers at once, it may have already # been popped off pass logger.info("Stop workers %s", worker_ids) if sync: get_session().synchronize(worker_ids, dispose=True) @gen.coroutine def scale_up(self, n, **kwargs): yield [self.start_workers(**kwargs) for _ in range(n - len(self.workers))] @gen.coroutine def scale_down(self, workers): workers = set(workers) yield self.scheduler.retire_workers(workers=workers) def close(self): logger.info("Closing DRMAA cluster") self.stop_workers(self.workers, sync=True) self.local_cluster.close() if self._should_cleanup_script and os.path.exists(self.script): os.remove(self.script) def __enter__(self): return self def __exit__(self, *args): self.close() def cleanup_closed_workers(self): for jid in list(self.workers): if get_session().jobStatus(jid) in ('closed', 'done'): logger.info("Removing closed worker %s", jid) del self.workers[jid] def __del__(self): try: self.close() except: pass def __str__(self): return "<%s: %d workers>" % (self.__class__.__name__, len(self.workers)) __repr__ = __str__
#load TF list from file tf_names = load_tf_names(net1_tf_path) #Quick inspection tf_names[:5] len(tf_names) #Set computational local environment # Obersvation: Less Ascertion errors if run with less people on cluster from distributed import LocalCluster, Client local_cluster = LocalCluster(n_workers=6, threads_per_worker=1) custom_client = Client(local_cluster) custom_client #Start Job network = grnboost2(expression_data=ex_matrix, tf_names=tf_names, client_or_address=custom_client) #QC job network.head() len(network) #Save output wd_output = '/home/pezoldt/NAS2/pezoldt/Analysis/scRNAseq/scenic/' + sample_ID + '/' + cell_type + '/int/GRNBoost_linklist.tsv' network.to_csv(wd_output, sep='\t', header=False, index=False) #close client custom_client.close() local_cluster.close()
class DRMAACluster(object): def __init__( self, jobName='dask-worker', remoteCommand=os.path.join(sys.exec_prefix, 'bin', 'dask-worker'), args=(), outputPath=':%s/out' % os.getcwd(), errorPath=':%s/err' % os.getcwd(), workingDirectory=os.getcwd(), nativeSpecification='', max_runtime='1:00:00', #1 hour **kwargs): """ Dask workers launched by a DRMAA-compatible cluster Parameters ---------- jobName: string Name of the job as known by the DRMAA cluster. remoteCommand: string Path to the dask-worker executable args: list Extra string arguments to pass to dask-worker outputPath: string errorPath: string workingDirectory: string Where dask-worker runs, defaults to current directory nativeSpecification: string Options native to the job scheduler max_runtime: string Maximum runtime of worker jobs in format ``"HH:MM:SS"`` Examples -------- >>> from dask_drmaa import DRMAACluster # doctest: +SKIP >>> cluster = DRMAACluster() # doctest: +SKIP >>> cluster.start_workers(10) # doctest: +SKIP >>> from distributed import Client # doctest: +SKIP >>> client = Client(cluster) # doctest: +SKIP >>> future = client.submit(lambda x: x + 1, 10) # doctest: +SKIP >>> future.result() # doctest: +SKIP 11 """ logger.info("Start local scheduler") self.local_cluster = LocalCluster(n_workers=0, **kwargs) logger.info("Initialize connection to job scheduler") self.jobName = jobName self.remoteCommand = remoteCommand self.args = [ '%s:%d' % (socket.gethostname(), self.local_cluster.scheduler.port) ] + list(args) self.outputPath = outputPath self.errorPath = errorPath self.nativeSpecification = nativeSpecification self.max_runtime = max_runtime self._cleanup_callback = PeriodicCallback( callback=self.cleanup_closed_workers, callback_time=1000, io_loop=self.scheduler.loop) # self._cleanup_callback.start() self.workers = {} # {job-id: {'resource': quanitty}} @property def scheduler(self): return self.local_cluster.scheduler @property def scheduler_address(self): return self.scheduler.address def createJobTemplate(self, nativeSpecification=''): wt = get_session().createJobTemplate() wt.jobName = self.jobName wt.remoteCommand = self.remoteCommand wt.args = self.args wt.outputPath = self.outputPath wt.errorPath = self.errorPath wt.nativeSpecification = self.nativeSpecification + ' ' + nativeSpecification return wt def start_workers(self, n=1, **kwargs): with log_errors(): wt = self.createJobTemplate(**kwargs) ids = get_session().runBulkJobs(wt, 1, n, 1) logger.info("Start %d workers. Job ID: %s", len(ids), ids[0].split('.')[0]) self.workers.update({jid: kwargs for jid in ids}) def stop_workers(self, worker_ids, sync=False): worker_ids = list(worker_ids) for wid in worker_ids: try: get_session().control(wid, drmaa.JobControlAction.TERMINATE) except drmaa.errors.InvalidJobException: pass self.workers.pop(wid) logger.info("Stop workers %s", worker_ids) if sync: get_session().synchronize(worker_ids, dispose=True) def close(self): logger.info("Closing DRMAA cluster") self.local_cluster.close() if self.workers: self.stop_workers(self.workers, sync=True) def __enter__(self): return self def __exit__(self, *args): self.close() def cleanup_closed_workers(self): for jid in list(self.workers): if get_session().jobStatus(jid) == 'closed': logger.info("Removing closed worker %s", jid) del self.workers[jid] def __del__(self): try: self.close() except: pass def __str__(self): return "<%s: %d workers>" % (self.__class__.__name__, len( self.workers)) __repr__ = __str__
class DaskYARNCluster(object): """ Implements a dask cluster with YARN containers running the worker processes. A dask scheduler is started locally upon instantiation, but you must call ``start()`` to initiate the building of containers by YARN. Parameters ---------- nn, nn_port, rm, rm_port, user, autodetect: see knit.Knit env: str or None If provided, the path of a zipped conda env to put in containers packages: list of str Packages to install in the env to provide to containers *if* env is None. Uses conda spec for pinning versions. dask and distributed will always be included. channels: list of str If building an environment, pass these extra channels to conda using ``-c`` (i.e., in addition but of superior priority to any system default channels). conda_pars: dict Things to pass to CondaCreator ip: IP-like string or None Address for the scheduler to listen on. If not given, uses the system IP. """ def __init__(self, autodetect=True, packages=None, ip=None, env=None, channels=None, conda_pars=None, **kwargs): ip = ip or socket.gethostbyname(socket.gethostname()) self.env = env self.application_master_container = None self.app_id = None self.channels = channels self.conda_pars = conda_pars try: self.local_cluster = LocalCluster(n_workers=0, ip=ip) except (OSError, IOError): self.local_cluster = LocalCluster(n_workers=0, scheduler_port=0, ip=ip) self.packages = list( sorted(unique((packages or []) + global_packages, key=first_word))) self.knit = Knit(autodetect=autodetect, **kwargs) atexit.register(self.stop) @property def scheduler_address(self): return self.local_cluster.scheduler_address def start(self, n_workers=1, cpus=1, memory=2048, checks=True, **kwargs): """ Initiate workers. If required, environment is first built and uploaded to HDFS, and then a YARN application with the required number of containers is created. Parameters ---------- n_workers: int How many containers to create cpus: int=1 How many CPU cores is available in each container memory: int=2048 Memory available to each dask worker (in MB) checks: bool=True Whether to run pre-flight checks before submitting app to YARN kwargs: passed to ``Knit.start()`` Returns ------- YARN application ID. """ c = CondaCreator(channels=self.channels, **(self.conda_pars or {})) if self.env is None: env_name = 'dask-' + sha1('-'.join( self.packages).encode()).hexdigest() env_path = os.path.join(c.conda_envs, env_name) if os.path.exists(env_path + '.zip'): # zipfile exists, ready to upload self.env = env_path + '.zip' elif os.path.exists(env_path): # environment exists, can zip and upload c.zip_env(env_path) self.env = env_path + '.zip' else: # create env from scratch self.env = c.create_env(env_name=env_name, packages=self.packages) elif not self.env.endswith('.zip'): # given env directory, so zip it c.zip_env(self.env) self.env = self.env + '.zip' # TODO: memory should not be total available? command = '$PYTHON_BIN $CONDA_PREFIX/bin/dask-worker --nprocs=1 ' \ '--nthreads=%d --memory-limit=%d %s > ' \ '/tmp/worker-log.out 2> /tmp/worker-log.err' % ( cpus, memory * 1e6, self.local_cluster.scheduler.address) app_id = self.knit.start(command, env=self.env, num_containers=n_workers, virtual_cores=cpus, memory=memory, checks=checks, **kwargs) self.app_id = app_id return app_id def remove_worker(self, container_id): """ Stop worker and remove container Parameters ---------- container_id Returns ------- None """ self.knit.remove_containers(container_id) @property def workers(self): """ list of running container ids """ # remove container ...00001 -- this is applicationMaster's container and # should not be removed or counted as a worker containers = list(self.knit.get_container_statuses()) containers.sort() self.application_master_container = containers.pop(0) return containers @gen.coroutine def _start(self): pass def stop(self): """Kill the YARN application and all workers""" if self.knit: self.knit.kill() def add_workers(self, n_workers=1, cpus=1, memory=2048): """ Non-blocking function to ask Yarn for more containers/dask-workers Parameters ---------- n_workers: int number of containers to add (default: 1) cpus: int number of cpus (default: 1) memory: int amount of memory to allocate per container Returns ------- None """ self.knit.add_containers(num_containers=n_workers, virtual_cores=cpus, memory=memory) def __enter__(self): return self def __exit__(self, *args): self.close() def close(self): """Stop the scheduler and workers""" self.stop() self.local_cluster.close()
def parallel_calculate_chunks(chunks, features, approximate, training_window, verbose, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, dask_kwargs=None): from distributed import Client, LocalCluster, as_completed from dask.base import tokenize client = None cluster = None try: if 'cluster' in dask_kwargs: cluster = dask_kwargs['cluster'] else: diagnostics_port = None if 'diagnostics_port' in dask_kwargs: diagnostics_port = dask_kwargs['diagnostics_port'] del dask_kwargs['diagnostics_port'] workers = n_jobs_to_workers(n_jobs) workers = min(workers, len(chunks)) cluster = LocalCluster(n_workers=workers, threads_per_worker=1, diagnostics_port=diagnostics_port, **dask_kwargs) # if cluster has bokeh port, notify user if unxepected port number if diagnostics_port is not None: if hasattr(cluster, 'scheduler') and cluster.scheduler: info = cluster.scheduler.identity() if 'bokeh' in info['services']: msg = "Dashboard started on port {}" print(msg.format(info['services']['bokeh'])) client = Client(cluster) # scatter the entityset # denote future with leading underscore start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): print("Using EntitySet persisted on the cluster as dataset %s" % (es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(features) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) end = time.time() scatter_time = end - start scatter_string = "EntitySet scattered to workers in {:.3f} seconds" print(scatter_string.format(scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, features=_saved_features, entityset=_es, approximate=approximate, training_window=training_window, profile=False, verbose=False, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = [] iterator = as_completed(_chunks).batches() if verbose: pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str) for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) if verbose: pbar.update() if verbose: pbar.close() except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() return feature_matrix
class JobQueueCluster(ClusterManager): """ Base class to launch Dask Clusters for Job queues This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster or SLURMCluster) Parameters ---------- name : str Name of Dask workers. cores : int Total number of cores per job memory: str Total amount of memory per job processes : int Number of processes per job interface : str Network interface like 'eth0' or 'ib0'. death_timeout : float Seconds to wait for a scheduler before closing workers local_directory : str Dask worker local directory for file spilling. extra : list Additional arguments to pass to `dask-worker` env_extra : list Other commands to add to script before launching worker. log_directory : str Directory to use for job scheduler logs. shebang : str Path to desired interpreter for your batch submission script. python : str Python executable used to launch Dask workers. config_name : str Section to use from jobqueue.yaml configuration file. kwargs : dict Additional keyword arguments to pass to `LocalCluster` Attributes ---------- submit_command: str Abstract attribute for job scheduler submit command, should be overridden cancel_command: str Abstract attribute for job scheduler cancel command, should be overridden See Also -------- PBSCluster SLURMCluster SGECluster OARCluster LSFCluster MoabCluster """ _script_template = """ %(shebang)s %(job_header)s %(env_header)s %(worker_command)s """.lstrip() # Following class attributes should be overridden by extending classes. submit_command = None cancel_command = None job_id_regexp = r"(?P<job_id>\d+)" def __init__(self, name=None, cores=None, memory=None, processes=None, interface=None, death_timeout=None, local_directory=None, extra=None, env_extra=None, log_directory=None, threads=None, shebang=None, python=sys.executable, config_name=None, **kwargs): """ """ # """ # This initializer should be considered as Abstract, and never used directly. # """ super(JobQueueCluster, self).__init__() if threads is not None: raise ValueError(threads_deprecation_message) if config_name is None: raise NotImplementedError( "JobQueueCluster is an abstract class that should not be instantiated." ) if name is None: name = dask.config.get("jobqueue.%s.name" % config_name) if cores is None: cores = dask.config.get("jobqueue.%s.cores" % config_name) if memory is None: memory = dask.config.get("jobqueue.%s.memory" % config_name) if processes is None: processes = dask.config.get("jobqueue.%s.processes" % config_name) if interface is None: interface = dask.config.get("jobqueue.%s.interface" % config_name) if death_timeout is None: death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name) if local_directory is None: local_directory = dask.config.get("jobqueue.%s.local-directory" % config_name) if extra is None: extra = dask.config.get("jobqueue.%s.extra" % config_name) if env_extra is None: env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name) if log_directory is None: log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name) if shebang is None: shebang = dask.config.get("jobqueue.%s.shebang" % config_name) if dask.config.get("jobqueue.%s.threads", None): warnings.warn(threads_deprecation_message) if cores is None: raise ValueError( "You must specify how many cores to use per job like ``cores=8``" ) if memory is None: raise ValueError( "You must specify how much memory to use per job like ``memory='24 GB'``" ) # This attribute should be overridden self.job_header = None if interface: extra += ["--interface", interface] kwargs.setdefault("ip", get_ip_interface(interface)) else: kwargs.setdefault("ip", "") # Bokeh diagnostics server should listen on all interfaces kwargs.setdefault("dashboard_address", ("", 8787)) self.local_cluster = LocalCluster(n_workers=0, **kwargs) # Keep information on process, cores, and memory, for use in subclasses self.worker_memory = parse_bytes( memory) if memory is not None else None self.worker_processes = processes self.worker_cores = cores self.name = name # plugin for tracking job status self._scheduler_plugin = JobQueuePlugin() self.local_cluster.scheduler.add_plugin(self._scheduler_plugin) self._adaptive = None self.shebang = shebang self._env_header = "\n".join(env_extra) # dask-worker command line build dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict( python=python) command_args = [dask_worker_command, self.scheduler.address] command_args += ["--nthreads", self.worker_process_threads] if processes is not None and processes > 1: command_args += ["--nprocs", processes] command_args += ["--memory-limit", self.worker_process_memory] command_args += ["--name", "%s--${JOB_ID}--" % name] if death_timeout is not None: command_args += ["--death-timeout", death_timeout] if local_directory is not None: command_args += ["--local-directory", local_directory] if extra is not None: command_args += extra self._command_template = " ".join(map(str, command_args)) self.log_directory = log_directory if self.log_directory is not None: if not os.path.exists(self.log_directory): os.makedirs(self.log_directory) def __repr__(self): running_workers = self._count_active_workers() running_cores = running_workers * self.worker_process_threads total_jobs = len(self.pending_jobs) + len(self.running_jobs) total_workers = total_jobs * self.worker_processes running_memory = running_workers * self.worker_memory / self.worker_processes return (self.__class__.__name__ + "(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)" % ( running_cores, format_bytes(running_memory), running_workers, total_workers, len(self.running_jobs), total_jobs, )) @property def pending_jobs(self): """ Jobs pending in the queue """ return self._scheduler_plugin.pending_jobs @property def running_jobs(self): """ Jobs with currently active workers """ return self._scheduler_plugin.running_jobs @property def finished_jobs(self): """ Jobs that have finished """ return self._scheduler_plugin.finished_jobs @property def worker_process_threads(self): return int(self.worker_cores / self.worker_processes) @property def worker_process_memory(self): mem = format_bytes(self.worker_memory / self.worker_processes) mem = mem.replace(" ", "") return mem @property def jobqueue_worker_spec(self): """ single worker process info needed for scaling on cores or memory """ return { "cores": self.worker_process_threads, "memory": self.worker_process_memory, } @property def workers(self): """ workers currently connected to the scheduler """ return self.scheduler.workers def job_script(self): """ Construct a job submission script """ pieces = { "shebang": self.shebang, "job_header": self.job_header, "env_header": self._env_header, "worker_command": self._command_template, } return self._script_template % pieces @contextmanager def job_file(self): """ Write job submission script to temporary file """ with tmpfile(extension="sh") as fn: with open(fn, "w") as f: logger.debug("writing job script: \n%s", self.job_script()) f.write(self.job_script()) yield fn def _submit_job(self, script_filename): return self._call(shlex.split(self.submit_command) + [script_filename]) def start_workers(self, n=1): """ Start workers and point them to our local scheduler """ logger.debug("starting %s workers", n) num_jobs = int(math.ceil(n / self.worker_processes)) for _ in range(num_jobs): with self.job_file() as fn: out = self._submit_job(fn) job = self._job_id_from_submit_output(out) if not job: raise ValueError( "Unable to parse jobid from output of %s" % out) logger.debug("started job: %s", job) self.pending_jobs[job] = {} @property def scheduler(self): """ The scheduler of this cluster """ return self.local_cluster.scheduler def _call(self, cmd, **kwargs): """ Call a command using subprocess.Popen. This centralizes calls out to the command line, providing consistent outputs, logging, and an opportunity to go asynchronous in the future. Parameters ---------- cmd: List(str)) A command, each of which is a list of strings to hand to subprocess.Popen Examples -------- >>> self._call(['ls', '/foo']) Returns ------- The stdout produced by the command, as string. Raises ------ RuntimeError if the command exits with a non-zero exit code """ cmd_str = " ".join(cmd) logger.debug( "Executing the following command to command line\n{}".format( cmd_str)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs) out, err = proc.communicate() if six.PY3: out, err = out.decode(), err.decode() if proc.returncode != 0: raise RuntimeError("Command exited with non-zero exit code.\n" "Exit code: {}\n" "Command:\n{}\n" "stdout:\n{}\n" "stderr:\n{}\n".format(proc.returncode, cmd_str, out, err)) return out def stop_workers(self, workers): """ Stop a list of workers""" logger.debug("Stopping workers: %s", workers) if not workers: return jobs = self._del_pending_jobs() # stop pending jobs too for w in workers: if isinstance(w, dict): jobs.append(_job_id_from_worker_name(w["name"])) else: jobs.append(_job_id_from_worker_name(w.name)) self.stop_jobs(jobs) def stop_jobs(self, jobs): """ Stop a list of jobs""" logger.debug("Stopping jobs: %s", jobs) if jobs: jobs = list(jobs) self._call(shlex.split(self.cancel_command) + list(set(jobs))) # if any of these jobs were pending, we should remove those now for job_id in jobs: if job_id in self.pending_jobs: del self.pending_jobs[job_id] def scale_up(self, n, **kwargs): """ Brings total worker count up to ``n`` """ active_and_pending = self._count_active_and_pending_workers() if n >= active_and_pending: logger.debug("Scaling up to %d workers.", n) self.start_workers(n - active_and_pending) else: # scale_up should not be called if n < active + pending jobs logger.warning("JobQueueCluster.scale_up was called with a" " number of workers lower that what is already" " running or pending") def _count_active_and_pending_workers(self): active_and_pending = (self._count_active_workers() + self._count_pending_workers()) logger.debug("Found %d active/pending workers.", active_and_pending) assert len(self.scheduler.workers) <= active_and_pending return active_and_pending def _count_active_workers(self): active_workers = sum([len(j) for j in self.running_jobs.values()]) assert len(self.scheduler.workers) == active_workers return active_workers def _count_pending_workers(self): return self.worker_processes * len(self.pending_jobs) def scale_down(self, workers, n=None): """ Close the workers with the given addresses """ if n is None: # Adaptive currently calls directly scale_down, we need to handle this # Need to only keep active workers minus those adaptive wants to stop n = self._count_active_workers() - len(workers) logger.debug("Scaling down to %d Workers: %s", n, workers) active_and_pending = self._count_active_and_pending_workers() n_to_close = active_and_pending - n if n_to_close < 0: logger.warning("JobQueueCluster.scale_down was called with" " a number of worker greater than what is" " already running or pending.") elif n_to_close <= self._count_pending_workers(): # We only need to kill some pending jobs, to_kill = int(n_to_close / self.worker_processes) jobs = list(self.pending_jobs.keys())[-to_kill:] logger.debug("%d jobs to stop, stopping jobs %s", to_kill, jobs) self.stop_jobs(jobs) else: worker_states = [] for w in workers: try: # Get the actual WorkerState worker_states.append(self.scheduler.workers[w]) except KeyError: logger.debug("worker %s is already gone", w) self.stop_workers(worker_states) def stop_all_jobs(self): """ Stops all running and pending jobs """ jobs = self._del_pending_jobs() jobs += list(self.running_jobs.keys()) self.stop_jobs(set(jobs)) def close(self, **kwargs): """ Stops all running and pending jobs and stops scheduler """ self.stop_all_jobs() return self.local_cluster.close(**kwargs) def __enter__(self): return self def __exit__(self, type, value, traceback): self.close() self.local_cluster.__exit__(type, value, traceback) def _del_pending_jobs(self): jobs = list(self.pending_jobs.keys()) logger.debug("Deleting pending jobs %s" % jobs) for job_id in jobs: del self.pending_jobs[job_id] return jobs def _job_id_from_submit_output(self, out): match = re.search(self.job_id_regexp, out) if match is None: msg = ("Could not parse job id from submission command " "output.\nJob id regexp is {!r}\nSubmission command " "output is:\n{}".format(self.job_id_regexp, out)) raise ValueError(msg) job_id = match.groupdict().get("job_id") if job_id is None: msg = ( "You need to use a 'job_id' named group in your regexp, e.g. " "r'(?P<job_id>\\d+)', in your regexp. Your regexp was: " "{!r}".format(self.job_id_regexp)) raise ValueError(msg) return job_id def worker_key(self, worker_state): return _job_id_from_worker_name(worker_state.name)
class Cluster: def _wait_workers_start(self, n=1, timeout=None): """ Wait for number of workers, seen by scheduler, will become not less than n and return True. If timeout is reached, and it is still not happens, return False """ dt = timeout or self._wait_timeout end_time = time() + dt while True: if self.n_workers >= n: return True if time() > end_time: return False sleep(self._wait_timestep) def _start_local_cluster(self, **kwargs): ip = kwargs.pop("ip", socket.gethostbyname(self._hostname)) scheduler_port = kwargs.pop("scheduler_port", 0) self._local_cluster = LocalCluster(n_workers=0, ip=ip, scheduler_port=scheduler_port, **kwargs) logger.info("Started local scheduler at {addr}".format( addr=self.scheduler_address)) def __init__(self, slurm_kwargs=None, hostname=None, task_name=None, nanny=True, bokeh=True, bokeh_port=None, timeout=10., extra_path=None, tmp_dir=None, **kwargs): """ Dask.Distribued workers launched via Slurm workload manager Parameters ---------- slurm_kwargs : dict A dictionary with arguments, passed to Slurm batch script (see Examples). If None, defaults to empty dictionary. hostname : None or string Hostname of a controller node, visible by other Slurm nodes. If None, determined automatically through 'socket.gethostname()'. task_name : string or None Name of the job, passed to Slurm. If None, defaults to 'dask-workers'. nanny : boolean Start Dask workers in nanny process for management. Default is True. bokeh : boolean Whether to launch Bokeh Web UI Default is True. bokeh_port: None or int Bokeh port for dask-worker. None means default. timeout : float Default time to wait until workers start (see ``self.start_workers``). extra_path : None or str or List of str Extra module path values, that are injected to the workers via PYTHONPATH environment variable tmp_dir : str or None Directory for temporary files. If not specified, defaults to "slurmified_files" in working directory. For now it is assumed, that it is accessible from all nodes of a cluster. If you need more clever behaviour, please file a bug. **kwargs: dict Keyword arguments, passed directly to 'distributed.LocalCluster' constructor. Examples -------- >>> from slurmified import Cluster >>> slurm_kwargs = { ... 'partition': 'default', ... 'mem-per-cpu': '100', ... 'time': '1-00:00:00' ... } >>> cluster = Cluster(slurm_kwargs) >>> cluster.start_workers(10) >>> from distributed import Client >>> client = Client(cluster) >>> future = client.submit(lambda x: x + 1, 10) >>> future.result() 11 """ self._hostname = hostname or socket.gethostname() self._start_local_cluster(**kwargs) self._slurm_kwargs = slurm_kwargs.copy() if slurm_kwargs else {} nthreads1 = self._slurm_kwargs.pop("cpus-per-task", None) nthreads2 = self._slurm_kwargs.pop("c", None) self._nthreads = nthreads1 or nthreads2 or 1 self._jobid = None self._task_name = task_name or "dask-workers" self._wait_timeout = timeout self._wait_timestep = 1 self._worker_exec = os.path.join(sys.exec_prefix, 'bin', 'dask-worker') logger.info("Using dask-worker executable '{exe}'".format( exe=self._worker_exec)) self._nanny = nanny self._bokeh = bokeh self._bokeh_port = bokeh_port if isinstance(extra_path, str): self._extra_path = [extra_path] else: self._extra_path = extra_path self._tmp_dir = tmp_dir or os.path.abspath("slurmified_files") if not os.path.exists(self._tmp_dir): os.makedirs(self._tmp_dir) self._remove_tmp_dir = True else: self._remove_tmp_dir = False @property def scheduler(self): return self._local_cluster.scheduler @property def scheduler_address(self): return ('{hostname}:{port}'.format(hostname=self._hostname, port=self.scheduler.port)) @property def n_workers(self): return len(self.scheduler.workers) def start_workers(self, n=1, n_min=None, timeout=None, **kwargs): """Start Dask workers via Slurm batch script. If workers are started already, they are terminated. Returns self. Parameters ---------- n: int Number of workers to start. n_min: None or int Minimal number of workers launched, needed to start calculations. Function waits, until it is reached and exits. If it is not achieved until ``timeout``, RuntimeError will be emited. If None, wunction will wait for all ``n`` workers to start, but error would never be emited, only warning. timeout: None or int Time in seconds to wait for workers to start. If it is reached, and workers are not started, warning is emited. If None, default is used (provided in constructor). **kwargs: dict Dictionary with strings as keys and values, can be used to override Slurm kwargs, passed to the constructor. """ if self._jobid: self.stop_workers() slurm_kwargs = merge(self._slurm_kwargs, kwargs or {}, { "array": "0-{}".format(n - 1), "cpus-per-task": self._nthreads }) if self._extra_path: pythonpath_cmd = ( "[[ -z \"$PYTHONPATH\" ]] && " "export PYTHONPATH=\"{new_entries}\" || " "export PYTHONPATH=\"{new_entries}:$PYTHONPATH\"".format( new_entries=":".join(self._extra_path))) else: pythonpath_cmd = "" s = slurmpy.Slurm(self._task_name, slurm_kwargs=slurm_kwargs, scripts_dir=self._tmp_dir) # This command puts Jobid to stderr, that is likely nice to suppress with redirect_stderr(_Sink): self._jobid = s.run(pythonpath_cmd + "\n" + " ".join(( self._worker_exec, "--nthreads", str(self._nthreads), "--nprocs", "1", "--reconnect", "--nanny" if self._nanny else "--no-nanny", "--bokeh" if self._bokeh else "--no-bokeh", ("--bokeh-port {}".format(self._bokeh_port) if self._bokeh_port else ""), "--local-directory \"{}\"".format(self._tmp_dir), self.scheduler_address))) if self._wait_workers_start(n_min or n, timeout): m = ("Started {n} workers, job number {jobid}".format( n=self.n_workers, jobid=self._jobid)) logger.info(m) elif n_min: m = ("Not enough workers to continue " "({n}, minimal provided {n_min})".format(n=self.n_workers, n_min=n_min)) self.stop_workers() raise RuntimeError(m) else: m = ("Timeout is reached while waiting for {n} workers to start. " "{n_started} actually started. Job number {jobid}.".format( n=n, n_started=self.n_workers, jobid=self._jobid)) logger.warning(m) return self def stop_workers(self): """ Stop running workers. """ # Sometimes retire_workers command throws a lot of exceptions, that # also vary from update to update, so we just suppress everything here. # Anyway we just kill all the workers later using Slurm, # so it is just an attempt to do this in polite manner. with warnings.catch_warnings(): warnings.simplefilter('ignore') with suppress(Exception): sync(loop=self._local_cluster.loop, func=self.scheduler.retire_workers, remove=True) if self._jobid: try: subprocess.check_call(("scancel", str(self._jobid))) except subprocess.CalledProcessError as ex: m = ("scancel returned non-zero exit status {code} while " "stopping Slurm job number {jobid} for workers. " "You should check manually whether they are " "terminated successfully.".format(code=ex.returncode, jobid=self._jobid)) logger.error(m) finally: self._jobid = None def _start(self): return self._local_cluster._start() def close(self): """ Close the cluster. """ logger.info("Closing workers and cluster") if self._jobid: self.stop_workers() self._local_cluster.close() if self._remove_tmp_dir: shutil.rmtree(self._tmp_dir) self._remove_tmp_dir = False def __enter__(self): return self def __exit__(self, *args): self.close() def __del__(self): self.close()
class DaskExecutorTest(unittest.TestCase): def setUp(self): self.dagbag = DagBag(include_examples=True) self.cluster = LocalCluster() @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_dask_executor_functions(self): executor = DaskExecutor(cluster_address=self.cluster.scheduler_address) # start the executor executor.start() success_command = 'echo 1' fail_command = 'exit 1' executor.execute_async(key='success', command=success_command) executor.execute_async(key='fail', command=fail_command) success_future = next( k for k, v in executor.futures.items() if v == 'success') fail_future = next( k for k, v in executor.futures.items() if v == 'fail') # wait for the futures to execute, with a timeout timeout = timezone.utcnow() + timedelta(seconds=30) while not (success_future.done() and fail_future.done()): if timezone.utcnow() > timeout: raise ValueError( 'The futures should have finished; there is probably ' 'an error communciating with the Dask cluster.') # both tasks should have finished self.assertTrue(success_future.done()) self.assertTrue(fail_future.done()) # check task exceptions self.assertTrue(success_future.exception() is None) self.assertTrue(fail_future.exception() is not None) @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_backfill_integration(self): """ Test that DaskExecutor can be used to backfill example dags """ dags = [ dag for dag in self.dagbag.dags.values() if dag.dag_id in [ 'example_bash_operator', # 'example_python_operator', ] ] for dag in dags: dag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True, executor=DaskExecutor( cluster_address=self.cluster.scheduler_address)) job.run() def tearDown(self): self.cluster.close(timeout=5)
class DRMAACluster(object): def __init__(self, template=None, cleanup_interval=1000, hostname=None, script=None, preexec_commands=(), **kwargs): """ Dask workers launched by a DRMAA-compatible cluster Parameters ---------- jobName: string Name of the job as known by the DRMAA cluster. script: string (optional) Path to the dask-worker executable script. A temporary file will be made if none is provided (recommended) args: list Extra string arguments to pass to dask-worker outputPath: string errorPath: string workingDirectory: string Where dask-worker runs, defaults to current directory nativeSpecification: string Options native to the job scheduler Examples -------- >>> from dask_drmaa import DRMAACluster # doctest: +SKIP >>> cluster = DRMAACluster() # doctest: +SKIP >>> cluster.start_workers(10) # doctest: +SKIP >>> from distributed import Client # doctest: +SKIP >>> client = Client(cluster) # doctest: +SKIP >>> future = client.submit(lambda x: x + 1, 10) # doctest: +SKIP >>> future.result() # doctest: +SKIP 11 """ self.hostname = hostname or socket.gethostname() logger.info("Start local scheduler at %s", self.hostname) self.local_cluster = LocalCluster(n_workers=0, ip='', **kwargs) if script is None: fn = tempfile.mktemp(suffix='sh', prefix='dask-worker-script', dir=os.path.curdir) self.script = fn script_contents = make_job_script(executable=worker_bin_path, name='%s.%s' % (JOB_ID, TASK_ID), preexec=preexec_commands) with open(fn, 'wt') as f: f.write(script_contents) @atexit.register def remove_script(): if os.path.exists(fn): os.remove(fn) os.chmod(self.script, 0o777) else: assert not preexec_commands, "Cannot specify both script and preexec_commands" # TODO: check that user-provided script is executable self.template = merge(default_template, {'remoteCommand': self.script}, template or {}) self._cleanup_callback = PeriodicCallback( callback=self.cleanup_closed_workers, callback_time=cleanup_interval, io_loop=self.scheduler.loop) self._cleanup_callback.start() self.workers = {} # {job-id: WorkerSpec} @gen.coroutine def _start(self): pass @property def scheduler(self): return self.local_cluster.scheduler @property def scheduler_address(self): return self.scheduler.address def create_job_template(self, **kwargs): template = self.template.copy() if kwargs: template.update(kwargs) template['args'] = [self.scheduler_address] + template['args'] jt = get_session().createJobTemplate() valid_attributes = dir(jt) for key, value in template.items(): if key not in valid_attributes: raise ValueError("Invalid job template attribute %s" % key) setattr(jt, key, value) return jt def start_workers(self, n=1, **kwargs): with log_errors(): with self.create_job_template(**kwargs) as jt: ids = get_session().runBulkJobs(jt, 1, n, 1) logger.info("Start %d workers. Job ID: %s", len(ids), ids[0].split('.')[0]) self.workers.update({ jid: WorkerSpec( job_id=jid, kwargs=kwargs, stdout=worker_out_path_template % dict(jid=jid, kind='out'), stderr=worker_out_path_template % dict(jid=jid, kind='err'), ) for jid in ids }) def stop_workers(self, worker_ids, sync=False): if isinstance(worker_ids, str): worker_ids = [worker_ids] for wid in list(worker_ids): try: get_session().control(wid, drmaa.JobControlAction.TERMINATE) except drmaa.errors.InvalidJobException: pass self.workers.pop(wid) logger.info("Stop workers %s", worker_ids) if sync: get_session().synchronize(worker_ids, dispose=True) def close(self): logger.info("Closing DRMAA cluster") self.local_cluster.close() if self.workers: self.stop_workers(self.workers, sync=True) if os.path.exists(self.script): os.remove(self.script) def __enter__(self): return self def __exit__(self, *args): self.close() def cleanup_closed_workers(self): for jid in list(self.workers): if get_session().jobStatus(jid) in ('closed', 'done'): logger.info("Removing closed worker %s", jid) del self.workers[jid] def __del__(self): try: self.close() except: pass def __str__(self): return "<%s: %d workers>" % (self.__class__.__name__, len( self.workers)) __repr__ = __str__
class DaskRunner(object): def __init__(self, config): self.config = config self.scheduler_mode = config.getvalue("dask_scheduler_mode") remote_cluster_address = config.getvalue('dask_scheduler_address') if remote_cluster_address: self.client = Client(remote_cluster_address) else: self.cluster = LocalCluster( ip='127.0.0.1', n_workers=int(config.getvalue('dask_nworkers')), processes=config.getvalue('dask_scheduler_mode') == 'process') self.client = Client(self.cluster, set_as_default=True) def __getstate__(self): return {'config': None} def __setstate__(self, state): for k in state: pass def pytest_runtestloop(self, session): if (session.testsfailed and not session.config.option.continue_on_collection_errors): raise session.Interrupted("%d errors during collection" % session.testsfailed) unregister_plugins = ['debugging', 'terminalreporter'] for p in unregister_plugins: session.config.pluginmanager.unregister(p) if session.config.option.collectonly: return True def generate_tasks(session): for i, item in enumerate(session.items): # @delayed(pure=False) def run_test(_item): # ensure that the plugin manager gets recreated appropriately. _item.config.pluginmanager.__recreate__() results = self.pytest_runtest_protocol(item=_item, nextitem=None) return results # hook = item.ihook # try to ensure that the module gets treated as a dynamic module that does not # exist. # delattr(item.module, '__file__') # setup = hook.pytest_runtest_setup # make_report = hook.pytest_runtest_makereport fut = self.client.submit(run_test, item, pure=False) yield fut with self.remote_syspath_ctx(): tasks = generate_tasks(session) # log these reports to the console. for resolved in as_completed(tasks): t = resolved.result() for report in t: session.ihook.pytest_runtest_logreport(report=report) return True @contextmanager def remote_syspath_ctx(self): # Due to test directories being dynamic in certain cases we should make sure that our # workers are using the same pythonpath that we are using here. original_sys_path = self.client.run(get_imports) logger.debug("Original remote sys path %s", original_sys_path) updated_sys_path = self.client.run(update_syspath, sys.path) logger.debug("Updated remote sys path %s", updated_sys_path) try: yield finally: # restore correct syspath for worker, value in original_sys_path.items(): self.client.run(restore_syspath, value, workers=[worker]) original_sys_path2 = self.client.run(get_imports) assert original_sys_path == original_sys_path2 def call_and_report(self, item, when, log=True, **kwds): call = self.call_runtest_hook(item, when, **kwds) hook = item.ihook report = hook.pytest_runtest_makereport(item=item, call=call) return report def call_runtest_hook(self, item, when, **kwds): hookname = "pytest_runtest_" + when ihook = getattr(item.ihook, hookname) return CallInfo(lambda: ihook(item=item, **kwds), when=when) # VENDORED so that we have access to the report objects and not just T/F def pytest_runtest_protocol(self, item, log=True, nextitem=None): hasrequest = hasattr(item, "_request") if hasrequest and not item._request: item._initrequest() rep = self.call_and_report(item, "setup", log) reports = [rep] if rep.passed: if item.config.option.setupshow: # TODO figure out how to pass this test # show_test_item(item) pass if not item.config.option.setuponly: rep = self.call_and_report(item, "call", log) reports.append(rep) rep = self.call_and_report(item, "teardown", log, nextitem=None) reports.append(rep) # after all teardown hooks have been called # want funcargs and request info to go away if hasrequest: item._request = False item.funcargs = None return reports def pytest_runtest_setup(self, item): item.session._setupstate.prepare(item) def pytest_unconfigure(self, config): """ called before test process is exited. """ if hasattr(self, 'cluster'): self.cluster.close()