def schedule(self, dsk, result, **kwargs): """ Execute dask graph against workers Parameters ---------- dsk: dict Dask graph result: list keys to return (possibly nested) Example ------- >>> scheduler.get({'x': 1, 'y': (add, 'x', 2)}, 'y') # doctest: +SKIP 3 Protocol -------- 1. Scheduler scatters precomputed data in graph to workers e.g. nodes like ``{'x': 1}``. See Scheduler.scatter 2. """ with self._schedule_lock: log(self.address_to_workers, "Scheduling dask") if isinstance(result, list): result_flat = set(flatten(result)) else: result_flat = set([result]) results = set(result_flat) cache = dict() dag_state = dag_state_from_dask(dsk, cache=cache) if cache: self.scatter(cache.items()) # send data in dask up to workers tick = [0] if dag_state['waiting'] and not dag_state['ready']: raise ValueError("Found no accessible jobs in dask graph") event_queue = Queue() qkey = str(uuid.uuid1()) self.queues[qkey] = event_queue def fire_task(): tick[0] += 1 # Update heartbeat # Choose a good task to compute key = dag_state['ready'].pop() dag_state['ready-set'].remove(key) dag_state['running'].add(key) self.trigger_task(dsk, key, qkey) # Fire try: worker = self.available_workers.get(timeout=20) self.available_workers.put(worker) # put him back in except Empty: raise ValueError("Waited 20 seconds. No workers found") # Seed initial tasks while dag_state['ready'] and self.available_workers.qsize() > 0: fire_task() # Main loop, wait on tasks to finish, insert new ones while dag_state['waiting'] or dag_state['ready'] or dag_state['running']: payload = event_queue.get() if isinstance(payload['status'], Exception): raise payload['status'] key = payload['key'] finish_task(dsk, key, dag_state, results, sortkey, release_data=self._release_data) while dag_state['ready'] and self.available_workers.qsize() > 0: fire_task() result2 = self.gather(result) for key in flatten(result): # release result data from workers self.release_key(key) return result2
def schedule(self, dsk, result, keep_results=False, **kwargs): """ Execute dask graph against workers Parameters ---------- dsk: dict Dask graph result: list keys to return (possibly nested) Examples -------- >>> scheduler.get({'x': 1, 'y': (add, 'x', 2)}, 'y') # doctest: +SKIP 3 Protocol -------- 1. Scheduler scatters precomputed data in graph to workers e.g. nodes like ``{'x': 1}``. See Scheduler.scatter 2. """ with self._schedule_lock: log(self.address_to_workers, "Scheduling dask") if isinstance(result, list): result_flat = set(flatten(result)) else: result_flat = set([result]) results = set(result_flat) for k in self.who_has: # remove keys that we already know about if self.who_has[k]: del dsk[k] if k in results: results.remove(k) dsk = cull(dsk, results) preexisting_data = set(k for k, v in self.who_has.items() if v) cache = dict((k, None) for k in preexisting_data) dag_state = dag_state_from_dask(dsk, cache=cache) del dag_state['cache'] new_data = dict((k, v) for k, v in cache.items() if not (k in self.who_has and self.who_has[k])) if new_data: self.scatter( new_data.items()) # send data in dask up to workers tick = [0] event_queue = Queue() qkey = str(uuid.uuid1()) self.queues[qkey] = event_queue def fire_task(): tick[0] += 1 # Update heartbeat # Choose a good task to compute key = dag_state['ready'].pop() dag_state['running'].add(key) self.trigger_task(key, dsk[key], dag_state['dependencies'][key], qkey) # Fire try: worker = self.available_workers.get(timeout=20) self.available_workers.put(worker) # put him back in except Empty: raise ValueError("Waited 20 seconds. No workers found") # Seed initial tasks while dag_state['ready'] and self.available_workers.qsize() > 0: fire_task() # Main loop, wait on tasks to finish, insert new ones release_data = partial(self._release_data, protected=preexisting_data) while dag_state['waiting'] or dag_state['ready'] or dag_state[ 'running']: payload = event_queue.get() if isinstance(payload['status'], Exception): raise payload['status'] key = payload['key'] finish_task(dsk, key, dag_state, results, sortkey, release_data=release_data, delete=key not in preexisting_data) while dag_state['ready'] and self.available_workers.qsize( ) > 0: fire_task() result2 = self.gather(result) if not keep_results: # release result data from workers for key in flatten(result): if key not in preexisting_data: self.release_key(key) self.cull_redundant_data(3) return result2
def schedule(self, dsk, result, keep_results=False, **kwargs): """ Execute dask graph against workers Parameters ---------- dsk: dict Dask graph result: list keys to return (possibly nested) Example ------- >>> scheduler.get({'x': 1, 'y': (add, 'x', 2)}, 'y') # doctest: +SKIP 3 Protocol -------- 1. Scheduler scatters precomputed data in graph to workers e.g. nodes like ``{'x': 1}``. See Scheduler.scatter 2. """ with self._schedule_lock: log(self.address_to_workers, "Scheduling dask") if isinstance(result, list): result_flat = set(flatten(result)) else: result_flat = set([result]) results = set(result_flat) for k in self.who_has: # remove keys that we already know about if self.who_has[k]: del dsk[k] if k in results: results.remove(k) dsk = cull(dsk, results) preexisting_data = set(k for k, v in self.who_has.items() if v) cache = dict((k, None) for k in preexisting_data) dag_state = dag_state_from_dask(dsk, cache=cache) del dag_state['cache'] new_data = dict((k, v) for k, v in cache.items() if not (k in self.who_has and self.who_has[k])) if new_data: self.scatter(new_data.items()) # send data in dask up to workers tick = [0] event_queue = Queue() qkey = str(uuid.uuid1()) self.queues[qkey] = event_queue def fire_task(): tick[0] += 1 # Update heartbeat # Choose a good task to compute key = dag_state['ready'].pop() dag_state['ready-set'].remove(key) dag_state['running'].add(key) self.trigger_task(key, dsk[key], dag_state['dependencies'][key], qkey) # Fire try: worker = self.available_workers.get(timeout=20) self.available_workers.put(worker) # put him back in except Empty: raise ValueError("Waited 20 seconds. No workers found") # Seed initial tasks while dag_state['ready'] and self.available_workers.qsize() > 0: fire_task() # Main loop, wait on tasks to finish, insert new ones release_data = partial(self._release_data, protected=preexisting_data) while dag_state['waiting'] or dag_state['ready'] or dag_state['running']: payload = event_queue.get() if isinstance(payload['status'], Exception): raise payload['status'] key = payload['key'] finish_task(dsk, key, dag_state, results, sortkey, release_data=release_data, delete=key not in preexisting_data) while dag_state['ready'] and self.available_workers.qsize() > 0: fire_task() result2 = self.gather(result) if not keep_results: # release result data from workers for key in flatten(result): if key not in preexisting_data: self.release_key(key) self.cull_redundant_data(3) return result2
def schedule(self, dsk, result, **kwargs): """ Execute dask graph against workers Parameters ---------- dsk: dict Dask graph result: list keys to return (possibly nested) Example ------- >>> scheduler.get({'x': 1, 'y': (add, 'x', 2)}, 'y') # doctest: +SKIP 3 Protocol -------- 1. Scheduler scatters precomputed data in graph to workers e.g. nodes like ``{'x': 1}``. See Scheduler.scatter 2. """ with self._schedule_lock: log(self.address_to_workers, "Scheduling dask") if isinstance(result, list): result_flat = set(flatten(result)) else: result_flat = set([result]) results = set(result_flat) cache = dict() dag_state = dag_state_from_dask(dsk, cache=cache) if cache: self.scatter(cache.items()) # send data in dask up to workers tick = [0] if dag_state['waiting'] and not dag_state['ready']: raise ValueError("Found no accessible jobs in dask graph") event_queue = Queue() qkey = str(uuid.uuid1()) self.queues[qkey] = event_queue def fire_task(): tick[0] += 1 # Update heartbeat # Choose a good task to compute key = dag_state['ready'].pop() dag_state['ready-set'].remove(key) dag_state['running'].add(key) self.trigger_task(dsk, key, qkey) # Fire try: worker = self.available_workers.get(timeout=20) self.available_workers.put(worker) # put him back in except Empty: raise ValueError("Waited 20 seconds. No workers found") # Seed initial tasks while dag_state['ready'] and self.available_workers.qsize() > 0: fire_task() # Main loop, wait on tasks to finish, insert new ones while dag_state['waiting'] or dag_state['ready'] or dag_state[ 'running']: payload = event_queue.get() if isinstance(payload['status'], Exception): raise payload['status'] key = payload['key'] finish_task(dsk, key, dag_state, results, release_data=self._release_data) while dag_state['ready'] and self.available_workers.qsize( ) > 0: fire_task() result2 = self.gather(result) for key in flatten(result): # release result data from workers self.release_key(key) return result2