Exemple #1
0
    def schedule(self, dsk, result, **kwargs):
        """ Execute dask graph against workers

        Parameters
        ----------

        dsk: dict
            Dask graph
        result: list
            keys to return (possibly nested)

        Example
        -------

        >>> scheduler.get({'x': 1, 'y': (add, 'x', 2)}, 'y')  # doctest: +SKIP
        3

        Protocol
        --------

        1.  Scheduler scatters precomputed data in graph to workers
            e.g. nodes like ``{'x': 1}``.  See Scheduler.scatter
        2.
        """
        with self._schedule_lock:
            log(self.address_to_workers, "Scheduling dask")
            if isinstance(result, list):
                result_flat = set(flatten(result))
            else:
                result_flat = set([result])
            results = set(result_flat)

            cache = dict()
            dag_state = dag_state_from_dask(dsk, cache=cache)
            if cache:
                self.scatter(cache.items())  # send data in dask up to workers

            tick = [0]

            if dag_state['waiting'] and not dag_state['ready']:
                raise ValueError("Found no accessible jobs in dask graph")

            event_queue = Queue()
            qkey = str(uuid.uuid1())
            self.queues[qkey] = event_queue

            def fire_task():
                tick[0] += 1  # Update heartbeat

                # Choose a good task to compute
                key = dag_state['ready'].pop()
                dag_state['ready-set'].remove(key)
                dag_state['running'].add(key)

                self.trigger_task(dsk, key, qkey)  # Fire

            try:
                worker = self.available_workers.get(timeout=20)
                self.available_workers.put(worker)  # put him back in
            except Empty:
                raise ValueError("Waited 20 seconds. No workers found")

            # Seed initial tasks
            while dag_state['ready'] and self.available_workers.qsize() > 0:
                fire_task()

            # Main loop, wait on tasks to finish, insert new ones
            while dag_state['waiting'] or dag_state['ready'] or dag_state['running']:
                payload = event_queue.get()

                if isinstance(payload['status'], Exception):
                    raise payload['status']

                key = payload['key']
                finish_task(dsk, key, dag_state, results, sortkey,
                            release_data=self._release_data)

                while dag_state['ready'] and self.available_workers.qsize() > 0:
                    fire_task()

            result2 = self.gather(result)
            for key in flatten(result):  # release result data from workers
                self.release_key(key)
        return result2
Exemple #2
0
    def schedule(self, dsk, result, keep_results=False, **kwargs):
        """ Execute dask graph against workers

        Parameters
        ----------

        dsk: dict
            Dask graph
        result: list
            keys to return (possibly nested)

        Example
        -------

        >>> scheduler.get({'x': 1, 'y': (add, 'x', 2)}, 'y')  # doctest: +SKIP
        3

        Protocol
        --------

        1.  Scheduler scatters precomputed data in graph to workers
            e.g. nodes like ``{'x': 1}``.  See Scheduler.scatter
        2.
        """
        with self._schedule_lock:
            log(self.address_to_workers, "Scheduling dask")
            if isinstance(result, list):
                result_flat = set(flatten(result))
            else:
                result_flat = set([result])
            results = set(result_flat)

            for k in self.who_has:  # remove keys that we already know about
                if self.who_has[k]:
                    del dsk[k]
                    if k in results:
                        results.remove(k)

            dsk = cull(dsk, results)

            preexisting_data = set(k for k, v in self.who_has.items() if v)
            cache = dict((k, None) for k in preexisting_data)
            dag_state = dag_state_from_dask(dsk, cache=cache)
            del dag_state['cache']

            new_data = dict((k, v) for k, v in cache.items()
                                   if not (k in self.who_has and
                                           self.who_has[k]))
            if new_data:
                self.scatter(new_data.items())  # send data in dask up to workers

            tick = [0]

            event_queue = Queue()
            qkey = str(uuid.uuid1())
            self.queues[qkey] = event_queue

            def fire_task():
                tick[0] += 1  # Update heartbeat

                # Choose a good task to compute
                key = dag_state['ready'].pop()
                dag_state['ready-set'].remove(key)
                dag_state['running'].add(key)

                self.trigger_task(key, dsk[key],
                        dag_state['dependencies'][key], qkey)  # Fire

            try:
                worker = self.available_workers.get(timeout=20)
                self.available_workers.put(worker)  # put him back in
            except Empty:
                raise ValueError("Waited 20 seconds. No workers found")

            # Seed initial tasks
            while dag_state['ready'] and self.available_workers.qsize() > 0:
                fire_task()

            # Main loop, wait on tasks to finish, insert new ones
            release_data = partial(self._release_data, protected=preexisting_data)
            while dag_state['waiting'] or dag_state['ready'] or dag_state['running']:
                payload = event_queue.get()

                if isinstance(payload['status'], Exception):
                    raise payload['status']

                key = payload['key']
                finish_task(dsk, key, dag_state, results, sortkey,
                            release_data=release_data,
                            delete=key not in preexisting_data)

                while dag_state['ready'] and self.available_workers.qsize() > 0:
                    fire_task()

            result2 = self.gather(result)
            if not keep_results:  # release result data from workers
                for key in flatten(result):
                    if key not in preexisting_data:
                        self.release_key(key)

            self.cull_redundant_data(3)

        return result2