Beispiel #1
0
def test_multiple_clients():
    with scheduler_and_workers(1) as (s, (a, )):
        c = Client(s.address_to_clients)
        d = Client(s.address_to_clients)

        assert c.get({'x': (inc, 1)}, 'x') == d.get({'x': (inc, 1)}, 'x')

        pool = ThreadPool(2)

        future1 = pool.apply_async(c.get,
                                   args=({
                                       'x': 1,
                                       'y': (inc, 'x')
                                   }, 'y'))
        future2 = pool.apply_async(d.get,
                                   args=({
                                       'a': 1,
                                       'b': (inc, 'a')
                                   }, 'b'))

        while not (future1.ready() and future2.ready()):
            sleep(1e-6)

        assert future1.get() == future2.get()
        c.close()
        d.close()
Beispiel #2
0
def test_multiple_clients():
    with scheduler_and_workers(1) as (s, (a, )):
        c = Client(s.address_to_clients)
        d = Client(s.address_to_clients)

        assert c.get({'x': (inc, 1)}, 'x') == d.get({'x': (inc, 1)}, 'x')

        def sleep_inc(x):
            sleep(0.5)
            return x + 1

        pool = ThreadPool(2)

        future1 = pool.apply_async(c.get,
                                   args=({
                                       'x': 1,
                                       'y': (sleep_inc, 'x')
                                   }, 'y'))
        future2 = pool.apply_async(d.get,
                                   args=({
                                       'a': 1,
                                       'b': (sleep_inc, 'a')
                                   }, 'b'))

        assert future1.get() == future2.get()
        c.close()
        d.close()
Beispiel #3
0
class Pipeline:
    def __init__(self, root_dir, png_path=None):
        self.session_dirs = list(find_session_dirs((root_dir,)))
        d = {}
        for session_dir in self.session_dirs:
            session_dir = str(session_dir)
            for task in missing_tasks(session_dir):
                d[('task_name', task.name)] = task.name
                dependencies = [(dt, session_dir) for dt in task.depends_on]
                d[(task.name, session_dir)] = (
                    run_task, ('task_name', task.name), session_dir, dependencies)
            d[('end', session_dir)] = (
                _count, [(task_name, session_dir) for task_name in TASK_CLASSES.keys()])
        if png_path:
            visualize(d, filename=png_path)
        self.graph = d
        self.create_cluster()

    def create_cluster(self):
        self.cluster = LocalCluster(
            n_workers=1, processes=False, silence_logs=logging.DEBUG)
        self.client = Client(self.cluster)

    def run(self):
        # TODO: check priority io etc
        # TODO: continuous server of dashboard
        return self.client.get(
            self.graph, [('end', session_dir) for session_dir in self.session_dirs])
Beispiel #4
0
def test_keep_results():
    with scheduler_and_workers() as (s, (a, b)):
        c = Client(s.address_to_clients)

        assert c.get({'x': (inc, 1)}, 'x', keep_results=True) == 2

        assert 'x' in a.data or 'x' in b.data
Beispiel #5
0
def test_keep_results():
    with scheduler_and_workers() as (s, (a, b)):
        c = Client(s.address_to_clients)

        assert c.get({'x': (inc, 1)}, 'x', keep_results=True) == 2

        assert 'x' in a.data or 'x' in b.data
Beispiel #6
0
def test_error():
    with scheduler_and_workers() as (s, (a, b)):
        c = Client(s.address_to_clients)

        assert raises(TypeError,
                lambda: c.get({'x': 1, 'y': (inc, 'x', 'x')}, 'y'))
        assert 'y' not in s.data
        c.close()
Beispiel #7
0
def test_get_with_dill():
    with scheduler_and_workers() as (s, (a, b)):
        c = Client(s.address_to_clients)

        dsk = {'x': 1, 'y': (partial(add, 1), 'x')}
        keys = 'y'

        assert c.get(dsk, keys) == 2
        c.close()
Beispiel #8
0
def test_get():
    with scheduler_and_workers() as (s, (a, b)):
        c = Client(s.address_to_clients)

        dsk = {'x': 1, 'y': (add, 'x', 'x'), 'z': (inc, 'y')}
        keys = ['y', 'z']

        assert c.get(dsk, keys) == [2, 3]
        c.close()
Beispiel #9
0
def test_get_with_dill():
    with scheduler_and_workers() as (s, (a, b)):
        c = Client(s.address_to_clients)

        dsk = {'x': 1, 'y': (partial(add, 1), 'x')}
        keys = 'y'

        assert c.get(dsk, keys) == 2
        c.close()
Beispiel #10
0
def test_get():
    with scheduler_and_workers() as (s, (a, b)):
        c = Client(s.address_to_clients)

        dsk = {'x': 1, 'y': (add, 'x', 'x'), 'z': (inc, 'y')}
        keys = ['y', 'z']

        assert c.get(dsk, keys) == [2, 3]
        c.close()
Beispiel #11
0
def test_error():
    with scheduler_and_workers() as (s, (a, b)):
        c = Client(s.address_to_clients)

        assert raises(TypeError, lambda: c.get({
            'x': 1,
            'y': (inc, 'x', 'x')
        }, 'y'))
        assert 'y' not in s.data
        c.close()
Beispiel #12
0
def test_multiple_clients():
    with scheduler_and_workers(1) as (s, (a,)):
        c = Client(s.address_to_clients)
        d = Client(s.address_to_clients)

        assert c.get({'x': (inc, 1)}, 'x') == d.get({'x': (inc, 1)}, 'x')

        pool = ThreadPool(2)

        future1 = pool.apply_async(c.get,
                                   args=({'x': 1, 'y': (inc, 'x')}, 'y'))
        future2 = pool.apply_async(d.get,
                                   args=({'a': 1, 'b': (inc, 'a')}, 'b'))

        while not (future1.ready() and future2.ready()):
            sleep(1e-6)

        assert future1.get() == future2.get()
        c.close()
        d.close()
Beispiel #13
0
def test_multiple_clients():
    with scheduler_and_workers(1) as (s, (a,)):
        c = Client(s.address_to_clients)
        d = Client(s.address_to_clients)

        assert c.get({'x': (inc, 1)}, 'x') == d.get({'x': (inc, 1)}, 'x')

        def sleep_inc(x):
            sleep(0.5)
            return x + 1

        pool = ThreadPool(2)

        future1 = pool.apply_async(c.get,
                                   args=({'x': 1, 'y': (sleep_inc, 'x')}, 'y'))
        future2 = pool.apply_async(d.get,
                                   args=({'a': 1, 'b': (sleep_inc, 'a')}, 'b'))

        assert future1.get() == future2.get()
        c.close()
        d.close()
        'men_bow2_cossim': (Analyzer.cossim, 'analyzer', 'load_bow2model',
                            'load_men'),
        'men_bow5_cossim': (Analyzer.cossim, 'analyzer', 'load_bow5model',
                            'load_men'),
        'men_deps_cossim': (Analyzer.cossim, 'analyzer', 'load_depsmodel',
                            'load_men'),
        'simlex_bow2_stats': (Analyzer.stats, 'analyzer',
                              'simlex_bow2_cossim'),
        'simlex_bow5_stats': (Analyzer.stats, 'analyzer',
                              'simlex_bow5_cossim'),
        'simlex_deps_stats': (Analyzer.stats, 'analyzer',
                              'simlex_deps_cossim'),
        'men_bow2_stats': (Analyzer.stats, 'analyzer', 'men_bow2_cossim'),
        'men_bow5_stats': (Analyzer.stats, 'analyzer', 'men_bow5_cossim'),
        'men_deps_stats': (Analyzer.stats, 'analyzer', 'men_deps_cossim')
    }

    start = time.time()

    #generate data
    dask.get(tasks, ['bow2words', 'bow5words', 'depswords', 'simlex', 'men'])

    #calculate statistics
    client = Client()
    client.get(tasks, [
        'simlex_bow2_stats', 'simlex_bow5_stats', 'simlex_deps_stats',
        'men_bow2_stats', 'men_bow5_stats', 'men_deps_stats'
    ])
    end = time.time()
    logger.info('Time Elapsed {0} Seconds'.format(round(end - start, 2)))
Beispiel #15
0
def start():
    t = time()
    isins = get_isins()

    # граф задач (состоит из списка ISIN)
    graph = {'isin_{}'.format(i): isin for i, isin in enumerate(isins)}

    # добавляем в граф задачи загрузки данных из БД
    # разбиения загруженных данных на параметры
    for i, isin in enumerate(isins):
        graph['data_{}'.format(i)] = (load_data, 'isin_{}'.format(i))
        graph['param_a_{}'.format(i)] = (
            get_param, 'data_{}'.format(i), 'param_a')
        graph['param_b_{}'.format(i)] = (
            get_param, 'data_{}'.format(i), 'param_b')

    # добавляем в граф задачу task_a
    for i, isin in enumerate(isins):
        graph['task_a_res_{}'.format(i)] = (
            task_a,
            'isin_{}'.format(i),
            'param_a_{}'.format(i),
            'param_b_{}'.format(i))

    # Добавляем в граф параметр для групповой задачи:
    param_list = []
    for i in range(len(isins)):
        param_list.append('isin_{}'.format(i))
        param_list.append('task_a_res_{}'.format(i))
        param_list.append('param_b_{}'.format(i))
    graph['group_data'] = (grouper,) + tuple(param_list)

    # Добавляем в граф групповую задачу
    graph['group_res'] = (task_group, 'group_data')

    # добавляем в граф задачу task_b
    for i, isin in enumerate(isins):
        graph['task_b_res_{}'.format(i)] = (
            task_b,
            'isin_{}'.format(i),
            'param_b_{}'.format(i),
            'group_res')

    # добавляем в граф задачу task_c
    for i, isin in enumerate(isins):
        graph['task_c_res_{}'.format(i)] = (
            task_c,
            'isin_{}'.format(i),
            'param_b_{}'.format(i))

    # Формирование списка получаемых результатов
    get_list = ['group_data']
    for i, isin in enumerate(isins):
        get_list.append('task_a_res_{}'.format(i))
        get_list.append('task_b_res_{}'.format(i))
        get_list.append('task_c_res_{}'.format(i))

    # Создаем client
    client = Client('127.0.0.1:8786')

    # Получение результатов
    result = client.get(graph, get_list)

    total = time() - t
    print(total)
    print(len(result))
    with open('/Users/vladimirmarunov/git/dask-test/res1.txt', 'w') as f:
        f.write('{}\n'.format(total))
        json.dump(result, f, indent=4)
Beispiel #16
0
def run_blockwise(total_roi,
                  read_roi,
                  write_roi,
                  process_function,
                  check_function=None,
                  read_write_conflict=True,
                  fit='valid',
                  num_workers=None,
                  processes=True,
                  client=None):
    '''Run block-wise tasks with dask.

    Args:

        total_roi (`class:daisy.Roi`):

            The region of interest (ROI) of the complete volume to process.

        read_roi (`class:daisy.Roi`):

            The ROI every block needs to read data from. Will be shifted over
            the ``total_roi`` to cover the whole volume.

        write_roi (`class:daisy.Roi`):

            The ROI every block writes data from. Will be shifted over the
            ``total_roi`` to cover the whole volume.

        process_function (function):

            A function that will be called as::

                process_function(read_roi, write_roi)

            with ``read_roi`` and ``write_roi`` shifted for each block to
            process.

            The callee can assume that there are no read/write concurencies,
            i.e., at any given point in time the ``read_roi`` does not overlap
            with the ``write_roi`` of another process.

        check_function (function, optional):

            A function that will be called as::

                check_function(write_roi)

            ``write_roi`` shifted for each block to process.

            This function should return ``True`` if the block represented by
            ``write_roi`` was completed. This is used internally to avoid
            processing blocks that are already done and to check if a block was
            correctly processed.

            If a tuple of two functions is given, the first one will be called
            to check if the block needs to be run, and if so, the second after
            it was run to check if the run succeeded.

        read_write_conflict (``bool``, optional):

            Whether the read and write ROIs are conflicting, i.e., accessing
            the same resource. If set to ``False``, all blocks can run at the
            same time in parallel. In this case, providing a ``read_roi`` is
            simply a means of convenience to ensure no out-of-bound accesses
            and to avoid re-computation of it in each block.

        fit (``string``, optional):

            How to handle cases where shifting blocks by the size of
            ``block_write_roi`` does not tile the ``total_roi``. Possible
            options are:

            "valid": Skip blocks that would lie outside of ``total_roi``. This
            is the default::

                |---------------------------|     total ROI

                |rrrr|wwwwww|rrrr|                block 1
                       |rrrr|wwwwww|rrrr|         block 2
                                                  no further block

            "overhang": Add all blocks that overlap with ``total_roi``, even if
            they leave it. Client code has to take care of save access beyond
            ``total_roi`` in this case.::

                |---------------------------|     total ROI

                |rrrr|wwwwww|rrrr|                block 1
                       |rrrr|wwwwww|rrrr|         block 2
                              |rrrr|wwwwww|rrrr|  block 3 (overhanging)

            "shrink": Like "overhang", but shrink the boundary blocks' read and
            write ROIs such that they are guaranteed to lie within
            ``total_roi``. The shrinking will preserve the context, i.e., the
            difference between the read ROI and write ROI stays the same.::

                |---------------------------|     total ROI

                |rrrr|wwwwww|rrrr|                block 1
                       |rrrr|wwwwww|rrrr|         block 2
                              |rrrr|www|rrrr|     block 3 (shrunk)

        num_workers (int, optional):

            The number of parallel processes or threads to run. Only effective
            if ``client`` is ``None``.

        processes (bool, optional):

            If ``True`` (default), spawns a process per worker, otherwise a
            thread.

        client (optional):

            The dask client to submit jobs to. If ``None``, a client will be
            created from ``dask.distributed.Client`` with ``num_workers``
            workers.

    Returns:

        True, if all tasks succeeded (or were skipped because they were already
        completed in an earlier run).
    '''

    blocks = create_dependency_graph(total_roi, read_roi, write_roi,
                                     read_write_conflict, fit)

    if check_function is not None:

        try:
            pre_check, post_check = check_function
        except:
            pre_check = check_function
            post_check = check_function

    else:

        pre_check = lambda _: False
        post_check = lambda _: True

    # dask requires strings for task names, string representation of
    # `class:Roi` is assumed to be unique.
    tasks = {
        block_to_dask_name(block):
        (check_and_run, block, process_function, pre_check, post_check,
         [block_to_dask_name(ups) for ups in upstream_blocks])
        for block, upstream_blocks in blocks
    }

    own_client = client is None

    if own_client:

        if num_workers is not None:
            print("Creating local cluster with %d workers..." % num_workers)

        if processes:
            cluster = LocalCluster(n_workers=num_workers,
                                   threads_per_worker=1,
                                   memory_limit=0,
                                   diagnostics_port=None)
        else:
            cluster = LocalCluster(n_workers=1,
                                   threads_per_worker=num_workers,
                                   processes=False,
                                   memory_limit=0,
                                   diagnostics_port=None)

        client = Client(cluster)

    logger.info("Scheduling %d tasks...", len(tasks))

    # don't show dask performance warnings (too verbose, probably not
    # applicable to our use-case)
    logging.getLogger('distributed.utils_perf').setLevel(logging.ERROR)

    # run all tasks
    results = client.get(tasks, list(tasks.keys()))

    if own_client:

        try:

            # don't show dask distributes warning during shutdown
            logging.getLogger('distributed').setLevel(logging.ERROR)

            client.close()

        # ignore exceptions during shutdown
        except Exception:
            pass

    succeeded = [t for t, r in zip(tasks, results) if r == 1]
    skipped = [t for t, r in zip(tasks, results) if r == 0]
    failed = [t for t, r in zip(tasks, results) if r == -1]
    errored = [t for t, r in zip(tasks, results) if r == -2]

    logger.info(
        "Ran %d tasks, of which %d succeeded, %d were skipped, %d failed (%d "
        "failed check, %d errored)", len(tasks), len(succeeded), len(skipped),
        len(failed) + len(errored), len(failed), len(errored))

    if len(failed) > 0:
        logger.info("Failed blocks: %s",
                    " ".join([str(t[1]) for _, t in failed]))

    return len(failed) + len(errored) == 0