Example #1
0
def check_parallel_context_manager(backend):
    lst = range(10)
    expected = [f(x, y=1) for x in lst]
    with Parallel(n_jobs=4, backend=backend) as p:
        # Internally a pool instance has been eagerly created and is managed
        # via the context manager protocol
        managed_pool = p._pool
        if mp is not None:
            assert_true(managed_pool is not None)

        # We make call with the managed parallel object several times inside
        # the managed block:
        assert_equal(expected, p(delayed(f)(x, y=1) for x in lst))
        assert_equal(expected, p(delayed(f)(x, y=1) for x in lst))

        # Those calls have all used the same pool instance:
        if mp is not None:
            assert_true(managed_pool is p._pool)

    # As soon as we exit the context manager block, the pool is terminated and
    # no longer referenced from the parallel object:
    assert_true(p._pool is None)

    # It's still possible to use the parallel instance in non-managed mode:
    assert_equal(expected, p(delayed(f)(x, y=1) for x in lst))
    assert_true(p._pool is None)
Example #2
0
def check_simple_parallel(backend):
    X = range(5)
    for n_jobs in (1, 2, -1, -2):
        nose.tools.assert_equal([square(x) for x in X], Parallel(n_jobs=n_jobs)(delayed(square)(x) for x in X))
    try:
        # To smoke-test verbosity, we capture stdout
        orig_stdout = sys.stdout
        orig_stderr = sys.stdout
        if sys.version_info[0] == 3:
            sys.stderr = io.StringIO()
            sys.stderr = io.StringIO()
        else:
            sys.stdout = io.BytesIO()
            sys.stderr = io.BytesIO()
        for verbose in (2, 11, 100):
            Parallel(n_jobs=-1, verbose=verbose, backend=backend)(delayed(square)(x) for x in X)
            Parallel(n_jobs=1, verbose=verbose, backend=backend)(delayed(square)(x) for x in X)
            Parallel(n_jobs=2, verbose=verbose, pre_dispatch=2, backend=backend)(delayed(square)(x) for x in X)
            Parallel(n_jobs=2, verbose=verbose, backend=backend)(delayed(square)(x) for x in X)
    except Exception as e:
        my_stdout = sys.stdout
        my_stderr = sys.stderr
        sys.stdout = orig_stdout
        sys.stderr = orig_stderr
        print(unicode(my_stdout.getvalue()))
        print(unicode(my_stderr.getvalue()))
        raise e
    finally:
        sys.stdout = orig_stdout
        sys.stderr = orig_stderr
Example #3
0
def test_error_capture():
    # Check that error are captured, and that correct exceptions
    # are raised.
    if mp is not None:
        # A JoblibException will be raised only if there is indeed
        # multiprocessing
        nose.tools.assert_raises(JoblibException,
                                Parallel(n_jobs=2),
                    [delayed(division)(x, y) for x, y in zip((0, 1), (1, 0))],
                        )
        nose.tools.assert_raises(WorkerInterrupt,
                                    Parallel(n_jobs=2),
                        [delayed(interrupt_raiser)(x) for x in (1, 0)],
                            )
    else:
        nose.tools.assert_raises(KeyboardInterrupt,
                                    Parallel(n_jobs=2),
                        [delayed(interrupt_raiser)(x) for x in (1, 0)],
                            )
    nose.tools.assert_raises(ZeroDivisionError,
                                Parallel(n_jobs=2),
                    [delayed(division)(x, y) for x, y in zip((0, 1), (1, 0))],
                        )
    try:
        ex = JoblibException()
        Parallel(n_jobs=1)(
                    delayed(division)(x, y) for x, y in zip((0, 1), (1, 0)))
    except Exception as ex:
        nose.tools.assert_false(isinstance(ex, JoblibException))
Example #4
0
def test_error_capture():
    # Check that error are captured, and that correct exceptions
    # are raised.
    if mp is not None:
        # A JoblibException will be raised only if there is indeed
        # multiprocessing
        assert_raises(JoblibException, Parallel(n_jobs=2),
                      [delayed(division)(x, y)
                       for x, y in zip((0, 1), (1, 0))])
        assert_raises(WorkerInterrupt, Parallel(n_jobs=2),
                      [delayed(interrupt_raiser)(x) for x in (1, 0)])

        # Try again with the context manager API
        with Parallel(n_jobs=2) as parallel:
            assert_true(parallel._pool is not None)

            assert_raises(JoblibException, parallel,
                          [delayed(division)(x, y)
                           for x, y in zip((0, 1), (1, 0))])

            # The managed pool should still be available and be in a working
            # state despite the previously raised (and caught) exception
            assert_true(parallel._pool is not None)
            assert_equal([f(x, y=1) for x in range(10)],
                         parallel(delayed(f)(x, y=1) for x in range(10)))

            assert_raises(WorkerInterrupt, parallel,
                          [delayed(interrupt_raiser)(x) for x in (1, 0)])

            # The pool should still be available despite the exception
            assert_true(parallel._pool is not None)
            assert_equal([f(x, y=1) for x in range(10)],
                         parallel(delayed(f)(x, y=1) for x in range(10)))

        # Check that the inner pool has been terminated when exiting the
        # context manager
        assert_true(parallel._pool is None)
    else:
        assert_raises(KeyboardInterrupt, Parallel(n_jobs=2),
                      [delayed(interrupt_raiser)(x) for x in (1, 0)])

    # wrapped exceptions should inherit from the class of the original
    # exception to make it easy to catch them
    assert_raises(ZeroDivisionError, Parallel(n_jobs=2),
                  [delayed(division)(x, y) for x, y in zip((0, 1), (1, 0))])

    assert_raises(
        MyExceptionWithFinickyInit,
        Parallel(n_jobs=2, verbose=0),
        (delayed(exception_raiser)(i, custom_exception=True)
         for i in range(30)))

    try:
        # JoblibException wrapping is disabled in sequential mode:
        ex = JoblibException()
        Parallel(n_jobs=1)(
            delayed(division)(x, y) for x, y in zip((0, 1), (1, 0)))
    except Exception as ex:
        assert_false(isinstance(ex, JoblibException))
Example #5
0
def check_dispatch_one_job(backend):
    """ Test that with only one job, Parallel does act as a iterator.
    """
    queue = list()

    def producer():
        for i in range(6):
            queue.append("Produced %i" % i)
            yield i

    # disable batching
    Parallel(n_jobs=1, batch_size=1, backend=backend)(delayed(consumer)(queue, x) for x in producer())
    nose.tools.assert_equal(
        queue,
        [
            "Produced 0",
            "Consumed 0",
            "Produced 1",
            "Consumed 1",
            "Produced 2",
            "Consumed 2",
            "Produced 3",
            "Consumed 3",
            "Produced 4",
            "Consumed 4",
            "Produced 5",
            "Consumed 5",
        ],
    )
    nose.tools.assert_equal(len(queue), 12)

    # empty the queue for the next check
    queue[:] = []

    # enable batching
    Parallel(n_jobs=1, batch_size=4, backend=backend)(delayed(consumer)(queue, x) for x in producer())
    nose.tools.assert_equal(
        queue,
        [
            # First batch
            "Produced 0",
            "Produced 1",
            "Produced 2",
            "Produced 3",
            "Consumed 0",
            "Consumed 1",
            "Consumed 2",
            "Consumed 3",
            # Second batch
            "Produced 4",
            "Produced 5",
            "Consumed 4",
            "Consumed 5",
        ],
    )
    nose.tools.assert_equal(len(queue), 12)
Example #6
0
def check_dispatch_multiprocessing(backend):
    """ Check that using pre_dispatch Parallel does indeed dispatch items
        lazily.
    """
    if mp is None:
        raise nose.SkipTest()
    manager = mp.Manager()
    queue = manager.list()

    def producer():
        for i in range(6):
            queue.append("Produced %i" % i)
            yield i

    Parallel(n_jobs=2, batch_size=1, pre_dispatch=3, backend=backend)(
        delayed(consumer)(queue, "any") for _ in producer()
    )

    # Only 3 tasks are dispatched out of 6. The 4th task is dispatched only
    # after any of the first 3 jobs have completed.
    first_four = list(queue)[:4]
    # The the first consumption event can sometimes happen before the end of
    # the dispatching, hence, pop it before introspecting the "Produced" events
    first_four.remove("Consumed any")
    nose.tools.assert_equal(first_four, ["Produced 0", "Produced 1", "Produced 2"])
    nose.tools.assert_equal(len(queue), 12)
Example #7
0
def test_batching_auto_threading():
    # batching='auto' with the threading backend leaves the effective batch
    # size to 1 (no batching) as it has been found to never be beneficial with
    # this low-overhead backend.
    p = Parallel(n_jobs=2, batch_size="auto", backend="threading")
    p(delayed(id)(i) for i in range(5000))  # many very fast tasks
    assert_equal(p._effective_batch_size, 1)
Example #8
0
def test_backend_context_manager():
    all_test_backends = ['test_backend_%d' % i for i in range(3)]
    for test_backend in all_test_backends:
        register_parallel_backend(test_backend, FakeParallelBackend)
    all_backends = ['multiprocessing', 'threading'] + all_test_backends

    try:
        assert _active_backend_type() == MultiprocessingBackend
        # check that this possible to switch parallel backends sequentially
        for test_backend in all_backends:
            # TODO: parametrize this block later
            # yield check_backend_context_manager, test_backend
            check_backend_context_manager(test_backend)

        # The default backend is retored
        assert _active_backend_type() == MultiprocessingBackend

        # Check that context manager switching is thread safe:
        Parallel(n_jobs=2, backend='threading')(
            delayed(check_backend_context_manager)(b)
            for b in all_backends if not b)

        # The default backend is again retored
        assert _active_backend_type() == MultiprocessingBackend
    finally:
        for backend_name in list(BACKENDS.keys()):
            if backend_name.startswith('test_'):
                del BACKENDS[backend_name]
Example #9
0
def test_parallel_timeout_success():
    # Check that timeout isn't thrown when function is fast enough
    for backend in ['multiprocessing', 'threading']:
        nose.tools.assert_equal(
            10,
            len(Parallel(n_jobs=2, backend=backend, timeout=10)
                (delayed(sleep)(0.001) for x in range(10))))
def parallel_search(k_pot, pots, lambdas, n_jobs=4):
    """Method for Parallel L-curve computation

    Parameters
    ----------
    k_pot : np.array
    pots : list
    lambdas : list
    Returns
    -------
    modelnormseq : list
    residualseq : list

    """
    if PARALLEL_AVAILABLE:
        jobs = (delayed(L_model_fast)(k_pot, pots, lamb, i)
                for i, lamb in enumerate(lambdas))
        modelvsres = Parallel(n_jobs=n_jobs, backend='threading')(jobs)
    else:
        # Please verify this!
        modelvsres = []
        for i, lamb in enumerate(lambdas):
            modelvsres.append(L_model_fast(k_pot, pots, lamb, i))
    modelnormseq, residualseq = zip(*modelvsres)
    return modelnormseq, residualseq
Example #11
0
def test_backend_context_manager():
    all_test_backends = ["test_backend_%d" % i for i in range(3)]
    for test_backend in all_test_backends:
        register_parallel_backend(test_backend, FakeParallelBackend)
    all_backends = ["multiprocessing", "threading"] + all_test_backends

    try:
        assert_equal(_active_backend_type(), MultiprocessingBackend)
        # check that this possible to switch parallel backends sequentially
        for test_backend in all_backends:
            yield check_backend_context_manager, test_backend

        # The default backend is retored
        assert_equal(_active_backend_type(), MultiprocessingBackend)

        # Check that context manager switching is thread safe:
        Parallel(n_jobs=2, backend="threading")(
            delayed(check_backend_context_manager)(b) for b in all_backends if not b
        )

        # The default backend is again retored
        assert_equal(_active_backend_type(), MultiprocessingBackend)
    finally:
        for backend_name in list(BACKENDS.keys()):
            if backend_name.startswith("test_"):
                del BACKENDS[backend_name]
Example #12
0
def test_parallel_kwargs():
    """Check the keyword argument processing of pmap."""
    lst = range(10)
    for n_jobs in (1, 4):
        yield (assert_equal,
               [f(x, y=1) for x in lst],
               Parallel(n_jobs=n_jobs)(delayed(f)(x, y=1) for x in lst))
Example #13
0
def test_nested_exception_dispatch():
    # Ensure TransportableException objects for nested joblib cases gets
    # propagated.
    assert_raises(
        JoblibException,
        Parallel(n_jobs=2, pre_dispatch=16, verbose=0),
        (delayed(SafeFunction(exception_raiser))(i) for i in range(30)))
Example #14
0
def test_exception_dispatch():
    "Make sure that exception raised during dispatch are indeed captured"
    nose.tools.assert_raises(
            ValueError,
            Parallel(n_jobs=2, pre_dispatch=16, verbose=0),
                    (delayed(exception_raiser)(i) for i in range(30)),
            )
Example #15
0
def test_multiple_spawning():
    # Test that attempting to launch a new Python after spawned
    # subprocesses will raise an error, to avoid infinite loops on
    # systems that do not support fork
    if not int(os.environ.get("JOBLIB_MULTIPROCESSING", 1)):
        raise nose.SkipTest()
    assert_raises(ImportError, Parallel(n_jobs=2, pre_dispatch="all"), [delayed(_reload_joblib)() for i in range(10)])
Example #16
0
def test_batching_auto_multiprocessing():
    with Parallel(n_jobs=2, batch_size="auto", backend="multiprocessing") as p:
        p(delayed(id)(i) for i in range(5000))  # many very fast tasks

        # It should be strictly larger than 1 but as we don't want heisen
        # failures on clogged CI worker environment be safe and only check that
        # it's a strictly positive number.
        assert_true(p._backend.compute_batch_size() > 0)
Example #17
0
 def run_trials(self, per_doc):
     results = Parallel(n_jobs=self.n_jobs)(delayed(bootstrap_trials)(per_doc, share, self.metrics)
                                            for share in _job_shares(self.n_jobs, self.trials))
     history = defaultdict(list)
     for res in results:
         for metric in self.metrics:
             history[metric].extend(res[metric])
     return history
Example #18
0
def test_parallel_pickling():
    """ Check that pmap captures the errors when it is passed an object
        that cannot be pickled.
    """
    def g(x):
        return x ** 2

    assert_raises(PickleError, Parallel(), (delayed(g)(x) for x in range(10)))
Example #19
0
def test_pre_dispatch_race_condition():
    # Check that using pre-dispatch does not yield a race condition on the
    # iterable generator that is not thread-safe natively.
    # this is a non-regression test for the "Pool seems closed" class of error
    for n_tasks in [2, 10, 20]:
        for n_jobs in [2, 4]:
            Parallel(n_jobs=n_jobs, pre_dispatch="2 * n_jobs")(
                delayed(square)(i) for i in range(n_tasks))
Example #20
0
def test_parallel_timeout_fail():
    # Check that timeout properly fails when function is too slow
    for backend in ['multiprocessing', 'threading']:
        nose.tools.assert_raises(
            TimeoutError,
            Parallel(n_jobs=2, backend=backend, timeout=0.01),
            (delayed(sleep)(10) for x in range(10))
        )
Example #21
0
def test_batching_auto_threading():
    # batching='auto' with the threading backend leaves the effective batch
    # size to 1 (no batching) as it has been found to never be beneficial with
    # this low-overhead backend.

    with Parallel(n_jobs=2, batch_size='auto', backend='threading') as p:
        p(delayed(id)(i) for i in range(5000))  # many very fast tasks
        assert p._backend.compute_batch_size() == 1
def compute_geodesic_distance_matrix(verts, tris):
    print "precomputing geodesic distance..."
    n_chunks = cpu_count()
    chunk_size = int(np.ceil(len(verts) / float(n_chunks)))
    sources = np.arange(len(verts))
    D = Parallel(n_chunks)(
        delayed(compute_geodesic_distances)(verts, tris, sources[i: i + chunk_size])
        for i in xrange(0, len(verts), chunk_size))
    return np.vstack(D)
Example #23
0
def check_main_thread_renamed_no_warning(backend):
    with warnings.catch_warnings(record=True) as caught_warnings:
        warnings.simplefilter("always")
        results = Parallel(n_jobs=2, backend=backend)(
            delayed(square)(x) for x in range(3))
        assert results == [0, 1, 4]
    # The multiprocessing backend will raise a warning when detecting that is
    # started from the non-main thread. Let's check that there is no false
    # positive because of the name change.
    assert caught_warnings == []
Example #24
0
def test_batching_auto_multiprocessing():
    p = Parallel(n_jobs=2, batch_size="auto", backend="multiprocessing")
    p(delayed(id)(i) for i in range(5000))  # many very fast tasks

    # When the auto-tuning of the batch size is enabled
    # size kicks in the following attribute gets updated.
    assert_true(hasattr(p, "_effective_batch_size"))

    # It should be strictly larger than 1 but as we don't want heisen failures
    # on clogged CI worker environment be safe and only check that it's a
    # strictly positive number.
    assert_true(p._effective_batch_size > 0)
Example #25
0
def test_numpy_arrays_use_different_memory(mmap_mode):
    def func(arr, value):
        arr[:] = value
        return arr

    arrays = [np.zeros((10, 10), dtype='float64') for i in range(10)]

    results = Parallel(mmap_mode=mmap_mode, max_nbytes=0, n_jobs=2)(
        delayed(func)(arr, i) for i, arr in enumerate(arrays))

    for i, arr in enumerate(results):
        np.testing.assert_array_equal(arr, i)
Example #26
0
def check_dispatch_one_job(backend):
    """ Test that with only one job, Parallel does act as a iterator.
    """
    queue = list()

    def producer():
        for i in range(6):
            queue.append('Produced %i' % i)
            yield i

    # disable batching
    Parallel(n_jobs=1, batch_size=1, backend=backend)(
        delayed(consumer)(queue, x) for x in producer())
    assert_equal(queue, [
        'Produced 0', 'Consumed 0',
        'Produced 1', 'Consumed 1',
        'Produced 2', 'Consumed 2',
        'Produced 3', 'Consumed 3',
        'Produced 4', 'Consumed 4',
        'Produced 5', 'Consumed 5',
    ])
    assert_equal(len(queue), 12)

    # empty the queue for the next check
    queue[:] = []

    # enable batching
    Parallel(n_jobs=1, batch_size=4, backend=backend)(
        delayed(consumer)(queue, x) for x in producer())
    assert_equal(queue, [
        # First batch
        'Produced 0', 'Produced 1', 'Produced 2', 'Produced 3',
        'Consumed 0', 'Consumed 1', 'Consumed 2', 'Consumed 3',

        # Second batch
        'Produced 4', 'Produced 5', 'Consumed 4', 'Consumed 5',
    ])
    assert_equal(len(queue), 12)
Example #27
0
def test_auto_memmap_on_arrays_from_generator():
    # Non-regression test for a problem with a bad interaction between the
    # GC collecting arrays recently created during iteration inside the
    # parallel dispatch loop and the auto-memmap feature of Parallel.
    # See: https://github.com/joblib/joblib/pull/294
    def generate_arrays(n):
        for i in range(n):
            yield np.ones(10, dtype=np.float32) * i

    # Use max_nbytes=1 to force the use of memory-mapping even for small
    # arrays
    results = Parallel(n_jobs=2, max_nbytes=1)(delayed(check_memmap)(a) for a in generate_arrays(100))
    for result, expected in zip(results, generate_arrays(len(results))):
        np.testing.assert_array_equal(expected, result)
def run_in_parallel(common_classes, number_of_activations, function_to_run,
                    parallel, number_of_parallel):

    avg_number_of_generations = sum(parallel(delayed(function_to_run)(common_classes, number_of_activations / number_of_parallel)
                                             for _ in xrange(number_of_parallel))) / number_of_parallel

    f = open('results.txt', 'a')

    HGT_factor, mutation_factor, population_factor = common_classes.get_simulation_variables()

    f.write("results for the following parameters: HGT_factor=" + str(HGT_factor) +
            " mutation factor=" + str(mutation_factor) + " population_factor=" + str(population_factor) +
            " is: " + str(avg_number_of_generations) + "\n")

    f.close()
Example #29
0
def test_parallel_pickling():
    """ Check that pmap captures the errors when it is passed an object
        that cannot be pickled.
    """

    def g(x):
        return x ** 2

    try:
        # pickling a local function always fail but the exception
        # raised is a PickleError for python <= 3.4 and AttributeError
        # for python >= 3.5
        pickle.dumps(g)
    except Exception as exc:
        exception_class = exc.__class__

    assert_raises(exception_class, Parallel(), (delayed(g)(x) for x in range(10)))
Example #30
0
def test_cached_function_race_condition_when_persisting_output(tmpdir, capfd):
    # Test race condition where multiple processes are writing into
    # the same output.pkl. See
    # https://github.com/joblib/joblib/issues/490 for more details.
    memory = Memory(location=tmpdir.strpath)
    func_cached = memory.cache(fast_func_with_complex_output)

    Parallel(n_jobs=2)(delayed(func_cached)() for i in range(3))

    stdout, stderr = capfd.readouterr()

    # Checking both stdout and stderr (ongoing PR #434 may change
    # logging destination) to make sure there is no exception while
    # loading the results
    exception_msg = 'Exception while loading results'
    assert exception_msg not in stdout
    assert exception_msg not in stderr
Example #31
0
def test_main_thread_renamed_no_warning(backend, monkeypatch):
    # Check that no default backend relies on the name of the main thread:
    # https://github.com/joblib/joblib/issues/180#issuecomment-253266247
    # Some programs use a different name for the main thread. This is the case
    # for uWSGI apps for instance.
    monkeypatch.setattr(target=threading.current_thread(), name='name',
                        value='some_new_name_for_the_main_thread')

    with warns(None) as warninfo:
        results = Parallel(n_jobs=2, backend=backend)(
            delayed(square)(x) for x in range(3))
        assert results == [0, 1, 4]

    # Due to the default parameters of LokyBackend, there is a chance that
    # warninfo catches Warnings from worker timeouts. We remove it if it exists
    warninfo = [w for w in warninfo if "worker timeout" not in str(w.message)]

    # The multiprocessing backend will raise a warning when detecting that is
    # started from the non-main thread. Let's check that there is no false
    # positive because of the name change.
    assert len(warninfo) == 0
Example #32
0
def test_cached_function_race_condition_when_persisting_output_2(
        tmpdir, capfd):
    # Test race condition in first attempt at solving
    # https://github.com/joblib/joblib/issues/490. The race condition
    # was due to the delay between seeing the cache directory created
    # (interpreted as the result being cached) and the output.pkl being
    # pickled.
    memory = Memory(location=tmpdir.strpath)
    func_cached = memory.cache(fast_func_with_conditional_complex_output)

    Parallel(n_jobs=2)(delayed(func_cached)(True if i % 2 == 0 else False)
                       for i in range(3))

    stdout, stderr = capfd.readouterr()

    # Checking both stdout and stderr (ongoing PR #434 may change
    # logging destination) to make sure there is no exception while
    # loading the results
    exception_msg = 'Exception while loading results'
    assert exception_msg not in stdout
    assert exception_msg not in stderr
Example #33
0
def run(allpool, test):
    # return [runOptLearn(allpool[0])]
    if test:
        result = [runOptLearn(p, test) for p in allpool]
    else:
        with parallel_backend('threading'):
            result = (Parallel()(delayed(runOptLearn)(p) for p in allpool))

    success = []
    fail = []
    for i in range(len(result)):
        if (result[i].success):
            success.append(result[i])
        else:
            logger.error("!!!FAILED: %d %s", i, result[i].shortname)
            fail.append(result[i])
    # for pool in allpool:
    #     runOptLearn(pool)
    #     break;
    logger.debug('finish')
    return success, fail
Example #34
0
def test_retrieval_context():
    import contextlib

    class MyBackend(ThreadingBackend):
        i = 0

        @contextlib.contextmanager
        def retrieval_context(self):
            self.i += 1
            yield

    register_parallel_backend("retrieval", MyBackend)

    def nested_call(n):
        return Parallel(n_jobs=2)(delayed(id)(i) for i in range(n))

    with parallel_backend("retrieval") as (ba, _):
        Parallel(n_jobs=2)(
            delayed(nested_call, check_pickle=False)(i)
            for i in range(5)
        )
        assert ba.i == 1
Example #35
0
def test_no_blas_crash_or_freeze_with_multiprocessing():
    if sys.version_info < (3, 4):
        raise SkipTest('multiprocessing can cause BLAS freeze on old Python')

    # Use the spawn backend that is both robust and available on all platforms
    spawn_backend = mp.get_context('spawn')

    # Check that on recent Python version, the 'spawn' start method can make
    # it possible to use multiprocessing in conjunction of any BLAS
    # implementation that happens to be used by numpy with causing a freeze or
    # a crash
    rng = np.random.RandomState(42)

    # call BLAS DGEMM to force the initialization of the internal thread-pool
    # in the main process
    a = rng.randn(1000, 1000)
    np.dot(a, a.T)

    # check that the internal BLAS thread-pool is not in an inconsistent state
    # in the worker processes managed by multiprocessing
    Parallel(n_jobs=2,
             backend=spawn_backend)(delayed(np.dot)(a, a.T) for i in range(2))
Example #36
0
def junction_search(directory, junction_folder, input_data_folder,
                    blast_results_folder, junction_sequence,
                    exclusion_sequence, threads):
    unmap_files = get_sam_filelist(directory, input_data_folder)
    if not len(unmap_files):
        click.echo(
            red_fg("\n>>> ERROR: No .sam files found in directory %s." %
                   directory))
        sys.exit(1)
    junction_seqs = make_search_junctions(junction_sequence)
    click.echo(
        cyan_fg(
            "\n>>> The primary, secondary, and tertiary sequences searched are:"
        ))
    for j in junction_seqs:
        click.echo(yellow_fg("    %s" % j))
    click.echo(cyan_fg('\n>>> Starting junction search on %s cores.' %
                       threads))
    parallel.Parallel(n_jobs=threads)(parallel.delayed(
        jsearch)(directory, f, input_data_folder, junction_folder,
                 junction_seqs, exclusion_sequence) for f in unmap_files)
    multi_convert(directory, junction_folder, blast_results_folder)
Example #37
0
    def significance(self, pair1, pair2):
        per_doc1, overall1 = pair1
        per_doc2, overall2 = pair2
        # TODO: limit to metrics
        base_diff = _result_diff(overall1, overall2)
        randomized_diffs = functools.partial(self.METHODS[self.method],
                                             per_doc1, per_doc2, base_diff)
        results = Parallel(n_jobs=self.n_jobs)(
            delayed(randomized_diffs)(share)
            for share in _job_shares(self.n_jobs, self.trials))
        all_counts = []
        for result in results:
            metrics, counts = zip(*result.items())
            all_counts.append(counts)

        return {
            metric: {
                'diff': base_diff[metric],
                'p': (sum(counts) + 1) / (self.trials + 1)
            }
            for metric, counts in zip(metrics, zip(*all_counts))
        }
Example #38
0
def test_dispatch_multiprocessing(backend):
    """ Check that using pre_dispatch Parallel does indeed dispatch items
        lazily.
    """
    manager = mp.Manager()
    queue = manager.list()

    def producer():
        for i in range(6):
            queue.append('Produced %i' % i)
            yield i

    Parallel(n_jobs=2, batch_size=1, pre_dispatch=3,
             backend=backend)(delayed(consumer)(queue, 'any')
                              for _ in producer())

    # Only 3 tasks are dispatched out of 6. The 4th task is dispatched only
    # after any of the first 3 jobs have completed.
    first_four = list(queue)[:4]
    # The the first consumption event can sometimes happen before the end of
    # the dispatching, hence, pop it before introspecting the "Produced" events
    first_four.remove('Consumed any')
    assert first_four == ['Produced 0', 'Produced 1', 'Produced 2']
    assert len(queue) == 12
Example #39
0
def test_simple_parallel(backend, n_jobs, verbose):
    assert ([square(x) for x in range(5)
             ] == Parallel(n_jobs=n_jobs, backend=backend,
                           verbose=verbose)(delayed(square)(x)
                                            for x in range(5)))
Example #40
0
    def __call__(self):
        if not self.models.is_dir():
            raise ValueError(f"{self.models} is not a directory.")
        if not self.gt.is_dir():
            raise ValueError(f"{self.gt} is not a directory.")

        self.threshold = np.linspace(
            0, 1,
            self.extra.get('interp', self.extra.get('interp_points', 1000)))

        if self.dataset.lower() == 'mobius':
            from datasets import MOBIUS
            dataset = MOBIUS
        elif self.dataset.lower() == 'sbvpi':
            from datasets import SBVPI
            dataset = SBVPI
        else:
            from datasets import Dataset
            dataset = Dataset

        dataset = dataset.from_dir(self.gt, mask_dir=None)
        dataset.shuffle()

        with tqdm_joblib(tqdm(desc="Reading GT", total=len(dataset))):
            gt = dict(
                Parallel(n_jobs=-1)(delayed(self._load_gt)(gt_sample)
                                    for gt_sample in dataset))

        for self._model in self.models.iterdir():
            self._predictions = self._model / 'Predictions'
            self._binarised = self._model / 'Binarised'
            if not self._predictions.is_dir():
                raise ValueError(f"{self._predictions} is not a directory.")
            if not self._binarised.is_dir():
                raise ValueError(f"{self._binarised} is not a directory.")

            # Check if all pickles already exist
            flat_attrs = tuple()
            for attr in ATTR_EXP:
                try:
                    flat_attrs += attr
                except TypeError:
                    flat_attrs += attr,
            unique_attr_values = {
                attr: {getattr(sample, attr)
                       for sample in dataset}
                for attr in set(flat_attrs)
            }
            exp_attr_values = [{
                attr: unique_attr_values[attr]
                for attr in ensure_iterable(attrs, True)
            } for attrs in ATTR_EXP]
            attr_experiments = {
                ', '.join(f'{attr.title()}={val.name.title()}'
                          for attr, val in current_values.items()):
                current_values
                for current_exp in exp_attr_values
                for current_values in dict_product(current_exp)
            }
            all_names = ['Overall'] + list(attr_experiments)
            if not self.extra.get('overwrite', False) and all(
                (self._model / f'Pickles/{name}.pkl').is_file()
                    for name in all_names):
                print(
                    f"All pickles already exist, skipping {self._model.name}")
                continue

            #TODO: Move folds here and only load one fold's predictions at a time
            # We can't do this because experiment2 needs to have different splits. If we absolutely need this, we'll have to reread the images for each sub-experiment anew.
            # We can cache the images for each split until the end of the split - that way we'll only need to read some of the images anew.

            print(f"Evaluating model {self._model.name}")
            with tqdm_joblib(
                    tqdm(desc="Reading predictions", total=len(dataset))):
                pred_bin = dict(
                    Parallel(n_jobs=-1)(delayed(self._process_image)(gt_sample)
                                        for gt_sample in dataset))
            # This will filter out non-existing predictions, so the code will still work, but missing predictions should be addressed (otherwise evaluation is unfair)
            pred_bin_gt = {
                gt_sample: (*pred_bin[gt_sample], gt[gt_sample])
                for gt_sample in dataset if pred_bin[gt_sample] is not None
            }

            # Overall
            self._experiment1(pred_bin_gt)

            # Split by lighting, phones, and gaze
            for attrs in ATTR_EXP:
                self._experiment2(pred_bin_gt, attrs)
Example #41
0
    def createMISPEventFromHash(self, _hash, filename, additional_hash=False):
        LibIoC_DK.debugging("Creating the MISP hash event: %s" % (_hash),
                            main._DEBUG_, main._LOGGING_, main.hFile)

        _hash = _hash.lower()
        # if the MISP event for the hash value already exists, stop the further process.
        if self.checkMISPHashEventExist(_hash):
            LibIoC_DK.debugging("The MISP hash event ALREADY exists",
                                main._DEBUG_, main._LOGGING_, main.hFile)
            return False

        result = self.malware_repo_connector.getMalwareInfo(_hash)

        if not result:
            return False

        event = self.misp_connection.new_event(0, 1, 2, _hash)
        self.misp_connection.add_named_attribute(
            event,
            category='Other',
            type_value='comment',
            value=LibIoC_DK.getFileName(
                filename))  # this will work as the ground truth of IoCs

        # the first three attributes in the result is md5, sha1, and sha256 representation malware hash, so manually store them in the event.
        md5 = result.pop(0)[2]
        sha1 = result.pop(0)[2]
        sha256 = result.pop(0)[2]

        self.misp_connection.add_hashes(event,
                                        category='Payload installation',
                                        md5=md5)
        if main._STAT_:
            self.ioc_stat.addCategory2('hash')
        self.misp_connection.add_hashes(event,
                                        category='Payload installation',
                                        sha1=sha1)
        if main._STAT_:
            self.ioc_stat.addCategory2('hash')
        self.misp_connection.add_hashes(event,
                                        category='Payload installation',
                                        sha256=sha256)
        if main._STAT_:
            self.ioc_stat.addCategory2('hash')

        if main._DOWNLOAD_MALWARE_:
            sample_path = self.config[
                'SampleRoot'] + '/' + LibIoC_DK.getReportPublicationYear(
                    filename)
            if not os.path.exists(sample_path + '/' + _hash):
                malware_buffer = self.malware_repo_connector.downloadMalware(
                    sha256, False)
                if malware_buffer:
                    if not os.path.exists(sample_path):
                        os.makedirs(sample_path)
                    f = open(sample_path + '/' + _hash, 'wb')
                    f.write(malware_buffer)
                    f.close()
                    extracted = self.malware_repo_connector.unzipMalware(
                        sample_path + '/' + _hash, sample_path)
                    if extracted.lower() != _hash.lower():
                        os.remove(sample_path + '/' + _hash)
                        _hash = extracted

                header_info = LibIoC_DK.getMalwareHeaderInfo(sample_path +
                                                             '/' + _hash)
                if header_info is not None:
                    self.addMalwareHeaderInfo(header_info, event)

        if main._STAT_ and self.ioc_stat.report_name != filename:
            self.ioc_stat.setReportBuffer(filename)

        if main._PARALLELIZE_ATTRIB_ADDITION_:
            # Parallelized Version
            print('[Hash] Parallelized attribute storing...')
            joblib.Parallel(joblib.cpu_count())(
                delayed(addAttribute)(self, event, attr) for attr in result)
        else:
            # Sequential Version
            for attr in result:
                self.addAttribute(event, attr, filename)

        LibIoC_DK.debugging("The MISP hash event created", main._DEBUG_,
                            main._LOGGING_, main.hFile)
        return True
Example #42
0
    def createMISPEventFromReport(self, ioc, filename, _date):
        LibIoC_DK.debugging(
            "Creating the MISP report event: %s" %
            (LibIoC_DK.getFileName(filename)), main._DEBUG_, main._LOGGING_,
            main.hFile)
        '''
        # if the MISP event for the filename already exists, add ioc in the event
        eid = self.getMISPEventID(LibIoC_DK.getFileName(filename))
        if eid:
            event = self.misp_connection.get_event(eid)
            for attr in ioc:
                self.addAttribute(event, attr)
            return False           
        '''
        if self.checkMISPReportEventExist(filename):
            LibIoC_DK.debugging("The MISP report event ALREADY exists",
                                main._DEBUG_, main._LOGGING_, main.hFile)
            return False

        try:
            if _date is not None:
                #_date = datetime.datetime.strptime(_date, "%m/%d/%Y").strftime("%Y-%m-%d")
                _date = _date.split("/")
                if int(_date[0]) > 12:
                    _date = datetime.date(int(_date[0]), int(_date[1]),
                                          int(_date[2])).isoformat()
                else:
                    _date = datetime.date(int(_date[2]), int(_date[0]),
                                          int(_date[1])).isoformat()
        except Exception as e:
            print e

        event = self.misp_connection.new_event(0,
                                               1,
                                               2,
                                               LibIoC_DK.getFileName(filename),
                                               date=_date)
        self.misp_connection.add_named_attribute(
            event,
            category='Other',
            type_value='comment',
            value=LibIoC_DK.getFileName(
                filename))  # this will work as the ground truth of IoC

        if main._STAT_ and self.ioc_stat.report_name != filename:
            self.ioc_stat.setReportBuffer(filename)

        if main._PARALLELIZE_ATTRIB_ADDITION_:
            # Parallelized Version
            print('[Report] Parallelized attribute storing...')
            attr_added = joblib.Parallel(joblib.cpu_count())(
                delayed(addAttribute)(self, event, attr, filename)
                for attr in ioc)
        else:
            # Sequential Version
            attr_added = False
            for attr in ioc:
                attr_added = self.addAttribute(event, attr,
                                               filename) or attr_added

        if (type(attr_added) is bool and not attr_added):
            self.misp_connection.delete_event(event['Event']['id'])
            LibIoC_DK.debugging(
                "NO attribute added for the report: %s" %
                (LibIoC_DK.getFileName(filename)), main._DEBUG_,
                main._LOGGING_, main.hFile)
            return False
        if type(attr_added) is list:
            if not (True in attr_added):
                self.misp_connection.delete_event(event['Event']['id'])
                LibIoC_DK.debugging(
                    "NO attribute added for the report: %s" %
                    (LibIoC_DK.getFileName(filename)), main._DEBUG_,
                    main._LOGGING_, main.hFile)
                return False

        LibIoC_DK.debugging("The MISP report event created", main._DEBUG_,
                            main._LOGGING_, main.hFile)
        return True
Example #43
0
 def nested_call(n):
     return Parallel(n_jobs=2)(delayed(id)(i) for i in range(n))
Example #44
0
 def get_nested_pids():
     assert _active_backend_type() == ThreadingBackend
     return Parallel(n_jobs=2)(delayed(os.getpid)() for _ in range(2))
Example #45
0
def test_exception_dispatch():
    """Make sure that exception raised during dispatch are indeed captured"""
    with raises(ValueError):
        Parallel(n_jobs=2, pre_dispatch=16,
                 verbose=0)(delayed(exception_raiser)(i) for i in range(30))
Example #46
0
def test_error_capture(backend):
    # Check that error are captured, and that correct exceptions
    # are raised.
    if mp is not None:
        # A JoblibException will be raised only if there is indeed
        # multiprocessing
        with raises(JoblibException):
            Parallel(n_jobs=2, backend=backend)(
                [delayed(division)(x, y) for x, y in zip((0, 1), (1, 0))])
        with raises(WorkerInterrupt):
            Parallel(n_jobs=2, backend=backend)(
                [delayed(interrupt_raiser)(x) for x in (1, 0)])

        # Try again with the context manager API
        with Parallel(n_jobs=2, backend=backend) as parallel:
            assert get_workers(parallel._backend) is not None
            original_workers = get_workers(parallel._backend)

            with raises(JoblibException):
                parallel(
                    [delayed(division)(x, y) for x, y in zip((0, 1), (1, 0))])

            # The managed pool should still be available and be in a working
            # state despite the previously raised (and caught) exception
            assert get_workers(parallel._backend) is not None

            # The pool should have been interrupted and restarted:
            assert get_workers(parallel._backend) is not original_workers

            assert ([f(x, y=1) for x in range(10)
                     ] == parallel(delayed(f)(x, y=1) for x in range(10)))

            original_workers = get_workers(parallel._backend)
            with raises(WorkerInterrupt):
                parallel([delayed(interrupt_raiser)(x) for x in (1, 0)])

            # The pool should still be available despite the exception
            assert get_workers(parallel._backend) is not None

            # The pool should have been interrupted and restarted:
            assert get_workers(parallel._backend) is not original_workers

            assert ([f(x, y=1) for x in range(10)
                     ] == parallel(delayed(f)(x, y=1) for x in range(10)))

        # Check that the inner pool has been terminated when exiting the
        # context manager
        assert get_workers(parallel._backend) is None
    else:
        with raises(KeyboardInterrupt):
            Parallel(n_jobs=2)([delayed(interrupt_raiser)(x) for x in (1, 0)])

    # wrapped exceptions should inherit from the class of the original
    # exception to make it easy to catch them
    with raises(ZeroDivisionError):
        Parallel(n_jobs=2)(
            [delayed(division)(x, y) for x, y in zip((0, 1), (1, 0))])

    with raises(MyExceptionWithFinickyInit):
        Parallel(n_jobs=2,
                 verbose=0)((delayed(exception_raiser)(i,
                                                       custom_exception=True)
                             for i in range(30)))

    try:
        # JoblibException wrapping is disabled in sequential mode:
        ex = JoblibException()
        Parallel(n_jobs=1)(delayed(division)(x, y)
                           for x, y in zip((0, 1), (1, 0)))
    except Exception as ex:
        assert not isinstance(ex, JoblibException)
Example #47
0
def test_parallel_timeout_fail(backend):
    # Check that timeout properly fails when function is too slow
    with raises(TimeoutError):
        Parallel(n_jobs=2, backend=backend,
                 timeout=0.01)(delayed(sleep)(10) for x in range(10))
Example #48
0
def test_parallel_timeout_success(backend):
    # Check that timeout isn't thrown when function is fast enough
    assert len(
        Parallel(n_jobs=2, backend=backend,
                 timeout=10)(delayed(sleep)(0.001) for x in range(10))) == 10
Example #49
0
def get_timestamps(blocks):
    data = Parallel(10, 'threading')(delayed(get_block_timestamp)(block)
                                     for block in blocks)
    return pd.to_datetime([x * 1e9 for x in data])
Example #50
0
def test_parallel_call_cached_function_defined_in_jupyter(
        tmpdir, call_before_reducing):
    # Calling an interactively defined memory.cache()'d function inside a
    # Parallel call used to clear the existing cache related to the said
    # function (https://github.com/joblib/joblib/issues/1035)

    # This tests checks that this is no longer the case.

    # TODO: test that the cache related to the function cache persists across
    # ipython sessions (provided that no code change were made to the
    # function's source)?

    # The first part of the test makes the necessary low-level calls to emulate
    # the definition of a function in an jupyter notebook cell. Joblib has
    # some custom code to treat functions defined specifically in jupyter
    # notebooks/ipython session -- we want to test this code, which requires
    # the emulation to be rigorous.
    for session_no in [0, 1]:
        ipython_cell_source = '''
        def f(x):
            return x
        '''

        ipython_cell_id = '<ipython-input-{}-000000000000>'.format(session_no)

        exec(
            compile(textwrap.dedent(ipython_cell_source),
                    filename=ipython_cell_id,
                    mode='exec'))
        # f is now accessible in the locals mapping - but for some unknown
        # reason, f = locals()['f'] throws a KeyError at runtime, we need to
        # bind locals()['f'] to a different name in the local namespace
        aliased_f = locals()['f']
        aliased_f.__module__ = "__main__"

        # Preliminary sanity checks, and tests checking that joblib properly
        # identified f as an interactive function defined in a jupyter notebook
        assert aliased_f(1) == 1
        assert aliased_f.__code__.co_filename == ipython_cell_id

        memory = Memory(location=tmpdir.strpath, verbose=0)
        cached_f = memory.cache(aliased_f)

        assert len(os.listdir(tmpdir / 'joblib')) == 1
        f_cache_relative_directory = os.listdir(tmpdir / 'joblib')[0]
        assert 'ipython-input' in f_cache_relative_directory

        f_cache_directory = tmpdir / 'joblib' / f_cache_relative_directory

        if session_no == 0:
            # The cache should be empty as cached_f has not been called yet.
            assert os.listdir(f_cache_directory) == ['f']
            assert os.listdir(f_cache_directory / 'f') == []

            if call_before_reducing:
                cached_f(3)
                # Two files were just created, func_code.py, and a folder
                # containing the informations (inputs hash/ouptput) of
                # cached_f(3)
                assert len(os.listdir(f_cache_directory / 'f')) == 2

                # Now, testing  #1035: when calling a cached function, joblib
                # used to dynamically inspect the underlying function to
                # extract its source code (to verify it matches the source code
                # of the function as last inspected by joblib) -- however,
                # source code introspection fails for dynamic functions sent to
                # child processes - which would eventually make joblib clear
                # the cache associated to f
                res = Parallel(n_jobs=2)(delayed(cached_f)(i) for i in [1, 2])
            else:
                # Submit the function to the joblib child processes, although
                # the function has never been called in the parent yet. This
                # triggers a specific code branch inside
                # MemorizedFunc.__reduce__.
                res = Parallel(n_jobs=2)(delayed(cached_f)(i) for i in [1, 2])
                assert len(os.listdir(f_cache_directory / 'f')) == 3

                cached_f(3)

            # Making sure f's cache does not get cleared after the parallel
            # calls, and contains ALL cached functions calls (f(1), f(2), f(3))
            # and 'func_code.py'
            assert len(os.listdir(f_cache_directory / 'f')) == 4
        else:
            # For the second session, there should be an already existing cache
            assert len(os.listdir(f_cache_directory / 'f')) == 4

            cached_f(3)

            # The previous cache should not be invalidated after calling the
            # function in a new session
            assert len(os.listdir(f_cache_directory / 'f')) == 4
Example #51
0
def _cross_validate_with_warm_start(
    estimators,
    X,
    y=None,
    *,
    groups=None,
    scoring=None,
    cv=None,
    n_jobs=None,
    verbose=0,
    fit_params=None,
    pre_dispatch="2*n_jobs",
    return_train_score=False,
    return_estimator=False,
    error_score=np.nan,
):
    """Evaluate metric(s) by cross-validation and also record fit/score times.

    Read more in the :ref:`User Guide <multimetric_cross_validation>`.

    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.

    X : array-like of shape (n_samples, n_features)
        The data to fit. Can be for example a list, or an array.

    y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
            default=None
        The target variable to try to predict in the case of
        supervised learning.

    groups : array-like of shape (n_samples,), default=None
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used in conjunction with a "Group" :term:`cv`
        instance (e.g., :class:`GroupKFold`).

    scoring : str, callable, list/tuple, or dict, default=None
        A single str (see :ref:`scoring_parameter`) or a callable
        (see :ref:`scoring`) to evaluate the predictions on the test set.

        For evaluating multiple metrics, either give a list of (unique) strings
        or a dict with names as keys and callables as values.

        NOTE that when using custom scorers, each scorer should return a single
        value. Metric functions returning a list/array of values can be wrapped
        into multiple scorers that return one value each.

        See :ref:`multimetric_grid_search` for an example.

        If None, the estimator's score method is used.

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

        - None, to use the default 5-fold cross validation,
        - int, to specify the number of folds in a `(Stratified)KFold`,
        - :term:`CV splitter`,
        - An iterable yielding (train, test) splits as arrays of indices.

        For int/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

        .. versionchanged:: 0.22
            ``cv`` default value if None changed from 3-fold to 5-fold.

    n_jobs : int, default=None
        The number of CPUs to use to do the computation.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    verbose : int, default=0
        The verbosity level.

    fit_params : dict, default=None
        Parameters to pass to the fit method of the estimator.

    pre_dispatch : int or str, default='2*n_jobs'
        Controls the number of jobs that get dispatched during parallel
        execution. Reducing this number can be useful to avoid an
        explosion of memory consumption when more jobs get dispatched
        than CPUs can process. This parameter can be:

            - None, in which case all the jobs are immediately
              created and spawned. Use this for lightweight and
              fast-running jobs, to avoid delays due to on-demand
              spawning of the jobs

            - An int, giving the exact number of total jobs that are
              spawned

            - A str, giving an expression as a function of n_jobs,
              as in '2*n_jobs'

    return_train_score : bool, default=False
        Whether to include train scores.
        Computing training scores is used to get insights on how different
        parameter settings impact the overfitting/underfitting trade-off.
        However computing the scores on the training set can be computationally
        expensive and is not strictly required to select the parameters that
        yield the best generalization performance.

        .. versionadded:: 0.19

        .. versionchanged:: 0.21
            Default value was changed from ``True`` to ``False``

    return_estimator : bool, default=False
        Whether to return the estimators fitted on each split.

        .. versionadded:: 0.20

    error_score : 'raise' or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If a numeric value is given, FitFailedWarning is raised. This parameter
        does not affect the refit step, which will always raise the error.

        .. versionadded:: 0.20

    Returns
    -------
    scores : dict of float arrays of shape (n_splits,)
        Array of scores of the estimator for each run of the cross validation.

        A dict of arrays containing the score/time arrays for each scorer is
        returned. The possible keys for this ``dict`` are:

            ``test_score``
                The score array for test scores on each cv split.
                Suffix ``_score`` in ``test_score`` changes to a specific
                metric like ``test_r2`` or ``test_auc`` if there are
                multiple scoring metrics in the scoring parameter.
            ``train_score``
                The score array for train scores on each cv split.
                Suffix ``_score`` in ``train_score`` changes to a specific
                metric like ``train_r2`` or ``train_auc`` if there are
                multiple scoring metrics in the scoring parameter.
                This is available only if ``return_train_score`` parameter
                is ``True``.
            ``fit_time``
                The time for fitting the estimator on the train
                set for each cv split.
            ``score_time``
                The time for scoring the estimator on the test set for each
                cv split. (Note time for scoring on the train set is not
                included even if ``return_train_score`` is set to ``True``
            ``estimator``
                The estimator objects for each cv split.
                This is available only if ``return_estimator`` parameter
                is set to ``True``.

    Examples
    --------
    >>> from sklearn import datasets, linear_model
    >>> from sklearn.model_selection import cross_validate
    >>> from sklearn.metrics import make_scorer
    >>> from sklearn.metrics import confusion_matrix
    >>> from sklearn.svm import LinearSVC
    >>> diabetes = datasets.load_diabetes()
    >>> X = diabetes.data[:150]
    >>> y = diabetes.target[:150]
    >>> lasso = linear_model.Lasso()

    Single metric evaluation using ``cross_validate``

    >>> cv_results = cross_validate(lasso, X, y, cv=3)
    >>> sorted(cv_results.keys())
    ['fit_time', 'score_time', 'test_score']
    >>> cv_results['test_score']
    array([0.33150734, 0.08022311, 0.03531764])

    Multiple metric evaluation using ``cross_validate``
    (please refer the ``scoring`` parameter doc for more information)

    >>> scores = cross_validate(lasso, X, y, cv=3,
    ...                         scoring=('r2', 'neg_mean_squared_error'),
    ...                         return_train_score=True)
    >>> print(scores['test_neg_mean_squared_error'])
    [-3635.5... -3573.3... -6114.7...]
    >>> print(scores['train_r2'])
    [0.28010158 0.39088426 0.22784852]

    See Also
    ---------
    :func:`sklearn.model_selection.cross_val_score`:
        Run cross-validation for single metric evaluation.

    :func:`sklearn.model_selection.cross_val_predict`:
        Get predictions from each split of cross-validation for diagnostic
        purposes.

    :func:`sklearn.metrics.make_scorer`:
        Make a scorer from a performance metric or loss function.

    """
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimators[0]))
    if callable(scoring):
        scorers = {"score": scoring}
    elif scoring is None or isinstance(scoring, str):
        scorers = {"score": check_scoring(estimators[0], scoring=scoring)}
    else:
        try:
            scorers = _check_multimetric_scoring(estimators[0], scoring=scoring)
            # sklearn < 0.24.0 compatibility
            if isinstance(scorers, tuple):
                scorers = scorers[0]
        except KeyError:
            pass

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.

    parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
    results_org = parallel(
        delayed(_fit_and_score)(
            estimators[i],
            X,
            y,
            scorers,
            train_test_tuple[0],
            train_test_tuple[1],
            verbose,
            None,
            fit_params[i] if isinstance(fit_params, list) else fit_params,
            return_train_score=return_train_score,
            return_times=True,
            return_n_test_samples=True,
            return_estimator=return_estimator,
            error_score=error_score,
        )
        for i, train_test_tuple in enumerate(cv.split(X, y, groups))
    )

    results = _aggregate_score_dicts(results_org)

    ret = {}
    ret["fit_time"] = results["fit_time"]
    ret["score_time"] = results["score_time"]

    if return_estimator:
        ret["estimator"] = results["estimator"]

    test_scores_dict = _normalize_score_results(results["test_scores"])
    if return_train_score:
        train_scores_dict = _normalize_score_results(results["train_scores"])

    for name in test_scores_dict:
        ret["test_%s" % name] = test_scores_dict[name]
        if return_train_score:
            key = "train_%s" % name
            ret[key] = train_scores_dict[name]

    return (ret, results_org)
Example #52
0
def parallel_func(inner_n_jobs, backend):
    return Parallel(n_jobs=inner_n_jobs,
                    backend=backend)(delayed(square)(i) for i in range(3))
Example #53
0
                # each task-config-fidelity would have been evaluated on a different seed
                obj = {
                    config_hash: {
                        fidelity_hash: {
                            seed: v
                        }
                    }
                }
                task_datas[task_id] = update_table_with_new_entry(task_datas[task_id], obj)
                file_count += 1

            try:
                # deleting data file that was processed
                os.remove(os.path.join(dump_path, filename))
                # os.remove(os.path.join(dump_path, filename))
            except FileNotFoundError:
                continue

        logger.info("\tFinished batch processing in {:.3f} seconds".format(time.time() - start))
        logger.info("\tUpdating benchmark data files...")

        with parallel_backend(backend="loky", n_jobs=args.n_jobs):
            Parallel()(
                delayed(save_task_file)(task_id, obj, output_path) for task_id, obj in task_datas.items()
            )
        logger.info("\tContinuing to next batch")
        logger.info("\t{}".format("-" * 25))

    logger.info("Done!")
    logger.info("Total files processed: {}".format(file_count))
Example #54
0
def nested_loop(backend):
    Parallel(n_jobs=2, backend=backend)(delayed(square)(.01) for _ in range(2))
Example #55
0
 def concurrent_get_filename(array, temp_dirs):
     with Parallel(backend='loky', n_jobs=2, max_nbytes=10) as p:
         for i in range(10):
             [filename
              ] = p(delayed(getattr)(array, 'filename') for _ in range(1))
             temp_dirs.add(os.path.dirname(filename))
Example #56
0
def test_nested_loop(parent_backend, child_backend):
    Parallel(n_jobs=2,
             backend=parent_backend)(delayed(nested_loop)(child_backend)
                                     for _ in range(2))
Example #57
0
                        help="The path where all models directories are")
    parser.add_argument("--output_path",
                        default=None,
                        type=str,
                        help="The path to dump yaml file")
    parser.add_argument("--n_jobs",
                        default=4,
                        type=int,
                        help="number of cores")
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = input_arguments()

    benches = load_benchmark_settings()
    benches = {k: v for k, v in benches.items() if check_key(k)}

    with parallel_backend(backend="multiprocessing", n_jobs=args.n_jobs):
        dicts = Parallel()(delayed(update_dict_entry)(key, args.path)
                           for key, value in benches.items())
    tabular_dict = dict()
    for entry in dicts:
        tabular_dict.update(entry)
    print("Collected {} keys...".format(len(tabular_dict)))
    with open(os.path.join(args.output_path, "tabular_plot_config.yaml"),
              "w") as f:
        f.writelines(yaml.dump(tabular_dict))
    print("Done!")
Example #58
0
def test_mutate_input_with_threads():
    """Input is mutable when using the threading backend"""
    q = Queue(maxsize=5)
    Parallel(n_jobs=2,
             backend="threading")(delayed(q.put)(1) for _ in range(5))
    assert q.full()
Example #59
0
 def vault_prices(self, blocks):
     prices = Parallel(10, 'threading')(
         delayed(magic.get_price)(self.vault, block=block)
         for block in blocks)
     return prices
Example #60
0
def test_parallel_kwargs(n_jobs):
    """Check the keyword argument processing of pmap."""
    lst = range(10)
    assert ([f(x, y=1) for x in lst
             ] == Parallel(n_jobs=n_jobs)(delayed(f)(x, y=1) for x in lst))