def test_memory_numpy(tmpdir, mmap_mode): " Test memory with a function with numpy arrays." accumulator = list() def n(l=None): accumulator.append(1) return l memory = Memory(location=tmpdir.strpath, mmap_mode=mmap_mode, verbose=0) cached_n = memory.cache(n) rnd = np.random.RandomState(0) for i in range(3): a = rnd.random_sample((10, 10)) for _ in range(3): assert np.all(cached_n(a) == a) assert len(accumulator) == i + 1
def test_memory_ignore(tmpdir): " Test the ignore feature of memory " memory = Memory(location=tmpdir.strpath, verbose=0) accumulator = list() @memory.cache(ignore=['y']) def z(x, y=1): accumulator.append(1) assert z.ignore == ['y'] z(0, y=1) assert len(accumulator) == 1 z(0, y=1) assert len(accumulator) == 1 z(0, y=2) assert len(accumulator) == 1
def test_partial_decoration(): "Check cache may be called with kwargs before decorating" memory = Memory(cachedir=env['dir'], verbose=0) test_values = [ (['x'], 100, 'r'), ([], 10, None), ] for ignore, verbose, mmap_mode in test_values: @memory.cache(ignore=ignore, verbose=verbose, mmap_mode=mmap_mode) def z(x): pass yield nose.tools.assert_equal, z.ignore, ignore yield nose.tools.assert_equal, z._verbose, verbose yield nose.tools.assert_equal, z.mmap_mode, mmap_mode
def test_cached_function_race_condition_when_persisting_output(tmpdir, capfd): # Test race condition where multiple processes are writing into # the same output.pkl. See # https://github.com/joblib/joblib/issues/490 for more details. memory = Memory(location=tmpdir.strpath) func_cached = memory.cache(fast_func_with_complex_output) Parallel(n_jobs=2)(delayed(func_cached)() for i in range(3)) stdout, stderr = capfd.readouterr() # Checking both stdout and stderr (ongoing PR #434 may change # logging destination) to make sure there is no exception while # loading the results exception_msg = 'Exception while loading results' assert exception_msg not in stdout assert exception_msg not in stderr
def test_memory_ignore(): " Test the ignore feature of memory " memory = Memory(cachedir=env['dir'], verbose=0) accumulator = list() @memory.cache(ignore=['y']) def z(x, y=1): accumulator.append(1) yield nose.tools.assert_equal, z.ignore, ['y'] z(0, y=1) yield nose.tools.assert_equal, len(accumulator), 1 z(0, y=1) yield nose.tools.assert_equal, len(accumulator), 1 z(0, y=2) yield nose.tools.assert_equal, len(accumulator), 1
def test_memory_recomputes_after_an_error_while_loading_results( tmpdir, monkeypatch): memory = Memory(location=tmpdir.strpath) def func(arg): # This makes sure that the timestamp returned by two calls of # func are different. This is needed on Windows where # time.time resolution may not be accurate enough time.sleep(0.01) return arg, time.time() cached_func = memory.cache(func) input_arg = 'arg' arg, timestamp = cached_func(input_arg) # Make sure the function is correctly cached assert arg == input_arg # Corrupting output.pkl to make sure that an error happens when # loading the cached result corrupt_single_cache_item(memory) # Make sure that corrupting the file causes recomputation and that # a warning is issued. recorded_warnings = monkeypatch_cached_func_warn(cached_func, monkeypatch) recomputed_arg, recomputed_timestamp = cached_func(arg) assert len(recorded_warnings) == 1 exception_msg = 'Exception while loading results' assert exception_msg in recorded_warnings[0] assert recomputed_arg == arg assert recomputed_timestamp > timestamp # Corrupting output.pkl to make sure that an error happens when # loading the cached result corrupt_single_cache_item(memory) reference = cached_func.call_and_shelve(arg) try: reference.get() raise AssertionError( "It normally not possible to load a corrupted" " MemorizedResult" ) except KeyError as e: message = "is corrupted" assert message in str(e.args)
def _setup_toy_cache(tmpdir, num_inputs=10): memory = Memory(cachedir=tmpdir.strpath, verbose=0) @memory.cache() def get_1000_bytes(arg): return 'a' * 1000 inputs = list(range(num_inputs)) for arg in inputs: get_1000_bytes(arg) hash_dirnames = [get_1000_bytes._get_output_dir(arg)[0] for arg in inputs] full_hashdirs = [ os.path.join(get_1000_bytes.cachedir, dirname) for dirname in hash_dirnames ] return memory, full_hashdirs, get_1000_bytes
def test_memory_args_as_kwargs(tmpdir): """Non-regression test against 0.12.0 changes. https://github.com/joblib/joblib/pull/751 """ memory = Memory(location=tmpdir.strpath, verbose=0) @memory.cache def plus_one(a): return a + 1 # It's possible to call a positional arg as a kwarg. assert plus_one(1) == 2 assert plus_one(a=1) == 2 # However, a positional argument that joblib hadn't seen # before would cause a failure if it was passed as a kwarg. assert plus_one(a=2) == 3
def test_memory_in_memory_function_code_change(): _function_to_cache.__code__ = _sum.__code__ mem = Memory(cachedir=env['dir'], verbose=0) f = mem.cache(_function_to_cache) nose.tools.assert_equal(f(1, 2), 3) nose.tools.assert_equal(f(1, 2), 3) with warnings.catch_warnings(record=True): # ignore name collision warnings warnings.simplefilter("always") # Check that inline function modification triggers a cache invalidation _function_to_cache.__code__ = _product.__code__ nose.tools.assert_equal(f(1, 2), 2) nose.tools.assert_equal(f(1, 2), 2)
def _setup_toy_cache(tmpdir, num_inputs=10): memory = Memory(location=tmpdir.strpath, verbose=0) @memory.cache() def get_1000_bytes(arg): return 'a' * 1000 inputs = list(range(num_inputs)) for arg in inputs: get_1000_bytes(arg) func_id = _build_func_identifier(get_1000_bytes) hash_dirnames = [get_1000_bytes._get_output_identifiers(arg)[1] for arg in inputs] full_hashdirs = [os.path.join(get_1000_bytes.store_backend.location, func_id, dirname) for dirname in hash_dirnames] return memory, full_hashdirs, get_1000_bytes
def test_memory_numpy_check_mmap_mode(tmpdir): """Check that mmap_mode is respected even at the first call""" memory = Memory(location=tmpdir.strpath, mmap_mode='r', verbose=0) @memory.cache() def twice(a): return a * 2 a = np.ones(3) b = twice(a) c = twice(a) assert isinstance(c, np.memmap) assert c.mode == 'r' assert isinstance(b, np.memmap) assert b.mode == 'r'
def test_memory_exception(): """ Smoketest the exception handling of Memory. """ memory = Memory(cachedir=env['dir'], verbose=0) class MyException(Exception): pass @memory.cache def h(exc=0): if exc: raise MyException # Call once, to initialise the cache h() for _ in range(3): # Call 3 times, to be sure that the Exception is always raised yield nose.tools.assert_raises, MyException, h, 1
def test_memory_numpy_check_mmap_mode(): """Check that mmap_mode is respected even at the first call""" memory = Memory(cachedir=env['dir'], mmap_mode='r', verbose=0) memory.clear(warn=False) @memory.cache() def twice(a): return a * 2 a = np.ones(3) b = twice(a) c = twice(a) nose.tools.assert_true(isinstance(c, np.memmap)) nose.tools.assert_equal(c.mode, 'r') nose.tools.assert_true(isinstance(b, np.memmap)) nose.tools.assert_equal(b.mode, 'r')
def test_memory_numpy(): " Test memory with a function with numpy arrays." # Check with memmapping and without. for mmap_mode in (None, 'r'): accumulator = list() def n(l=None): accumulator.append(1) return l memory = Memory(cachedir=env['dir'], mmap_mode=mmap_mode, verbose=0) memory.clear(warn=False) cached_n = memory.cache(n) rnd = np.random.RandomState(0) for i in range(3): a = rnd.random_sample((10, 10)) for _ in range(3): yield nose.tools.assert_true, np.all(cached_n(a) == a) yield nose.tools.assert_equal, len(accumulator), i + 1
def test_memory_exception(tmpdir): """ Smoketest the exception handling of Memory. """ memory = Memory(location=tmpdir.strpath, verbose=0) class MyException(Exception): pass @memory.cache def h(exc=0): if exc: raise MyException # Call once, to initialise the cache h() for _ in range(3): # Call 3 times, to be sure that the Exception is always raised with raises(MyException): h(1)
def test_memorized_result_pickle(tmpdir): # Verify a MemoryResult object can be pickled/depickled. Non regression # test introduced following issue # https://github.com/joblib/joblib/issues/747 memory = Memory(location=tmpdir.strpath) @memory.cache def g(x): return x**2 memorized_result = g.call_and_shelve(4) memorized_result_pickle = pickle.dumps(memorized_result) memorized_result_loads = pickle.loads(memorized_result_pickle) assert memorized_result.store_backend.location == \ memorized_result_loads.store_backend.location assert memorized_result.func == memorized_result_loads.func assert memorized_result.args_id == memorized_result_loads.args_id assert str(memorized_result) == str(memorized_result_loads)
def test_memory_recomputes_after_an_error_why_loading_results( tmpdir, monkeypatch): memory = Memory(location=tmpdir.strpath) def func(arg): # This makes sure that the timestamp returned by two calls of # func are different. This is needed on Windows where # time.time resolution may not be accurate enough time.sleep(0.01) return arg, time.time() cached_func = memory.cache(func) input_arg = 'arg' arg, timestamp = cached_func(input_arg) # Make sure the function is correctly cached assert arg == input_arg # Corrupting output.pkl to make sure that an error happens when # loading the cached result single_cache_item, = memory.store_backend.get_items() output_filename = os.path.join(single_cache_item.path, 'output.pkl') with open(output_filename, 'w') as f: f.write('garbage') recorded_warnings = [] def append_to_record(item): recorded_warnings.append(item) # Make sure that corrupting the file causes recomputation and that # a warning is issued. Need monkeypatch because pytest does not # capture stdlib logging output (see # https://github.com/pytest-dev/pytest/issues/2079) monkeypatch.setattr(cached_func, 'warn', append_to_record) recomputed_arg, recomputed_timestamp = cached_func(arg) assert len(recorded_warnings) == 1 exception_msg = 'Exception while loading results' assert exception_msg in recorded_warnings[0] assert recomputed_arg == arg assert recomputed_timestamp > timestamp
def test_memory_numpy_check_mmap_mode_async(tmpdir, monkeypatch): """Check that mmap_mode is respected even at the first call""" memory = Memory(location=tmpdir.strpath, mmap_mode='r', verbose=0) @memory.cache() @asyncio.coroutine def twice(a): return a * 2 @asyncio.coroutine def main(): a = np.ones(3) b = yield from twice(a) c = yield from twice(a) assert isinstance(c, np.memmap) assert c.mode == 'r' assert isinstance(b, np.memmap) assert b.mode == 'r' # Corrupts the file, Deleting b and c mmaps # is necessary to be able edit the file del b del c corrupt_single_cache_item(memory) # Make sure that corrupting the file causes recomputation and that # a warning is issued. recorded_warnings = monkeypatch_cached_func_warn(twice, monkeypatch) d = yield from twice(a) assert len(recorded_warnings) == 1 exception_msg = 'Exception while loading results' assert exception_msg in recorded_warnings[0] # Asserts that the recomputation returns a mmap assert isinstance(d, np.memmap) assert d.mode == 'r' asyncio.get_event_loop().run_until_complete(main())
def test_cached_function_race_condition_when_persisting_output_2( tmpdir, capfd): # Test race condition in first attempt at solving # https://github.com/joblib/joblib/issues/490. The race condition # was due to the delay between seeing the cache directory created # (interpreted as the result being cached) and the output.pkl being # pickled. memory = Memory(location=tmpdir.strpath) func_cached = memory.cache(fast_func_with_conditional_complex_output) Parallel(n_jobs=2)(delayed(func_cached)(True if i % 2 == 0 else False) for i in range(3)) stdout, stderr = capfd.readouterr() # Checking both stdout and stderr (ongoing PR #434 may change # logging destination) to make sure there is no exception while # loading the results exception_msg = 'Exception while loading results' assert exception_msg not in stdout assert exception_msg not in stderr
def _setup_temporary_cache_folder(num_inputs=10): # Use separate cache dir to avoid side-effects from other tests # that do not use _setup_temporary_cache_folder mem = Memory(cachedir=os.path.join(env['dir'], 'separate_cache'), verbose=0) @mem.cache() def get_1000_bytes(arg): return 'a' * 1000 inputs = list(range(num_inputs)) for arg in inputs: get_1000_bytes(arg) hash_dirnames = [get_1000_bytes._get_output_dir(arg)[0] for arg in inputs] full_hashdirs = [ os.path.join(get_1000_bytes.cachedir, dirname) for dirname in hash_dirnames ] return mem, full_hashdirs, get_1000_bytes
def main(): logging.basicConfig( format="[%(asctime)s] %(levelname)s %(threadName)s: %(message)s", level=logging.INFO) cli_parser = ArgumentParser() cli_parser.add_argument("-c", "--conf", type=str, required=True) cli_parser.add_argument("-d", "--dataset", type=str, required=True) cli_parser.add_argument("-r", "--repeats", type=int, default=15) cli_parser.add_argument("-p", "--parallel", type=int, default=1) cli_parser.add_argument("--cache", type=str, default="cache") args = cli_parser.parse_args() parser = ArgumentParser() parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--fraction", type=float, default=0.001) parser.add_argument("--epochs", type=int, default=10) with open(args.conf, "rt") as f: configs = [ parser.parse_args(line.strip().split(" ")) for line in f.readlines() ] with TaskExecutor(max_workers=args.parallel, memory=Memory(args.cache, compress=6)): results = [ evaluate_config(args.dataset, conf.lr, conf.fraction, conf.epochs, args.repeats) for conf in configs ] results = [r.result for r in results] for config, result in sorted(zip(configs, results), key=lambda e: e[1]['conf'][0], reverse=True): logging.info( f"{args.dataset} baseline: {result['mean']:.5f} +/- {result['std']:.5f} => {result['conf'][0]:.5f} (lr = {result['lr']})" )
def test_memory_warning_lambda_collisions(): # Check that multiple use of lambda will raise collisions memory = Memory(cachedir=env['dir'], verbose=0) # For isolation with other tests memory.clear() a = lambda x: x a = memory.cache(a) b = lambda x: x + 1 b = memory.cache(b) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") # This is a temporary workaround until we get rid of # inspect.getargspec, see # https://github.com/joblib/joblib/issues/247 warnings.simplefilter("ignore", DeprecationWarning) nose.tools.assert_equal(0, a(0)) nose.tools.assert_equal(2, b(1)) nose.tools.assert_equal(1, a(1)) # In recent Python versions, we can retrieve the code of lambdas, # thus nothing is raised nose.tools.assert_equal(len(w), 4)
def test_memory_arg_lambda(): " Test memory with a lambda argument." memory = Memory(cachedir=env['dir'], verbose=0) memory.clear(warn=False) accum = {'value': 0} @memory.cache() def run_func(func): accum['value'] += 1 return func() lambda_1 = lambda: 1 lambda_2 = lambda: 2 a = run_func(lambda_1) b = run_func(lambda_1) c = run_func(lambda_2) nose.tools.assert_equal(a, 1) nose.tools.assert_equal(b, 1) nose.tools.assert_equal(c, 2) nose.tools.assert_equal(accum['value'], 2)
def test_memory_name_collision(): " Check that name collisions with functions will raise warnings" memory = Memory(cachedir=env['dir'], verbose=0) @memory.cache def name_collision(x): """ A first function called name_collision """ return x a = name_collision @memory.cache def name_collision(x): """ A second function called name_collision """ return x b = name_collision if not hasattr(warnings, 'catch_warnings'): # catch_warnings is new in Python 2.6 return with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") # This is a temporary workaround until we get rid of # inspect.getargspec, see # https://github.com/joblib/joblib/issues/247 warnings.simplefilter("ignore", DeprecationWarning) a(1) b(1) yield nose.tools.assert_equal, len(w), 1 yield nose.tools.assert_true, "collision" in str(w[-1].message)
def test_memory_warning_collision_detection(): # Check that collisions impossible to detect will raise appropriate # warnings. memory = Memory(cachedir=env['dir'], verbose=0) # For isolation with other tests memory.clear() a1 = eval('lambda x: x') a1 = memory.cache(a1) b1 = eval('lambda x: x+1') b1 = memory.cache(b1) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") # This is a temporary workaround until we get rid of # inspect.getargspec, see # https://github.com/joblib/joblib/issues/247 warnings.simplefilter("ignore", DeprecationWarning) a1(1) b1(1) a1(0) assert len(w) == 2 assert "cannot detect" in str(w[-1].message).lower()
import matplotlib.pyplot as plt from joblib.memory import Memory from matplotlib import rc import matplotlib rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']}) ## for Palatino and other serif fonts use: #rc('font',**{'family':'serif','serif':['Palatino']}) rc('text', usetex=True) #matplotlib.rcParams['text.latex.preamble']=[r"\usepackage{siunitx}\sisetup{detect-weight=true, detect-family=true}"] matplotlib.rcParams['text.latex.preamble'] = [ r"\usepackage{bm}\usepackage{siunitx}\sisetup{detect-weight=true, detect-family=true}" ] memory = Memory(cachedir="plotsperformance", verbose=0, compress=9) def loadtest(folder): result = [] for fold in range(10): result.append( numpy.load(folder + "/fold_" + str(fold) + "_test.npz")['arr_0']) # return numpy.ravel(numpy.array(result)) return result @memory.cache def loadLL(folder): result = [] for fold in range(10):
def test(): n_topics = 10 with open("acceptedoralpaperstext.txt") as f: content = [] ids = [] for line in f.readlines(): cols = line.split("\t") content.append(cols[1].strip()) ids.append(cols[0].strip()) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=100, stop_words='english') #print(content) bow = tf_vectorizer.fit_transform(content) feature_names = tf_vectorizer.get_feature_names() lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=2000, #learning_method='online', #learning_offset=50., random_state=0) topics = lda.fit_transform(bow, bow) print(print_top_words(lda, feature_names, 10)) print(topics) print(bow.shape) f = numpy.array(feature_names) data = numpy.array(bow.todense()) featureTypes = ["discrete"] * data.shape[1] domains = [] for i, ft in enumerate(featureTypes): domain = numpy.unique(data[:, i]) # print(i, ft, domain) domains.append(domain) memory = Memory(cachedir="/tmp/spntopics", verbose=0, compress=9) @memory.cache def learn(data, min_instances_slice, feature_names, domains, featureTypes): spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True), featureNames=feature_names, domains=domains, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=min_instances_slice) return spn print(data.shape) print(type(data)) #0/0 spn = learn(data, 5, f, domains, featureTypes) spn.root.validate() prodNodes = spn.get_nodes_by_type(ProductNode) for pn in prodNodes: leaves = pn.get_leaves() words = set() for leaf in leaves: # assuming pwl node: _x = numpy.argmax(leaf.y_range) max_x = leaf.x_range[_x] if max_x < 1.0: continue words.add(feature_names[leaf.featureIdx]) # ll = pn.eval() if len(words) < 4: continue print(pn.rows, words)
def test_clear_memory_with_none_location(): memory = Memory(location=None) memory.clear()
def test_memory_func_with_signature(tmpdir): memory = Memory(location=tmpdir.strpath, verbose=0) func_cached = memory.cache(func_with_signature) assert func_cached(1, 2.) == 3.
def test_memory_default_store_backend(): # test an unknow backend falls back into a FileSystemStoreBackend with raises(TypeError) as excinfo: Memory(location='/tmp/joblib', backend='unknown') excinfo.match(r"Unknown location*")