def check_memory(memory): """Check that ``memory`` is joblib.Memory-like. joblib.Memory-like means that ``memory`` can be converted into a joblib.Memory instance (typically a str denoting the ``location``) or has the same interface (has a ``cache`` method). Parameters ---------- memory : None, str or object with the joblib.Memory interface Returns ------- memory : object with the joblib.Memory interface Raises ------ ValueError If ``memory`` is not joblib.Memory-like. """ if memory is None or isinstance(memory, str): if LooseVersion(joblib.__version__) < '0.12': memory = joblib.Memory(cachedir=memory, verbose=0) else: memory = joblib.Memory(location=memory, verbose=0) elif not hasattr(memory, 'cache'): raise ValueError("'memory' should be None, a string or have the same" " interface as joblib.Memory." " Got memory='{}' instead.".format(memory)) return memory
def test_make_pipeline_memory(): cachedir = mkdtemp() if LooseVersion(joblib.__version__) < LooseVersion('0.12'): # Deal with change of API in joblib memory = joblib.Memory(cachedir=cachedir, verbose=10) else: memory = joblib.Memory(location=cachedir, verbose=10) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert pipeline.memory is memory pipeline = make_pipeline(DummyTransf(), SVC()) assert pipeline.memory is None assert len(pipeline) == 2 shutil.rmtree(cachedir)
def __init__(self, cache_dir): # TODO TdR 10/08/16: change url to post-processed data source. self.url = 'https://dds.cr.usgs.gov/srtm/version2_1/SRTM3/' self.file_listing = None cache_engine = joblib.Memory(cachedir=cache_dir, verbose=0) self.cached_crawl_page = cache_engine.cache(self.crawl_page)
def test_compile_string(): import stan_utility.cache with tempfile.TemporaryDirectory() as cachedir: print("using cachedir:", cachedir) stan_utility.cache.path = cachedir stan_utility.cache.mem = joblib.Memory(cachedir, verbose=False) import stan_utility model_code = open(os.path.join(os.path.dirname(__file__), 'test.stan')).read() model = stan_utility.compile_model_code(model_code, model_name="mytest") data = dict( mean=1, unused=np.random.normal(size=(4,42)), ) if os.path.exists("mytest_fitfit.hdf5"): os.unlink("mytest_fitfit.hdf5") samples = stan_utility.sample_model(model, data, outprefix="mytest_fit", chains=2, iter=346) assert os.path.exists("mytest_fitfit.hdf5") os.unlink("mytest_fitfit.hdf5") if os.path.exists("mytest_fit_corner.pdf"): os.unlink("mytest_fit_corner.pdf") stan_utility.plot_corner(samples, outprefix="mytest_fit") assert os.path.exists("mytest_fit_corner.pdf") os.unlink("mytest_fit_corner.pdf") flat_samples = stan_utility.get_flat_posterior(samples) assert set(flat_samples.keys()) == {"x", "y"}, flat_samples.keys() assert flat_samples['x'].shape == (346,), flat_samples['x'].shape assert flat_samples['y'].shape == (346, 10), flat_samples['y'].shape
def test_compile_file(): import stan_utility.cache with tempfile.TemporaryDirectory() as cachedir: print("using cachedir:", cachedir) stan_utility.cache.path = cachedir stan_utility.cache.mem = joblib.Memory(cachedir, verbose=False) import stan_utility model = stan_utility.compile_model(os.path.join(os.path.dirname(__file__), 'test.stan')) data = dict( mean=1, unused=np.random.normal(size=(4,42)), ) stan_utility.sample_model(model, data, chains=2) files = os.listdir(stan_utility.cache.get_path()) assert "joblib" in files assert any(f for f in files if f.startswith("cached-") and f.endswith('.pkl')), files assert len(files) > 1, files stan_utility.cache.clear() files = os.listdir(stan_utility.cache.get_path()) assert files == ["joblib"], files
def set_cachedir(cachedir=None, verbose=0): """Set root directory for the joblib cache. :Parameters: cachedir the cache directory name; if ``None``, a temporary directory is created using `TemporaryDirectory` verbose an integer number, controls the verbosity of the cache (default is 0, i.e., not verbose) """ global _cachedir global _cacheobj global _cached_methods global _memory if cachedir is None: _cacheobj = TemporaryDirectory(prefix='mdp-joblib-cache.') cachedir = _cacheobj.name # only reset if the directory changes if cachedir != _cachedir: _cachedir = cachedir _memory = joblib.Memory(cachedir, verbose=verbose) # reset cached methods _cached_methods.clear()
def __init__(self, basedir='.cache'): if not basedir: raise ValueError('`basedir` is empty') location = str(Path(__file__).resolve().parent.parent / basedir) mem = joblib.Memory(location=location, verbose=logging.DEBUG) self.load_svmlight_file = mem.cache( sklearn.datasets.load_svmlight_file)
def cached(*args, **kargs): import joblib as jb from .. import CACHE memo = getattr(cached, 'memo', None) if memo is None: cached.memo = memo = jb.Memory(CACHE, verbose=0) return memo.cache(*args, **kargs)
def generate_tsdiffana_thumbnail(image_files, sessions, subject_id, output_dir, results_gallery=None, tooltips=None): """Generate tsdiffana thumbnails Parameters ---------- image_files: list or strings or list paths (4D case) to list of paths (3D case) of images under inspection output_dir: string dir to which all output whill be written subject_id: string id of subject under inspection sessions: list list of session ids, one per element of image_files result_gallery: ResultsGallery instance (optional) gallery to which thumbnails will be committed """ # plot figures qa_cache_dir = os.path.join(output_dir, "QA") if not os.path.exists(qa_cache_dir): os.makedirs(qa_cache_dir) qa_mem = joblib.Memory(cachedir=qa_cache_dir, verbose=5) results = qa_mem.cache(multi_session_time_slice_diffs)(image_files) axes = plot_tsdiffs(results, use_same_figure=False) figures = [ax.get_figure() for ax in axes] output_filename_template = os.path.join(output_dir, "tsdiffana_plot_{0}.png") output_filenames = [ output_filename_template.format(i) for i in range(len(figures)) ] for fig, output_filename in zip(figures, output_filenames): fig.savefig(output_filename, bbox_inches="tight", dpi=200) pl.close(fig) if tooltips is None: tooltips = [None] * len(output_filename) # create thumbnails thumbnails = [] for output_filename, tooltip in zip(output_filenames, tooltips): thumbnail = Thumbnail(tooltip=tooltip) thumbnail.a = a(href=os.path.basename(output_filename)) thumbnail.img = img(src=os.path.basename(output_filename), height="250px", width="600px") thumbnail.description = "tsdiffana ({0} sessions)".format( len(sessions)) thumbnails.append(thumbnail) if results_gallery: results_gallery.commit_thumbnails(thumbnails) return thumbnails
def __init__(self, grid, shell_model, cache_dir=None): # final distance r r = grid.r_spherical # initial distance r0 r0 = shell_model.initial_radius(r) # dr/dr0 dr_dr0 = shell_model.dr_dr0(r, r0) # initial Lagrangian mesh x0 = r0 * grid.x / r y0 = r0 * grid.y / r z0 = r0 * grid.z / r # Computes and stores the components of the Jacobian matrix self.dx_dx0 = dr_dr0 * (x0 / r0)**2 + r / r0 * (1 - (x0 / r0)**2) self.dy_dy0 = dr_dr0 * (y0 / r0)**2 + r / r0 * (1 - (y0 / r0)**2) self.dz_dz0 = dr_dr0 * (z0 / r0)**2 + r / r0 * (1 - (z0 / r0)**2) self.dx_dy0 = (dr_dr0 - r / r0) * x0 * y0 / r0**2 self.dy_dx0 = self.dx_dy0 self.dx_dz0 = (dr_dr0 - r / r0) * x0 * z0 / r0**2 self.dz_dx0 = self.dx_dz0 self.dy_dz0 = (dr_dr0 - r / r0) * y0 * z0 / r0**2 self.dz_dy0 = self.dy_dz0 self._inv_J = None if cache_dir is not None: self._mem = joblib.Memory(cache_dir, verbose=0) else: self._mem = None
def hdb_predict(data, cl_size): cl = hdbscan.HDBSCAN( min_samples=1, min_cluster_size=cl_size, core_dist_n_jobs=threads, memory=joblib.Memory(location=".DAJIN_temp/clustering", verbose=0), ) return cl.fit_predict(data) + 1
def __init__(self, paths_name, cache=True): self.paths_name = paths_name self.paths = self._get_paths() if not on_cloud and cache: memory = joblib.Memory(cachedir=self.cache_dir) self.get_le_features = memory.cache(self.get_le_features) self.get_features = memory.cache(self.get_features) self.is_log = False
def init_data(data, client_options=None, plugin=None, cache_waveforms=False): """Return appropriate get_waveforms function See example configuration file for a description of the options""" if client_options is None: client_options = {} is_webservice = data in ('arclink', 'fdsn', 'seishub') if is_webservice: webservice_module = import_module('obspy.%s' % data) Client = getattr(webservice_module, 'Client') client = Client(**client_options) if data == 'fdsn': get_waveforms_orig = client.get_waveforms else: get_waveforms_orig = client.getWaveform def get_waveforms(event=None, **args): return get_waveforms_orig(**args) elif data == 'plugin': modulename, funcname = plugin.split(':') get_waveforms = load_func(modulename.strip(), funcname.strip()) else: from obspy import read stream = read(data) def get_waveforms(network, station, location, channel, starttime, endtime, event=None): st = stream.select(network=network, station=station, location=location, channel=channel) st = st.slice(starttime, endtime) return st def wrapper(**kwargs): try: return get_waveforms(**kwargs) except (KeyboardInterrupt, SystemExit): raise except Exception as ex: seedid = '.'.join((kwargs['network'], kwargs['station'], kwargs['location'], kwargs['channel'])) msg = 'channel %s: error while retireving data: %s' log.debug(msg, seedid, ex) use_cache = cache_waveforms and (is_webservice or data == 'plugin') if use_cache and joblib: log.info('use waveform cache in %s', cache_waveforms) memory = joblib.Memory(cachedir=cache_waveforms, verbose=0) return memory.cache(wrapper) elif use_cache: log.warning('install joblib to use cache_waveforms option') return wrapper
def __init__(self, user_settings): self.tokenKey = user_settings.quandl_token self.user_settings = user_settings quandl.ApiConfig.api_key = self.tokenKey self.temp_dir = getTempPath(userSettings=user_settings) self.cacher = joblib.Memory(self.temp_dir, compress=9, verbose=0) pass
def hdb_cl_num(cl_size): cl = hdbscan.HDBSCAN( min_samples=1, min_cluster_size=cl_size, core_dist_n_jobs=threads, memory=joblib.Memory(location=".DAJIN_temp/clustering", verbose=0), ) tmp = cl.fit_predict(pc_score) return len(np.unique(tmp))
def __init__(self, user_settings): self.user_settings = user_settings self.financial_downloader = gm.FinancialsDownloader() # self.key_ratios_downloader = gm.KeyRatiosDownloader() self.temp_dir = getTempPath(userSettings=user_settings) self.cacher = joblib.Memory(self.temp_dir) # kr = gm.KeyRatiosDownloader() pass
def _memoize(self, func, verbose=0): ''' helper method for memory cache. ''' if not hasattr(self, '_mymem'): self._mymem = joblib.Memory(cachedir=self.cachedir) memoized_func = self._mymem.cache(func, verbose=verbose) memoized_func.__doc__ = func.__doc__ return memoized_func
def setup_cachedir(cachedir, mmap_mode=None, bytes_limit=None): """This function injects a joblib.Memory object in the cache() function (in a thread-specific slot of its 'memories' attribute).""" if not hasattr(cache, 'memories'): cache.memories = {} memory = joblib.Memory(location=cachedir, verbose=0, mmap_mode=mmap_mode, bytes_limit=bytes_limit) cache.memories[current_thread().name] = memory return memory
def test_data_cache(): # First, get the raw features crema_input = crema.pre.CQTensor() data = crema.data.make_task_data(TEST_FILE, TEST_JAMS, [], crema_input) # Then create a cache cache = joblib.Memory(cachedir='./crema_cache/', verbose=0) data2 = crema.data.make_task_data(TEST_FILE, TEST_JAMS, [], crema_input, cache=cache) data3 = crema.data.make_task_data(TEST_FILE, TEST_JAMS, [], crema_input, cache=cache) assert np.all(data['input_cqtensor'] == data2['input_cqtensor']) assert np.all(data2['input_cqtensor'] == data3['input_cqtensor'])
def memory(name) -> joblib.Memory: """ Return the joblib's Memory object with the given name. """ if isinstance(name, joblib.Memory): return name path = user_path() / "cache" / name path.mkdir(parents=True, exist_ok=True) opts = CACHE_OPTIONS.get(name, {}) opts.setdefault("verbose", 0) return joblib.Memory(path, **opts)
def __init__(self, user_settings): self.user_settings = user_settings self.vectorDao = VectorDao(self.user_settings) self.vectorizedDataService = VectorOnlineService(self.user_settings) self.vectorizedDataService.parallelDownloadInstruments = False if self.threads > 1: self.vectorizedDataService.parallelDownloadRatios = False else: self.vectorizedDataService.parallelDownloadRatios = False self.temp_dir = getTempPath(userSettings=user_settings) self.cacher = joblib.Memory(self.temp_dir, compress=9, verbose=0)
def resample_img(input_img_filename, new_vox_dims, output_filename=None): """ Resamples an image to a new resolution Parameters ---------- input_img_filename: string path to image to be resampled new_vox_dims: list or tuple of +ve floats new vox dimensions to which the image is to be resampled output_filename: string (optional) where output image will be written Returns ------- output_filename: string where the resampled img has been written """ try: from nilearn.image import resample_img as ni_resample_img except ImportError: raise RuntimeError( "nilearn not found on your system; can't do resampling!") # sanity if output_filename is None: output_filename = os.path.join( os.path.dirname(input_img_filename), "resample_" + os.path.basename(input_img_filename)) # prepare for smart-caching output_dir = os.path.dirname(output_filename) cache_dir = os.path.join(output_dir, "resample_img_cache") if not os.path.exists(cache_dir): os.makedirs(cache_dir) mem = joblib.Memory(cachedir=cache_dir, verbose=5) # resample input img to new resolution resampled_img = mem.cache(ni_resample_img)( input_img_filename, target_affine=np.diag(new_vox_dims)) # save resampled img nibabel.save(resampled_img, output_filename) return output_filename
def set_cache_dir(location=None, compress=True, verbose=0, **kwargs): """ Set up a cache directory for use with requests. Parameters ---------- location: str or None or False The path of the base directory to use as a data store or None or False. If None, a default directory is created using appdirs.user_cache_dir. If False is given, no caching is done and the Memory object is completely transparent. compress: boolean, or integer, optional Whether to zip the stored data on disk. If an integer is given, it should be between 1 and 9, and sets the amount of compression. verbose: int, optional Verbosity flag, controls the debug messages that are issued as functions are evaluated. bytes_limit: int, optional Limit in bytes of the size of the cache. """ global memory, cache_dir if location is None: location = appdirs.user_cache_dir('cached_requests') if location is False: location = None memory = joblib.Memory(location, compress=compress, verbose=verbose, **kwargs) make_cache = ( (requests, 'get'), (requests, 'post'), ) for module, func_name in make_cache: try: func = getattr(module, f"_{func_name}_orig") except AttributeError: func = getattr(module, func_name) setattr(module, f"_{func_name}_orig", func) setattr(module, func_name, memory.cache(func))
def adult_data_demo(): xtrain, ytrain, xtest, ytest = load_adult_income_data( test_size=0.2, random_state=1337 ) # feature selection options fs_dict = adult_data_feature_selectors() # modelers model_dict = adult_data_modellers() # create a pipeline from the above cachedir = tempfile.mkdtemp() memory = joblib.Memory(cachedir, verbose=0) pipelines = [ sklearn.pipeline.Pipeline( steps=[ (fsname, fs), (modelname, model) ], memory=memory, ) for (fsname, fs) in fs_dict.items() for (modelname, model) in model_dict.items() ] # cross validation df_scores = cross_validate_scores(pipelines, xtrain, ytrain) # select the pair with the best overall negative log loss fs, m = df_scores.groupby(['fs', 'm']).mean().test_neg_log_loss.idxmax() p_best = [ p for p in pipelines if fs in p.named_steps and m in p.named_steps ][0] # re-fit this model to the *entire* train data (it has only ever been fitted # to bootstrapped sub-samples) p_best.fit(xtrain, ytrain) # get ccr values on test data df_ccr = get_ccr_df(p_best, xtest, ytest) ccr_fig = make_ccr_plot(df_ccr) # clean up our pipeline memory shutil.rmtree(cachedir) return p_best, df_scores, df_ccr, ccr_fig
def setup_preprocessing(tokenizer, use_idf, ngram): memory = joblib.Memory(cache_dir, verbose=0) clf = ExtraTreesClassifier(max_depth=10, n_estimators=2000, random_state=RANDOM_SEED, n_jobs=N_CORES) return Pipeline( [ ("vect", CountVectorizer(tokenizer=tokenizer, ngram_range=ngram)), ("tfidf", TfidfTransformer(use_idf=use_idf)), ("selectfrommodel", SelectFromModel(clf)), ], memory=memory, )
def __init__(self, cachedir='tmp', verbose=1): """ Parameters ---------- cachedir: str Name of directory where objects are stored in files. verbose: bool, int Let joblib and this class speak when storing files to disk. """ import joblib self.memory = joblib.Memory(cachedir=cachedir, verbose=verbose) self.verbose = verbose self.retrieve = self.memory.cache(self.retrieve, ignore=['data']) self.save = self.retrieve
def cache_to_disk(fn=None, *, cachedir=_DEFAULT_FN_CACHE_PATH, **kwargs): """Cache this function to disk, using joblib. """ mem = joblib.Memory(cachedir=cachedir, verbose=0, **kwargs) # bare decorator if fn: return mem.cache(fn) # decorator called with kwargs else: def cache_to_disk_decorator(fn): return mem.cache(fn) return cache_to_disk_decorator
def __init__(self, steps: list = [], cache_dir=None): '''Helper function that just calls build_graph_recur with an empty graph Params ------ steps: list a list of ModuleSet instances cache_dir: str, default=None The directory to use as data store by joblib. If None, won't do caching. Returns ------- G: nx.Digraph() ''' self.steps = steps # set up the cache self.memory = joblib.Memory(location=cache_dir)
def solver_scaled(I, dt, C, T): """ Solve 1D wave equation in dimensionless form. """ # Make a hash of the arguments import inspect, hashlib data = inspect.getsource(I) + '_' + str(dt) + '_' + \ str(C) + '_' + str(T) # Not fool proof: if x0 changes value, I source is the same... hashed_input = hashlib.sha1(data).hexdigest() cachedir = 'tmp_%s' % hashed_input is_computed = os.path.isdir(cachedir) import joblib memory = joblib.Memory(cachedir=cachedir, verbose=1) def retrieve(name, data=None): print 'joblib save of', name return data retrieve = memory.cache(retrieve, ignore=['data']) save = retrieve def action(u, x, t, n): if n == 0: save('x', x) save('t', t) save('u%d' % n, u) if is_computed: print 'No need to compute the numerical solution' return retrieve else: print 'Computing the numerical solution' solver_unscaled(I=I, V=0, f=0, c=1, L=1, dt=dt, C=C, T=T, user_action=action) return retrieve
def slice_snapshot( snapshot: md.Trajectory, project_dir: str, run: int, cache_dir: Optional[str], ) -> Dict[str, md.Trajectory]: """ Slice snapshot to specified state in-place .. TODO :: The htf.npz file is very slow to load. Replace this with a JSON file containing relevant ligand indices only Parameters ---------- snapshot : mdtraj.Trajectory Snapshot to slice project_dir : str Path to project directory (e.g. '/home/server/server2/projects/13422') run : int Run (e.g. '0') cache_dir : str or None If specified, cache relevant parts of "htf.npz" file in a local directory of this name Returns ------- sliced_snapshot : dict of str : mdtraj.Trajectory sliced_snapshot[x] where x is one of ['protein', 'old_ligand', 'new_ligand', 'old_complex', 'new_complex'] """ get_stored_atom_indices_cached = ( get_stored_atom_indices if cache_dir is None else joblib.Memory( cachedir=cache_dir, verbose=0).cache(get_stored_atom_indices)) stored_atom_indices = get_stored_atom_indices_cached(project_dir, run) sliced_snapshot = dict() for key, atom_indices in stored_atom_indices.items(): sliced_snapshot[key] = md.Trajectory( snapshot.xyz[:, atom_indices, :], snapshot.topology.subset(atom_indices)) return sliced_snapshot