def test_new_img_like_side_effect(): img1 = Nifti1Image(np.ones((2, 2, 2, 2)), affine=np.eye(4)) hash1 = joblib.hash(img1) new_img_like(img1, np.ones((2, 2, 2, 2)), img1.affine.copy(), copy_header=True) hash2 = joblib.hash(img1) assert_equal(hash1, hash2)
def test_joblib_cache(): if not LooseVersion(nibabel.__version__) > LooseVersion('1.1.0'): # Old nibabel do not pickle raise SkipTest from sklearn.externals.joblib import hash, Memory mask = np.zeros((40, 40, 40)) mask[20, 20, 20] = 1 mask_img = Nifti1Image(mask, np.eye(4)) with testing.write_tmp_imgs(mask_img, create_files=True)\ as filename: masker = NiftiMasker(mask_img=filename) masker.fit() mask_hash = hash(masker.mask_img_) masker.mask_img_.get_data() assert_true(mask_hash == hash(masker.mask_img_)) # Test a tricky issue with memmapped joblib.memory that makes # imgs return by inverse_transform impossible to save cachedir = mkdtemp() try: masker.memory = Memory(cachedir=cachedir, mmap_mode='r', verbose=0) X = masker.transform(mask_img) # inverse_transform a first time, so that the result is cached out_img = masker.inverse_transform(X) out_img = masker.inverse_transform(X) out_img.to_filename(os.path.join(cachedir, 'test.nii')) finally: shutil.rmtree(cachedir, ignore_errors=True)
def hash_X_y(X, y, n_samples=10, n_features=5): """Compute hash of the input arrays. Parameters ---------- X : ndarray, shape (n_samples, n_features) The ``X`` array. y : ndarray, shape (n_samples) The ``y`` array. n_samples : int, optional The number of samples to use to compute the hash. Default is 100. n_features : int, optional The number of features to use to compute the hash. Default is 10. Returns ------- X_hash: str Hash identifier of the ``X`` matrix. y_hash: str Hash identifier of the ``y`` matrix. """ row_idx = slice(None, None, max(1, X.shape[0] // n_samples)) col_idx = slice(None, None, max(1, X.shape[1] // n_features)) return joblib.hash(X[row_idx, col_idx]), joblib.hash(y[row_idx])
def test_new_img_like_side_effect(): img1 = Nifti1Image(np.ones((2, 2, 2, 2)), affine=np.eye(4)) hash1 = joblib.hash(img1) new_img_like(img1, np.ones((2, 2, 2, 2)), img1.get_affine().copy(), copy_header=True) hash2 = joblib.hash(img1) assert_equal(hash1, hash2)
def check_estimators_overwrite_params(name, Estimator): X, y = make_blobs(random_state=0, n_samples=9) y = multioutput_estimator_convert_y_2d(name, y) # some want non-negative input X -= X.min() with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) # Make a physical copy of the orginal estimator parameters before fitting. params = estimator.get_params() original_params = deepcopy(params) # Fit the model estimator.fit(X, y) # Compare the state of the model parameters with the original parameters new_params = estimator.get_params() for param_name, original_value in original_params.items(): new_value = new_params[param_name] # We should never change or mutate the internal state of input # parameters by default. To check this we use the joblib.hash function # that introspects recursively any subobjects to compute a checksum. # The only exception to this rule of immutable constructor parameters # is possible RandomState instance but in this check we explicitly # fixed the random_state params recursively to be integer seeds. assert_equal(hash(new_value), hash(original_value), "Estimator %s should not change or mutate " " the parameter %s from %s to %s during fit." % (name, param_name, original_value, new_value))
def check_estimators_overwrite_params(name, Estimator): X, y = make_blobs(random_state=0, n_samples=9) y = multioutput_estimator_convert_y_2d(name, y) # some want non-negative input X -= X.min() with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) # Make a physical copy of the orginal estimator parameters before fitting. params = estimator.get_params() original_params = deepcopy(params) # Fit the model estimator.fit(X, y) # Compare the state of the model parameters with the original parameters new_params = estimator.get_params() for param_name, original_value in original_params.items(): new_value = new_params[param_name] # We should never change or mutate the internal state of input # parameters by default. To check this we use the joblib.hash function # that introspects recursively any subobjects to compute a checksum. # The only exception to this rule of immutable constructor parameters # is possible RandomState instance but in this check we explicitly # fixed the random_state params recursively to be integer seeds. assert_equal( hash(new_value), hash(original_value), "Estimator %s should not change or mutate " " the parameter %s from %s to %s during fit." % (name, param_name, original_value, new_value))
def test_check_estimator_clones(): # check that check_estimator doesn't modify the estimator it receives from sklearn.datasets import load_iris iris = load_iris() for Estimator in [GaussianMixture, LinearRegression, RandomForestClassifier, NMF, SGDClassifier, MiniBatchKMeans]: with ignore_warnings(category=FutureWarning): # when 'est = SGDClassifier()' est = Estimator() set_checking_parameters(est) set_random_state(est) # without fitting old_hash = joblib.hash(est) check_estimator(est) assert_equal(old_hash, joblib.hash(est)) with ignore_warnings(category=FutureWarning): # when 'est = SGDClassifier()' est = Estimator() set_checking_parameters(est) set_random_state(est) # with fitting est.fit(iris.data + 10, iris.target) old_hash = joblib.hash(est) check_estimator(est) assert_equal(old_hash, joblib.hash(est))
def _check_integrity_atlas(atlas): # check that the folder is existing atlas_directory = os.path.abspath(os.path.join('.', 'data', 'fmri', atlas)) if os.path.isdir(atlas_directory): # compute the hash of the current data present on the disk filenames_atlas_current = np.array(glob.glob( os.path.join(atlas_directory, '*', '*', '*')), dtype=object) filenames_atlas_current.sort() current_hash = joblib.hash(filenames_atlas_current) # compute the expected hash from the data set which we provide filenames_atlas_expected = pd.read_csv(os.path.abspath( os.path.join('.', 'data', 'fmri_filename.csv')), index_col=0)[atlas].values for idx in range(filenames_atlas_expected.size): filenames_atlas_expected[idx] = os.path.abspath( filenames_atlas_expected[idx]) filenames_atlas_expected.sort() expected_hash = joblib.hash(filenames_atlas_expected) if current_hash == expected_hash: return shutil.rmtree(atlas_directory) _download_fmri_data(atlas)
def test_hash_X_y(): rng = check_random_state(0) X = rng.randn(2000, 20) y = np.array([0] * 500 + [1] * 1500) assert hash_X_y(X, y, 10, 10) == (joblib.hash(X[::200, ::2]), joblib.hash(y[::200])) X = rng.randn(5, 2) y = np.array([0] * 2 + [1] * 3) # all data will be used in this case assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
def test_hash_X_y_pandas(): pd = pytest.importorskip("pandas") rng = check_random_state(0) X = pd.DataFrame(rng.randn(2000, 20)) y = pd.Series([0] * 500 + [1] * 1500) assert hash_X_y(X, y, 10, 10) == (joblib.hash(X.iloc[::200, ::2]), joblib.hash(y.iloc[::200])) X = pd.DataFrame(rng.randn(5, 2)) y = pd.Series([0] * 2 + [1] * 3) # all data will be used in this case assert hash_X_y(X, y) == (joblib.hash(X), joblib.hash(y))
def test_joblib_cache(): from sklearn.externals.joblib import hash # Dummy mask mask = np.zeros((40, 40, 40)) mask[20, 20, 20] = 1 mask_img = Nifti1Image(mask, np.eye(4)) with write_tmp_imgs(mask_img, create_files=True) as filename: masker = MultiNiftiMasker(mask_img=filename) masker.fit() mask_hash = hash(masker.mask_img_) masker.mask_img_.get_data() assert_true(mask_hash == hash(masker.mask_img_)) # enables to delete "filename" on windows del masker
def __init__(self, tmp_dir, array): h = hash(array) hashs[h] = array output_filename = os.path.join(tmp_dir, h) with lockfile.LockFile(output_filename): if not os.path.exists(output_filename): dump(array, output_filename) self.filename = output_filename
def test_joblib_cache(): if not LooseVersion(nibabel.__version__) > LooseVersion('1.1.0'): # Old nibabel do not pickle raise SkipTest from sklearn.externals.joblib import hash # Dummy mask mask = np.zeros((40, 40, 40)) mask[20, 20, 20] = 1 mask_img = Nifti1Image(mask, np.eye(4)) with write_tmp_imgs(mask_img, create_files=True)\ as filename: masker = MultiNiftiMasker(mask_img=filename) masker.fit() mask_hash = hash(masker.mask_img_) masker.mask_img_.get_data() assert_true(mask_hash == hash(masker.mask_img_))
def test_joblib_cache(): if not LooseVersion(nibabel.__version__) > LooseVersion('1.1.0'): # Old nibabel do not pickle raise SkipTest from sklearn.externals.joblib import hash # Dummy mask mask = np.zeros((40, 40, 40)) mask[20, 20, 20] = 1 mask_img = Nifti1Image(mask, np.eye(4)) with testing.write_tmp_imgs(mask_img, create_files=True)\ as filename: masker = MultiNiftiMasker(mask_img=filename) masker.fit() mask_hash = hash(masker.mask_img_) masker.mask_img_.get_data() assert_true(mask_hash == hash(masker.mask_img_))
def evaluate_one(model_class, parameters, cv_split): split_idx, (X_train, X_val, y_train, y_val) = cv_split model = model_class(**parameters).fit(X_train, y_train) train_score = model.score(X_train, y_train) validation_score = model.score(X_val, y_val) results = { 'train_score': train_score, 'val_score': validation_score, 'parameters': parameters, 'parameters_hash': hash(parameters), } return results
def hash_X_y(X, y, n_samples=1000): """Compute hash of the input arrays. Parameters ---------- X : ndarray, shape (n_samples, n_features) The ``X`` array. y : ndarray, shape (n_samples) Returns ------- X_hash: str Hash identifier of the ``X`` matrix. y_hash: str Hash identifier of the ``y`` matrix. """ rng = np.random.RandomState(0) raw_idx = rng.randint(X.shape[0], size=n_samples) col_idx = rng.randint(X.shape[1], size=n_samples) return joblib.hash(X[raw_idx, col_idx]), joblib.hash(y[raw_idx])
def test_copy_img_side_effect(): img1 = Nifti1Image(np.ones((2, 2, 2, 2)), affine=np.eye(4)) hash1 = joblib.hash(img1) niimg.copy_img(img1) hash2 = joblib.hash(img1) assert_equal(hash1, hash2)
joblib.dump((X_vali, y_vali, qid_vali), VALI_DATA) if not os.path.exists(GRID_JOBS_FOLDER): os.makedirs(GRID_JOBS_FOLDER) params = { 'max_features': [10, 20, 50, 100], 'max_depth': [2, 3, 4, 5], 'subsample': [0.5, 0.8, 1.0], 'loss': ['ls', 'huber', 'quantile'], 'learning_rate': [0.05, 0.1, 0.5], } for i, param in enumerate(ParameterGrid(params)): params_description = json.dumps(param) job_id = joblib.hash(params_description) job_folder = GRID_JOBS_FOLDER + '/' + job_id if not os.path.exists(job_folder): os.makedirs(job_folder) with open(job_folder + '/parameters.json', 'wb') as f: f.write(params_description.encode('utf-8')) data_filenames = {'train': TRAIN_SAMPLE_DATA, 'validation': VALI_DATA} with open(job_folder + '/data.json', 'wb') as f: f.write(json.dumps(data_filenames).encode('utf-8')) cmd = 'qsub -V -cwd letor_gridpoint.py {}'.format(job_folder) os.system(cmd) # if i > 100: # break
def __hash__(self): if self._hashvalue is None: self._hashvalue = int(joblib.hash(self._hashkey), 16) return self._hashvalue
if not os.path.exists(GRID_JOBS_FOLDER): os.makedirs(GRID_JOBS_FOLDER) params = { 'max_features': [10, 20, 50, 100], 'max_depth': [2, 3, 4, 5], 'subsample': [0.5, 0.8, 1.0], 'loss': ['ls', 'huber', 'quantile'], 'learning_rate': [0.05, 0.1, 0.5], } for i, param in enumerate(ParameterGrid(params)): params_description = json.dumps(param) job_id = joblib.hash(params_description) job_folder = GRID_JOBS_FOLDER + '/' + job_id if not os.path.exists(job_folder): os.makedirs(job_folder) with open(job_folder + '/parameters.json', 'wb') as f: f.write(params_description.encode('utf-8')) data_filenames = {'train': TRAIN_SAMPLE_DATA, 'validation': VALI_DATA} with open(job_folder + '/data.json', 'wb') as f: f.write(json.dumps(data_filenames).encode('utf-8')) cmd = 'qsub -V -cwd letor_gridpoint.py {}'.format(job_folder) os.system(cmd) # if i > 100: # break