def test_randomized_search_grid_scores(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # XXX: as of today (scipy 0.12) it's not possible to set the random seed # of scipy.stats distributions: the assertions in this test should thus # not depend on the randomization params = dict(C=distributions.expon(scale=10), gamma=distributions.expon(scale=0.1)) n_cv_iter = 3 n_search_iter = 30 search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, param_distributions=params, iid=False) search.fit(X, y) assert_equal(len(search.grid_scores_), n_search_iter) # Check consistency of the structure of each cv_score item for cv_score in search.grid_scores_: assert_equal(len(cv_score.cv_validation_scores), n_cv_iter) # Because we set iid to False, the mean_validation score is the # mean of the fold mean scores instead of the aggregate sample-wise # mean score assert_almost_equal(np.mean(cv_score.cv_validation_scores), cv_score.mean_validation_score) assert_equal(list(sorted(cv_score.parameters.keys())), list(sorted(params.keys()))) # Check the consistency with the best_score_ and best_params_ attributes sorted_grid_scores = list(sorted(search.grid_scores_, key=lambda x: x.mean_validation_score)) best_score = sorted_grid_scores[-1].mean_validation_score assert_equal(search.best_score_, best_score) tied_best_params = [s.parameters for s in sorted_grid_scores if s.mean_validation_score == best_score] assert_true( search.best_params_ in tied_best_params, "best_params_={0} is not part of the" " tied best models: {1}".format(search.best_params_, tied_best_params), )
def get_param_space(): param_space = {} param_types = {} param_space["svc"] = {"C": expon(scale=100), "gamma": expon(scale=0.1), "probability": [True], "kernel": ["linear"]} param_types["svc"] = {"C": "real", "gamma": "real", "probability": "int", "kernel": "categorical"} param_space["rfc"] = {"n_estimators": randint(50, 600), "max_features": [1, 2]} param_types["rfc"] = {"n_estimators": "int", "max_features": "int"} """ clfs maps string-names to a cloneable clf instance. """ clfs = {"svc": SVC(), "rfc": RFC()} return (clfs, param_space, param_types)
def get_param_space(): param_space = {} param_types = {} param_space['svc'] = {'C': expon(scale=100), 'gamma': expon(scale=0.1), 'probability': [True], 'kernel': ['linear']} param_types['svc'] = {'C': 'real', 'gamma': 'real', 'probability': 'int', 'kernel': 'categorical'} param_space['rfc'] = {'n_estimators': randint(50, 600), 'max_features': [1, 2]} param_types['rfc'] = {'n_estimators': 'int', 'max_features': 'int'} ''' clfs maps string-names to a cloneable clf instance. ''' clfs = {'svc': SVC(), 'rfc': RFC()} return (clfs, param_space, param_types)
def test_randomized_search(): # very basic smoke test X, y = make_classification(n_samples=200, n_features=100, random_state=0) params = dict(C=distributions.expon()) search = RandomizedSearchCV(LinearSVC(), param_distributions=params) search.fit(X, y) assert_equal(len(search.cv_scores_), 10)
def ExpPrior(rate=1.): """ Exponential prior. rate : exponential rate parameter (inverse scale) """ return Prior(distributions.expon(scale=1./rate))
def test_lstats_expon(self): "Try lstats on the exponential distribution" _exp = [ 1. , 0.5 , 0.33333333, 0.16666667, 0.1 , 0.06666667, 0.04761905, 0.03571429, 0.02777778, 0.02222222, 0.01818182, 0.01515152, 0.01282051, 0.01098901, 0.00952381, 0.00833333, 0.00735294, 0.00653595, 0.00584795, 0.00526316] assert_almost_equal(dist.expon.lstats(20), np.array(_exp)) _exp = ( 4. , 0.5 , 0.33333333, 0.16666667, 0.1) assert_almost_equal(dist.expon.lstats(5, loc=3., scale=1.), np.array(_exp)) assert_almost_equal(dist.expon(3., 1.).lstats(5), np.array(_exp))
def test_randomized_search_grid_scores(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # XXX: as of today (scipy 0.12) it's not possible to set the random seed # of scipy.stats distributions: the assertions in this test should thus # not depend on the randomization params = dict(C=distributions.expon(scale=10), gamma=distributions.expon(scale=0.1)) n_cv_iter = 3 n_search_iter = 30 search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, param_distributions=params, iid=False) search.fit(X, y) assert_equal(len(search.grid_scores_), n_search_iter) # Check consistency of the structure of each cv_score item for cv_score in search.grid_scores_: assert_equal(len(cv_score.cv_validation_scores), n_cv_iter) # Because we set iid to False, the mean_validation score is the # mean of the fold mean scores instead of the aggregate sample-wise # mean score assert_almost_equal(np.mean(cv_score.cv_validation_scores), cv_score.mean_validation_score) assert_equal(list(sorted(cv_score.parameters.keys())), list(sorted(params.keys()))) # Check the consistency with the best_score_ and best_params_ attributes sorted_grid_scores = list(sorted(search.grid_scores_, key=lambda x: x.mean_validation_score)) best_score = sorted_grid_scores[-1].mean_validation_score assert_equal(search.best_score_, best_score) tied_best_params = [s.parameters for s in sorted_grid_scores if s.mean_validation_score == best_score] assert_true(search.best_params_ in tied_best_params, "best_params_={0} is not part of the" " tied best models: {1}".format( search.best_params_, tied_best_params))
def test_lstats_expon(self): "Try lstats on the exponential distribution" _exp = [ 1., 0.5, 0.33333333, 0.16666667, 0.1, 0.06666667, 0.04761905, 0.03571429, 0.02777778, 0.02222222, 0.01818182, 0.01515152, 0.01282051, 0.01098901, 0.00952381, 0.00833333, 0.00735294, 0.00653595, 0.00584795, 0.00526316 ] assert_almost_equal(dist.expon.lstats(20), np.array(_exp)) _exp = (4., 0.5, 0.33333333, 0.16666667, 0.1) assert_almost_equal(dist.expon.lstats(5, loc=3., scale=1.), np.array(_exp)) assert_almost_equal(dist.expon(3., 1.).lstats(5), np.array(_exp))
def LCB(self, n_sample, gpr=None, Xd=None): gpr = self._get_gpr(gpr) if Xd is None: Xd = self.Xd preds = gpr.predict(Xd, return_std=True) preds = pd.DataFrame({"prediction" : preds[0], "std_dev" : preds[1]}) # n.b. lambda is a keyword so change vector of values to alpha alpha = ParameterSampler({ "alpha" : expon()}, n_iter=n_sample) lcb = pd.DataFrame({"lcb_{}".format(i) : \ preds.prediction - \ (li["alpha"] * preds.std_dev) \ for i, li in enumerate(alpha)}) # TODO: include X in lcb, to look up parameters from selected values return lcb
def create_testing_points_regular_transformed(): """ create array of testing points from regular wind turbine arrays Discard any training points where turbines are not in the correct order and any training points where turbines are closer than 2D Parameters ---------- noise_level: float Level of gaussian noise to be added to simulator Returns ------- X_test: ndarray of shape(variable,6) array containing valid test points X_test_tran: ndarray of shape(variable,6) array containing valid transformed test points y_test: ndarray of shape(variable,) value of CT* at test points """ X_test_real = regular_array_monte_carlo(20000) y_test = np.zeros(len(X_test_real)) for i in range(len(X_test_real)): y_test[i] = simulator6d_halved(X_test_real[i, :]) X_test = X_test_real X_test_tran = np.zeros((20000, 6)) X_test_tran[:, 0] = expon(scale=10).cdf(X_test_real[:, 0]) X_test_tran[:, 2] = expon(scale=10).cdf(X_test_real[:, 2]) X_test_tran[:, 4] = expon(scale=10).cdf(X_test_real[:, 4]) X_test_tran[:, 1] = norm(0, 2.5).cdf(X_test_real[:, 1]) X_test_tran[:, 3] = norm(0, 2.5).cdf(X_test_real[:, 3]) X_test_tran[:, 5] = norm(0, 2.5).cdf(X_test_real[:, 5]) np.savetxt('regular_arrays_no_rot_transformed.txt', X_test_tran) return X_test, X_test_tran, y_test
def test_multimodel(): X = np.array([[1, 1, 1], [0, 0, 0], [-1, -1, -1]] * 100) y = np.array([0, 0, 1] * 100) models = [("rf", RandomForestClassifier(n_estimators=10), { "max_depth": [5, 10], "min_samples_split": range(2, 10) }), ("lr0", LogisticRegression(multi_class="auto", solver="liblinear"), { "C": [0.1, 1.0, 10] }), ("lr1", LogisticRegression(multi_class="auto", solver="liblinear"), { "fit_intercept": [True, False], "C": expon() }), ("nb", GaussianNB(), {})] clf = DistMultiModelSearch(models, n=2) clf.fit(X, y) preds = clf.predict(X[:3]) assert np.allclose(preds, np.array([0, 0, 1]))
def create_training_points_regular_transformed(n_target, noise_level, cand_points): """ create array of training points from regular turbine arrays Returns ------- X_train: ndarray of shape(variable,6) array containing valid training points X_train_tran: ndarray of shape(variable,6) array containing valid transformed training points y_train: ndarray of shape(variable,) value of CT* at test points n_train: int number of valid training points """ cand_points_tran = np.zeros((len(cand_points), 6)) cand_points_tran[:, 0] = expon(scale=10).cdf(cand_points[:, 0]) cand_points_tran[:, 2] = expon(scale=10).cdf(cand_points[:, 2]) cand_points_tran[:, 4] = expon(scale=10).cdf(cand_points[:, 4]) cand_points_tran[:, 1] = norm(0, 2.5).cdf(cand_points[:, 1]) cand_points_tran[:, 3] = norm(0, 2.5).cdf(cand_points[:, 3]) cand_points_tran[:, 5] = norm(0, 2.5).cdf(cand_points[:, 5]) X_train_tran = sb.select_greedy_maximin(cand_points_tran, n_target) X_train = np.zeros((len(X_train_tran), 6)) X_train[:, 0] = expon(scale=10).ppf(X_train_tran[:, 0]) X_train[:, 1] = norm(0, 2.5).ppf(X_train_tran[:, 1]) X_train[:, 2] = expon(scale=10).ppf(X_train_tran[:, 2]) X_train[:, 3] = norm(0, 2.5).ppf(X_train_tran[:, 3]) X_train[:, 4] = expon(scale=10).ppf(X_train_tran[:, 4]) X_train[:, 5] = norm(0, 2.5).ppf(X_train_tran[:, 5]) y_train = np.zeros(len(X_train)) for i in range(len(X_train)): y_train[i] = simulator6d_halved(X_train[i, :], noise_level) n_train = n_target return X_train, X_train_tran, y_train, n_train
import numpy as np from scipy.stats.distributions import expon, gamma, rayleigh, norm, t, uniform from posteriori import between def RMSE(predicted, expected): return np.linalg.norm(predicted - expected) / np.sqrt(len(predicted)) distributions = [ norm(), t(df=5), gamma(a=2), gamma(a=4), gamma(a=8), expon(scale=1/0.5), expon(scale=1/1), expon(scale=1/2), rayleigh(), uniform(), ] errors = [] for distribution in distributions: parameters = [k + '=' + str(v) for k, v in distribution.kwds.items()] name = "{name}({parameters})".format( name=distribution.dist.name, parameters=', '.join(parameters) ) l, lm, lt, m, ut, um, u = distribution.ppf([0.05, 0.2625, 0.342, 0.5, 0.658, 0.7375, 0.95])
] test_data = np.concatenate(test_window, axis=1) is_normal = reduce(np.logical_and, is_normal_window) print('test data size :', test_data.shape) if args.debug >= 4: print('window\n', window) print('data\n', data) pp_end_time = time.time() print("start learning...") params = ParameterSampler( { 'nu': expon(scale=args.nu), 'gamma': expon(scale=args.gamma) }, n_iter=1000) for param in params: nu = param['nu'] gamma = param['gamma'] print('nu =', nu) print('gamma =', gamma) if args.debug >= 1: print('kernel =', args.kernel) print('data :', data.shape) learn_start_time = time.time()
" scale,"\ " b, c, d)") from numpy.ma.testutils import * import scipy.special as special import scipy.integrate as integrate if __name__ == "__main__": if 1: print "normal" print "Generic:", [dist.norm()._lmomg(_) for _ in (1, 2, 3, 4)] print "Standard:", dist.norm().lmoments(4) if 1: print "expon" print "Generic:", [dist.expon()._lmomg(_) for _ in (1, 2, 3, 4)] print "Standard:", dist.expon().lmoments(4) if 1: reorg = lambda (m, s, g): (g, m, s) lmoms = (lmom_1, lmom_2, tau_3) = (0., 1., 0.) params = extradist.pearson3.lmparams(lmoms) assert_almost_equal(np.array(params), (0.000000, 1.772454, 0.000000)) assert_almost_equal( pearson3(*reorg(params)).lmoments(3), np.array(lmoms), ) # lmoms = (lmom_1, lmom_2, tau_3) = (0., 1., 0.5) params = extradist.pearson3.lmparams(lmoms) assert_almost_equal(np.array(params), (0.000000, 2.299931, 3.079345), 6)
def exp(lam=1.0): return dists.expon(scale=1. / lam)
'seed': 1, } booster_params = { 'eta': 0.1, # default=0.3 'gamma': 0., # default=0.; larger => more conservative 'max_depth': 6, # default=6 'min_child_weight': 1, # default=1; larger => more conservative 'subsample': 1., # default=1.; proportion of points to sample each round 'lambda': 1, # default=1, L2 regularization 'alpha': 0, # default=0, L1 regularization } # Parameter space to search over param_dist = { 'eta': [0.1], 'gamma': expon(), 'max_depth': randint(3, 10), 'min_child_weight': randint(1, 10), 'subsample': uniform(0.5, 0.5), 'lambda': expon(), 'alpha': expon() } sampler = ParameterSampler(param_dist, n_iter=32, random_state=1) # Perform the search best_score = np.Inf best_params = {**general_params, **booster_params} # Repeatedly sample parameters from the above distributions print('Testing hyperparameters...') for point in tqdm(sampler):
def D(self): return D.expon(self._lambda)
def sample_pspace(model, param_list=None, bounds=None, samples=100, seed=None): """ A DataFrame where each row represents a location in the parameter space, locations distributed to exercise the full range of values that each parameter can take on. This is useful for quick and dirty application of tests to a bunch of locations in the sample space. Kind-of a fuzz-testing for the model. Uses latin hypercube sampling, with random values within the sample bins. The LHS sampler shuffles the bins each time, so a subsequent call will yield a different sample from the parameter space. When a variable has both upper and lower bounds, use a uniform sample between those bounds. When a variable has only one bound, use an exponential distribution with the scale set to be the difference between the bound and the current model value (1 if they are the same) When the variable has neither bound, use a normal distribution centered on the current model value, with scale equal to the absolute value of the model value (1 if that magnitude is 0) Parameters ---------- model: pysd.Model object param_list: None or list of strings The real names of parameters to include in the explored parameter space. If None, uses all of the constants in the model except TIME STEP, INITIAL TIME, etc. bounds: DataFrame, string filename, or None A range test matrix as used for bounds checking. If None, creates one from the model These bounds can also place artificial limits on the parameter space you want to explore, even if the theoretical bounds on the variable are infinite. samples: int How many samples to include in the iterator? Returns ------- lhs : pandas DataFrame distribution-weighted latin hypercube samples Note ---- Executes the model by 1 time-step to get the current value of parameters. """ if param_list is None: doc = model.doc() param_list = sorted( list( set(doc[doc['Type'] == 'constant']['Real Name']) - {'FINAL TIME', 'INITIAL TIME', 'TIME STEP', 'TIME STEP'})) if isinstance(bounds, _pd.DataFrame): bounds = bounds.set_index('Real Name') elif bounds is None: bounds = create_bounds_test_matrix(model).set_index('Real Name') elif isinstance(bounds, str): if bounds.split('.')[-1] in ['xls', 'xlsx']: bounds = _pd.read_excel(bounds, sheetname='Bounds', index_col='Real Name') elif bounds.split('.')[-1] == 'csv': bounds = _pd.read_csv(bounds, index_col='Real Name', encoding='UTF-8') elif bounds.split('.')[-1] == 'tab': bounds = _pd.read_csv(bounds, sep='\t', index_col='Real Name', encoding='UTF-8') else: raise ValueError('Unknown file type: bounds') else: raise ValueError('Unknown type: bounds') if seed is not None: _np.random.seed(seed) unit_lhs = _pd.DataFrame(_pyDOE.lhs(n=len(param_list), samples=samples), columns=param_list) # raw latin hypercube sample res = model.run(return_timestamps=[model.components.initial_time()]) lhs = _pd.DataFrame(index=unit_lhs.index) for param in param_list: lower, upper = bounds[['Min', 'Max']].loc[param] value = res[param].iloc[0] if lower == upper: lhs[param] = lower elif _np.isfinite(lower) and _np.isfinite( upper): # np.isfinite(0)==True scale = upper - lower lhs[param] = _dist.uniform(lower, scale).ppf(unit_lhs[param]) elif _np.isfinite(lower) and _np.isinf(upper): if lower == value: scale = 1 else: scale = value - lower lhs[param] = _dist.expon(lower, scale).ppf(unit_lhs[param]) elif _np.isinf(lower) and _np.isfinite( upper): # np.isinf(-np.inf)==True if upper == value: scale = 1 else: scale = upper - value lhs[param] = upper - _dist.expon(0, scale).ppf(unit_lhs[param]) elif _np.isinf(lower) and _np.isinf(upper): # np.isinf(-np.inf)==True if value == 0: scale = 1 else: scale = abs(value) lhs[param] = _dist.norm(value, scale).ppf(unit_lhs[param]) else: raise ValueError('Problem with lower: %s or upper: %s bounds' % (lower, upper)) return lhs
from numpy.ma.testutils import * import scipy.special as special import scipy.integrate as integrate if __name__ == "__main__": if 1: print "normal" print "Generic:", [dist.norm()._lmomg(_) for _ in (1,2,3,4)] print "Standard:", dist.norm().lmoments(4) if 1: print "expon" print "Generic:", [dist.expon()._lmomg(_) for _ in (1,2,3,4)] print "Standard:", dist.expon().lmoments(4) if 1: reorg = lambda (m, s, g): (g, m, s) lmoms = (lmom_1, lmom_2, tau_3) = (0., 1., 0.) params = extradist.pearson3.lmparams(lmoms) assert_almost_equal(np.array(params), (0.000000, 1.772454, 0.000000)) assert_almost_equal(pearson3(*reorg(params)).lmoments(3), np.array(lmoms),) # lmoms = (lmom_1, lmom_2, tau_3) = (0., 1., 0.5) params = extradist.pearson3.lmparams(lmoms) assert_almost_equal(np.array(params), (0.000000, 2.299931, 3.079345), 6) assert_almost_equal(pearson3(*reorg(params)).lmoments(3), np.array(lmoms), 5) #
def __init__(self, _lambda=1): self._lambda = _lambda self.D = D.expon(_lambda)
def exp(lam=1.0): return dists.expon(scale=1./lam)
def sample_pspace(model, param_list=None, bounds=None, samples=100, seed=None): """ A DataFrame where each row represents a location in the parameter space, locations distributed to exercise the full range of values that each parameter can take on. This is useful for quick and dirty application of tests to a bunch of locations in the sample space. Kind-of a fuzz-testing for the model. Uses latin hypercube sampling, with random values within the sample bins. The LHS sampler shuffles the bins each time, so a subsequent call will yield a different sample from the parameter space. When a variable has both upper and lower bounds, use a uniform sample between those bounds. When a variable has only one bound, use an exponential distribution with the scale set to be the difference between the bound and the current model value (1 if they are the same) When the variable has neither bound, use a normal distribution centered on the current model value, with scale equal to the absolute value of the model value (1 if that magnitude is 0) Parameters ---------- model: pysd.Model object param_list: None or list of strings The real names of parameters to include in the explored parameter space. If None, uses all of the constants in the model except TIME STEP, INITIAL TIME, etc. bounds: DataFrame, string filename, or None A range test matrix as used for bounds checking. If None, creates one from the model These bounds can also place artificial limits on the parameter space you want to explore, even if the theoretical bounds on the variable are infinite. samples: int How many samples to include in the iterator? Returns ------- lhs : pandas DataFrame distribution-weighted latin hypercube samples Note ---- Executes the model by 1 time-step to get the current value of parameters. """ if param_list is None: doc = model.doc() param_list = sorted(list(set(doc[doc['Type'] == 'constant']['Real Name']) - {'FINAL TIME', 'INITIAL TIME', 'TIME STEP', 'TIME STEP'})) if isinstance(bounds, _pd.DataFrame): bounds = bounds.set_index('Real Name') elif bounds is None: bounds = create_bounds_test_matrix(model).set_index('Real Name') elif isinstance(bounds, str): if bounds.split('.')[-1] in ['xls', 'xlsx']: bounds = _pd.read_excel(bounds, sheetname='Bounds', index_col='Real Name') elif bounds.split('.')[-1] == 'csv': bounds = _pd.read_csv(bounds, index_col='Real Name', encoding='UTF-8') elif bounds.split('.')[-1] == 'tab': bounds = _pd.read_csv(bounds, sep='\t', index_col='Real Name', encoding='UTF-8') else: raise ValueError('Unknown file type: bounds') else: raise ValueError('Unknown type: bounds') if seed is not None: _np.random.seed(seed) unit_lhs = _pd.DataFrame(_pyDOE.lhs(n=len(param_list), samples=samples), columns=param_list) # raw latin hypercube sample res = model.run(return_timestamps=[model.components.initial_time()]) lhs = _pd.DataFrame(index=unit_lhs.index) for param in param_list: lower, upper = bounds[['Min', 'Max']].loc[param] value = res[param].iloc[0] if lower == upper: lhs[param] = lower elif _np.isfinite(lower) and _np.isfinite(upper): # np.isfinite(0)==True scale = upper - lower lhs[param] = _dist.uniform(lower, scale).ppf(unit_lhs[param]) elif _np.isfinite(lower) and _np.isinf(upper): if lower == value: scale = 1 else: scale = value - lower lhs[param] = _dist.expon(lower, scale).ppf(unit_lhs[param]) elif _np.isinf(lower) and _np.isfinite(upper): # np.isinf(-np.inf)==True if upper == value: scale = 1 else: scale = upper - value lhs[param] = upper - _dist.expon(0, scale).ppf(unit_lhs[param]) elif _np.isinf(lower) and _np.isinf(upper): # np.isinf(-np.inf)==True if value == 0: scale = 1 else: scale = abs(value) lhs[param] = _dist.norm(value, scale).ppf(unit_lhs[param]) else: raise ValueError('Problem with lower: %s or upper: %s bounds' % (lower, upper)) return lhs
def __init__(self,_lambda=1): self._lambda=_lambda self.D=D.expon(_lambda)