def random_PolynomialLosses(dom, T, M, L, m_max, exponents, dist=uniform(), high_ratio=False): """ Creates T random L-Lipschitz PolynomialLossFunctions uniformly bounded (in dual norm) by M, with Lipschitz constant uniformly bounded by L. Here exponents is a (finite) set of possible exponents. This is a brute force implementation and horribly inefficient. """ lossfuncs = [] while len(lossfuncs) < T: if high_ratio: weights = np.ones(len(exponents)) else: weights = np.linspace(1, 10, len(exponents)) expon = [tuple(np.random.choice(exponents, size=dom.n, p=weights/np.sum(weights))) for i in range(np.random.choice(np.arange(2,m_max)))] if high_ratio: coeffs = np.array([uniform(scale=np.max(expo)).rvs(1) for expo in expon]).flatten() else: coeffs = dist.rvs(len(expon)) lossfunc = PolynomialLossFunction(dom, coeffs, expon) Ml, Ll = lossfunc.max(grad=True) ml = lossfunc.min() if (Ml-ml)>0: scaling = dist.rvs()*np.minimum(M/(Ml-ml), L/Ll) else: scaling = 1 lossfuncs.append(scaling*(lossfunc + PolynomialLossFunction(dom, [-ml], [(0,)*dom.n]))) return lossfuncs
def fit_estimator(estimator, positive_data_matrix=None, negative_data_matrix=None, target=None, cv=10, n_jobs=-1, n_iter_search=40, random_state=1): # hyperparameter optimization param_dist = {"n_iter": randint(5, 100), "power_t": uniform(0.1), "alpha": uniform(1e-08, 1e-03), "eta0": uniform(1e-03, 1), "penalty": ["l1", "l2", "elasticnet"], "learning_rate": ["invscaling", "constant", "optimal"]} scoring = 'roc_auc' n_iter_search = n_iter_search random_search = RandomizedSearchCV(estimator, param_distributions=param_dist, n_iter=n_iter_search, cv=cv, scoring=scoring, n_jobs=n_jobs, random_state=random_state, refit=True) X, y = make_data_matrix(positive_data_matrix=positive_data_matrix, negative_data_matrix=negative_data_matrix, target=target) random_search.fit(X, y) logger.debug('\nClassifier:') logger.debug('%s' % random_search.best_estimator_) logger.debug('\nPredictive performance:') # assess the generalization capacity of the model via a 10-fold cross validation for scoring in ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']: scores = cross_validation.cross_val_score(random_search.best_estimator_, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs) logger.debug('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores))) return random_search.best_estimator_
def train(XTrain, yTrain, testsize): XTrain = np.array(XTrain, dtype=float) yTrain = np.array(yTrain, dtype=float) params = {'C': uniform(1, 99), 'gamma': uniform(0.01, 0.29), 'kernel': ['rbf', 'poly']} kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) models = [] for i in range(len(yTrain[0])): svr = svm.SVR() clf = grid_search.RandomizedSearchCV(svr, param_distributions=params, n_iter=30, cv=kfold, scoring='mean_squared_error', n_jobs=-1,verbose=1) clf.fit(transArray(XTrain), yTrain[:, i]) models.append(clf.best_estimator_) yPredict = [] XPredict = copy(XTrain[-1]) for i in range(testsize): XPredict = np.delete(XPredict, 0, axis=0) XPredict = np.insert(XPredict, len(XPredict), yTrain[-1], axis=0) subyPredict = np.array([]) for j in range(len(models)): models[j].fit(transArray(XTrain), yTrain[:, j]) # 重复训练模型 newPredict = models[j].predict([transRow(XPredict)]) subyPredict = np.hstack((subyPredict, newPredict)) XTrain = np.delete(XTrain, 0, axis=0) XTrain = np.insert(XTrain, len(XTrain), copy(XPredict), axis=0) yTrain = np.delete(yTrain, 0, axis=0) yTrain = np.insert(yTrain, len(yTrain), copy(subyPredict), axis=0) yPredict.append(copy(subyPredict[0])) return np.array(yPredict)
def nb_params(text, names): tfidf_base_params = [('__tfidf__min_df', [2, 3]), ('__tfidf__max_df', ss.uniform(.8, .2)), ('__tfidf__use_idf', [True, False]), #('__tfidf__ngram_range', [(1,1), (2,2), (3,3)], # ('__tfidf__norm', ['l1', None]), ] params = {} for x in text: for param, dist in tfidf_base_params: if (('thomas' not in x) and ((x.split('_')[1] in names))): params['data__'+x+param] = dist for x in ['speaking_order']:#, 'text_advocate_petitioner', 'text_advocate_respondent']: for param, dist in tfidf_base_params: params['data__'+x+param] = dist params.update({'select__param': ss.uniform(0.05, .25)}) params.update({#'predict__n_estimators':[5,10], 'predict__max_samples':ss.uniform(.8, .2), 'predict__max_features':ss.uniform(.5, .5)}) return params
def test_randomizedsearchcv_same_splits(): """Ensure that all parameter combinations are tested on the same splits (we check their RMSE scores are the same once averaged over the splits, which should be enough). We use as much parallelism as possible.""" data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(data_file, reader=Reader('ml-100k'), rating_scale=(1, 5)) kf = KFold(3, shuffle=True, random_state=4) # all RMSE should be the same (as param combinations are the same) param_distributions = {'n_epochs': [5], 'lr_all': uniform(.2, 0), 'reg_all': uniform(.4, 0), 'n_factors': [5], 'random_state': [0]} rs = RandomizedSearchCV(SVD, param_distributions, measures=['RMSE'], cv=kf, n_jobs=1) rs.fit(data) rmse_scores = [m for m in rs.cv_results['mean_test_rmse']] assert len(set(rmse_scores)) == 1 # assert rmse_scores are all equal # Note: actually, even when setting random_state=None in kf, the same folds # are used because we use product(param_comb, kf.split(...)). However, it's # needed to have the same folds when calling fit again: rs.fit(data) rmse_scores += [m for m in rs.cv_results['mean_test_rmse']] assert len(set(rmse_scores)) == 1 # assert rmse_scores are all equal
def train(array, embedDim, interval): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, 1) kfold = cross_validation.KFold(len(XTrain), n_folds=5, shuffle=False) params = {'n_estimators': randint(20, 200), 'loss': ['ls', 'lad', 'huber'], 'learning_rate': uniform(0.01, 0.19), 'subsample': uniform(0.5, 0.5), 'max_depth': randint(1, 5), 'min_samples_split': randint(1, 3), 'min_samples_leaf': randint(1, 3), 'max_features': randint(1, len(XTrain[0]))} bestModels = [] for i in range(len(yTrain[0])): gbrt = GradientBoostingRegressor() clf = grid_search.RandomizedSearchCV(gbrt, param_distributions=params, n_iter=20, scoring='mean_squared_error', cv=kfold, n_jobs=-1) clf.fit(XTrain, yTrain[:, i]) bestModels.append(clf.best_estimator_) for i in range(1, 12): XTrain, yTrain = pp.makeTrainset(array, embedDim, interval, i) # 模型的预测天数递增 XPredict = pp.makeXPredict(array, embedDim, interval, i) # 待预测的输入递增 subyPredict = [] for j in range(len(yTrain[0])): bestModels[j].fit(XTrain, yTrain[:, j]) subyPredict.append(bestModels[j].predict(XPredict)) array = np.hstack((array, np.array(copy(subyPredict)))) # 将一个模型的预测值作为已知数据,训练下一个模型 yPredict = array[0, -65:-5] # 一共可以预测66天,取其中对应的数据 return yPredict
def main(): #TODO Choose new random seed after testing np.random.seed(917) steps = 5 eps = 0.25 min_part = 10 #stars = pickle.load(file('stars.pkl')) stars = pickle.load(file('stars_trimmed.pkl')) #obs = pickle.load(file('data.pkl')) model = simple_model.MyModel(stars) model.set_prior([stats.uniform(0.5, 1.0), stats.uniform(0, 1.0)]) theta = (0.513265306122, 0.1) obs = model.generate_data(theta) model.set_data(obs) n_procs = [1, 2, 3, 4, 5, 6, 7, 8] start = time.time() OT = simple_abc.pmc_abc(model, obs, epsilon_0=eps, min_particles=min_part, steps=steps, target_epsilon=eps, parallel=False, plot=True) end = time.time() print 'Serial took {}s'.format(end - start) out_pickle = file('simptest.pkl', 'w') pickle.dump(OT, out_pickle) out_pickle.close()
def f3TruncNormRVSnp(parameters): N = parameters['N'] target = parameters['target'] rv1, rv2, rv3 = ndarray(shape = (N,), dtype=float), ndarray(shape = (N,), dtype=float), ndarray(shape = (N,), dtype=float) # if parameters['ncpu']: # ncpu = parameters['ncpu'] # else: # ncpu = mp.cpu_count() # # pool = mp.Pool(ncpu) # workers = [] if not parameters['distribution']: print 'No distribution set...abort' exit(1) elif parameters['distribution'] == 'truncnorm': a1, b1 = (parameters['min_intrv1'] - parameters['mu1']) / parameters['sigma1'], (parameters['max_intrv1'] - parameters['mu1']) / parameters['sigma1'] a2, b2 = (parameters['min_intrv2'] - parameters['mu2']) / parameters['sigma2'], (parameters['max_intrv2'] - parameters['mu2']) / parameters['sigma2'] a3, b3 = (parameters['min_intrv3'] - parameters['mu3']) / parameters['sigma3'], (parameters['max_intrv3'] - parameters['mu3']) / parameters['sigma3'] rv1 = truncnorm(a1, b1, loc=parameters['mu1'], scale=parameters['sigma1']).rvs(N) rv2 = truncnorm(a2, b2, loc=parameters['mu2'], scale=parameters['sigma2']).rvs(N) rv3 = truncnorm(a3, b3, loc=parameters['mu3'], scale=parameters['sigma3']).rvs(N) elif parameters['distribution'] == 'norm': rv1 = norm(loc=parameters['mu1'], scale=parameters['sigma1']).rvs(N) rv2 = norm(loc=parameters['mu2'], scale=parameters['sigma2']).rvs(N) rv3 = norm(loc=parameters['mu3'], scale=parameters['sigma3']).rvs(N) elif parameters['distribution'] == 'uniform': rv1 = uniform(loc=parameters['mu1'], scale=parameters['sigma1']).rvs(N) rv2 = uniform(loc=parameters['mu2'], scale=parameters['sigma2']).rvs(N) rv3 = uniform(loc=parameters['mu3'], scale=parameters['sigma3']).rvs(N) elif parameters['distribution'] == 'beta': rv1 = beta(a=parameters['min_intrv1'], b=parameters['max_intrv1'], loc=parameters['mu1'], scale=parameters['sigma1']).rvs(N) rv2 = beta(a=parameters['min_intrv2'], b=parameters['max_intrv2'], loc=parameters['mu2'], scale=parameters['sigma2']).rvs(N) rv3 = beta(a=parameters['min_intrv3'], b=parameters['max_intrv3'], loc=parameters['mu3'], scale=parameters['sigma3']).rvs(N) elif parameters['distribution'] == 'triang': rv1 = triang(loc=parameters['min_intrv1'], scale=parameters['max_intrv1'], c=parameters['mu1']).rvs(N) rv2 = triang(loc=parameters['min_intrv2'], scale=parameters['max_intrv2'], c=parameters['mu2']).rvs(N) rv3 = triang(loc=parameters['min_intrv3'], scale=parameters['max_intrv3'], c=parameters['mu3']).rvs(N) else: print 'Distribution not recognized...abort' exit(1) if parameters['scaling']: #scale the values of Qs in the allowed range such that sum(Q_i) = A r = ABS(parameters['Q1']) + ABS(parameters['Q2']) + ABS(parameters['Q3']) if r == 0.0: r = 1. # rounding the values, the sum could exceed A Q1 = ABS(parameters['Q1']) * parameters['A'] / r Q2 = ABS(parameters['Q2']) * parameters['A'] / r Q3 = parameters['A'] - Q1 - Q2 else: # print "scaling = False" Q1 = parameters['Q1'] Q2 = parameters['Q2'] Q3 = parameters['Q3'] return _f3(rv1, rv2, rv3, Q1, Q2, Q3, target)
def __setstate__(self, state): np.random.seed() self.__dict__ = state new_prior = [stats.uniform(**state['prior'][0]), stats.uniform(**state['prior'][1]), stats.uniform(**state['prior'][2]), stats.uniform(**state['prior'][3])] self.__dict__['prior'] = new_prior
def greedy_allocation3(parameters): """ Greedy heuristic for 3 supplier (the same as heu_allocation3 but with different parameters) Does not write on the file but returns the solution :param df: dataframe containing the data from the excel file :param parameters: parameters dict :return: write o the df and save on the file """ if not parameters['distribution']: print 'No distribution set...abort' exit(1) elif parameters['distribution'] == 'truncnorm': rv1 = truncnorm_custom(parameters['min_intrv1'], parameters['max_intrv1'], parameters['mu1'], parameters['sigma1']) rv2 = truncnorm_custom(parameters['min_intrv2'], parameters['max_intrv2'], parameters['mu2'], parameters['sigma2']) rv3 = truncnorm_custom(parameters['min_intrv3'], parameters['max_intrv3'], parameters['mu3'], parameters['sigma3']) elif parameters['distribution'] == 'norm': rv1 = norm(parameters['mu1'], parameters['sigma1']) rv2 = norm(parameters['mu2'], parameters['sigma2']) rv3 = norm(parameters['mu3'], parameters['sigma3']) elif parameters['distribution'] == 'uniform': rv1 = uniform(loc=parameters['mu1'], scale=parameters['sigma1']) rv2 = uniform(loc=parameters['mu2'], scale=parameters['sigma2']) rv3 = uniform(loc=parameters['mu3'], scale=parameters['sigma3']) elif parameters['distribution'] == 'beta': rv1 = beta(a=parameters['min_intrv1'], b=parameters['max_intrv1'], loc=parameters['mu1'], scale=parameters['sigma1']) rv2 = beta(a=parameters['min_intrv2'], b=parameters['max_intrv2'], loc=parameters['mu2'], scale=parameters['sigma2']) rv3 = beta(a=parameters['min_intrv3'], b=parameters['max_intrv3'], loc=parameters['mu3'], scale=parameters['sigma3']) elif parameters['distribution'] == 'triang': rv1 = triang(loc=parameters['min_intrv1'], scale=parameters['max_intrv1'], c=parameters['mu1']) rv2 = triang(loc=parameters['min_intrv2'], scale=parameters['max_intrv2'], c=parameters['mu2']) rv3 = triang(loc=parameters['min_intrv3'], scale=parameters['max_intrv3'], c=parameters['mu3']) else: print 'Distribution not recognized...abort' exit(1) A = parameters['A'] Q = {i: 0 for i in xrange(3)} while A > 0: best_probability = -1 best_retailer = -1 for n, r in enumerate([rv1, rv2, rv3]): p = 1 - r.cdf(Q[n]+1) if p > best_probability: best_probability = p best_retailer = n Q[best_retailer] += 1 A -= 1 parameters['Q1'] = Q[0] parameters['Q2'] = Q[1] parameters['Q3'] = Q[2] return {'Q1': Q[0], 'Q2': Q[1], 'Q3': Q[2], 'PROB': f3TruncNormRVSnp(parameters)}
def test_randomizedsearchcv_cv_results(): """Test the cv_results attribute""" f = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(f, Reader('ml-100k'), rating_scale=(1, 5)) kf = KFold(3, shuffle=True, random_state=4) param_distributions = {'n_epochs': [5], 'lr_all': uniform(.2, .3), 'reg_all': uniform(.4, .3), 'n_factors': [5], 'random_state': [0]} n_iter = 5 rs = RandomizedSearchCV(SVD, param_distributions, n_iter=n_iter, measures=['RMSE', 'mae'], cv=kf, return_train_measures=True) rs.fit(data) # test keys split*_test_rmse, mean and std dev. assert rs.cv_results['split0_test_rmse'].shape == (n_iter,) assert rs.cv_results['split1_test_rmse'].shape == (n_iter,) assert rs.cv_results['split2_test_rmse'].shape == (n_iter,) assert rs.cv_results['mean_test_rmse'].shape == (n_iter,) assert np.allclose(rs.cv_results['mean_test_rmse'], np.mean([rs.cv_results['split0_test_rmse'], rs.cv_results['split1_test_rmse'], rs.cv_results['split2_test_rmse']], axis=0)) assert np.allclose(rs.cv_results['std_test_rmse'], np.std([rs.cv_results['split0_test_rmse'], rs.cv_results['split1_test_rmse'], rs.cv_results['split2_test_rmse']], axis=0)) # test keys split*_train_mae, mean and std dev. assert rs.cv_results['split0_train_rmse'].shape == (n_iter,) assert rs.cv_results['split1_train_rmse'].shape == (n_iter,) assert rs.cv_results['split2_train_rmse'].shape == (n_iter,) assert rs.cv_results['mean_train_rmse'].shape == (n_iter,) assert np.allclose(rs.cv_results['mean_train_rmse'], np.mean([rs.cv_results['split0_train_rmse'], rs.cv_results['split1_train_rmse'], rs.cv_results['split2_train_rmse']], axis=0)) assert np.allclose(rs.cv_results['std_train_rmse'], np.std([rs.cv_results['split0_train_rmse'], rs.cv_results['split1_train_rmse'], rs.cv_results['split2_train_rmse']], axis=0)) # test fit and train times dimensions. assert rs.cv_results['mean_fit_time'].shape == (n_iter,) assert rs.cv_results['std_fit_time'].shape == (n_iter,) assert rs.cv_results['mean_test_time'].shape == (n_iter,) assert rs.cv_results['std_test_time'].shape == (n_iter,) assert rs.cv_results['params'] is rs.param_combinations # assert that best parameter in rs.cv_results['rank_test_measure'] is # indeed the best_param attribute. best_index = np.argmin(rs.cv_results['rank_test_rmse']) assert rs.cv_results['params'][best_index] == rs.best_params['rmse'] best_index = np.argmin(rs.cv_results['rank_test_mae']) assert rs.cv_results['params'][best_index] == rs.best_params['mae']
def _random_pos(self, z_offset=0): """Random within-extent position generator. Returns: (tuple) of X,Y,Z uniform distributions. """ return (stats.uniform(self.extent[0], self.extent[1]), stats.uniform(self.extent[2], self.extent[3]), stats.uniform(self.extent[4] + z_offset, self.extent[5]))
def optimize_predictor(predictor = None, data_matrix = None, target = None, n_iter_search = 20, cv = 3, scoring = "roc_auc", n_jobs = -1): param_dist = {"n_iter": randint(5, 100), "power_t": uniform(0.1), "alpha": uniform(1e-08,1e-03), "eta0" : uniform(1e-03,10), "penalty": ["l1", "l2", "elasticnet"], "learning_rate": ["invscaling", "constant","optimal"]} optclf = RandomizedSearchCV(predictor, param_distributions = param_dist, n_iter = n_iter_search, cv = cv, scoring = scoring, refit = True, n_jobs = n_jobs) optclf.fit(data_matrix, target) return optclf.best_estimator_
def rvs(self, random_state=None): if random_state is None: exp = uniform(loc=self.lo, scale=self.scale).rvs() else: exp = uniform(loc=self.lo, scale=self.scale).rvs(random_state=random_state) if self.mass_on_zero > 0.0 and np.random.uniform() < self.mass_on_zero: return 0.0 return self.base ** exp
def test_ad_test(self): # Versus ad.test() from R goftest. result = ad_test((.1, .4, .7), uniform(0, 1)) assert allclose(result, (.366028, .875957)) result = ad_test((.1, .4, .7), norm(0, 1)) assert allclose(result, (.921699, .390938)) # Poles of the weight function. result = ad_test((0., .5), uniform(0, 1)) assert allclose(result, (float('inf'), 0)) result = ad_test((1., .5), uniform(0, 1)) assert allclose(result, (float('inf'), 0))
def train(XTrain, yTrain, XPredict): params = {'C': uniform(1, 999), 'gamma': uniform(0.01, 0.29), 'kernel': ['rbf', 'poly']} kfold = cross_validation.KFold(len(XTrain), n_folds=3, shuffle=False) svr = svm.SVR() clf = grid_search.RandomizedSearchCV(svr, param_distributions=params, n_iter=20, cv=kfold, scoring='mean_squared_error', n_jobs=-1) clf.fit(XTrain, yTrain) # 一次性训练模型 # print clf.best_score_, clf.best_estimator_ yPredict = clf.predict(XPredict) return yPredict, clf.best_params_
def testUniform4(self): distV = uniform() distW = uniform(scale=2) v = 0.7 w = 1.4 expected = ([0,1],[0,1.2]) bids = [v,w] distributions = [distV,distW] obtained = myersonAuction(bids,distributions) self.assertAlmostEqual(expected, obtained, msg="Myerson auction with inputs: " + str(bids) + ", " + str(distributions) + ". Expected " + str(expected) + "but obtained " + str(obtained) + ".")
def test_legendre_consistency(): import scipy.stats as stats dist = stats.uniform(-1, 1 - -1) p = LegendrePolynomials(normalised=False) _check_poly_consistency(p, dist) p = LegendrePolynomials(a=2, b=5, normalised=False) dist = stats.uniform(2, 5 - 2) _check_poly_consistency(p, dist) p = LegendrePolynomials(a= -2.5, b= -1.2, normalised=True) assert_equal(p.norm(4, False), 1) dist = stats.uniform(-2.5, -1.2 - -2.5) _check_poly_consistency(p, dist)
def train(XTrain, yTrain, XPredict): XTrain = np.array(XTrain, dtype=float) yTrain = np.array(yTrain, dtype=float) params = {'C': uniform(1, 999), 'gamma': uniform(0.01, 0.29), 'kernel': ['rbf', 'poly']} kfold = cross_validation.KFold(len(XTrain), n_folds=4, shuffle=False) svr = svm.SVR() clf = grid_search.RandomizedSearchCV(svr, param_distributions=params, n_iter=20, cv=kfold, scoring='mean_squared_error', n_jobs=-1) yPredict = [] for i in range(yTrain.shape[1]): clf.fit(XTrain, yTrain[:, i]) # 训练distance个模型 yPredict.extend(clf.predict(XPredict)) return np.array(yPredict)
def generateToy(): np.random.seed(12345) fig,ax = plt.subplots(4,sharex=True) #fig,ax = plt.subplots(2) powerlaw_arg = 2 triang_arg=0.7 n_samples = 500 #generate simple line with slope 1, from 0 to 1 frozen_powerlaw = powerlaw(powerlaw_arg) #powerlaw.pdf(x, a) = a * x**(a-1) #generate triangle with peak at 0.7 frozen_triangle = triang(triang_arg) #up-sloping line from loc to (loc + c*scale) and then downsloping for (loc + c*scale) to (loc+scale). frozen_uniform = uniform(0.2,0.5) frozen_uniform2 = uniform(0.3,0.2) x = np.linspace(0,1) signal = np.random.normal(0.5, 0.1, n_samples/2) data_frame = pd.DataFrame({'powerlaw':powerlaw.rvs(powerlaw_arg,size=n_samples), 'triangle':triang.rvs(triang_arg,size=n_samples), 'uniform':np.concatenate((uniform.rvs(0.2,0.5,size=n_samples/2),uniform.rvs(0.3,0.2,size=n_samples/2))), 'powerlaw_signal':np.concatenate((powerlaw.rvs(powerlaw_arg,size=n_samples/2),signal))}) ax[0].plot(x, frozen_powerlaw.pdf(x), 'k-', lw=2, label='powerlaw pdf') hist(data_frame['powerlaw'],bins=100,normed=True,histtype='stepfilled',alpha=0.2,label='100 bins',ax=ax[0]) #hist(data_frame['powerlaw'],bins='blocks',fitness='poly_events',normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[0]) ax[0].legend(loc = 'best') ax[1].plot(x, frozen_triangle.pdf(x), 'k-', lw=2, label='triangle pdf') hist(data_frame['triangle'],bins=100,normed=True,histtype='stepfilled',alpha=0.2,label='100 bins',ax=ax[1]) hist(data_frame['triangle'],bins='blocks',fitness='poly_events',normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[1]) ax[1].legend(loc = 'best') #ax[0].plot(x, frozen_powerlaw.pdf(x), 'k-', lw=2, label='powerlaw pdf') hist(data_frame['powerlaw_signal'],bins=100,normed=True,histtype='stepfilled',alpha=0.2,label='100 bins',ax=ax[2]) #hist(data_frame['powerlaw_signal'],bins='blocks',normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[2]) ax[2].legend(loc = 'best') ax[3].plot(x, frozen_uniform.pdf(x)+frozen_uniform2.pdf(x), 'k-', lw=2, label='uniform pdf') hist(data_frame['uniform'],bins=100,normed=True,histtype='stepfilled',alpha=0.2,label='100 bins',ax=ax[3]) #hist(data_frame['uniform'],bins='blocks',fitness = 'poly_events',p0=0.05,normed=True,histtype='stepfilled',alpha=0.2,label='b blocks',ax=ax[3]) ax[3].legend(loc = 'best') plt.show() fig.savefig('plots/toy_plots.png')
def unpack(self, relation_a_b_beginning, relation_a_b_ending): before_a_b_beginning, same_a_b_beginning, after_a_b_beginning = relation_a_b_beginning before_a_b_ending, same_a_b_ending, after_a_b_ending = relation_a_b_ending if almost_equals(before_a_b_beginning + same_a_b_beginning + same_a_b_ending + after_a_b_ending, 2.0, epsilon): return [], uniform_reference # Inconsistent if almost_equals(before_a_b_beginning + before_a_b_ending, 2.0, epsilon): return [], uniform_reference if almost_equals(after_a_b_beginning + after_a_b_ending, 2.0, epsilon): return [], uniform_reference if almost_equals(before_a_b_ending, 1.0, epsilon): return self.unpack_partial(relation_a_b_beginning), uniform_reference if almost_equals(after_a_b_beginning, 1.0, epsilon): return self.unpack_partial(relation_a_b_ending), uniform_reference a_possibilities = self.unpack_partial(relation_a_b_beginning) if len(a_possibilities) == 1: a_possibility = a_possibilities[0] return else: a_possibility_1, a_possibility_2 = a_possibilities a_possibility = a_possibility_1 if a_possibility.args[1] < a_possibility_2.args[1]: a_possibility = a_possibility_2 a_start_point, length_a = a_possibility.args if before_a_b_ending * same_a_b_ending * after_a_b_ending > 0: if almost_equals(before_a_b_beginning, 0, epsilon): length_b_ending = length_a * same_a_b_ending ** 2 else: length_b_ending = same_a_b_ending ** 2 / same_a_b_beginning ** 2 b_ending = uniform(a_start_point + before_a_b_ending * (length_a - length_b_ending) / (before_a_b_ending + after_a_b_ending), length_b_ending) return [a_possibility], b_ending else: denominator = length_a * same_a_b_ending ** 2 length_b_ending_lower_bound = (length_a * same_a_b_ending ** 2) ** 2 / denominator length_b_ending_upper_bound = (a_start_point + length_a - 1) ** 2 / denominator length_b_ending = (length_b_ending_lower_bound + length_b_ending_upper_bound) / 2.0 b_start_point = a_start_point + length_a - same_a_b_ending * sqrt(length_b_ending * length_a) b_ending = uniform(b_start_point, length_b_ending) return [a_possibility], b_ending
def test_random_grid(): # get our train/test X_train, X_test, y_train, y_test = train_test_split(X, iris.target, train_size=0.75, random_state=42) # default CV does not shuffle, so we define our own custom_cv = KFold(n=y_train.shape[0], n_folds=5, shuffle=True, random_state=42) # build a pipeline pipe = Pipeline([ ('retainer' , FeatureRetainer()), # will retain all ('dropper' , FeatureDropper()), # won't drop any ('mapper' , FunctionMapper()), # pass through ('encoder' , OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer' , SelectiveImputer()), # pass through ('scaler' , SelectiveScaler()), ('boxcox' , BoxCoxTransformer()), ('nzv' , NearZeroVarianceFilterer(threshold=1e-4)), ('pca' , SelectivePCA(n_components=0.9)), ('model' , RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold' : uniform(loc=.8, scale=.15), 'collinearity__method' : ['pearson','kendall','spearman'], 'scaler__scaler' : [StandardScaler(), RobustScaler()], 'pca__n_components' : uniform(loc=.75, scale=.2), 'pca__whiten' : [True, False], 'model__n_estimators' : randint(5,100), 'model__max_depth' : randint(2,25), 'model__min_samples_leaf' : randint(1,15), 'model__max_features' : uniform(loc=.5, scale=.5), 'model__max_leaf_nodes' : randint(10,75) } # define the gridsearch search = RandomizedSearchCV(pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=custom_cv, random_state=42) # fit the search search.fit(X_train, y_train) # test the report the_report = report_grid_score_detail(search, charts=False)
def _lhs(self, dist, parms, siz=100): """ Latin Hypercube sampling of any distribution. modified from code found `online <http://code.google.com/p/bayesian-inference/source/browse/trunk/BIP/Bayes/lhs.py?r=3cfbbaa5806f2b8cc9e2457d967b0a58a3ce459c>`_. :param dist: random number generator from `scipy.stats <http://docs.scipy.org/doc/scipy/reference/stats.html>`_ :param parms: tuple of parameters as required for dist. :param siz: number or shape tuple for the output sample """ if not isinstance(dist, (stats.rv_discrete, stats.rv_continuous)): raise TypeError("dist is not a scipy.stats distribution object") # number of samples n = siz if isinstance(siz, (tuple, list)): n = np.product(siz) perc = np.arange(0, 1.0, 1.0 / n) np.random.shuffle(perc) smp = [stats.uniform(i, 1.0 / n).rvs() for i in perc] v = dist(*parms).ppf(smp) if isinstance(siz, (tuple, list)): v.shape = siz return v
def make_theta_gen(): """ Generates prior for theta = angle between x axis and semi-MAJOR axis for the gaussian population. Returns angle in radians """ # Will allow angle between 0 and 180 degrees (in radians) return stats.uniform(loc=0, scale=3.14159)
def gillespie_logistique2(taille_ini, b1,b2,d1,d2,temps): """une autre implémentation de l'algorithme de Gillepie on ne conserve la taille qu'à des instants prédéfinis""" taille = zeros(temps.size) # préalocation de la mémoire # initialisation des temps et taille courantes temps_courant, taille_courante = 0.0, taille_ini t_nais = (b1 + b2 * taille_courante) * taille_courante # taux de naissance t_mort = (d1 + d2 * taille_courante) * taille_courante # taux de mort tau = t_nais + t_mort # taux global ee = expon() uu = uniform() delta_temps = ee.rvs() / tau for k in range(temps.size): # on simule sans dépasser temps[k] while temps_courant + delta_temps < temps[k]: temps_courant += delta_temps # mise à jour instant courant if uu.rvs() < (b1 * taille_courante) / tau: taille_courante += 1 # naissance else: taille_courante -= 1 # mort t_nais = (b1 + b2 * taille_courante) * taille_courante # taux de naissance t_mort = (d1 + d2 * taille_courante) * taille_courante # taux de mort tau = t_nais + t_mort # taux global delta_temps = ee.rvs() / tau # temps de séjour taille[k] = taille_courante return taille
def testUniformStd(self): with self.test_session(): a = 10.0 b = 100.0 uniform = uniform_lib.Uniform(a=a, b=b) s_uniform = stats.uniform(loc=a, scale=b - a) self.assertAllClose(uniform.stddev().eval(), s_uniform.std())
def _rvs_helper(self): num_samples = 10000 xs = gauss(0, 1).rvs((num_samples, 3)) xs = divide(xs, reshape(norm(xs, 1), (num_samples, 1))) pvalues = self.pdf(xs, normalize=False) fmax = self.pdf_max(normalize=False) return xs[uniform(0, fmax).rvs(num_samples) < pvalues]
def create_training_set ( parameters, minvals, maxvals, n_train=200 ): """Creates a traning set for a set of parameters specified by ``parameters`` (not actually used, but useful for debugging maybe). Parameters are assumed to be uniformly distributed between ``minvals`` and ``maxvals``. ``n_train`` input parameter sets will be produced, and returned with the actual distributions list. The latter is useful to create validation sets. Parameters ------------- parameters: list A list of parameter names minvals: list The minimum value of the parameters. Same order as ``parameters`` maxvals: list The maximum value of the parameters. Same order as ``parameters`` n_train: int How many training points to produce Returns --------- The training set and a distributions object that can be used by ``create_validation_set`` """ distributions = [] for i,p in enumerate(parameters): distributions.append ( ss.uniform ( loc=minvals[i], \ scale=(maxvals[i]-minvals[i] ) ) ) samples = lhd ( dist=distributions, size=n_train ) return samples, distributions
def prior(self): """ Loops through the parameters """ priorz = [] for key in self.ordered_keys: prior_key = self.prior_dict[key] if prior_key['shape'] == 'uniform': loc = prior_key['min'] scale = prior_key['max'] - prior_key['min'] priorz.append( uniform(loc, scale)) elif prior_key['shape'] == 'gauss': loc = prior_key['mean'] scale = prior_key['stddev'] priorz.append( norm(loc, scale) ) #else: # raise ValueError("Not specified") return priorz
def lincombo_hierregress_taubybeta(yy, stderrs, XX, maxtau=None, guess_range=False, draws=100): yy, stdvars, XX = helpers.check_arguments(yy, stderrs, XX) nummus = XX.shape[1] print "Sampling tau..." if maxtau is None: maxtau = pooling.estimated_maxlintau(yy, stderrs, XX) print "Using maximum tau =", maxtau if maxtau[0] > 0: probability_prior_tau = uniform(0, maxtau) # Prepare to sample from from p(tau | yy) # Create pdf for p(tau | yy) def pdf(tau): # Requires mus, but is invarient to them return probability_tau([np.mean(yy)] * nummus, tau, yy, stdvars, XX, probability_prior_tau) dist = ContinuousSampled(pdf, 2) if guess_range: mini, maxi = dist.guess_ranges([0, 0], maxtau, draws * 10) else: mini, maxi = 0, maxtau dist.prepare_draws(mini, maxi, count=draws) else: # maxtau == 0 dist = MultivariateDelta(np.zeros(2)) print "Sampling mus..." return sample_posterior(yy, stderrs, XX, dist, draws)
def __init__(self): self.prv = uniform(0, 1) self.qfx = np.vectorize(self.qf)
net = NeuralNetClassifier( MLP, criterion=nn.CrossEntropyLoss, max_epochs=30, lr=0.1, module__input_size=11, module__num_classes=3, device='cuda' ) from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from scipy.stats import uniform params = { 'net__lr': uniform(loc=0, scale=0.2), 'net__module__hidden_size': randint(100, 1000), 'net__optimizer__weight_decay': uniform(loc=0, scale=0.1), 'net__batch_size': randint(10, 200) } model = Pipeline(steps=[("scaler",StandardScaler()), ("net",net)]) rs = RandomizedSearchCV(model, params, refit=True, cv=3, scoring='accuracy', n_iter=100, n_jobs=-1) rs.fit(train_x, train_y) print(rs.best_score_, rs.best_params_) print(rs.score(test_x, test_y))
from matplotlib import pyplot from scipy.stats import norm, laplace, poisson, cauchy, uniform import numpy if __name__ == "__main__": sizes = [10, 50, 1000] densities = [ norm(loc=0, scale=1), laplace(scale=1 / numpy.sqrt(2), loc=0), poisson(10), cauchy(), uniform(loc=-numpy.sqrt(3), scale=2 * numpy.sqrt(3)) ] names = ["Normal", "Laplace", "Poisson", "Cauchy", "Uniform"] for size in sizes: n = norm.rvs(loc=0, scale=1, size=size) l = laplace.rvs(scale=1 / numpy.sqrt(2), loc=0, size=size) p = poisson.rvs(10, size=size) c = cauchy.rvs(size=size) u = uniform.rvs(loc=-numpy.sqrt(3), scale=2 * numpy.sqrt(3), size=size) distributions = [n, l, p, c, u] build = list(zip(distributions, densities, names)) for histogram, density, name in build: fig, ax = pyplot.subplots(1, 1) ax.hist(histogram, density=True, histtype='stepfilled', alpha=0.6,
def set_pipeline(self): if self.reference == 'a': if self.year == '2009': categorical_features_1 = [ 'category_code', 'country_code', 'state_code', 'founded_at', 'timediff_founded_series_a', 'time_diff_series_a_now' ] elif self.year == '2014': categorical_features_1 = [ 'category_code', 'country_code', 'state_code', 'founded_at', 'timediff_founded_series_a', 'time_diff_series_a_now' ] #first use imputer /after ohe categorical_features_2 = [ 'participants_a', 'raised_amount_usd_a', 'rounds_before_a', 'mean_comp_worked_before', 'founder_count', 'degree_count' ] # impute first, after ordinals booleans_features = [ 'graduate', 'undergrad', 'professional', 'MBA_bool', 'cs_bool', 'phd_bool', 'top_20_bool', 'mean_comp_founded_before', 'female_ratio' ] # ordinals/binaries #Defining imputers imputer = self.get_imputer() imputer_2 = SimpleImputer(strategy='most_frequent') #pipes for each feature pipe_1 = Pipeline([('imputer', imputer_2), ('ohe', OneHotEncoder(handle_unknown='ignore')) ]) pipe_2 = Pipeline([('imputer_ord', imputer), ('ord_encoder', OneHotEncoder(handle_unknown='ignore'))]) pipe_bool = Pipeline([('imputer_bool', imputer_2), ('ord_encoder', OneHotEncoder(handle_unknown='ignore'))]) #process feateng_blocks = [('cat_ohe', pipe_1, categorical_features_1), ('cat_ord', pipe_2, categorical_features_2), ('cat_bool', pipe_bool, booleans_features)] elif self.reference == 0: if self.year == '2014': categorical_features_1 = [ 'category_code', 'country_code', 'state_code', 'founded_at', 'timediff_founded_series_0', 'time_diff_series_0_now' ] #first use imputer /after ohe categorical_features_2 = [ 'participants_0', 'raised_amount_usd_0', 'participants_0', 'mean_comp_worked_before', 'founder_count', 'degree_count' ] # impute first, after ordinals booleans_features = [ 'graduate', 'undergrad', 'professional', 'MBA_bool', 'cs_bool', 'phd_bool', 'top_20_bool', 'mean_comp_founded_before', 'female_ratio' ] # ordinals/binaries #Defining imputers imputer = self.get_imputer() imputer_2 = SimpleImputer(strategy='most_frequent') #pipes for each feature pipe_1 = Pipeline([('imputer', imputer_2), ('ohe', OneHotEncoder(handle_unknown='ignore')) ]) pipe_2 = Pipeline([('imputer_ord', imputer), ('ord_encoder', OneHotEncoder(handle_unknown='ignore'))]) pipe_bool = Pipeline([('imputer_bool', imputer_2), ('ord_encoder', OneHotEncoder(handle_unknown='ignore'))]) #process feateng_blocks = [('cat_1', pipe_1, categorical_features_1), ('cat_2', pipe_2, categorical_features_2), ('cat_bool', pipe_bool, booleans_features)] #Columntransformer keeping order preprocessor = ColumnTransformer(feateng_blocks, remainder='passthrough') #final_pipeline self.pipeline = Pipeline( steps=[('preprocessing', preprocessor), ('model_use', self.get_estimator())]) if self.smote: smote = ADASYN(sampling_strategy='minority', n_neighbors=20) self.pipeline = Pipeline_imb([('prep', preprocessor), ('smote', smote), ('model_use', self.get_estimator())]) # Random search if self.grid_search_choice: grid_search = RandomizedSearchCV( self.pipeline, param_distributions={ "model_use__learning_rate": uniform(0, 1), "model_use__gamma": uniform(0, 2), "model_use__max_depth": randint(1, 15), "model_use__colsample_bytree": randint(0.1, 1), "model_use__subsample": [0.2, 0.4, 0.5], "model_use__reg_alpha": uniform(0, 1), "model_use__reg_lambda": uniform(1, 10), "model_use__min_child_weight": randint(1, 10), "model_use__n_estimators": randint(1000, 3000) }, #param depending of the model to use cv=35, scoring='f1', n_iter=10, n_jobs=-1) grid_search.fit(self.X_train, self.y_train) self.pipeline = grid_search.best_estimator_ self.grid_params = grid_search.get_params self.set_tag('model_used', self.pipeline)
def init_classifier(model="LDA"): if model == "LDA": clf = LinearDiscriminantAnalysis() distributions = dict() elif model == "KNN": clf = KNeighborsClassifier() distributions = dict( classifier__n_neighbors=np.arange(1, 16, 1), classifier__weights=["uniform", "distance"], classifier__metric=["minkowski", "euclidean", "manhattan"], ) elif model == "SVM": clf = SVC(kernel="linear") distributions = { "classifier__C": [ 0.1, 0.5, 1, 3, 10, 50, 100, 200, 500, 1000, ], # uniform(loc=0, scale=100), "classifier__gamma": [5, 2, 1, 0.01, 0.001, 0.0001, 0.00001], "classifier__kernel": ["linear"], # "rbf", "poly", "sigmoid", "linear"], "classifier__max_iter": [100, 500, 1000], # , 200, 300, 400, 500, 1000], } elif model == "DT": clf = DecisionTreeClassifier() distributions = dict( classifier__criterion=["gini", "entropy"], classifier__splitter=["best", "random"], ) elif model == "LR": clf = LogisticRegression() distributions = dict( classifier__C=uniform(loc=0, scale=4), classifier__penalty=["l2", "l1", "elasticnet", "none"], classifier__solver=[ "newton-cg", "lbfgs", "liblinear", "sag", "saga" ], classifier__multi_class=["auto", "ovr", "multinomial"], classifier__max_iter=[100, 200, 300, 400, 500, 1000], ) elif model == "XGBC": clf = XGBClassifier() distributions = dict() elif model == "RF": clf = RandomForestClassifier() distributions = { "classifier__n_estimators": [10], # mettre dautres valeurs "classifier__max_depth": [1, 2, 4, 8, 12, 16], "classifier__min_samples_split": [2, 4, 6, 8, 10, 12, 14, 16], "classifier__max_features": [0.25], } return clf, distributions
def augmented_data( f_sub=None, beta=None, f_sub_alt=None, beta_alt=None, f_sub_ref=None, beta_ref=None, f_sub_prior=uniform(0.001, 0.199), beta_prior=uniform(-2.5, 1.0), n_images=None, n_thetas_marginal=1000, draw_host_mass=True, draw_host_redshift=True, draw_alignment=True, mine_gold=True, calculate_dx_dm=False, return_dx_dm=False, roi_size=2., ): """ Wraps around the population simulation, starts the simulation with parameters drawn from the prior and "mines the gold" appropriately """ # Input if (f_sub is None or beta is None) and n_images is None: raise ValueError( "Either f_sub and beta or n_images have to be different from None") if n_images is None: n_images = len(f_sub) n_verbose = max(1, n_images // 100) # Hypothesis for sampling beta, f_sub = _draw_params(beta, beta_prior, f_sub, f_sub_prior, n_images) # Alternate hypothesis (test hypothesis when swapping num - den) beta_alt, f_sub_alt = _draw_params(beta_alt, beta_prior, f_sub_alt, f_sub_prior, n_images) # Reference hypothesis beta_ref, f_sub_ref = _draw_params(beta_alt, beta_prior, f_sub_alt, f_sub_prior, n_thetas_marginal - 1) params_ref = np.vstack((f_sub_ref, beta_ref)).T # Output all_params, all_params_alt, all_images = [], [], [] all_t_xz, all_t_xz_alt, all_log_r_xz, all_log_r_xz_alt = [], [], [], [] all_sub_latents, all_global_latents = [], [] all_dx_dm = [] # Main loop for i_sim in range(n_images): if (i_sim + 1) % n_verbose == 0: logger.info("Simulating image %s / %s", i_sim + 1, n_images) else: logger.debug("Simulating image %s / %s", i_sim + 1, n_images) # Prepare params this_f_sub = _pick_param(f_sub, i_sim, n_images) this_beta = _pick_param(beta, i_sim, n_images) this_f_sub_alt = _pick_param(f_sub_alt, i_sim, n_images) this_beta_alt = _pick_param(beta_alt, i_sim, n_images) params = np.asarray([this_f_sub, this_beta]).reshape((1, 2)) params_alt = np.asarray([this_f_sub_alt, this_beta_alt]).reshape( (1, 2)) params_eval = np.vstack( (params, params_alt, params_ref)) if mine_gold else None logger.debug("Numerator hypothesis: f_sub = %s, beta = %s", this_f_sub, this_beta) if mine_gold: logger.debug("Evaluating joint log likelihood at %s", params_eval) # Simulate sim = LensingObservationWithSubhalos( m_200_min_sub=1.0e7 * M_s, m_200_max_sub_div_M_hst=0.01, m_min_calib=1.0e7 * M_s, m_max_sub_div_M_hst_calib=0.01, f_sub=this_f_sub, beta=this_beta, params_eval=params_eval, calculate_joint_score=mine_gold, draw_host_mass=draw_host_mass, draw_host_redshift=draw_host_redshift, draw_alignment=draw_alignment, calculate_msub_derivatives=calculate_dx_dm, roi_size=roi_size, ) # Store information if calculate_dx_dm: sum_abs_dx_dm = np.sum(np.abs(sim.grad_msub_image).reshape( sim.grad_msub_image.shape[0], -1), axis=1) sub_latents = np.vstack( (sim.m_subs, sim.theta_xs, sim.theta_ys, sum_abs_dx_dm)).T if return_dx_dm: all_dx_dm.append(sim.grad_msub_image) else: sub_latents = np.vstack((sim.m_subs, sim.theta_xs, sim.theta_ys)).T global_latents = [ sim.M_200_hst, # Host mass sim.D_l, # Host distance sim.z_l, # Host redshift sim.sigma_v, # sigma_V sim.theta_x_0, # Source offset x sim.theta_y_0, # Source offset y sim.theta_E, # Host Einstein radius sim.n_sub_roi, # Number of subhalos sim.f_sub_realiz, # Fraction of halo mass in subhalos sim. n_sub_in_ring, # Number of subhalos with r < 90% of host Einstein radius sim. f_sub_in_ring, # Fraction of halo mass in subhalos with r < 90% of host Einstein radius sim. n_sub_near_ring, # Number of subhalos with r within 10% of host Einstein radius sim. f_sub_near_ring, # Fraction of halo mass in subhalos with r within 10% of host Einstein radius ] global_latents = np.asarray(global_latents) all_params.append(params) all_params_alt.append(params_alt) all_images.append(sim.image_poiss_psf) all_sub_latents.append(sub_latents) all_global_latents.append(global_latents) if mine_gold: all_log_r_xz.append(_extract_log_r(sim, 0, n_thetas_marginal)) all_log_r_xz_alt.append(_extract_log_r(sim, 1, n_thetas_marginal)) all_t_xz.append(sim.joint_scores[0]) all_t_xz_alt.append(sim.joint_scores[1]) if calculate_dx_dm and return_dx_dm: return ( np.array(all_params).reshape((-1, 2)), np.array(all_params_alt).reshape((-1, 2)), np.array(all_images), np.array(all_t_xz) if mine_gold else None, np.array(all_t_xz_alt) if mine_gold else None, np.array(all_log_r_xz) if mine_gold else None, np.array(all_log_r_xz_alt) if mine_gold else None, all_sub_latents, np.array(all_global_latents), ) return ( np.array(all_params).reshape((-1, 2)), np.array(all_params_alt).reshape((-1, 2)), np.array(all_images), np.array(all_t_xz) if mine_gold else None, np.array(all_t_xz_alt) if mine_gold else None, np.array(all_log_r_xz) if mine_gold else None, np.array(all_log_r_xz_alt) if mine_gold else None, all_sub_latents, np.array(all_global_latents), )
def __init__(self, target_F): self.target_f = target_F self.uniform = stats.uniform()
while i<n: d = min(n-i, 750) u = self.uniform.rvs(size=d) sample = np.concatenate( [sample, fsolve( lambda y:self.target_f(y) - u, 0.5*np.ones(d) )] ) i+=d return sample if __name__=="__main__": F = lambda x: (x+x**2 + x**(5))/3. f = lambda x: (1 + 2*x + 5*x**(4))/3. N = 1e4 print "Testing AR Method." print "Generate %d variables:"%N g = stats.uniform() M = f(1) ar = AR_method( target_f = f, sample_g = g, M = M) start = time.clock() ar_test = ar.generateIII( N ) print "Mean: %.3f, time taken: %.2f seconds"%(ar_test.mean(), time.clock() - start ) print print "Testing Inverse Method." print "Generate %d variables:"%N iv = Inversion_method( target_F = F) start = time.clock() iv_test = iv.generateII( N ) print "Mean: %.3f, time taken: %.2f seconds"%(iv_test.mean(), time.clock() - start )
def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = DecisionTreeClassifier(random_state=20, class_weight='balanced', max_features=self.numf) ada = AdaBoostClassifier(base_estimator=clf1, algorithm="SAMME.R", random_state=55) logging.info(f'Initialised classifier using balanced class weights \n') #set up randomized search param_dict = { 'base_estimator__criterion': ['gini', 'entropy'], 'n_estimators': randint(100, 10000), #number of base estimators to use 'learning_rate': uniform(0.0001, 1.0), 'base_estimator__min_samples_split': randint(2, 20), 'base_estimator__max_depth': randint(1, 10), 'base_estimator__min_samples_leaf': randint(1, 20), 'base_estimator__max_leaf_nodes': randint(10, 20) } logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(ada, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train, self.y_train) best_parameters = rand_search_fitted.best_params_ best_scores = rand_search_fitted.best_score_ logging.info( f'Running randomised search for best patameters of classifier \n' f'Best parameters found: {best_parameters} \n' f'Best accuracy scores found: {best_scores} \n') self.model = rand_search_fitted.best_estimator_ datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info(f'Writing best classifier to disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train, self.y_train, self.X_test, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n') print_to_consol('Getting feature importances for best classifier') best_clf_feat_import = self.model.feature_importances_ best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import, self.X_train.columns), reverse=True) logging.info( f'Feature importances for best classifier {best_clf_feat_import_sorted} \n' ) all_clf_feat_import_mean = np.mean( [tree.feature_importances_ for tree in self.model.estimators_], axis=0) all_clf_feat_import_mean_sorted = sorted(zip(all_clf_feat_import_mean, self.X_train.columns), reverse=True) print_to_consol('Plotting feature importances for best classifier') feature_importances_best_estimator(best_clf_feat_import_sorted, self.directory) logging.info( f'Plotting feature importances for best classifier in decreasing order \n' ) feature_importances_error_bars(self.model, self.X_train.columns, self.directory) logging.info( f'Plotting feature importances for best classifier with errorbars \n' )
def make_pi_gen(): """ piC = fraction of stars belonging to the gaussian """ return stats.uniform(loc=0, scale=1) #Uniform distribution from 0 - 1
# Let's use **logit** or **inverse sigmoid** transformation to map the support to real number line. Mathematically, $\zeta=logit(\theta)$. # # $$ # P(\zeta) = P(T^{-1}(\zeta)) |det J_{T^{-1}}(\zeta)|\\ # P(\zeta) = P(sig(\zeta)) * sig(\zeta) * (1-sig(\zeta)) # $$ # # where $sig$ is the sigmoid function. # # Converting this directly into Python code - # + theta = np.linspace(0, 1, 100) zeta = np.linspace(-5, 5, 100) dist = uniform() p_theta = dist.pdf(theta) sigmoid = sp.special.expit p_zeta = dist.pdf(sigmoid(zeta)) * sigmoid(zeta) * (1 - sigmoid(zeta)) plot_transformation(theta, zeta, p_theta, p_zeta) # - # ### Mean Field ADVI Example # # Infer $\mu$ and $\sigma$ for Normal distribution. # Generating data mu = 12 sigma = 2.2 data = np.random.normal(mu, sigma, size=200)
val_errors.append(mean_squared_error(y_val_predict, y_val)) plt.plot(np.sqrt(train_errors), "r-", linewidth=2, label="train") plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val") plt.ylim(0, 1) plt.legend(loc="upper right", fontsize=14) plt.xlabel("Training set size", fontsize=14) plt.ylabel("RMSE(log(y))", fontsize=14) ## Run plot_learning_curves(SVR(), train_prepared, price_labels) #%% param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)} rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, random_state=42) rnd_search_cv.fit(train_prepared, price_labels) #%% rnd_search_cv.best_estimator_ #%% #Random Forest Regressor from sklearn.linear_model import Ridge
from sklearn import linear_model, datasets from sklearn.model_selection import RandomizedSearchCV # Load data iris = datasets.load_iris() X = iris.data y = iris.target # Create logistic regression logistic = linear_model.LogisticRegression() # Create regularization penalty space penalty = ['l1', 'l2'] # Create regularization hyperparameter distribution using uniform distribution C = uniform(loc=0, scale=4) # Create hyperparameter options hyperparameters = dict(C=C, penalty=penalty) # Create randomized search 5-fold cross validation and 100 iterations clf = RandomizedSearchCV(logistic, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1) # Fit randomized search best_model = clf.fit(X, y)
def testUniformVariance(self): a = 10.0 b = 100.0 uniform = tfd.Uniform(low=a, high=b) s_uniform = sp_stats.uniform(loc=a, scale=b - a) self.assertAllClose(self.evaluate(uniform.variance()), s_uniform.var())
C = 0 for i in range(m): C += weights[i] * obj_functions[i] return C if __name__ == "__main__": from scipy.stats import uniform n_design_variables = 3 n_obj_functions = 2 n_simulations = 100000 x_sampler = [uniform(-5, 10) for i in range(n_design_variables)] column_names = ['id'] \ + ['x{}'.format(i+1) for i in range(n_design_variables)] \ + ['f{}'.format(i+1) for i in range(n_obj_functions)] data = [] for i in range(n_simulations): x = [u.rvs() for u in x_sampler] f1, f2 = kurasawe(x) row = [i] + x + [f1, f2] data.append([i] + x + [f1, f2]) df = pd.DataFrame(data, columns=column_names) print(df) import matplotlib.pyplot as plt
def testUniformStd(self): a = 10.0 b = 100.0 uniform = tfd.Uniform(low=a, high=b) s_uniform = sp_stats.uniform(loc=a, scale=b - a) self.assertAllClose(self.evaluate(uniform.stddev()), s_uniform.std())
import matplotlib.pyplot as plt import numpy as np import pandas as pd import pymc3 as pm from scipy.stats import norm, uniform import seaborn as sns # Config os.chdir("/home/jovyan/work") %config InlineBackend.figure_format = 'retina' %matplotlib inline plt.rcParams["figure.figsize"] = (10, 5) np.random.seed(42) # Prepare the data x = uniform(0, 20).rvs(30) eps = norm(0, 4).rvs(30) y = 11 + 3*x + eps plt.scatter(x, y); # Sampling with pm.Model() as model: b_0 = pm.Normal("b_0", mu=0, sd=10) b_1 = pm.Normal("b_1", mu=0, sd=2) e = pm.HalfCauchy("e", 2) mu = pm.Deterministic("mu", b_0 + b_1*x) Y = pm.Normal("Y", mu=mu, sd=e, observed=y) trace = pm.sample(10000, step=pm.Metropolis()) pm.traceplot(trace[2000:]); plt.savefig("./results/4-11-regression-half-cauchy.png")
data_x = np.array([1., 2., 3.]) data_y = np.array([1.4, 1.7, 4.1]) data_yerr = np.array([0.2, 0.15, 0.2]) # Define the loglikelihood function def loglikelihood(theta): y = theta[1] * data_x + theta[0] chisq = np.sum(((data_y - y) / data_yerr)**2) return -chisq / 2. if __name__ == '__main__': # Set up the list of sampled parameters: the prior is Uniform(-5:5) -- # we are using a fixed uniform prior from scipy.stats parm_names = list(['m', 'b']) sampled_parameters = [SampledParameter(name=p, prior=uniform(loc=-5.0,scale=10.0)) for p in parm_names] # Set the active point population size population_size = 100 # Setup the Nested Sampling run n_params = len(sampled_parameters) print("Sampling a total of {} parameters".format(n_params)) #population_size = 10 print("Will use NS population size of {}".format(population_size)) # Construct the Nested Sampler MNNS = MultiNestNestedSampling(sampled_parameters, loglikelihood, population_size) #print(PCNS.likelihood(np.array([1.0]))) #quit() # run it
# coding:utf8 import numpy as np import matplotlib.pyplot as plt from pathlib import Path import pickle from scipy import stats as sts np.random.seed(0) plt.switch_backend('agg') root = Path('./ass2/savedoc/') sigmas = [0.1, 0.5, 1, 2, 5] num = int(1e4) tpdf = sts.t(1).pdf urv = sts.uniform(0, 1) datadic = {} accrates = {} if not (root / 'p6.pkl').is_file(): for sigma in sigmas: nrv = sts.norm(0, sigma) xk = 1 datalst = [] us = [] probs = [] while len(datalst) < num: eps = nrv.rvs() u = urv.rvs() y = xk + eps prob = np.min([tpdf(y) / tpdf(xk), 1]) if u <= prob:
def get_pd(**kwargs): """ Get probability distribution. Returns ------- pd : scipy.stats.rv_frozen Desired probability distribution. Other Parameters ---------------- use_hidden : bool pdf : str scale : str a : float b : float hidden_for_a : list or tuple hidden_for_b : list or tuple hidden_pdf : str """ allowed_kwargs = { 'use_hidden', 'pdf', 'scale', 'a', 'b', 'hidden_for_a', 'hidden_for_b', 'hidden_pdf' } for key in allowed_kwargs: if key not in kwargs: raise ValueError( 'You did not input enough or correct keyword argument.') use_hidden = kwargs['use_hidden'] pdf = kwargs['pdf'] scale = kwargs['scale'] a = kwargs['a'] b = kwargs['b'] hidden_for_a = kwargs['hidden_for_a'] hidden_for_b = kwargs['hidden_for_b'] hidden_pdf = kwargs['hidden_pdf'] if use_hidden: if pdf == 'uniform': pd = RandUniform(hidden_for_a, hidden_for_b, scale, hidden_pdf) elif pdf == 'normal': if scale == 'linear': pd = RandTruncnorm(hidden_for_a, hidden_for_b, 0, np.inf, scale, hidden_pdf) elif scale == 'log10': pd = RandTruncnorm(hidden_for_a, hidden_for_b, -np.inf, np.inf, scale, hidden_pdf) else: raise ValueError( 'You did not input enough or correct keyword argument.') else: raise ValueError( 'You did not input enough or correct keyword argument.') else: if pdf == 'uniform': pd = uniform(a, b - a) elif pdf == 'normal': if scale == 'linear': _a = (0 - a) / b _b = (np.inf - a) / b pd = truncnorm(_a, _b, loc=a, scale=b) elif scale == 'log10': pd = norm(loc=a, scale=b) else: raise ValueError( 'You did not input enough or correct keyword argument.') else: raise ValueError( 'You did not input enough or correct keyword argument.') return pd
def new_pd(self): self._new_para() self.pd = uniform(loc=self.loc, scale=self.scale) self.pd.random_state.seed() # re-seed
def make_v_gen(): """ Generates prior for gaussian X velocity """ return stats.uniform(loc=-4, scale=12)
def __init__(self, a_range, b_range, scale, hidden_pdf): super(RandUniform, self).__init__(a_range, b_range, scale, hidden_pdf) self._new_para() self.pd = uniform(loc=self.loc, scale=self.scale) self.pd.random_state.seed() # re-seed
def find_best_model_parameters( X_train: np.array, y_train: np.array, X_val: np.array, y_val: np.array, model, pca: PCA, best_k: int, n_jobs: int = 1, n_iter: int = 60, ) -> dict: est = model() split, X_combined, y_combined = get_train_test_split( X_train, y_train, X_val, y_val) if model == SVR: distributions = { 'model__C': loguniform(1e-1, 1e3), 'model__gamma': loguniform(1e-4, 1e0), 'model__kernel': ['poly', 'rbf', 'sigmoid'], 'model__degree': [1, 2, 3, 4, 5, 6], 'model__cache_size': [500], } elif model == RandomForestRegressor: distributions = { 'model__n_estimators': randint(10, 2000), 'model__max_features': uniform(0.01, 0.99), # 0.01-1.0 'model__max_depth': randint(10, 110), 'model__min_samples_split': randint(2, 10), 'model__min_samples_leaf': randint(1, 10), 'model__bootstrap': [True, False], 'model__n_jobs': [n_jobs], } # I think its more efficient to parallelize each RF instead of the search as it is possible for a lot of cores # to idle when all param sets are done except for one really long running one n_jobs = 1 else: print( f'HP search for {str(model)} is not implemented. Returning default parameters.' ) return est pipe = Pipeline(steps=[ ('kbest', SelectKBest(score_func=f_regression, k=best_k)), ('pca', pca), ('model', est), ]) search = RandomizedSearchCV(pipe, distributions, cv=split, n_iter=n_iter, n_jobs=n_jobs, refit=False) search.fit(X_combined, y_combined) # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print(pd.DataFrame(search.cv_results_)) # Remove 'model__' from keys best_params = {k[7:]: v for k, v in search.best_params_.items()} # print('best model parameters:', best_params) return model(**best_params)
import numpy as np import matplotlib.pyplot as plt from scipy.stats import poisson, uniform, norm from scipy.integrate import trapz # invent a "recurrence time" T = 150 L = 1000 NPTS = 100 NSAMP = 10000 ones = np.ones(NSAMP) # set poission recurrence for earthquakes T_prior = poisson(T) X_prior = uniform(0, L) # get the samples Dt = T_prior.rvs(size=NSAMP) x = X_prior.rvs(size=NSAMP) t = np.cumsum(Dt) def likelihood_data_mu(data, mu): return np.prod([poisson.pmf(d, mu) for d in data]) def post_mu(data): mu_mean = np.average(data) mu_std = np.std(data) mu_prior = norm(mu_mean, mu_std) mu = np.linspace(mu_prior.ppf(0.01), mu_prior.ppf(0.99), NPTS)
mock_log_model.assert_called_once() query = "tags.{} = '{}'".format(MLFLOW_PARENT_RUN_ID, run.info.run_id) assert len(mlflow.search_runs([run.info.experiment_id])) == 1 assert len(mlflow.search_runs([run.info.experiment_id], query)) == 0 @pytest.mark.parametrize( "cv_class, search_space", [ (sklearn.model_selection.GridSearchCV, { "kernel": ("linear", "rbf"), "C": [1, 5, 10] }), (sklearn.model_selection.RandomizedSearchCV, { "C": uniform(loc=0, scale=4) }), ], ) @pytest.mark.parametrize("backend", [None, "threading", "loky"]) def test_parameter_search_estimators_produce_expected_outputs( cv_class, search_space, backend): mlflow.sklearn.autolog() svc = sklearn.svm.SVC() cv_model = cv_class(svc, search_space, n_jobs=5, return_train_score=True) X, y = get_iris() def train_cv_model(): if backend is None: cv_model.fit(X, y)
#First one must be zero, for the prior. DataIndices = [0, 1, 2, 100] #True regression parameters that we wish to recover. Do not set these outside the range of [-1,1] a0 = -0.3 a1 = 0.5 NPoints = 100 #Number of (x,y) training points noiseSD = 0.2 #True noise standard deviation priorPrecision = 2.0 #Fix the prior precision, alpha. We will use a zero-mean isotropic Gaussian. likelihoodSD = noiseSD # Assume the likelihood precision, beta, is known. likelihoodPrecision = 1.0 / (likelihoodSD**2) #Because of how axises are set up, x and y values should be in the same range as the coefficients. x = 2 * uniform().rvs(NPoints) - 1 y = a0 + a1 * x + norm(0, noiseSD).rvs(NPoints) def MeanCovPost(x, y): #Given data vectors x and y, this returns the posterior mean and covariance. X = np.array([[1, x1] for x1 in x]) Precision = np.diag( [priorPrecision] * 2) + likelihoodPrecision * X.T.dot(X) Cov = np.linalg.inv(Precision) Mean = likelihoodPrecision * Cov.dot(X.T.dot(y)) return {'Mean': Mean, 'Cov': Cov} def GaussPdfMaker(mean, cov): #For a given (mean, cov) pair, this returns a vectorized pdf function.
f = interpolate.interpld(x, y, kind='quadratic') x_new = np.arange(0, 10, 0.1) y_new = f(x_new) # Generate normal distribution mu = 2.0 sigma = 0.5 norm_rv = sts.norm(loc=mu, scale=sigma) x = norm_rv.rvs(size=4) # [2.42471807, 2.89001427, 1.5406754 , 2.218372] # Generate uniform distribution a = 1 b = 4 uniform_rv = sts.uniform(a, b - a) x = uniform_rv.rvs( size=4) # [2.90068986, 1.30900927, 2.61667386, 1.82853085] # Generate Bernoulli distribution p = 0.7 bernoulli_rv = sts.bernoulli(p) x = bernoulli_rv.rvs(size=4) # [1, 1, 1, 0] # Generate binomial distribution n = 20 p = 0.7 binom_rv = sts.binom(n, p) x = binom_rv.rvs(size=4) # [13, 15, 13, 14]
steps = [ ('extract', FBCSP(fs,4,40,4,4,n_components=4)), ('select', SelectKBest()), ('classify',SVC()) ] pipeline = Pipeline(steps = steps) param_dist = {'extract__n_components':[4], 'extract__fs':[fs], 'extract__f_low':[4], 'extract__f_high':[40], 'extract__bandwidth':[4], 'extract__step':[4], 'select__score_func':[mutual_info_classif], 'select__k':randint(1,145), 'classify__C':uniform(1e-2,1e2), 'classify__kernel':['linear'] } kappa_corr = lambda target,output : (cohen_kappa_score(target,output)+1)/2 search = RandomizedSearchCV(pipeline, param_distributions=param_dist, scoring=make_scorer(kappa_corr), n_iter=5,n_jobs=5,verbose=10,cv=10) search.fit(Xdata,labels) cv_results = search.cv_results_ cv_results = pd.DataFrame.from_dict(cv_results) cv_results.to_csv(savename)
def make_sig_gen(): """ Generates prior for gaussian semi-MAJOR axis vel standard deviation """ return stats.uniform(loc=0, scale=8)