def new_gpr_draws(df2, response, amplitude, prior, scale, has_data, draws): ''' (data frame, str, float, data frame, int, int) -> array Using the input parameters given above runs an instance of gaussian process smoothing in order to account for years where we do not have data. The data frame (df2) is specific to a location-age. ''' all_indices = df2.index data_indices = df2[(df2.ix[:, -4])].index years = df2.loc[all_indices]["year"].values data = pd.DataFrame({"year": years, "prior": prior}) data.sort_values("year", inplace=True) data.drop_duplicates(inplace=True) def mean_function(x): return np.interp(x, data.year.values, data.prior.values) M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) df4 = df2.loc[data_indices] if has_data: gp.observe(M=M, C=C, obs_mesh=df4.year.values, obs_vals=df4[response].values, obs_V=(df4[response + "_sd"].values)**2 + df4[response + "_nsv"].values) ca_draws = np.array([gp.Realization(M, C)(years) for d in range(draws)]).T return ca_draws
def _setup(self): '''Given the current set of params, setup the interpolator.''' x, y, dy = self._regularize() if self.diff_degree is None: self.diff_degree = 3 if self.amp is None: self.amp = num.std(y - self.mean(x)) if self.scale is None: #self.scale = (self.x.max() - self.x.min())/2 self.scale = 30 self.M = GP.Mean(self.mean) self.C = GP.Covariance(GP.matern.euclidean, diff_degree=self.diff_degree, amp=self.amp, scale=self.scale) GP.observe(self.M, self.C, obs_mesh=x, obs_vals=y, obs_V=num.power(dy, 2)) self.setup = True self.realization = None
def collect_data(self): obs = np.random.uniform(-10,10,(10,2)) #10 points V = np.array([.002,.002]) #lower value makes the GP fit closer to the data, higher value makes it less important to do that. data = self.sinxfun(obs) #find the z-value of the points gp.observe(self.M, self.C, #current mean and covariance functions ->> it updates these!! (I think) obs_mesh=obs, #INPUT values for datapoints obs_V = V, #NOISE? Variance obs_vals = data) #actual OUTPUT values observed
def gaussian_process(self): """ return a PyMC Gaussian Process mean and covariance to interpolate the population-by-age mesh/value data """ # TODO: make this evaluate the function on arange(MAX_AGE) and store the results in the db for better performance M, C = uninformative_prior_gp(c=0., diff_degree=2., amp=10., scale=200.) gp.observe(M, C, self.params['mesh'] + [ MAX_AGE ], self.params['vals'] + [ 0. ], 0.0) return M, C
def gaussian_process(self): """ return a PyMC Gaussian Process mean and covariance to interpolate the population-by-age mesh/value data """ from pymc import gp from dismod3.bayesian_models import probabilistic_utils M, C = probabilistic_utils.uninformative_prior_gp(c=0., diff_degree=2., amp=10., scale=200.) gp.observe(M, C, self.data['mesh'], self.data['vals'], 0.0) return M, C
def gp_puzzle_nub(diff_degree=2., amp=1., scale=1.5, steps=100): """ Generate a puzzle nub connecting point a to point b""" M, C = uninformative_prior_gp(0., diff_degree, amp, scale) gp.observe(M, C, data.puzzle_t, data.puzzle_x, data.puzzle_V) GPx = gp.GPSubmodel('GP', M, C, pl.arange(1)) X = GPx.value.f(pl.arange(0., 1.0001, 1. / steps)) M, C = uninformative_prior_gp(0., diff_degree, amp, scale) gp.observe(M, C, data.puzzle_t, data.puzzle_y, data.puzzle_V) GPy = gp.GPSubmodel('GP', M, C, pl.arange(1)) Y = GPy.value.f(pl.arange(0., 1.0001, 1. / steps)) return X, Y
def setK(self, k): if k != self._k: deltaK = (k - self._k) self._k = k self._adjustedBounds[0] -= deltaK self._adjustedBounds[1] += deltaK self._cov = _GP.Covariance(self._c, amp=1, r=self._k) self._cov_matrix = self._cov(self._fiber, self._fiber) self._cov_inv_matrix = linalg.inv(self._cov_matrix) self._alpha = numpy.asmatrix( numpy.dot(numpy.ones(len(self._fiber)), self._cov_inv_matrix)).T self._mean = _GP.Mean(_constant, val=0) self._gp = _GP.observe(self._mean, self._cov, obs_mesh=self._fiber, obs_vals=numpy.ones(len(self._fiber)), obs_V=numpy.zeros(len(self._fiber)) + self._epsilon) if self._precomputedVariance != None: self.precomputeVarianceField() if self._precomputedMean != None: self.precomputeMeanField()
def create_gps(self): """ Create the smoothness GP gps and the diffusion-associated blurring GP gpd. ToDo: compute gpd """ # Creating a mean function self._means = gp.Mean(constant_mean, val=0) # Creating a covairance function # The covariance function is multiplied by amp**2, and this effectively # multiplies realizations by amp. In other words, a larger amp # parameter means that realizations will deviate further from their # mean self._covs = gp.Covariance(self._cs_pymc, amp=1, R=self._r) self.covs_matrix = self._covs(self._fiber, self._fiber) # Some parameters needed to compute the inner product self._invcovs_matrix = numpy.linalg.inv(self.covs_matrix) self.alphas = numpy.asmatrix( numpy.dot(numpy.ones(len(self._fiber)), self._invcovs_matrix)).T # Normally-distributed observations on gaussian process distribution self.gps = gp.observe( self._means, self._covs, obs_mesh=self._fiber, obs_vals=numpy.ones(len(self._fiber)), obs_V=numpy.zeros(len(self._fiber)) + self._observed_variance)
def mortality(self, key="all-cause_mortality", data=None): """ Calculate the all-cause mortality rate for the region and sex of disease_model, and return it in an array corresponding to age_mesh Parameters ---------- key : str, optional of the form 'all-cause_mortality+gbd_region+year+sex' data: list, optional the data list to extract all-cause mortality from """ if self.params.get("initial_value", {}).has_key(key): return self.get_initial_value(key) if not data: data = self.filter_data("all-cause_mortality data") if len(data) == 0: return NEARLY_ZERO * np.ones(len(self.get_estimate_age_mesh())) else: M, C = uninformative_prior_gp(c=-1.0, scale=300.0) age = [] val = [] V = [] for d in data: scale = self.extract_units(d) a0 = d.get("age_start", MISSING) a1 = d.get("age_end", MISSING) y = self.value_per_1(d) se = self.se_per_1(d) if se == MISSING: se = 0.01 if MISSING in [a0, a1, y]: continue age.append(0.5 * (a0 + a1)) val.append(y + 0.00001) V.append(se ** 2.0) if len(data) > 0: gp.observe(M, C, age, mc.logit(val), V) normal_approx_vals = mc.invlogit(M(self.get_estimate_age_mesh())) self.set_initial_value(key, normal_approx_vals) return self.get_initial_value(key)
def fit_gpr(df, amp, obs_variable='observed_data', obs_var_variable='obs_data_variance', mean_variable='st_prediction', year_variable='year', scale=40, diff_degree=2, draws=0): initial_columns = list(df.columns) data = df.ix[(pd.notnull(df[obs_variable])) & (pd.notnull(df[obs_var_variable]))] mean_prior = df[[year_variable,mean_variable]].drop_duplicates() def mean_function(x): return np.interp(x, mean_prior[year_variable], mean_prior[mean_variable]) M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=diff_degree, amp=amp, scale=scale) if len(data)>0: gp.observe(M=M, C=C, obs_mesh=data[year_variable], obs_V=data[obs_var_variable], obs_vals=data[obs_variable]) model_mean = M(mean_prior[year_variable]).T #model_variance = np.diagonal(C(p_years,p_years)).T model_variance = C(mean_prior[year_variable]) try: model_lower = model_mean - np.sqrt(model_variance)*1.96 except: pdb.set_trace() model_upper = model_mean + np.sqrt(model_variance)*1.96 if draws > 0: realizations = [gp.Realization(M, C)(range(min(mean_prior['year']),max(mean_prior['year'])+1)) for i in range(draws)] real_draws = pd.DataFrame({year_variable:mean_prior[year_variable],'gpr_mean':model_mean,'gpr_var':model_variance,'gpr_lower':model_lower,'gpr_upper':model_upper}) for i,r in enumerate(realizations): real_draws["draw"+str(i)] = r real_draws = pd.merge(df,real_draws,on=year_variable,how='left') gpr_columns = list(set(real_draws.columns) - set(initial_columns)) initial_columns.extend(gpr_columns) return real_draws[initial_columns] else: results = pd.DataFrame({year_variable:mean_prior[year_variable],'gpr_mean':model_mean,'gpr_var':model_variance,'gpr_lower':model_lower,'gpr_upper':model_upper}) gpr_columns = list(set(results.columns) - set(initial_columns)) initial_columns.extend(gpr_columns) results = pd.merge(df,results,on=year_variable,how='left') return results
def normal_approx(asrf): """ This 'normal approximation' of the age-specific rate function is formed by using each rate to produce an estimate of the age-specific rate, and then saying that that logit of the true rate function is a gaussian process and these age-specific rates are observations of this gaussian process. This is less valid and less accurate than using mcmc or map on the vars produced by the model_rate_list method below, but maybe it will be faster. """ M,C = uninformative_prior_gp() # use prior to set rate near zero as requested for prior_str in asrf.fit.get('priors', '').split('\n'): prior = prior_str.split() if len(prior) > 0 and prior[0] == 'zero': age_start = int(prior[1]) age_end = int(prior[2]) gp.observe(M, C, range(age_start, age_end+1), [-10.], [0.]) for r in asrf.rates.all(): mesh, obs, V = logit_rate_from_range(r) # make sure that there is something to observe if mesh == []: continue # uncomment the following line to make more inferences than # are valid from the data #gp.observe(M, C, mesh, obs, V) # uncomment the following 2 lines to make less inferences than # possible: it may be better to waste information than have # false confidence ii = len(mesh)/2 gp.observe(M, C, [mesh[ii]], [obs[ii]], [V[ii]]) x = asrf.fit['out_age_mesh'] na_rate = mc.invlogit(M(x)) asrf.fit['normal_approx'] = list(na_rate) asrf.save() return M, C
def _setup(self): '''Given the current set of params, setup the interpolator.''' x,y,dy = self._regularize() if self.diff_degree is None: self.diff_degree = 3 if self.amp is None: self.amp = num.std(y - self.mean(x)) if self.scale is None: #self.scale = (self.x.max() - self.x.min())/2 self.scale = 30 self.M = GP.Mean(self.mean) self.C = GP.Covariance(GP.matern.euclidean, diff_degree=self.diff_degree, amp=self.amp, scale=self.scale) GP.observe(self.M, self.C, obs_mesh=x, obs_vals=y, obs_V=num.power(dy,2)) self.setup = True self.realization = None
def setK( self, k ): if k!=self._k: deltaK = (k-self._k) self._k = k self._adjustedBounds[0]-=deltaK self._adjustedBounds[1]+=deltaK self._cov = _GP.Covariance( self._c, amp=1, r= self._k ) self._cov_matrix = self._cov( self._fiber, self._fiber ) self._cov_inv_matrix = linalg.inv( self._cov_matrix ) self._alpha = numpy.asmatrix(numpy.dot( numpy.ones(len(self._fiber)), self._cov_inv_matrix )).T self._mean = _GP.Mean( _constant, val = 0 ) self._gp = _GP.observe( self._mean, self._cov, obs_mesh = self._fiber, obs_vals=numpy.ones( len(self._fiber)), obs_V = numpy.zeros(len(self._fiber) )+self._epsilon ) if self._precomputedVariance!=None: self.precomputeVarianceField() if self._precomputedMean!=None: self.precomputeMeanField()
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, iters): # load in the data all_data = csv2rec(infile, use_mrecords=False) for m in range(number_submodels): all_data = np.delete( all_data, np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0], axis=0) # Investigate error thrown for HKG, MAC, and SGP... they don't have data, but don't know why this is breaking line 62 all_data = all_data[all_data['iso3'] != "HKG"] all_data = all_data[all_data['iso3'] != "MAC"] all_data = all_data[all_data['iso3'] != "SGP"] # find the list of years for which we need to predict year_list = np.unique(all_data.year) # find the list of country/age groups country_age = np.array([all_data.iso3[i] for i in range(len(all_data))]) country_age_list = np.repeat(np.unique(country_age), len(year_list)) # make empty arrays in which to store the results draws = [ np.empty(len(country_age_list), 'float') for i in range(iters * number_submodels * 2) ] iso3 = np.empty(len(country_age_list), '|S3') # age_group = np.empty(len(country_age_list), 'int') year = np.empty(len(country_age_list), 'int') # loop through country/age groups for ca in np.unique(country_age_list): print('GPRing ' + ca) # subset the data for this particular country/age ca_data = all_data[country_age == ca] # subset just the observed data if ca_data['lt_prev'].dtype != '|O8': ca_observed = ca_data[(np.isnan(ca_data['lt_prev']) == 0)] if len(ca_observed) > 1: has_data = True else: has_data = False else: has_data = False # loop through each submodel for m in range(number_submodels): # identify the dependent variable for this model dv = dv_list[m] # loop through spacetime/linear for x, t in enumerate(['spacetime']): # make a list of the spacetime predictions ca_prior = np.array([ np.mean(ca_data[t + '_' + str(m + 1)][ca_data.year == y]) for y in year_list ]) # find the amplitude for this country/age amplitude = np.mean(ca_data[t + '_amplitude_' + str(m + 1)]) # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR def mean_function(x): return np.interp(x, year_list, ca_prior) # setup the covariance function M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) # observe the data if there is any if has_data: gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed[t + '_data_variance_' + str(m + 1)], obs_vals=ca_observed['lt_prev']) # draw realizations from the data realizations = [gp.Realization(M, C) for i in range(iters)] # save the data for this country/age into the results array iso3[country_age_list == ca] = ca[0:3] # age_group[country_age_list==ca] = ca[4:] year[country_age_list == ca] = year_list.T for i in range(iters): draws[((2 * m + x) * iters) + i][ country_age_list == ca] = realizations[i](year_list) # save the results print('Saving GPR results') names = ['iso3', 'age_group', 'year'] results = np.core.records.fromarrays([iso3, year], names=names) for m in range(number_submodels): for x, t in enumerate(['spacetime']): for i in range(iters): results = recfunctions.append_fields( results, 'gpr_' + str(m + 1) + '_' + t + '_d' + str(i + 1), draws[((2 * m + x) * iters) + i]) results = recfunctions.append_fields( results, 'gpr_' + str(m + 1) + '_' + t + '_mean', np.mean(draws[((2 * m + x) * iters):((2 * m + x + 1) * iters)], axis=0)) rec2csv(results, outfile)
def fit_gpr( df, amp, obs_variable='observed_data', obs_var_variable='obs_data_variance', mean_variable='st_prediction', year_variable='year_id', scale=10, diff_degree=2, draws=0): initial_columns = list(df.columns) data = df[(df[obs_variable].notnull()) & (df[obs_var_variable].notnull())] mean_prior = df[[year_variable, mean_variable]].drop_duplicates() def mean_function(x): return np.interp( x, mean_prior[year_variable], mean_prior[mean_variable]) M = gp.Mean(mean_function) C = gp.Covariance( eval_fun=gp.matern.euclidean, diff_degree=diff_degree, amp=amp, scale=scale) if len(data) > 0: gp.observe( M=M, C=C, obs_mesh=data[year_variable], obs_V=data[obs_var_variable], obs_vals=data[obs_variable]) model_mean = M(mean_prior[year_variable]).T # model_variance = np.diagonal(C(p_years,p_years)).T model_variance = C(mean_prior[year_variable]) model_lower = model_mean - np.sqrt(model_variance)*1.96 model_upper = model_mean + np.sqrt(model_variance)*1.96 if draws > 0: """ The pymc version of drawing realizations... slower than just sampling directly from the MVN, but should give the same result: realizations = [ gp.Realization(M, C)(range(1980,2014)) for i in range(draws)] """ real_draws = pd.DataFrame({ year_variable: mean_prior[year_variable], 'gpr_mean': model_mean, 'gpr_var': model_variance, 'gpr_lower': model_lower, 'gpr_upper': model_upper}) realizations = np.random.multivariate_normal( model_mean, C(mean_prior[year_variable], mean_prior[year_variable]), draws) for i, r in enumerate(realizations): real_draws["draw_"+str(i)] = r real_draws = pd.merge(df, real_draws, on=year_variable, how='left') # gpr_columns = list(set(real_draws.columns) - set(initial_columns)) gpr_columns = ['gpr_mean', 'gpr_var', 'gpr_lower', 'gpr_upper'] draw_columns = ['draw_'+str(i) for i in range(draws)] initial_columns.extend(gpr_columns) initial_columns.extend(draw_columns) return real_draws[initial_columns] else: results = pd.DataFrame({ year_variable: mean_prior[year_variable], 'gpr_mean': model_mean, 'gpr_var': model_variance, 'gpr_lower': model_lower, 'gpr_upper': model_upper}) gpr_columns = list(set(results.columns) - set(initial_columns)) initial_columns.extend(gpr_columns) results = pd.merge(df, results, on=year_variable, how='left') return results
def fit_gpr(df, amp, obs_variable='observed_data', obs_var_variable='obs_data_variance', mean_variable='st_prediction', year_variable='year', scale=40, diff_degree=2, draws=0): initial_columns = list(df.columns) data = df.ix[(pd.notnull(df[obs_variable])) & (pd.notnull(df[obs_var_variable]))] mean_prior = df[[year_variable, mean_variable]].drop_duplicates() def mean_function(x): return np.interp(x, mean_prior[year_variable], mean_prior[mean_variable]) M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=diff_degree, amp=amp, scale=scale) if len(data) > 0: gp.observe(M=M, C=C, obs_mesh=data[year_variable], obs_V=data[obs_var_variable], obs_vals=data[obs_variable]) model_mean = M(mean_prior[year_variable]).T #model_variance = np.diagonal(C(p_years,p_years)).T model_variance = C(mean_prior[year_variable]) model_lower = model_mean - np.sqrt(model_variance) * 1.96 model_upper = model_mean + np.sqrt(model_variance) * 1.96 if draws > 0: realizations = [ gp.Realization(M, C)(range(min(mean_prior['year']), max(mean_prior['year']) + 1)) for i in range(draws) ] real_draws = pd.DataFrame({ year_variable: mean_prior[year_variable], 'gpr_mean': model_mean, 'gpr_var': model_variance, 'gpr_lower': model_lower, 'gpr_upper': model_upper }) for i, r in enumerate(realizations): real_draws["draw" + str(i)] = r real_draws = pd.merge(df, real_draws, on=year_variable, how='left') gpr_columns = list(set(real_draws.columns) - set(initial_columns)) initial_columns.extend(gpr_columns) return real_draws[initial_columns] else: results = pd.DataFrame({ year_variable: mean_prior[year_variable], 'gpr_mean': model_mean, 'gpr_var': model_variance, 'gpr_lower': model_lower, 'gpr_upper': model_upper }) gpr_columns = list(set(results.columns) - set(initial_columns)) initial_columns.extend(gpr_columns) results = pd.merge(df, results, on=year_variable, how='left') return results
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test, spacetime_iters, top_submodel): # load in the data all_data = csv2rec(infile, use_mrecords=False) for m in range(number_submodels): if all_data['spacetime_' + str(m + 1)].dtype == 'float64': all_data = np.delete( all_data, np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0], axis=0) # find the list of years for which we need to predict year_list = np.unique(all_data.year) # find the list of country/age groups country_age = np.array([ str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data)) ]) country_age_list = np.repeat(np.unique(country_age), len(year_list)) # make empty arrays in which to store the results total_iters = np.sum(spacetime_iters) draws = [ np.empty(len(country_age_list), 'float') for i in range(total_iters) ] if (top_submodel > 0): top_submodel_draws = [ np.empty(len(country_age_list), 'float') for i in range(100) ] iso3 = np.empty(len(country_age_list), '|S3') age_group = np.empty(len(country_age_list), 'int') year = np.empty(len(country_age_list), 'int') # loop through country/age groups for ca in np.unique(country_age_list): print('GPRing ' + ca) # subset the data for this particular country/age ca_data = all_data[country_age == ca] # subset just the observed data if ca_data['lt_cf'].dtype != '|O8': ca_observed = ca_data[(np.isnan(ca_data['lt_cf']) == 0) & (ca_data['test_' + test] == 0)] if len(ca_observed) > 1: has_data = True else: has_data = False else: has_data = False # keep track of how many iterations have been added for this model iter_counter = 0 # loop through each submodel for m in range(number_submodels): # identify the dependent variable for this model dv = dv_list[m] # continue making predictions if we actually need draws for this model if (spacetime_iters[m] > 0) or (m + 1 == top_submodel): # skip models with no spacetime results if all_data['spacetime_' + str(m + 1)].dtype != 'float64': for i in range(spacetime_iters[m]): draws[iter_counter][country_age_list == ca] = np.NaN iter_counter += 1 if (m + 1 == top_submodel): for i in range(100): top_submodel_draws[i][country_age_list == ca] = np.NaN continue # make a list of the spacetime predictions ca_prior = np.array([ np.mean(ca_data['spacetime_' + str(m + 1)][ca_data.year == y]) for y in year_list ]) # find the amplitude for this country/age amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m + 1)]) # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR def mean_function(x): return np.interp(x, year_list, ca_prior) # setup the covariance function M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) # observe the data if there is any if has_data: gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m + 1)], obs_vals=ca_observed[dv]) # draw realizations from the data realizations = [ gp.Realization(M, C) for i in range(spacetime_iters[m]) ] # save the data for this country/age into the results array iso3[country_age_list == ca] = ca[0:3] age_group[country_age_list == ca] = ca[4:] year[country_age_list == ca] = year_list.T for i in range(spacetime_iters[m]): try: draws[iter_counter][country_age_list == ca] = realizations[i](year_list) except: print('Failure in ' + ca) iter_counter += 1 # if it's the top submodel, do 100 additional draws if (m + 1 == top_submodel): realizations = [gp.Realization(M, C) for i in range(100)] for i in range(100): try: top_submodel_draws[i][country_age_list == ca] = realizations[i]( year_list) except: print('Failure in ' + ca) # save the results print('Saving GPR results') names = ['iso3', 'age_group', 'year'] results = np.core.records.fromarrays([iso3, age_group, year], names=names) for i in range(total_iters): results = recfunctions.append_fields(results, 'ensemble_d' + str(i + 1), draws[i]) if (top_submodel > 0): for i in range(100): results = recfunctions.append_fields(results, 'top_submodel_d' + str(i + 1), top_submodel_draws[i]) rec2csv(results, outfile)
def plot2dsurf(self, param1, param2, ax=None, xrange=None, yrange=None, bins=30, smooth=False, bfac=2, sfac=1., dd=3, cmap=cm.gray_r, levels=[], ccolor='red', fill=False, ccmap=None, falpha=1.0, outfile=None, zorder=None): '''Plot up a 2D binned paramter plot for [param1] and [param2]. if [ax] is supplied, use it to plot, otherwise, open up a new figure and axes. You can specify [xrange] and [yrange]. [bins] will be passed to histogram2d. If [smooth], the binned surface is smoothed using either a bivariate spline or a Gaussian Process (if pymc.gp is available). If [cmap] is None, no image is drawn. If [levels] is specified as fractions (0.68, 0.95, etc), draw the contours that enclose this fraction of the data.''' if ax is None: fig = plt.figure() ax = fig.add_subplot(111) own_ax = True else: own_ax = False #if ccmap is not None and ccolor is not None: # # Cmap takes precedence # ccolor = None tr1 = self.get_trace0(param1) tr2 = self.get_trace0(param2) if len(tr1.shape) != 1 or len(tr2.shape) != 1: raise RuntimeError, "Error, variables must be scalars, try using ':' notation" #tr1 = tr1[:,0] #tr2 = tr2[:,0] range = [[tr1.min(), tr1.max()], [tr2.min(), tr2.max()]] if xrange is not None: range[0] = list(xrange) if yrange is not None: range[1] = list(yrange) # first, bin up the data (all of it) grid, xs, ys = histogram2d(tr1, tr1, bins=bins, range=range) grid = grid.T * 1.0 xplot = linspace(xs[0], xs[-1], 101) yplot = linspace(ys[0], ys[-1], 101) extent = [xs[0], xs[-1], ys[0], ys[-1]] xs = (xs[1:] + xs[:-1]) / 2 ys = (ys[1:] + ys[:-1]) / 2 x, y = meshgrid(xs, ys) tx = xs[::bfac] ty = ys[::bfac] if smooth and not gp: tck = bisplrep(ravel(x), ravel(y), ravel(grid), task=-1, tx=tx, ty=ty) x = linspace(xs[0], xs[-1], 501) y = linspace(ys[0], ys[-1], 501) grid = bisplev(x, y, tck).T elif smooth and gp: M = gp.Mean( lambda x: zeros(x.shape[:-1], dtype=float) + median(grid)) scalerat = (tr2.max() - tr2.min()) / (tr1.max() - tr1.min()) C = gp.Covariance(gp.matern.aniso_euclidean, diff_degree=dd, scale=(tr1.max() - tr1.min()) * sfac, amp=std(grid), scalerat=scalerat) x, y = meshgrid(xs, ys) mesh = vstack((ravel(x), ravel(y))).T gp.observe(M, C, obs_mesh=mesh, obs_vals=ravel(grid), obs_V=ravel(grid)) dplot = dstack(meshgrid(xplot, yplot)) grid, Vsurf = gp.point_eval(M, C, dplot) grid = where(grid < 0, 0, grid) if cmap: ax.imshow(grid, extent=extent, origin='lower', aspect='auto', interpolation='nearest', cmap=cmap) if levels: prob = ravel(grid) / sum(grid) sprob = sort(prob) cprob = 1.0 - cumsum(sprob) clevels = [] for l in levels: id = nonzero(greater(cprob - l, 0))[0][-1] clevels.append(sprob[id]) prob.shape = grid.shape clevels.sort() norm = Normalize(clevels[0] * 0.5, clevels[-1] * 1.3) if fill: ax.contourf(prob, levels=clevels + [1], extent=extent, origin='lower', alpha=falpha, cmap=ccmap, norm=norm, zorder=zorder) ax.contour(prob, levels=clevels, colors=ccolor, extent=extent, origin='lower', linewidths=2, zorder=zorder) if own_ax: ax.set_xlabel("$%s$" % param1) ax.set_ylabel("$%s$" % param2) if xrange is not None: ax.set_xlim(xrange[0], xrange[1]) if yrange is not None: ax.set_ylim(yrange[0], yrange[1]) plt.draw() if outfile is not None: fig.savefig(outfile) return fig
def plus_minus(arr, bins=30, conf=0.68, xrange=None, func='poly', fit_log=True, order=7, debug=False, zero_pad=False, end_tol=[None, None]): hist0, bins = histogram(arr, bins=bins, range=xrange) xb = (bins[1:] + bins[:-1]) / 2 if fit_log: gids = greater(hist0, 0) xb = xb[gids] var = 1. / hist0[gids] hist = log(hist0[gids]) else: var = hist0 * 1 hist = hist0 * 1 if xrange is None: xrange = (bins[0], bins[-1]) xplot = linspace(xrange[0] * 0.9, xrange[1] * 1.1, 101) if debug: fig = plt.figure() if fit_log: y1 = hist y2 = exp(hist) else: y1 = log(hist) y2 = hist ax1 = fig.add_subplot(211) ax1.plot(xb, y1, 'o') ax2 = fig.add_subplot(212) ax2.plot(xb, y2, 'o') if func == 'gp' or func == 'poly': if func == 'gp': if not gp: raise RuntimeError, "To use GP interpolation, you need to install pymc" scale = xb.max() - xb.min() M = gp.Mean( lambda x: zeros(x.shape[0], dtype=float32) + median(hist)) C = gp.Covariance(gp.matern.euclidean, diff_degree=3, scale=scale * 0.5, amp=std(hist)) # Pad with zeros if zero_pad and not fit_log: obs_mesh = concatenate([ xb.min() + (xb - xb.max())[:-1], xb, xb.max() + (xb - xb.min())[1:] ]) obs = concatenate([hist[1:] * 0, hist, hist[1:] * 0]) var = concatenate([hist[1:] * 0, var, hist[1:] * 0]) else: obs_mesh = xb obs = hist gp.observe(M, C, obs_mesh=obs_mesh, obs_vals=obs, obs_V=var) func = lambda x: wrap_M(x, M, xb[0], xb[-1], log=fit_log) else: x0 = xb[argmax(hist)] pars, epars = fit_poly.fitpoly(xb, hist, w=1. / var, x0=x0, k=order) func = lambda x: wrap_poly(x, x0, pars, xb[0], xb[-1], log=fit_log) if debug: ax1.plot(xplot, log(func(xplot)), '-') ax2.plot(xplot, func(xplot), '-') oneside = False if argmax(hist) == 0: mod = xb[0] oneside = True elif argmax(hist) == len(xb) - 1: mod = xb[-1] oneside = True else: mod0 = xb[argmax(hist)] try: mod = brent(lambda x: -func(x), brack=(xb.min(), mod0, xb.max())) except: # monotonic. Take extremum oneside = True if func(xb[0]) > func(xb[-1]): mod = xb[0] else: mod = xb[-1] fac = integrate.quad(func, xb[0], xb[-1])[0] prob = lambda x: func(x) / fac #end tolerance if requested lower_limit = False upper_limit = False if end_tol[0] is not None and float( hist0[0]) / hist0.max() > end_tol[0]: lower_limit = True if end_tol[1] is not None and float( hist0[-1]) / hist0.max() > end_tol[1]: upper_limit = True if lower_limit and upper_limit: # too flat, return mode, but no limits return mod, nan, nan elif lower_limit and not upper_limit: # one-sided tail = (1 - conf) upper = brentq(\ lambda x: integrate.quad(prob, x, xplot[-1])[0]-tail, mod, xplot[-1]) return mod, nan, upper elif upper_limit and not lower_limit: tail = (1 - conf) lower = brentq(\ lambda x: integrate.quad(prob, xplot[0], x)[0]-tail, xplot[0], xplot[-1]) return mod, lower, nan if debug: ax1.axvline(mod, color='red') ax2.axvline(mod, color='red') if oneside: tail = (1 - conf) else: tail = (1 - conf) / 2 if integrate.quad(prob, xplot[0], mod)[0] < tail: # No lower bound minus = nan else: lower = brentq(\ lambda x: integrate.quad(prob, xplot[0], x)[0]-tail, xplot[0], mod) minus = mod - lower if debug: ax1.axvline(lower, color='orange') ax2.axvline(lower, color='orange') #test for upper bound if integrate.quad(prob, mod, xplot[-1])[0] < tail: # No upper bound plus = nan else: upper = brentq(\ lambda x: integrate.quad(prob, x, xplot[-1])[0]-tail, mod, xplot[-1]) plus = upper - mod if debug: ax1.axvline(upper, color='orange') ax2.axvline(upper, color='orange') else: hist = hist * 1.0 / sum(hist) mid = argmax(hist) mod = xb[mid] if debug: ax1.axvline(mod, color='red') ax2.axvline(mod, color='red') i0 = 0 i1 = len(hist) - 1 prob = 0 while (prob < (1 - conf) / 2): if i0 < mid: i0 += 1 else: break prob = sum(hist[0:i0]) if i0 == 0: lower = None else: lower = xb[i0] if debug: ax1.axvline(lower, color='orange') ax2.axvline(lower, color='orange') while (prob < 1 - conf): if i1 > mid: i1 -= 1 else: break prob = sum(hist[0:i0]) + sum(hist[i1:]) if i1 == len(xb) - 1: upper = None else: upper = xb[i1] if debug: ax1.axvline(upper, color='orange') ax2.axvline(upper, color='orange') if upper is not None: plus = upper - mod else: plus = nan if lower is not None: minus = mod - lower else: minus = nan return mod, minus, plus
print 'median over all replicates of median absolute relative error' print results.unstack()['mare', '50%'].unstack() else: N = int(options.numberofrows) delta_true = float(options.delta) replicate = int(options.replicate) print 'Running random effects validation for:' print 'N', N print 'delta_true', delta_true print 'replicate', replicate M = gp.Mean(validate_consistent_model.constant) C = gp.Covariance(gp.matern.euclidean, amp=1., diff_degree=2, scale=50) gp.observe(M, C, [0, 100], [-5, -5]) true = {} li = gp.Realization(M, C) true['i'] = lambda x: pl.exp(li(x)) lr = gp.Realization(M, C) true['r'] = lambda x: pl.exp(lr(x)) lf = gp.Realization(M, C) true['f'] = lambda x: pl.exp(lf(x)) model = validate_consistent_model.validate_consistent_model_sim( N, delta_true, true) model.results.to_csv( '%s/%s/%s-%s-%s-%s.csv' % (output_dir, validation_name, options.numberofrows, options.delta, '', options.replicate))
def smooth(x): from pymc import gp M = gp.Mean(lambda x: zeros(len(x))) C = gp.Covariance(gp.matern.euclidean, amp=1, scale=15, diff_degree=2) gp.observe(M, C, range(len(x)), x, .5) return M(range(len(x)))
print "\ninspect with:\nresults.unstack()['mare', '50%'].unstack() # for example" print "or: results.unstack()['mare', '50%'].unstack(2).reindex(columns='Very Moderately Slightly'.split())" else: N = int(options.numberofrows) delta_true = float(options.delta) replicate = int(options.replicate) bias = float(options.bias) sigma_prior = float(options.sigma) print 'Running random effects validation for:' print 'N', N print 'delta_true', delta_true print 'bias', bias print 'sigma_prior', sigma_prior print 'replicate', replicate M = gp.Mean(validate_similarity.quadratic) C = gp.Covariance(gp.matern.euclidean, amp=1., diff_degree=2, scale=50) gp.observe(M, C, [0, 30, 100], [-5, -3, -5]) true = {} lp = gp.Realization(M, C) true_p = lambda x: pl.exp(lp(x)) model = validate_similarity.generate_data(N, delta_true, true_p, 'Unusable', bias, sigma_prior) for het in 'Very Moderately Slightly'.split(): model.parameters['p']['heterogeneity'] = het validate_similarity.fit(model) model.results.to_csv('%s/%s/%s-%s-%s-%s-%s-%s.csv' % (output_dir, validation_name, options.numberofrows, options.delta, het, bias, sigma_prior, options.replicate))
N = int(options.numberofrows) delta_true = float(options.delta) sigma_true = float(options.sigma) * pl.ones(5) replicate = int(options.replicate) print 'Running random effects validation for:' print 'N', N print 'delta_true', delta_true print 'sigma_true', sigma_true print 'replicate', replicate mc.np.random.seed(1234567 + replicate) M = gp.Mean(validate_consistent_re_model.quadratic) C = gp.Covariance(gp.matern.euclidean, amp=1., diff_degree=2, scale=50) gp.observe(M, C, [0, 25, 100], [-5, -3, -5]) true = {} li = gp.Realization(M, C) true['i'] = lambda x: pl.exp(li(x)) lr = gp.Realization(M, C) true['r'] = lambda x: pl.exp(lr(x)) lf = gp.Realization(M, C) true['f'] = lambda x: pl.exp(lf(x)) model = validate_consistent_re_model.validate_consistent_re( N, delta_true, sigma_true, true) model.results.to_csv( '%s/%s/%s-%s-%s-%s.csv' % (output_dir, validation_name, options.numberofrows, options.delta, options.sigma, options.replicate))
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test, spacetime_iters, top_submodel): # load in the data all_data = csv2rec(infile, use_mrecords=False) for m in range(number_submodels): if all_data['spacetime_' + str(m+1)].dtype == 'float64': all_data = np.delete(all_data, np.where(np.isnan(all_data['spacetime_' + str(m+1)]))[0], axis=0) # find the list of years for which we need to predict year_list = np.unique(all_data.year) # find the list of country/age groups country_age = np.array([str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data))]) country_age_list = np.repeat(np.unique(country_age), len(year_list)) # make empty arrays in which to store the results total_iters = np.sum(spacetime_iters) draws = [np.empty(len(country_age_list), 'float') for i in range(total_iters)] if (top_submodel > 0): top_submodel_draws = [np.empty(len(country_age_list), 'float') for i in range(100)] iso3 = np.empty(len(country_age_list), '|S3') age_group = np.empty(len(country_age_list), 'int') year = np.empty(len(country_age_list), 'int') # loop through country/age groups for ca in np.unique(country_age_list): print('GPRing ' + ca) # subset the data for this particular country/age ca_data = all_data[country_age==ca] # subset just the observed data if ca_data['lt_cf'].dtype != '|O8': ca_observed = ca_data[(np.isnan(ca_data['lt_cf'])==0) & (ca_data['test_' + test]==0)] if len(ca_observed) > 1: has_data = True else: has_data = False else: has_data = False # keep track of how many iterations have been added for this model iter_counter = 0 # loop through each submodel for m in range(number_submodels): # identify the dependent variable for this model dv = dv_list[m] # continue making predictions if we actually need draws for this model if (spacetime_iters[m] > 0) or (m+1 == top_submodel): # skip models with no spacetime results if all_data['spacetime_' + str(m+1)].dtype != 'float64': for i in range(spacetime_iters[m]): draws[iter_counter][country_age_list==ca] = np.NaN iter_counter += 1 if (m+1 == top_submodel): for i in range(100): top_submodel_draws[i][country_age_list==ca] = np.NaN continue # make a list of the spacetime predictions ca_prior = np.array([np.mean(ca_data['spacetime_' + str(m+1)][ca_data.year==y]) for y in year_list]) # find the amplitude for this country/age amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m+1)]) # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR def mean_function(x) : return np.interp(x, year_list, ca_prior) # setup the covariance function M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) # observe the data if there is any if has_data: gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m+1)], obs_vals=ca_observed[dv]) # draw realizations from the data realizations = [gp.Realization(M, C) for i in range(spacetime_iters[m])] # save the data for this country/age into the results array iso3[country_age_list==ca] = ca[0:3] age_group[country_age_list==ca] = ca[4:] year[country_age_list==ca] = year_list.T for i in range(spacetime_iters[m]): try: draws[iter_counter][country_age_list==ca] = realizations[i](year_list) except: print('Failure in ' + ca) iter_counter += 1 # if it's the top submodel, do 100 additional draws if (m+1 == top_submodel): realizations = [gp.Realization(M, C) for i in range(100)] for i in range(100): try: top_submodel_draws[i][country_age_list==ca] = realizations[i](year_list) except: print('Failure in ' + ca) # save the results print('Saving GPR results') names = ['iso3','age_group','year'] results = np.core.records.fromarrays([iso3,age_group,year], names=names) for i in range(total_iters): results = recfunctions.append_fields(results, 'ensemble_d' + str(i+1), draws[i]) if (top_submodel > 0): for i in range(100): results = recfunctions.append_fields(results, 'top_submodel_d' + str(i+1), top_submodel_draws[i]) rec2csv(results, outfile)
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test): # load in the data all_data = csv2rec(infile, use_mrecords=False) for m in range(number_submodels): if all_data['spacetime_' + str(m+1)].dtype == 'float64': all_data = np.delete(all_data, np.where(np.isnan(all_data['spacetime_' + str(m+1)]))[0], axis=0) # find the list of years for which we need to predict year_list = np.unique(all_data.year) # find the list of country/age groups country_age = np.array([str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data))]) country_age_list = np.repeat(np.unique(country_age), len(year_list)) # make empty arrays in which to store the results draws = [np.empty(len(country_age_list), 'float') for i in range(number_submodels)] iso3 = np.empty(len(country_age_list), '|S3') age_group = np.empty(len(country_age_list), 'int') year = np.empty(len(country_age_list), 'int') # loop through country/age groups for ca in np.unique(country_age_list): print('GPRing ' + ca) # subset the data for this particular country/age ca_data = all_data[country_age==ca] # subset just the observed data if ca_data['lt_cf'].dtype != '|O8': ca_observed = ca_data[(np.isnan(ca_data['lt_cf'])==0) & (ca_data['test_' + test]==0)] if len(ca_observed) > 1: has_data = True else: has_data = False else: has_data = False # loop through each submodel for m in range(number_submodels): # skip models with no spacetime results if all_data['spacetime_' + str(m+1)].dtype != 'float64': draws[m][country_age_list==ca] = np.NaN continue # identify the dependent variable for this model dv = dv_list[m] # make a list of the spacetime predictions ca_prior = np.array([np.mean(ca_data['spacetime_' + str(m+1)][ca_data.year==y]) for y in year_list]) # find the amplitude for this country/age amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m+1)]) # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR def mean_function(x) : return np.interp(x, year_list, ca_prior) # setup the covariance function M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) # observe the data if there is any if has_data: gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m+1)], obs_vals=ca_observed[dv]) # save the data for this country/age into the results array iso3[country_age_list==ca] = ca[0:3] age_group[country_age_list==ca] = ca[4:] year[country_age_list==ca] = year_list.T draws[m][country_age_list==ca] = M(year_list) # save the results print('Saving GPR results') names = ['iso3','age_group','year'] results = np.core.records.fromarrays([iso3,age_group,year], names=names) for m in range(number_submodels): results = recfunctions.append_fields(results, 'gpr_' + str(m+1) + '_spacetime_mean', draws[m]) rec2csv(results, outfile)