def new_gpr_draws(df2, response, amplitude, prior, scale, has_data, draws): ''' (data frame, str, float, data frame, int, int) -> array Using the input parameters given above runs an instance of gaussian process smoothing in order to account for years where we do not have data. The data frame (df2) is specific to a location-age. ''' all_indices = df2.index data_indices = df2[(df2.ix[:, -4])].index years = df2.loc[all_indices]["year"].values data = pd.DataFrame({"year": years, "prior": prior}) data.sort_values("year", inplace=True) data.drop_duplicates(inplace=True) def mean_function(x): return np.interp(x, data.year.values, data.prior.values) M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) df4 = df2.loc[data_indices] if has_data: gp.observe(M=M, C=C, obs_mesh=df4.year.values, obs_vals=df4[response].values, obs_V=(df4[response + "_sd"].values)**2 + df4[response + "_nsv"].values) ca_draws = np.array([gp.Realization(M, C)(years) for d in range(draws)]).T return ca_draws
def ca_realizations(gpr_dict, years, draws): ''' (dict, array, int) -> matrix Make draws for a specific country age group. ''' realizations = np.array([gp.Realization(gpr_dict["M"], gpr_dict["C"])(years) for d in range(draws)]).T return realizations
delta_true = float(options.delta) sigma_true = float(options.sigma) * pl.ones(5) replicate = int(options.replicate) print 'Running random effects validation for:' print 'N', N print 'delta_true', delta_true print 'sigma_true', sigma_true print 'replicate', replicate mc.np.random.seed(1234567 + replicate) M = gp.Mean(validate_consistent_re_model.quadratic) C = gp.Covariance(gp.matern.euclidean, amp=1., diff_degree=2, scale=50) gp.observe(M, C, [0, 25, 100], [-5, -3, -5]) true = {} li = gp.Realization(M, C) true['i'] = lambda x: pl.exp(li(x)) lr = gp.Realization(M, C) true['r'] = lambda x: pl.exp(lr(x)) lf = gp.Realization(M, C) true['f'] = lambda x: pl.exp(lf(x)) model = validate_consistent_re_model.validate_consistent_re( N, delta_true, sigma_true, true) model.results.to_csv( '%s/%s/%s-%s-%s-%s.csv' % (output_dir, validation_name, options.numberofrows, options.delta, options.sigma, options.replicate))
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, iters): # load in the data all_data = csv2rec(infile, use_mrecords=False) for m in range(number_submodels): all_data = np.delete( all_data, np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0], axis=0) # Investigate error thrown for HKG, MAC, and SGP... they don't have data, but don't know why this is breaking line 62 all_data = all_data[all_data['iso3'] != "HKG"] all_data = all_data[all_data['iso3'] != "MAC"] all_data = all_data[all_data['iso3'] != "SGP"] # find the list of years for which we need to predict year_list = np.unique(all_data.year) # find the list of country/age groups country_age = np.array([all_data.iso3[i] for i in range(len(all_data))]) country_age_list = np.repeat(np.unique(country_age), len(year_list)) # make empty arrays in which to store the results draws = [ np.empty(len(country_age_list), 'float') for i in range(iters * number_submodels * 2) ] iso3 = np.empty(len(country_age_list), '|S3') # age_group = np.empty(len(country_age_list), 'int') year = np.empty(len(country_age_list), 'int') # loop through country/age groups for ca in np.unique(country_age_list): print('GPRing ' + ca) # subset the data for this particular country/age ca_data = all_data[country_age == ca] # subset just the observed data if ca_data['lt_prev'].dtype != '|O8': ca_observed = ca_data[(np.isnan(ca_data['lt_prev']) == 0)] if len(ca_observed) > 1: has_data = True else: has_data = False else: has_data = False # loop through each submodel for m in range(number_submodels): # identify the dependent variable for this model dv = dv_list[m] # loop through spacetime/linear for x, t in enumerate(['spacetime']): # make a list of the spacetime predictions ca_prior = np.array([ np.mean(ca_data[t + '_' + str(m + 1)][ca_data.year == y]) for y in year_list ]) # find the amplitude for this country/age amplitude = np.mean(ca_data[t + '_amplitude_' + str(m + 1)]) # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR def mean_function(x): return np.interp(x, year_list, ca_prior) # setup the covariance function M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) # observe the data if there is any if has_data: gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed[t + '_data_variance_' + str(m + 1)], obs_vals=ca_observed['lt_prev']) # draw realizations from the data realizations = [gp.Realization(M, C) for i in range(iters)] # save the data for this country/age into the results array iso3[country_age_list == ca] = ca[0:3] # age_group[country_age_list==ca] = ca[4:] year[country_age_list == ca] = year_list.T for i in range(iters): draws[((2 * m + x) * iters) + i][ country_age_list == ca] = realizations[i](year_list) # save the results print('Saving GPR results') names = ['iso3', 'age_group', 'year'] results = np.core.records.fromarrays([iso3, year], names=names) for m in range(number_submodels): for x, t in enumerate(['spacetime']): for i in range(iters): results = recfunctions.append_fields( results, 'gpr_' + str(m + 1) + '_' + t + '_d' + str(i + 1), draws[((2 * m + x) * iters) + i]) results = recfunctions.append_fields( results, 'gpr_' + str(m + 1) + '_' + t + '_mean', np.mean(draws[((2 * m + x) * iters):((2 * m + x + 1) * iters)], axis=0)) rec2csv(results, outfile)
def draw(self): '''Generate a random realization of the spline, based on the data.''' if not self.setup: self._setup() self.realization = GP.Realization(self.M, self.C)
def fit_gpr(df, amp, obs_variable='observed_data', obs_var_variable='obs_data_variance', mean_variable='st_prediction', year_variable='year', scale=40, diff_degree=2, draws=0): initial_columns = list(df.columns) data = df.ix[(pd.notnull(df[obs_variable])) & (pd.notnull(df[obs_var_variable]))] mean_prior = df[[year_variable, mean_variable]].drop_duplicates() def mean_function(x): return np.interp(x, mean_prior[year_variable], mean_prior[mean_variable]) M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=diff_degree, amp=amp, scale=scale) if len(data) > 0: gp.observe(M=M, C=C, obs_mesh=data[year_variable], obs_V=data[obs_var_variable], obs_vals=data[obs_variable]) model_mean = M(mean_prior[year_variable]).T #model_variance = np.diagonal(C(p_years,p_years)).T model_variance = C(mean_prior[year_variable]) model_lower = model_mean - np.sqrt(model_variance) * 1.96 model_upper = model_mean + np.sqrt(model_variance) * 1.96 if draws > 0: realizations = [ gp.Realization(M, C)(range(min(mean_prior['year']), max(mean_prior['year']) + 1)) for i in range(draws) ] real_draws = pd.DataFrame({ year_variable: mean_prior[year_variable], 'gpr_mean': model_mean, 'gpr_var': model_variance, 'gpr_lower': model_lower, 'gpr_upper': model_upper }) for i, r in enumerate(realizations): real_draws["draw" + str(i)] = r real_draws = pd.merge(df, real_draws, on=year_variable, how='left') gpr_columns = list(set(real_draws.columns) - set(initial_columns)) initial_columns.extend(gpr_columns) return real_draws[initial_columns] else: results = pd.DataFrame({ year_variable: mean_prior[year_variable], 'gpr_mean': model_mean, 'gpr_var': model_variance, 'gpr_lower': model_lower, 'gpr_upper': model_upper }) gpr_columns = list(set(results.columns) - set(initial_columns)) initial_columns.extend(gpr_columns) results = pd.merge(df, results, on=year_variable, how='left') return results
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test, spacetime_iters, top_submodel): # load in the data all_data = csv2rec(infile, use_mrecords=False) for m in range(number_submodels): if all_data['spacetime_' + str(m + 1)].dtype == 'float64': all_data = np.delete( all_data, np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0], axis=0) # find the list of years for which we need to predict year_list = np.unique(all_data.year) # find the list of country/age groups country_age = np.array([ str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data)) ]) country_age_list = np.repeat(np.unique(country_age), len(year_list)) # make empty arrays in which to store the results total_iters = np.sum(spacetime_iters) draws = [ np.empty(len(country_age_list), 'float') for i in range(total_iters) ] if (top_submodel > 0): top_submodel_draws = [ np.empty(len(country_age_list), 'float') for i in range(100) ] iso3 = np.empty(len(country_age_list), '|S3') age_group = np.empty(len(country_age_list), 'int') year = np.empty(len(country_age_list), 'int') # loop through country/age groups for ca in np.unique(country_age_list): print('GPRing ' + ca) # subset the data for this particular country/age ca_data = all_data[country_age == ca] # subset just the observed data if ca_data['lt_cf'].dtype != '|O8': ca_observed = ca_data[(np.isnan(ca_data['lt_cf']) == 0) & (ca_data['test_' + test] == 0)] if len(ca_observed) > 1: has_data = True else: has_data = False else: has_data = False # keep track of how many iterations have been added for this model iter_counter = 0 # loop through each submodel for m in range(number_submodels): # identify the dependent variable for this model dv = dv_list[m] # continue making predictions if we actually need draws for this model if (spacetime_iters[m] > 0) or (m + 1 == top_submodel): # skip models with no spacetime results if all_data['spacetime_' + str(m + 1)].dtype != 'float64': for i in range(spacetime_iters[m]): draws[iter_counter][country_age_list == ca] = np.NaN iter_counter += 1 if (m + 1 == top_submodel): for i in range(100): top_submodel_draws[i][country_age_list == ca] = np.NaN continue # make a list of the spacetime predictions ca_prior = np.array([ np.mean(ca_data['spacetime_' + str(m + 1)][ca_data.year == y]) for y in year_list ]) # find the amplitude for this country/age amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m + 1)]) # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR def mean_function(x): return np.interp(x, year_list, ca_prior) # setup the covariance function M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) # observe the data if there is any if has_data: gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m + 1)], obs_vals=ca_observed[dv]) # draw realizations from the data realizations = [ gp.Realization(M, C) for i in range(spacetime_iters[m]) ] # save the data for this country/age into the results array iso3[country_age_list == ca] = ca[0:3] age_group[country_age_list == ca] = ca[4:] year[country_age_list == ca] = year_list.T for i in range(spacetime_iters[m]): try: draws[iter_counter][country_age_list == ca] = realizations[i](year_list) except: print('Failure in ' + ca) iter_counter += 1 # if it's the top submodel, do 100 additional draws if (m + 1 == top_submodel): realizations = [gp.Realization(M, C) for i in range(100)] for i in range(100): try: top_submodel_draws[i][country_age_list == ca] = realizations[i]( year_list) except: print('Failure in ' + ca) # save the results print('Saving GPR results') names = ['iso3', 'age_group', 'year'] results = np.core.records.fromarrays([iso3, age_group, year], names=names) for i in range(total_iters): results = recfunctions.append_fields(results, 'ensemble_d' + str(i + 1), draws[i]) if (top_submodel > 0): for i in range(100): results = recfunctions.append_fields(results, 'top_submodel_d' + str(i + 1), top_submodel_draws[i]) rec2csv(results, outfile)
print "\ninspect with:\nresults.unstack()['mare', '50%'].unstack() # for example" print "or: results.unstack()['mare', '50%'].unstack(2).reindex(columns='Very Moderately Slightly'.split())" else: N = int(options.numberofrows) delta_true = float(options.delta) sigma_true = float(options.sigma) * pl.ones(5) replicate = int(options.replicate) smoothness = options.smoothing print 'Running random effects validation for:' print 'N', N print 'delta_true', delta_true print 'sigma_true', sigma_true print 'replicate', replicate print 'smoothness', smoothness M = gp.Mean(validate_age_integrating_re.quadratic) C = gp.Covariance(gp.matern.euclidean, amp=1., diff_degree=2, scale=50) gp.observe(M, C, [0, 25, 100], [-5, -3, -5]) log_p = gp.Realization(M, C) true_p = lambda x: pl.exp(log_p(x)) model = validate_age_integrating_re.validate_ai_re( N, delta_true, sigma_true, true_p, smoothness) model.results.to_csv( '%s/%s/%s-%s-%s-%s.csv' % (output_dir, validation_name, options.numberofrows, options.delta, options.sigma, options.replicate))
delta_true = float(options.delta) replicate = int(options.replicate) bias = float(options.bias) sigma_prior = float(options.sigma) print 'Running random effects validation for:' print 'N', N print 'delta_true', delta_true print 'bias', bias print 'sigma_prior', sigma_prior print 'replicate', replicate M = gp.Mean(validate_similarity.quadratic) C = gp.Covariance(gp.matern.euclidean, amp=1., diff_degree=2, scale=50) gp.observe(M, C, [0, 30, 100], [-5, -3, -5]) true = {} lp = gp.Realization(M, C) true_p = lambda x: pl.exp(lp(x)) model = validate_similarity.generate_data(N, delta_true, true_p, 'Unusable', bias, sigma_prior) for het in 'Very Moderately Slightly'.split(): model.parameters['p']['heterogeneity'] = het validate_similarity.fit(model) model.results.to_csv( '%s/%s/%s-%s-%s-%s-%s-%s.csv' % (output_dir, validation_name, options.numberofrows, options.delta, het, bias, sigma_prior, options.replicate))
def draw(self): '''Generate a random realization of the spline, based on the data.''' self.realization = GP.Realization(self.M, self.C)