def validate_once(true_cf=[pl.ones(3) / 3.0, pl.ones(3) / 3.0], true_std=0.01 * pl.ones(3), std_bias=[1., 1., 1.], save=False, dir='', i=0): """ Generate a set of simulated estimates for the provided true cause fractions; Fit the bad model and the latent simplex model to this simulated data and calculate quality metrics. """ # generate simulation data X = data.sim_data_for_validation(1000, true_cf, true_std, std_bias) # fit bad model, calculate fit metrics bad_model = models.bad_model(X) bad_model_metrics = calc_quality_metrics(true_cf, true_std, std_bias, bad_model) retrieve_estimates(bad_model, True, 'bad_model', dir, i) # fit latent simplex model, calculate fit metrics m, latent_simplex = models.fit_latent_simplex(X) latent_simplex_metrics = calc_quality_metrics(true_cf, true_std, std_bias, latent_simplex) retrieve_estimates(latent_simplex, True, 'latent_simplex', dir, i) # either write results to disk or return them if save: pl.rec2csv(bad_model_metrics, '%s/metrics_bad_model_%i.csv' % (dir, i)) pl.rec2csv(latent_simplex_metrics, '%s/metrics_latent_simplex_%i.csv' % (dir, i)) else: return bad_model_metrics, latent_simplex_metrics
def validate_once(true_cf = [pl.ones(3)/3.0, pl.ones(3)/3.0], true_std = 0.01*pl.ones(3), std_bias = [1., 1., 1.], save=False, dir='', i=0): """ Generate a set of simulated estimates for the provided true cause fractions; Fit the bad model and the latent simplex model to this simulated data and calculate quality metrics. """ # generate simulation data X = data.sim_data_for_validation(1000, true_cf, true_std, std_bias) # fit bad model, calculate fit metrics bad_model = models.bad_model(X) bad_model_metrics = calc_quality_metrics(true_cf, true_std, std_bias, bad_model) retrieve_estimates(bad_model, True, 'bad_model', dir, i) # fit latent simplex model, calculate fit metrics m, latent_simplex = models.fit_latent_simplex(X) latent_simplex_metrics = calc_quality_metrics(true_cf, true_std, std_bias, latent_simplex) retrieve_estimates(latent_simplex, True, 'latent_simplex', dir, i) # either write results to disk or return them if save: pl.rec2csv(bad_model_metrics, '%s/metrics_bad_model_%i.csv' % (dir, i)) pl.rec2csv(latent_simplex_metrics, '%s/metrics_latent_simplex_%i.csv' % (dir, i)) else: return bad_model_metrics, latent_simplex_metrics
def combine_output(J, T, model, dir, reps, save=False): """ Combine output on absolute error, relative error, csmf_accuracy, and coverage from from multiple runs of validate_once. Either saves the output to the disk, or returns arays for each. """ cause = pl.zeros(J*T, dtype='f').view(pl.recarray) time = pl.zeros(J*T, dtype='f').view(pl.recarray) abs_err = pl.zeros(J*T, dtype='f').view(pl.recarray) rel_err = pl.zeros(J*T, dtype='f').view(pl.recarray) coverage = pl.zeros(J*T, dtype='f').view(pl.recarray) csmf_accuracy = pl.zeros(J*T, dtype='f').view(pl.recarray) for i in range(reps): metrics = pl.csv2rec('%s/metrics_%s_%i.csv' % (dir, model, i)) cause = pl.vstack((cause, metrics.cause)) time = pl.vstack((time, metrics.time)) abs_err = pl.vstack((abs_err, metrics.abs_err)) rel_err = pl.vstack((rel_err, metrics.rel_err)) coverage = pl.vstack((coverage, metrics.coverage)) csmf_accuracy = pl.vstack((csmf_accuracy, metrics.csmf_accuracy)) cause = cause[1:,] time = time[1:,] abs_err = abs_err[1:,] rel_err = rel_err[1:,] coverage = coverage[1:,] csmf_accuracy = csmf_accuracy[1:,] mean_abs_err = abs_err.mean(0) median_abs_err = pl.median(abs_err, 0) mean_rel_err = rel_err.mean(0) median_rel_err = pl.median(rel_err, 0) mean_csmf_accuracy = csmf_accuracy.mean(0) median_csmf_accuracy = pl.median(csmf_accuracy, 0) mean_coverage_bycause = coverage.mean(0) mean_coverage = coverage.reshape(reps, T, J).mean(0).mean(1) percent_total_coverage = (coverage.reshape(reps, T, J).sum(2)==3).mean(0) mean_coverage = pl.array([[i for j in range(J)] for i in mean_coverage]).ravel() percent_total_coverage = pl.array([[i for j in range(J)] for i in percent_total_coverage]).ravel() models = pl.array([[model for j in range(J)] for i in range(T)]).ravel() true_cf = metrics.true_cf true_std = metrics.true_std std_bias = metrics.std_bias all = pl.np.core.records.fromarrays([models, cause[0], time[0], true_cf, true_std, std_bias, mean_abs_err, median_abs_err, mean_rel_err, median_rel_err, mean_csmf_accuracy, median_csmf_accuracy, mean_coverage_bycause, mean_coverage, percent_total_coverage], names=['model', 'cause', 'time', 'true_cf', 'true_std', 'std_bias', 'mean_abs_err', 'median_abs_err', 'mean_rel_err', 'median_rel_err', 'mean_csmf_accuracy', 'median_csmf_accuracy', 'mean_covearge_bycause', 'mean_coverage', 'percent_total_coverage']) if save: pl.rec2csv(all, '%s/%s_summary.csv' % (dir, model)) else: return all
def sites_and_env(session, species, layer_names, glob_name, glob_channels, buffer_width, n_pseudoabsences, dblock=None, simdata=False): """ Queries the DB to get a list of locations. Writes it out along with matching extractions of the requested layers to a temporary csv file, which serves the dual purpose of caching the extraction and making it easier to get data into the BRT package. """ breaks, x, found, zero, others_found, multipoints, eo = sites_as_ndarray(session, species) if simdata: print 'Process %i simulating presences for species %s.'%(multiprocessing.current_process().ident,species[1]) x = get_pseudoabsences(eo, -1, n_pseudoabsences, layer_names, glob_name) found = np.ones(n_pseudoabsences) fname = hashlib.sha1(str(x)+found.tostring()+\ glob_name+'channel'.join([str(i) for i in glob_channels])+\ 'layer'.join(layer_names)).hexdigest()+'.csv' pseudoabsences = get_pseudoabsences(eo, buffer_width, n_pseudoabsences, layer_names, glob_name) x_found = x[np.where(found)] x = np.vstack((x_found, pseudoabsences)) found = np.concatenate((np.ones(len(x_found)), np.zeros(n_pseudoabsences))) if fname in os.listdir('anopheles-caches'): pass else: # Makes list of (key, value) tuples env_layers = map(lambda ln: extract_environment(ln, x, lock=dblock), layer_names)\ + map(lambda ch: (os.path.basename(glob_name)+'_'+str(ch), extract_environment(glob_name,x,\ postproc=lambda d: d==ch, id_=ch, lock=dblock)[1]), glob_channels) arrays = [(found>0).astype('int')] + [l[1] for l in env_layers] names = ['found'] + [l[0] for l in env_layers] data = np.rec.fromarrays(arrays, names=','.join(names)) nancheck = np.array([np.any(np.isnan(row.tolist())) for row in data]) if np.any(nancheck): print 'There were some NaNs in the data, probably points in the sea' singletons = 0 for e in env_layers: if len(set(e[1][np.where(True-np.isnan(e[1]))]))==1: singletons += 1 if singletons == len(env_layers): raise ValueError, 'All environmental layer evaluations contained only single values.' data = data[np.where(True-nancheck)] rec2csv(data, os.path.join('anopheles-caches',fname)) return fname, pseudoabsences, x
def knockout_uniformly_at_random(in_fname='noisy_data.csv', out_fname='missing_noisy_data.csv', pct=20.): """ replace data.csv y column with uniformly random missing entries Parameters ---------- pct : float, percent to knockout """ data = pl.csv2rec(in_fname) for i, row in enumerate(data): if pl.rand() < pct/100.: data[i].y = pl.nan pl.rec2csv(data, out_fname)
def compile_all_results (scenarios, dir='../data'): """ Compiles the results across multiple scenarios produced by running run_on_cluster on each one into a single sv file. The specified directory must be where where the results of running run_on_cluster for each scenario are stored (each is a sub-directory named v0, v1, etc.) and is also where the output from this function will be saved. """ models = [] causes = [] time = [] true_cf = [] true_std = [] std_bias = [] mean_abs_err = [] median_abs_err = [] mean_rel_err = [] median_rel_err = [] mean_csmf_accuracy = [] median_csmf_accuracy = [] mean_coverage_bycause = [] mean_coverage = [] percent_total_coverage = [] scenario = [] for i in range(scenarios): for j in ['bad_model', 'latent_simplex']: read = csv.reader(open('%s/v%s/%s_summary.csv' % (dir, i, j))) read.next() for row in read: models.append(row[0]) causes.append(row[1]) time.append(row[2]) true_cf.append(row[3]) true_std.append(row[4]) std_bias.append(row[5]) mean_abs_err.append(row[6]) median_abs_err.append(row[7]) mean_rel_err.append(row[8]) median_rel_err.append(row[9]) mean_csmf_accuracy.append(row[10]) median_csmf_accuracy.append(row[11]) mean_coverage_bycause.append(row[12]) mean_coverage.append(row[13]) percent_total_coverage.append(row[14]) scenario.append(i) all = pl.np.core.records.fromarrays([scenario, models, time, true_cf, true_std, causes, mean_abs_err, median_abs_err, mean_rel_err, median_rel_err, mean_csmf_accuracy, median_csmf_accuracy, mean_coverage_bycause, mean_coverage, percent_total_coverage], names=['scenario', 'model', 'time', 'true_cf', 'true_std', 'cause', 'mean_abs_err', 'median_abs_err', 'mean_rel_err', 'median_rel_err', 'mean_csmf_accuracy', 'median_csmf_accuracy', 'mean_covearge_bycause', 'mean_coverage', 'percent_total_coverage']) pl.rec2csv(all, fname='%s/all_summary_metrics.csv' % (dir))
def add_sampling_error(in_fname='data.csv', out_fname='noisy_data.csv', std=1.): """ add normally distributed noise to data.csv y column Parameters ---------- std : float, or array of floats standard deviation of noise """ data = pl.csv2rec(in_fname) if type(std) == float: std = std * pl.ones(len(data)) for i, row in enumerate(data): data[i].y += std[i] * pl.randn(1) data[i].se += std[i] pl.rec2csv(data, out_fname)
def retrieve_estimates(preds, save=False, model='', dir='', i=0): """ calculates the posterior mean for pi as well as the 95% hpd region and optionally saves this output """ T, J = preds.shape[1:] mean = preds.mean(0).ravel() hpd = mc.utils.hpd(preds, 0.05) lower = hpd[:,:,0].ravel() upper = hpd[:,:,1].ravel() time = pl.array([[t for j in range(J)] for t in range(T)]).ravel() cause = pl.array([[j for j in range(J)] for t in range(T)]).ravel() results = pl.np.core.records.fromarrays([time, cause, mean, lower, upper], names=['time', 'cause', 'med', 'lower', 'upper']) if (save): pl.rec2csv(results, '%s/%s_estimates%i.csv' % (dir, model, i)) else: return(results)
def retrieve_estimates(preds, save=False, model='', dir='', i=0): """ calculates the posterior mean for pi as well as the 95% hpd region and optionally saves this output """ T, J = preds.shape[1:] mean = preds.mean(0).ravel() hpd = mc.utils.hpd(preds, 0.05) lower = hpd[:, :, 0].ravel() upper = hpd[:, :, 1].ravel() time = pl.array([[t for j in range(J)] for t in range(T)]).ravel() cause = pl.array([[j for j in range(J)] for t in range(T)]).ravel() results = pl.np.core.records.fromarrays( [time, cause, mean, lower, upper], names=['time', 'cause', 'med', 'lower', 'upper']) if (save): pl.rec2csv(results, '%s/%s_estimates%i.csv' % (dir, model, i)) else: return (results)
def trees_to_diagnostics(brt_evaluator, fname, species_name, n_pseudopresences, n_pseudoabsences, config_filename): """ Takes the BRT evaluator and sees how well it does at predicting the training dataset. """ from diagnostics import simple_assessments, roc, plot_roc_ din = csv2rec(os.path.join('anopheles-caches',fname)) found = din.found din = dict([(k,din[k]) for k in brt_evaluator.nice_tree_dict.iterkeys()]) probs = pm.flib.invlogit(brt_evaluator(din)) print 'Species %s: fraction %f correctly classified.'%(species_name, ((probs>.5)*found+(probs<.5)*(True-found)).sum()/float(len(probs))) result_dirname = get_result_dir(config_filename) resdict = {} for f in simple_assessments: resdict[f.__name__] = f(probs>.5, found) pstack = np.array([pm.rbernoulli(probs) for i in xrange(10000)]) fp, tp, AUC = roc(pstack, found) resdict['AUC'] = AUC fout=file(os.path.join(result_dirname,'simple-diagnostics.txt'),'w') fout.write('presences: %i\n'%(found.sum()-n_pseudopresences)) fout.write('pseudopresences: %i\n'%n_pseudopresences) fout.write('pseudoabsences: %i\n'%n_pseudoabsences) for k in resdict.iteritems(): fout.write('%s: %s\n'%k) import pylab as pl pl.clf() plot_roc_(fp,tp,AUC) pl.savefig(os.path.join(result_dirname,'roc.pdf')) r = np.rec.fromarrays([fp,tp],names='false,true') rec2csv(r,os.path.join(result_dirname,'roc.csv'))
def measure_fit(self): ''' Provide metrics of fit to determine how well the model performed ''' # TODO: code up RMSE for non-holdout predictions if self.training_type == 'make predictions': print 'RMSE for non-holdout data not yet implemented' # calculate age-adjusted rates on the test data else: predicted = self.predictions[['country','year','age','pop','actual_deaths', 'mean_deaths', 'upper_deaths', 'lower_deaths']].view(np.recarray) predicted = recfunctions.append_fields(predicted, 'mean_rate', predicted.mean_deaths / predicted.pop * 100000.).view(np.recarray) predicted = recfunctions.append_fields(predicted, 'actual_rate', predicted.actual_deaths / predicted.pop * 100000.).view(np.recarray) predicted = recfunctions.append_fields(predicted, 'weight', np.ones(predicted.shape[0])).view(np.recarray) for a in self.age_list: predicted.weight[np.where(predicted.age==a)[0]] = self.age_weights.weight[np.where(self.age_weights.age==a)[0]] predicted.mean_rate = predicted.mean_rate * predicted.weight predicted.actual_rate = predicted.actual_rate * predicted.weight from matplotlib import mlab adj_rates = mlab.rec_groupby(predicted, ('country','year'), (('mean_rate', np.sum, 'adj_mean_rate'),('actual_rate', np.sum, 'adj_actual_rate'))) # calculate RMSE/RMdSE err = adj_rates.adj_mean_rate - adj_rates.adj_actual_rate sq_err = err ** 2. mse = np.mean(sq_err) mdse = np.median(sq_err) rmse = np.sqrt(mse) rmdse = np.sqrt(mdse) # calculate AARE/MdARE abs_rel_err = np.abs(err / adj_rates.adj_actual_rate) aare = np.mean(abs_rel_err) mdare = np.median(abs_rel_err) # calculate coverage (age-specific, not age-adjusted) coverage = np.array((predicted.upper_deaths >= predicted.actual_deaths) & (predicted.lower_deaths <= predicted.actual_deaths)).astype(np.int).mean() # output fit metrics print 'Root Mean Square Error: ' + str(rmse), '\nRoot Median Square Error: ' + str(rmdse), '\nAverage Absolute Relative Error: ' + str(aare), '\nMedian Absolute Relative Error: ' + str(mdare), '\nCoverage: ' + str(coverage) pl.rec2csv(np.core.records.fromarrays([np.array(('rmse','rmdse','aare','mdare','coverage')),np.array((rmse,rmdse,aare,mdare,coverage))], names=['metric','value']), '/home/j/Project/Causes of Death/CoDMod/tmp/' + self.name + '_fits_' + self.cause + '_' + self.sex + '.csv')
wshape[axis] = 2 weights.shape = wshape sumval = weights.sum() return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval # save basic estimates model_estimates = model.trace('estimate')[:] mean_estimate = model_estimates.mean(axis=0) lower_estimate = percentile(model_estimates, 2.5, axis=0) upper_estimate = percentile(model_estimates, 97.5, axis=0) output = pl.rec_append_fields( rec = data, names = ['mean', 'lower', 'upper'], arrs = [mean_estimate, lower_estimate, upper_estimate]) pl.rec2csv(output, proj_dir + 'outputs/model results/spatial smoothing/SWRI_with_spatial.csv') ''' ### plot diagnostics # setup plotting #import matplotlib.pyplot as pp #pp.switch_backend('acc') plot_me = [mu_si, mu_ss, mu_ci, mu_cs, sigma_si, sigma_ss, sigma_ci, sigma_cs, state_intercepts, state_slopes, cause_intercepts, cause_slopes] # plot traces os.chdir(proj_dir + '/outputs/model results/simple random effects by state/mcmc plots/traces/') for p in plot_me: mc.Matplot.plot(p, suffix='_trace') if len(p.shape) == 0: plt.close()
weights = np.array(1) sumval = 1.0 else: indexer[axis] = slice(i, i+2) j = i + 1 weights = np.array([(j - index), (index - i)],float) wshape = [1]*sorted.ndim wshape[axis] = 2 weights.shape = wshape sumval = weights.sum() return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval # save basic estimates model_estimates = model.trace('estimate')[:] mean_estimate = model_estimates.mean(axis=0) lower_estimate = percentile(model_estimates, 2.5, axis=0) upper_estimate = percentile(model_estimates, 97.5, axis=0) output = pl.rec_append_fields( rec = data, names = ['mean', 'lower', 'upper'], arrs = [mean_estimate, lower_estimate, upper_estimate]) pl.rec2csv(output, proj_dir + 'outputs/model results/spatial smoothing/' + mod_name + '_' + str(sex) + '_' + age + '.csv') # save draws draws = pl.rec_append_fields( rec = data, names = ['draw_' + str(i+1) for i in range(100)], arrs = [model.trace('estimate')[i] for i in range(100)]) pl.rec2csv(draws, proj_dir + 'outputs/model results/spatial smoothing/' + mod_name + '_draws_' + str(sex) + '_' + age + '.csv')
tfr_draws[draw,:] = Realization(M, C)(predictionyears) # collapse across draws # note: space transformations need to be performed at the draw level logit_est = gpr.collapse_sims(tfr_draws) unlogit_est = gpr.collapse_sims(np.exp(tfr_draws)*tfr_bound/(1+np.exp(tfr_draws))) # get the inverse logit os.chdir('FILEPATH') all_est = [] for i in range(len(predictionyears)): all_est.append((cc, predictionyears[i], unlogit_est['med'][i], unlogit_est['lower'][i], unlogit_est['upper'][i])) all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('med', '<f8'), ('lower', '<f8'), ('upper', '<f8')]) pl.rec2csv(all_est, 'gpr_%s.txt' %(cc+'_'+ str(best_amp2x) + '_' + str(best_scale))) # save the sims all_sim = [] for i in range(len(predictionyears)): for s in range(draws): all_sim.append((cc, predictionyears[i], s, np.exp(tfr_draws[s][i])*tfr_bound/ (1+np.exp(tfr_draws[s][i])) )) all_sim = pl.array(all_sim, [('ihme_loc', '|S32'), ('year', '<f8'), ('sim', '<f8'), ('fert', '<f8')]) pl.rec2csv(all_sim, 'gpr_%s_sim.txt' %(cc+ '_' + str(best_amp2x) + '_' + str(best_scale)))
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test, spacetime_iters, top_submodel): # load in the data all_data = csv2rec(infile, use_mrecords=False) for m in range(number_submodels): if all_data['spacetime_' + str(m + 1)].dtype == 'float64': all_data = np.delete( all_data, np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0], axis=0) # find the list of years for which we need to predict year_list = np.unique(all_data.year) # find the list of country/age groups country_age = np.array([ str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data)) ]) country_age_list = np.repeat(np.unique(country_age), len(year_list)) # make empty arrays in which to store the results total_iters = np.sum(spacetime_iters) draws = [ np.empty(len(country_age_list), 'float') for i in range(total_iters) ] if (top_submodel > 0): top_submodel_draws = [ np.empty(len(country_age_list), 'float') for i in range(100) ] iso3 = np.empty(len(country_age_list), '|S3') age_group = np.empty(len(country_age_list), 'int') year = np.empty(len(country_age_list), 'int') # loop through country/age groups for ca in np.unique(country_age_list): print('GPRing ' + ca) # subset the data for this particular country/age ca_data = all_data[country_age == ca] # subset just the observed data if ca_data['lt_cf'].dtype != '|O8': ca_observed = ca_data[(np.isnan(ca_data['lt_cf']) == 0) & (ca_data['test_' + test] == 0)] if len(ca_observed) > 1: has_data = True else: has_data = False else: has_data = False # keep track of how many iterations have been added for this model iter_counter = 0 # loop through each submodel for m in range(number_submodels): # identify the dependent variable for this model dv = dv_list[m] # continue making predictions if we actually need draws for this model if (spacetime_iters[m] > 0) or (m + 1 == top_submodel): # skip models with no spacetime results if all_data['spacetime_' + str(m + 1)].dtype != 'float64': for i in range(spacetime_iters[m]): draws[iter_counter][country_age_list == ca] = np.NaN iter_counter += 1 if (m + 1 == top_submodel): for i in range(100): top_submodel_draws[i][country_age_list == ca] = np.NaN continue # make a list of the spacetime predictions ca_prior = np.array([ np.mean(ca_data['spacetime_' + str(m + 1)][ca_data.year == y]) for y in year_list ]) # find the amplitude for this country/age amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m + 1)]) # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR def mean_function(x): return np.interp(x, year_list, ca_prior) # setup the covariance function M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) # observe the data if there is any if has_data: gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m + 1)], obs_vals=ca_observed[dv]) # draw realizations from the data realizations = [ gp.Realization(M, C) for i in range(spacetime_iters[m]) ] # save the data for this country/age into the results array iso3[country_age_list == ca] = ca[0:3] age_group[country_age_list == ca] = ca[4:] year[country_age_list == ca] = year_list.T for i in range(spacetime_iters[m]): try: draws[iter_counter][country_age_list == ca] = realizations[i](year_list) except: print('Failure in ' + ca) iter_counter += 1 # if it's the top submodel, do 100 additional draws if (m + 1 == top_submodel): realizations = [gp.Realization(M, C) for i in range(100)] for i in range(100): try: top_submodel_draws[i][country_age_list == ca] = realizations[i]( year_list) except: print('Failure in ' + ca) # save the results print('Saving GPR results') names = ['iso3', 'age_group', 'year'] results = np.core.records.fromarrays([iso3, age_group, year], names=names) for i in range(total_iters): results = recfunctions.append_fields(results, 'ensemble_d' + str(i + 1), draws[i]) if (top_submodel > 0): for i in range(100): results = recfunctions.append_fields(results, 'top_submodel_d' + str(i + 1), top_submodel_draws[i]) rec2csv(results, outfile)
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test, spacetime_iters, top_submodel): # load in the data all_data = csv2rec(infile, use_mrecords=False) for m in range(number_submodels): if all_data['spacetime_' + str(m+1)].dtype == 'float64': all_data = np.delete(all_data, np.where(np.isnan(all_data['spacetime_' + str(m+1)]))[0], axis=0) # find the list of years for which we need to predict year_list = np.unique(all_data.year) # find the list of country/age groups country_age = np.array([str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data))]) country_age_list = np.repeat(np.unique(country_age), len(year_list)) # make empty arrays in which to store the results total_iters = np.sum(spacetime_iters) draws = [np.empty(len(country_age_list), 'float') for i in range(total_iters)] if (top_submodel > 0): top_submodel_draws = [np.empty(len(country_age_list), 'float') for i in range(100)] iso3 = np.empty(len(country_age_list), '|S3') age_group = np.empty(len(country_age_list), 'int') year = np.empty(len(country_age_list), 'int') # loop through country/age groups for ca in np.unique(country_age_list): print('GPRing ' + ca) # subset the data for this particular country/age ca_data = all_data[country_age==ca] # subset just the observed data if ca_data['lt_cf'].dtype != '|O8': ca_observed = ca_data[(np.isnan(ca_data['lt_cf'])==0) & (ca_data['test_' + test]==0)] if len(ca_observed) > 1: has_data = True else: has_data = False else: has_data = False # keep track of how many iterations have been added for this model iter_counter = 0 # loop through each submodel for m in range(number_submodels): # identify the dependent variable for this model dv = dv_list[m] # continue making predictions if we actually need draws for this model if (spacetime_iters[m] > 0) or (m+1 == top_submodel): # skip models with no spacetime results if all_data['spacetime_' + str(m+1)].dtype != 'float64': for i in range(spacetime_iters[m]): draws[iter_counter][country_age_list==ca] = np.NaN iter_counter += 1 if (m+1 == top_submodel): for i in range(100): top_submodel_draws[i][country_age_list==ca] = np.NaN continue # make a list of the spacetime predictions ca_prior = np.array([np.mean(ca_data['spacetime_' + str(m+1)][ca_data.year==y]) for y in year_list]) # find the amplitude for this country/age amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m+1)]) # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR def mean_function(x) : return np.interp(x, year_list, ca_prior) # setup the covariance function M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) # observe the data if there is any if has_data: gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m+1)], obs_vals=ca_observed[dv]) # draw realizations from the data realizations = [gp.Realization(M, C) for i in range(spacetime_iters[m])] # save the data for this country/age into the results array iso3[country_age_list==ca] = ca[0:3] age_group[country_age_list==ca] = ca[4:] year[country_age_list==ca] = year_list.T for i in range(spacetime_iters[m]): try: draws[iter_counter][country_age_list==ca] = realizations[i](year_list) except: print('Failure in ' + ca) iter_counter += 1 # if it's the top submodel, do 100 additional draws if (m+1 == top_submodel): realizations = [gp.Realization(M, C) for i in range(100)] for i in range(100): try: top_submodel_draws[i][country_age_list==ca] = realizations[i](year_list) except: print('Failure in ' + ca) # save the results print('Saving GPR results') names = ['iso3','age_group','year'] results = np.core.records.fromarrays([iso3,age_group,year], names=names) for i in range(total_iters): results = recfunctions.append_fields(results, 'ensemble_d' + str(i+1), draws[i]) if (top_submodel > 0): for i in range(100): results = recfunctions.append_fields(results, 'top_submodel_d' + str(i+1), top_submodel_draws[i]) rec2csv(results, outfile)
def rec2csv_2d(Y, fname): """ write a 2-dimensional recarray to a csv file """ pl.rec2csv(pl.np.core.records.fromarrays(Y.T), fname)
for i in range(len(predictionyears)): all_est.append((cc, ss, predictionyears[i], unlog_est['med'][i], unlog_est['lower'][i], unlog_est['upper'][i])) all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('sex', '|S32'), ('year', '<f8'), ('mort_med', '<f8'), ('mort_lower', '<f8'), ('mort_upper', '<f8')]) ## no need to save the summary if we're doing the HIV draws version if (huncert == int(1)): os.chdir('strPath') else: os.chdir('%s/strPath' % ('/home/j' if os.name == 'posix' else 'J:')) pl.rec2csv(all_est, 'gpr_%s_%s_not_scaled.txt' % (cc, ss)) # save the sims all_sim = [] for i in range(len(predictionyears)): for s in range(dr): if (transform == 'log10'): all_sim.append((cc, ss, predictionyears[i], s, 10**d[s][i])) # log base 10 space elif (transform == 'ln'): all_sim.append((cc, ss, predictionyears[i], s, math.e**d[s][i])) # natural log space elif (transform == 'logit'): all_sim.append((cc, ss, predictionyears[i], s, ((math.e**d[s][i]) /
weights = np.array(1) sumval = 1.0 else: indexer[axis] = slice(i, i+2) j = i + 1 weights = np.array([(j - index), (index - i)],float) wshape = [1]*sorted.ndim wshape[axis] = 2 weights.shape = wshape sumval = weights.sum() return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval # save basic estimates model_estimates = model.trace('estimate')[:] mean_estimate = model_estimates.mean(axis=0) lower_estimate = percentile(model_estimates, 2.5, axis=0) upper_estimate = percentile(model_estimates, 97.5, axis=0) output = pl.rec_append_fields( rec = data, names = ['mean', 'lower', 'upper'], arrs = [mean_estimate, lower_estimate, upper_estimate]) pl.rec2csv(output, proj_dir + 'outputs/model results/random effects plus flex time/' + mod_name + '_' + str(sex) + '_' + age + '.csv') # save draws draws = pl.rec_append_fields( rec = data, names = ['draw_' + str(i+1) for i in range(100)], arrs = [model.trace('estimate')[i] for i in range(100)]) pl.rec2csv(draws, proj_dir + 'outputs/model results/random effects plus flex time/' + mod_name + '_draws_' + str(sex) + '_' + age + '.csv')
for i in range(len(predictionyears)): all_est.append((cc, ss, predictionyears[i], unlog_est['med'][i], unlog_est['lower'][i], unlog_est['upper'][i])) all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('sex', '|S32'), ('year', '<f8'), ('mort_med', '<f8'), ('mort_lower', '<f8'), ('mort_upper', '<f8')]) ## no need to save the summary if we're doing the HIV draws version if (huncert == int(1)): os.chdir('filepath') else: os.chdir('filepath') pl.rec2csv(all_est, 'filepath' % (cc, ss)) # save the sims all_sim = [] for i in range(len(predictionyears)): for s in range(dr): if (transform == 'log10'): all_sim.append((cc, ss, predictionyears[i], s, 10**d[s][i])) # log base 10 space elif (transform == 'ln'): all_sim.append((cc, ss, predictionyears[i], s, math.e**d[s][i])) # natural log space elif (transform == 'logit'): all_sim.append((cc, ss, predictionyears[i], s, ((math.e**d[s][i]) /
def predict_test(self, save_csv=False): ''' Use the MCMC traces to predict the test data ''' # setup constants num_test_rows = self.test_data.shape[0] num_iters = self.mod_mc.beta.trace().shape[0] # indices t_index = dict([(t, i) for i, t in enumerate(self.year_list)]) a_index = dict([(a, i) for i, a in enumerate(self.age_list)]) # fixed effects X = np.array([self.test_data['x%d'%i] for i in range(self.mod_mc.beta.value.shape[0])]) BX = np.dot(self.mod_mc.beta.trace(), X) # exposure ''' if self.training_type == 'make predictions': E = np.ones((num_iters, num_test_rows))*self.test_data.envelope else: E = np.random.binomial(np.round(self.test_data.envelope).astype('int'), (self.test_data.sample_size/self.test_data.envelope), (num_iters, num_test_rows)) ''' E = np.ones((num_iters, num_test_rows))*self.test_data.envelope # pi_s s_index = [np.where(self.test_data.super_region==s) for s in self.super_region_list] t_by_s = [[t_index[self.test_data.year[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))] a_by_s = [[a_index[self.test_data.age[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))] pi_s = np.zeros((num_iters, num_test_rows)) for s in range(len(self.super_region_list)): pi_s[:,s_index[s][0]] = self.mod_mc.pi_s_list.trace()[:,s][:,a_by_s[s],t_by_s[s]] self.test_s_index = s_index # pi_r r_index = [np.where(self.test_data.region==r) for r in self.region_list] t_by_r = [[t_index[self.test_data.year[j]] for j in r_index[r][0]] for r in range(len(self.region_list))] a_by_r = [[a_index[self.test_data.age[j]] for j in r_index[r][0]] for r in range(len(self.region_list))] pi_r = np.zeros((num_iters, num_test_rows)) for r in range(len(self.region_list)): pi_r[:,r_index[r][0]] = self.mod_mc.pi_r_list.trace()[:,r][:,a_by_r[r],t_by_r[r]] self.test_r_index = r_index # pi_c c_index = [np.where(self.test_data.country==c) for c in self.country_list] t_by_c = [[t_index[self.test_data.year[j]] for j in c_index[c][0]] for c in range(len(self.country_list))] a_by_c = [[a_index[self.test_data.age[j]] for j in c_index[c][0]] for c in range(len(self.country_list))] pi_c = np.zeros((num_iters, num_test_rows)) for c in range(len(self.country_list)): pi_c[:,c_index[c][0]] = self.mod_mc.pi_c_list.trace()[:,c][:,a_by_c[c],t_by_c[c]] self.test_c_index = c_index # make predictions import os os.chdir('/home/j/Project/Causes of Death/CoDMod/codmod2/') import percentile predictions = np.exp(BX + np.log(E) + pi_s + pi_r + pi_c) mean = predictions.mean(axis=0) lower = percentile.percentile(predictions, 2.5, axis=0) upper = percentile.percentile(predictions, 97.5, axis=0) self.predictions = self.test_data[['country','region','super_region','year','age','pop']] self.predictions = recfunctions.append_fields(self.predictions, 'mean_deaths', mean) self.predictions = recfunctions.append_fields(self.predictions, 'lower_deaths', lower) self.predictions = recfunctions.append_fields(self.predictions, 'upper_deaths', upper) if self.training_type != 'make predictions': self.predictions = recfunctions.append_fields(self.predictions, 'actual_deaths', self.test_data.cf*self.test_data.envelope) self.predictions = self.predictions.view(np.recarray) # save the predictions if save_csv == True: pl.rec2csv(self.predictions, '/home/j/Project/Causes of Death/CoDMod/tmp/' + self.name + '_predictions_' + self.cause + '_' + self.sex + '.csv')
def load(self, save_cache=False, use_cache=False, dir='/home/j/Project/Causes of Death/CoDMod/tmp/'): ''' If use_cache=True, loads data from a previous call to the MySQL server. Otherwise, loads codmod data from the MySQL server. The resulting query will get all the data for a specified cause and sex, plus any covariates specified. If save_cache is True, then the results from this will be saved as csvs. ''' # use cached data if specified if use_cache == True: self.use_cache(dir) # otherwise, load in the data from MySQL else: # make the sql covariate query covs = '' for i in list(set(self.covariates_untransformed)): if i != 'year': covs = covs + i + ', ' covs = covs[0:-2] # load observed deaths plus covariates obs_sql = 'SELECT iso3 as country, a.region, a.super_region, age, year, sex, cf, sample_size, a.envelope, a.pop, ' + covs + ' FROM full_cod_database AS a LEFT JOIN all_covariates USING (iso3,year,sex,age) WHERE a.cod_id="' + self.cause + '";' obs = mysql_to_recarray(self.cursor, obs_sql) obs = obs[np.where((obs.year >= self.year_range[0]) & (obs.year <= self.year_range[1]) & (obs.age >= self.age_range[0]) & (obs.age <= self.age_range[1]) & (obs.sex == self.sex_num))[0]] # load in just covariates (for making predictions) all_sql = 'SELECT iso3 as country, region, super_region, age, year, sex, envelope, pop, ' + covs + ' FROM all_covariates;' all = mysql_to_recarray(self.cursor, all_sql) all = all[np.where((all.year >= self.year_range[0]) & (all.year <= self.year_range[1]) & (all.age >= self.age_range[0]) & (all.age <= self.age_range[1]) & (all.sex == self.sex_num))[0]] # get rid of rows for which covariates are unavailable for i in list(set(self.covariates_untransformed)): all = np.delete(all, np.where(np.isnan(all[i]))[0], axis=0) obs = np.delete(obs, np.where(np.isnan(obs[i]))[0], axis=0) # remove observations in which the CF is missing or outside of (0,1), or where sample size/envelope is missing obs = np.delete(obs, np.where((np.isnan(obs.cf)) | (obs.cf > 1.) | (obs.cf < 0.) | (np.isnan(obs.sample_size)) | (obs.sample_size < 1.) | np.isnan(obs.envelope))[0], axis=0) # make lists of all the countries/regions/ages/years to predict for self.country_list = np.unique(all.country) self.region_list = np.unique(all.region) self.super_region_list = np.unique(all.super_region) self.age_list = np.unique(all.age) self.year_list = np.unique(all.year) # apply a moving average (5 year window) on cause fractions of 0 or 1, or where sample size is less than 100 age_lookups = {} for a in self.age_list: age_lookups[a] = np.where(obs.age == a)[0] country_lookups = {} country_age_lookups = {} for c in self.country_list: country_lookups[c] = np.where(obs.country == c)[0] for a in self.age_list: country_age_lookups[c+'_'+str(a)] = np.intersect1d(country_lookups[c], age_lookups[a]) year_window_lookups = {} for y in range(self.year_range[0],self.year_range[1]+1): year_window_lookups[y] = np.where((obs.year >= y-2.) & (obs.year <= y+2.))[0] smooth_me = np.where((obs.cf==0.) | (obs.cf==1.) | (obs.sample_size<100.))[0] for i in smooth_me: obs.cf[i] = obs.cf[np.intersect1d(country_age_lookups[obs.country[i]+'_'+str(obs.age[i])],year_window_lookups[obs.year[i]])].mean() # for cases in which the CF is still 0 or 1 after the moving average, use the smallest/largest non-0/1 CF observed in that region-age region_age_lookups = {} region_lookups = {} for r in self.region_list: region_lookups[r] = np.where(obs.region == r)[0] for a in self.age_list: region_age_lookups[str(r)+'_'+str(a)] = np.intersect1d(region_lookups[r], age_lookups[a]) validcfs = np.where((obs.cf>0.) & (obs.cf<1.))[0] for i in np.where(obs.cf==0.)[0]: candidates = np.intersect1d(region_age_lookups[str(obs.region[i])+'_'+str(obs.age[i])], validcfs) if candidates.shape[0] == 0: obs.cf[i] = 0. else: obs.cf[i] = obs.cf[candidates].min() for i in np.where(obs.cf==1.)[0]: candidates = np.intersect1d(region_age_lookups[str(obs.region[i])+'_'+str(obs.age[i])], validcfs) if candidates.shape[0] == 0: obs.cf[i] = 1. else: obs.cf[i] = obs.cf[candidates].max() # finally, any CF that is still 0 or 1 after the above corrections should simply be dropped obs = np.delete(obs, np.where((obs.cf == 0.) | (obs.cf == 1.))[0], axis=0) # we treat our envelope as truth, so never allow sample size to exceed it shrink_me = np.where(obs.sample_size > obs.envelope*.999)[0] obs.sample_size[shrink_me] = obs.envelope[shrink_me]*.999 # make covariate matrices (including transformations and normalization) obs_vectors = [obs.country, obs.region, obs.super_region, obs.year, obs.age, obs.cf, obs.sample_size, obs.envelope, obs.pop, np.ones(obs.shape[0])] obs_names = ['country', 'region', 'super_region', 'year', 'age', 'cf', 'sample_size', 'envelope', 'pop', 'x0'] all_vectors = [all.country, all.region, all.super_region, all.year, all.age, all.envelope, all.pop, np.ones(all.shape[0])] all_names = ['country', 'region', 'super_region', 'year', 'age', 'envelope', 'pop', 'x0'] self.covariate_dict = {'x0': 'constant'} for i in range(len(self.covariate_list)): a = all[self.covariates_untransformed[i]] o = obs[self.covariates_untransformed[i]] if self.covariate_transformations[i] == 'ln': a = np.log(a) o = np.log(o) elif self.covariate_transformations[i] == 'ln+sq': a = (np.log(a))**2 o = (np.log(o))**2 elif self.covariate_transformations[i] == 'sq': a = a**2 o = o**2 if self.normalize == True: cov_mean = np.mean(a) cov_sd = np.std(a) a = ((a-cov_mean)/cov_sd) o = ((o-cov_mean)/cov_sd) all_vectors.append(a) all_names.append('x' + str(i+1)) obs_vectors.append(o) obs_names.append('x' + str(i+1)) self.covariate_dict['x' + str(i+1)] = self.covariate_list[i] # create age dummies if specified if self.age_dummies == True: pre_ref = 1 for i,j in enumerate(self.age_list): if j == self.age_ref: pre_ref = 0 elif pre_ref == 1: all_vectors.append(np.array(all.age==j).astype(np.float)) all_names.append('x' + str(len(self.covariate_list)+i+1)) obs_vectors.append(np.array(obs.age==j).astype(np.float)) obs_names.append('x' + str(len(self.covariate_list)+i+1)) self.covariate_dict['x' + str(len(self.covariate_list)+i+1)] = 'Age ' + str(j) else: all_vectors.append(np.array(all.age==j).astype(np.float)) all_names.append('x' + str(len(self.covariate_list)+i)) obs_vectors.append(np.array(obs.age==j).astype(np.float)) obs_names.append('x' + str(len(self.covariate_list)+i)) self.covariate_dict['x' + str(len(self.covariate_list)+i)] = 'Age ' + str(j) # return the prediction and observation matrices self.prediction_matrix = np.core.records.fromarrays(all_vectors, names=all_names) self.observation_matrix = np.core.records.fromarrays(obs_vectors, names=obs_names) # prep all the in-sample data self.data_rows = self.observation_matrix.shape[0] print 'Data Rows:', self.data_rows self.training_split() # cache the data if requested if save_cache == True: pl.rec2csv(self.prediction_matrix, '/home/j/Project/Causes of Death/CoDMod/tmp/prediction_matrix_' + self.cause + '_' + self.sex + '.csv') pl.rec2csv(self.observation_matrix, '/home/j/Project/Causes of Death/CoDMod/tmp/observation_matrix_' + self.cause + '_' + self.sex + '.csv') # load in age weights for creating age adjusted rates later age_weights = mysql_to_recarray(self.cursor, 'SELECT age,weight FROM age_weights;') age_weights = recfunctions.append_fields(age_weights, 'keep', np.zeros(age_weights.shape[0])).view(np.recarray) for a in self.age_list: age_weights.keep[np.where(age_weights.age==a)[0]] = 1 age_weights = np.delete(age_weights, np.where(age_weights.keep==0)[0], axis=0) age_weights.weight = age_weights.weight/age_weights.weight.sum() self.age_weights = age_weights
def predict_test(self, save_csv=False): ''' Use the MCMC traces to predict the test data ''' # setup constants num_test_rows = self.test_data.shape[0] num_iters = self.approxs['beta'].shape[0] # indices t_index = dict([(t, i) for i, t in enumerate(self.year_list)]) a_index = dict([(a, i) for i, a in enumerate(self.age_list)]) # fixed effects X = np.array([self.test_data['x%d'%i] for i in range(self.mod_mc.beta.value.shape[0])]) BX = np.dot(self.approxs['beta'], X) # exposure ''' if self.training_type == 'make predictions': E = np.ones((num_iters, num_test_rows))*self.test_data.envelope else: E = np.random.binomial(np.round(self.test_data.envelope).astype('int'), (self.test_data.sample_size/self.test_data.envelope), (num_iters, num_test_rows)) ''' E = np.ones((num_iters, num_test_rows))*self.test_data.envelope # interpolation parameters x_samples = self.sample_points[:,0] y_samples = self.sample_points[:,1] xb = self.age_list[0] xe = self.age_list[-1] yb = self.year_list[0] ye = self.year_list[-1] kx = 3 if len(self.age_samples) > 3 else len(self.age_samples)-1 ky = 3 if len(self.year_samples) > 3 else len(self.year_samples)-1 # pi_s s_index = [np.where(self.test_data.super_region==s) for s in self.super_region_list] t_by_s = [[t_index[self.test_data.year[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))] a_by_s = [[a_index[self.test_data.age[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))] pi_s = np.zeros((num_iters, num_test_rows)) for s in range(len(self.super_region_list)): for i in range(num_iters): interpolator = interpolate.bisplrep(x=x_samples, y=y_samples, z=self.approxs['pi_s_'+str(s)][i], xb=xb, xe=xe, yb=yb, ye=ye, kx=kx, ky=ky) pi_s[i,s_index[s][0]] = interpolate.bisplev(x=self.age_list, y=self.year_list, tck=interpolator)[a_by_s[s],t_by_s[s]] mean_pi_s = pi_s[:,s_index[s][0]].mean(axis=1) pi_s[:,s_index[s][0]] = pi_s[:,s_index[s][0]][np.argsort(mean_pi_s)] # pi_r r_index = [np.where(self.test_data.region==r) for r in self.region_list] t_by_r = [[t_index[self.test_data.year[j]] for j in r_index[r][0]] for r in range(len(self.region_list))] a_by_r = [[a_index[self.test_data.age[j]] for j in r_index[r][0]] for r in range(len(self.region_list))] pi_r = np.zeros((num_iters, num_test_rows)) for r in range(len(self.region_list)): for i in range(num_iters): interpolator = interpolate.bisplrep(x=x_samples, y=y_samples, z=self.approxs['pi_r_'+str(r)][i], xb=xb, xe=xe, yb=yb, ye=ye, kx=kx, ky=ky) pi_r[i,r_index[r][0]] = interpolate.bisplev(x=self.age_list, y=self.year_list, tck=interpolator)[a_by_r[r],t_by_r[r]] mean_pi_r = pi_r[:,r_index[r][0]].mean(axis=1) pi_r[:,r_index[r][0]] = pi_r[:,r_index[r][0]][np.argsort(mean_pi_r)] # pi_c c_index = [np.where(self.test_data.country==c) for c in self.country_list] t_by_c = [[t_index[self.test_data.year[j]] for j in c_index[c][0]] for c in range(len(self.country_list))] a_by_c = [[a_index[self.test_data.age[j]] for j in c_index[c][0]] for c in range(len(self.country_list))] pi_c = np.zeros((num_iters, num_test_rows)) for c in range(len(self.country_list)): for i in range(num_iters): interpolator = interpolate.bisplrep(x=x_samples, y=y_samples, z=self.approxs['pi_c_'+str(c)][i], xb=xb, xe=xe, yb=yb, ye=ye, kx=kx, ky=ky) pi_c[i,c_index[c][0]] = interpolate.bisplev(x=self.age_list, y=self.year_list, tck=interpolator)[a_by_c[c],t_by_c[c]] mean_pi_c = pi_c[:,c_index[c][0]].mean(axis=1) pi_s[:,c_index[c][0]] = pi_c[:,c_index[c][0]][np.argsort(mean_pi_c)] # make predictions import os os.chdir('/home/j/Project/Causes of Death/CoDMod/codmod2/') import percentile predictions = np.exp(BX + np.log(E) + pi_s + pi_r + pi_c) mean = predictions.mean(axis=0) lower = percentile.percentile(predictions, 2.5, axis=0) upper = percentile.percentile(predictions, 97.5, axis=0) self.predictions = self.test_data[['country','region','super_region','year','age','pop']] self.predictions = recfunctions.append_fields(self.predictions, 'mean_deaths', mean) self.predictions = recfunctions.append_fields(self.predictions, 'lower_deaths', lower) self.predictions = recfunctions.append_fields(self.predictions, 'upper_deaths', upper) if self.training_type != 'make predictions': self.predictions = recfunctions.append_fields(self.predictions, 'actual_deaths', self.test_data.cf*self.test_data.envelope) self.predictions = self.predictions.view(np.recarray) # save the predictions if save_csv == True: pl.rec2csv(self.predictions, '/home/j/Project/Causes of Death/CoDMod/tmp/' + self.name + '_predictions_' + self.cause + '_' + self.sex + '.csv')
## find mean and standard error, drawing from M and C draws = 1000 mort_draws = np.zeros((draws, len(predictionyears))) gpr_seeds = [x + 123456 for x in range(1, 1001)] for draw in range(draws): np.random.seed(gpr_seeds[draw]) mort_draws[draw, :] = Realization(M, C)(predictionyears) logit_est = gpr.collapse_sims(mort_draws) unlogit_est = gpr.collapse_sims(mort_draws) os.chdir('FILEPATH' + sex + '_' + age + '/') all_est = [] for i in range(len(predictionyears)): all_est.append((cc, predictionyears[i], unlogit_est['med'][i], unlogit_est['lower'][i], unlogit_est['upper'][i])) all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('med', '<f8'), ('lower', '<f8'), ('upper', '<f8')]) pl.rec2csv(all_est, 'gpr_' + cc + '_' + sex + '_' + age + '.txt') # save the sims all_sim = [] for i in range(len(predictionyears)): for s in range(draws): all_sim.append((cc, predictionyears[i], s, mort_draws[s][i])) all_sim = pl.array(all_sim, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('sim', '<f8'), ('mort', '<f8')]) pl.rec2csv(all_sim, 'gpr_' + cc + '_' + sex + '_' + age + '_sim.txt')
total_var = var + logit_est['std'][pred_index]**2 coverage = int( (logit_est['med'][pred_index] - 1.96 * pl.sqrt(total_var)) < mort < (logit_est['med'][pred_index] + 1.96 * pl.sqrt(total_var))) all_err.append((rr, cc, ho, scale, amp2x, mse * amp2x, year, mort, re, coverage)) ## write files if os.name == 'posix': os.chdir('FILEPATH') else: os.chdir('FILEPATH') all_est = pl.array(all_est, [('gbd_region', '|S64'), ('iso3', '|S32'), ('ho', '<f8'), ('scale', '<f8'), ('amp2x', '<f8'), ('amp2', '<f8'), ('year', '<f8'), ('mort', '<f8'), ('std', '<f8')]) pl.rec2csv(all_est, 'gpr_%s_%i.txt' % (cc, ho)) if os.name == 'posix': os.chdir('FILEPATH') else: os.chdir('FILEPATH') all_err = pl.array(all_err, [('gbd_region', '|S64'), ('iso3', '|S32'), ('ho', '<f8'), ('scale', '<f8'), ('amp2x', '<f8'), ('amp2', '<f8'), ('year', '<f8'), ('mort', '<f8'), ('re', '<f8'), ('coverage', '<f8')]) pl.rec2csv(all_err, 'loss_%s_%i.txt' % (cc, ho))
coldict['lat'] = np.concatenate((duffy_data.lat, vivax_data.lat)) coldict['n'] = np.concatenate((duffy_data.n, vivax_data.pos + vivax_data.neg)) coldict['vivax_pos'] = np.concatenate((duffy_nan, vivax_data.pos)) coldict['vivax_neg'] = np.concatenate((duffy_nan, vivax_data.neg)) coldict['datatype'] = np.concatenate( (duffy_data.datatype, np.repeat('vivax', n_vivax))) for colname in vivaxcols: coldict[colname] = np.concatenate((duffy_nan, vivax_data[colname])) for colname in duffycols: coldict[colname] = np.concatenate((duffy_data[colname], vivax_nan)) allcols = coldict.keys() combined_data = np.rec.fromarrays([coldict[col] for col in allcols], names=allcols) # FIXME: Do the Sahel instead. def box_data(data, llcrnrlon, llcrnrlat, urcrnrlon, urcrnrlat): indicator = (data.lon > llcrnrlon) * (data.lon < urcrnrlon) * ( data.lat > llcrnrlat) * (data.lat < urcrnrlat) return data[np.where(indicator)] # Write out # warnings.warn('Boxing') # combined_data = combined_data[np.where((combined_data.lon>-19)*(combined_data.lon<54)*(combined_data.lat>0))] # combined_data = box_data(combined_data, 31.5, 11.5, 64, 32) rec2csv(combined_data, combined_datafile)
for i in range(len(predictionyears)): all_est.append((cc, ss, predictionyears[i], unlog_est['med'][i], unlog_est['lower'][i], unlog_est['upper'][i])) all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('sex', '|S32'), ('year', '<f8'), ('mort_med', '<f8'), ('mort_lower', '<f8'), ('mort_upper', '<f8')]) ## no need to save the summary if we're doing the HIV draws version if (hiv_uncert == int(1)): pass else: est_file = "FILEPATH" pl.rec2csv(all_est, est_file) # save the sims all_sim = [] for i in range(len(predictionyears)): for s in range(dr): if (transform == 'log10'): all_sim.append((cc, ss, predictionyears[i], s, 10**d[s][i])) # log base 10 space elif (transform == 'ln'): all_sim.append((cc, ss, predictionyears[i], s, math.e**d[s][i])) # natural log space elif (transform == 'logit'): all_sim.append((cc, ss, predictionyears[i], s, ((math.e**d[s][i]) /
# collapse across draws # note: space transformations need to be performed at the draw level # not actually doing any transformations here, we'll do after logit_est = gpr.collapse_sims(mort_draws) unlogit_est = gpr.collapse_sims(mort_draws) # save the predictions all_est = [] for i in range(len(predictionyears)): all_est.append((cc, predictionyears[i], unlogit_est['med'][i], unlogit_est['lower'][i], unlogit_est['upper'][i])) all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('med', '<f8'), ('lower', '<f8'), ('upper', '<f8')]) output_file = "FILEPATH" pl.rec2csv(all_est, output_file) # save the sims all_sim = [] for i in range(len(predictionyears)): for s in range(draws): all_sim.append((cc, predictionyears[i], s, mort_draws[s][i])) all_sim = pl.array(all_sim, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('sim', '<f8'), ('mort', '<f8')]) output_file = "FILEPATH" pl.rec2csv(all_sim, output_file)
def combine_output(J, T, model, dir, reps, save=False): """ Combine output on absolute error, relative error, csmf_accuracy, and coverage from from multiple runs of validate_once. Either saves the output to the disk, or returns arays for each. """ cause = pl.zeros(J * T, dtype='f').view(pl.recarray) time = pl.zeros(J * T, dtype='f').view(pl.recarray) abs_err = pl.zeros(J * T, dtype='f').view(pl.recarray) rel_err = pl.zeros(J * T, dtype='f').view(pl.recarray) coverage = pl.zeros(J * T, dtype='f').view(pl.recarray) csmf_accuracy = pl.zeros(J * T, dtype='f').view(pl.recarray) for i in range(reps): metrics = pl.csv2rec('%s/metrics_%s_%i.csv' % (dir, model, i)) cause = pl.vstack((cause, metrics.cause)) time = pl.vstack((time, metrics.time)) abs_err = pl.vstack((abs_err, metrics.abs_err)) rel_err = pl.vstack((rel_err, metrics.rel_err)) coverage = pl.vstack((coverage, metrics.coverage)) csmf_accuracy = pl.vstack((csmf_accuracy, metrics.csmf_accuracy)) cause = cause[1:, ] time = time[1:, ] abs_err = abs_err[1:, ] rel_err = rel_err[1:, ] coverage = coverage[1:, ] csmf_accuracy = csmf_accuracy[1:, ] mean_abs_err = abs_err.mean(0) median_abs_err = pl.median(abs_err, 0) mean_rel_err = rel_err.mean(0) median_rel_err = pl.median(rel_err, 0) mean_csmf_accuracy = csmf_accuracy.mean(0) median_csmf_accuracy = pl.median(csmf_accuracy, 0) mean_coverage_bycause = coverage.mean(0) mean_coverage = coverage.reshape(reps, T, J).mean(0).mean(1) percent_total_coverage = (coverage.reshape(reps, T, J).sum(2) == 3).mean(0) mean_coverage = pl.array([[i for j in range(J)] for i in mean_coverage]).ravel() percent_total_coverage = pl.array([[i for j in range(J)] for i in percent_total_coverage ]).ravel() models = pl.array([[model for j in range(J)] for i in range(T)]).ravel() true_cf = metrics.true_cf true_std = metrics.true_std std_bias = metrics.std_bias all = pl.np.core.records.fromarrays( [ models, cause[0], time[0], true_cf, true_std, std_bias, mean_abs_err, median_abs_err, mean_rel_err, median_rel_err, mean_csmf_accuracy, median_csmf_accuracy, mean_coverage_bycause, mean_coverage, percent_total_coverage ], names=[ 'model', 'cause', 'time', 'true_cf', 'true_std', 'std_bias', 'mean_abs_err', 'median_abs_err', 'mean_rel_err', 'median_rel_err', 'mean_csmf_accuracy', 'median_csmf_accuracy', 'mean_covearge_bycause', 'mean_coverage', 'percent_total_coverage' ]) if save: pl.rec2csv(all, '%s/%s_summary.csv' % (dir, model)) else: return all
else: indexer[axis] = slice(i, i + 2) j = i + 1 weights = np.array([(j - index), (index - i)], float) wshape = [1] * sorted.ndim wshape[axis] = 2 weights.shape = wshape sumval = weights.sum() return np.add.reduce(sorted[indexer] * weights, axis=axis, out=out) / sumval # save basic estimates model_estimates = model.trace("estimate")[:] mean_estimate = model_estimates.mean(axis=0) lower_estimate = percentile(model_estimates, 2.5, axis=0) upper_estimate = percentile(model_estimates, 97.5, axis=0) output = pl.rec_append_fields( rec=data, names=["mean", "lower", "upper"], arrs=[mean_estimate, lower_estimate, upper_estimate] ) pl.rec2csv( output, proj_dir + "outputs/model results/spatial smoothing/" + mod_name + "_" + str(sex) + "_" + age + ".csv" ) # save draws draws = pl.rec_append_fields( rec=data, names=["draw_" + str(i + 1) for i in range(500)], arrs=[model.trace("estimate")[i] for i in range(500)] ) pl.rec2csv( draws, proj_dir + "outputs/model results/spatial smoothing/" + mod_name + "_draws_" + str(sex) + "_" + age + ".csv" )
# collapse across draws # note: space transformations need to be performed at the draw level logit_est = gpr.collapse_sims(mort_draws) unlogit_est = gpr.collapse_sims(gpr.inv_logit(mort_draws)) if hivsims == 0: os.chdir('FILEPATH') all_est = [] for i in range(len(predictionyears)): all_est.append((cc, predictionyears[i], unlogit_est['med'][i], unlogit_est['lower'][i], unlogit_est['upper'][i])) all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('med', '<f8'), ('lower', '<f8'), ('upper', '<f8')]) pl.rec2csv(all_est, 'gpr_%s.txt' % cc) # save the sims all_sim = [] for i in range(len(predictionyears)): for s in range(draws): all_sim.append( (cc, predictionyears[i], s, gpr.inv_logit(mort_draws[s][i]))) all_sim = pl.array(all_sim, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('sim', '<f8'), ('mort', '<f8')]) if hivsims == 1: os.chdir('FILEPATH') pl.rec2csv(all_sim, 'gpr_%s_%s_sim.txt' % (cc, rnum)) else:
full_dir = '%s/v02_prep_%s' % (indir, iso3) # get cause list causes = list(set([file.split('+')[1] for file in os.listdir(full_dir) if re.search(age, file)])) causes.remove('HIV') # temporary until Miriam fixes the HIV files # gather data and fit model cf = data.get_cod_data(full_dir, causes, age, iso3, sex) m, pi = models.fit_latent_simplex(cf) # calculate summary measures N, T, J = pi.shape mean = pi.mean(0) lower = pl.array([[st.mquantiles(pi[:,t,j], 0.025)[0] for j in range(J)] for t in range(T)]) upper = pl.array([[st.mquantiles(pi[:,t,j], 0.975)[0] for j in range(J)] for t in range(T)]) # format summary and save output = pl.np.core.records.fromarrays(mean.T, names=['%s_mean' % c for c in causes]) output = pl.rec_append_fields(output, ['%s_lower' % c for c in causes], lower.T) output = pl.rec_append_fields(output, ['%s_upper' % c for c in causes], upper.T) pl.rec2csv(output, '%s/%s+%s+%s+summary.csv' % (outdir, iso3, age, sex)) # format all sims and save pi.shape = (N*T, J) years = pl.array([t for s in range(N) for t in range(1980, 2012)]) sim = pl.array([s for s in range(N) for t in range(1980, 2012)]) output = pl.np.core.records.fromarrays(pi.T, names=causes) output = pl.rec_append_fields(output, 'year', years) output = pl.rec_append_fields(output, 'sim', sim) pl.rec2csv(output, '%s/%s+%s+%s.csv' % (outdir, iso3, age, sex))
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, iters): # load in the data all_data = csv2rec(infile, use_mrecords=False) for m in range(number_submodels): all_data = np.delete( all_data, np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0], axis=0) # Investigate error thrown for HKG, MAC, and SGP... they don't have data, but don't know why this is breaking line 62 all_data = all_data[all_data['iso3'] != "HKG"] all_data = all_data[all_data['iso3'] != "MAC"] all_data = all_data[all_data['iso3'] != "SGP"] # find the list of years for which we need to predict year_list = np.unique(all_data.year) # find the list of country/age groups country_age = np.array([all_data.iso3[i] for i in range(len(all_data))]) country_age_list = np.repeat(np.unique(country_age), len(year_list)) # make empty arrays in which to store the results draws = [ np.empty(len(country_age_list), 'float') for i in range(iters * number_submodels * 2) ] iso3 = np.empty(len(country_age_list), '|S3') # age_group = np.empty(len(country_age_list), 'int') year = np.empty(len(country_age_list), 'int') # loop through country/age groups for ca in np.unique(country_age_list): print('GPRing ' + ca) # subset the data for this particular country/age ca_data = all_data[country_age == ca] # subset just the observed data if ca_data['lt_prev'].dtype != '|O8': ca_observed = ca_data[(np.isnan(ca_data['lt_prev']) == 0)] if len(ca_observed) > 1: has_data = True else: has_data = False else: has_data = False # loop through each submodel for m in range(number_submodels): # identify the dependent variable for this model dv = dv_list[m] # loop through spacetime/linear for x, t in enumerate(['spacetime']): # make a list of the spacetime predictions ca_prior = np.array([ np.mean(ca_data[t + '_' + str(m + 1)][ca_data.year == y]) for y in year_list ]) # find the amplitude for this country/age amplitude = np.mean(ca_data[t + '_amplitude_' + str(m + 1)]) # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR def mean_function(x): return np.interp(x, year_list, ca_prior) # setup the covariance function M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) # observe the data if there is any if has_data: gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed[t + '_data_variance_' + str(m + 1)], obs_vals=ca_observed['lt_prev']) # draw realizations from the data realizations = [gp.Realization(M, C) for i in range(iters)] # save the data for this country/age into the results array iso3[country_age_list == ca] = ca[0:3] # age_group[country_age_list==ca] = ca[4:] year[country_age_list == ca] = year_list.T for i in range(iters): draws[((2 * m + x) * iters) + i][ country_age_list == ca] = realizations[i](year_list) # save the results print('Saving GPR results') names = ['iso3', 'age_group', 'year'] results = np.core.records.fromarrays([iso3, year], names=names) for m in range(number_submodels): for x, t in enumerate(['spacetime']): for i in range(iters): results = recfunctions.append_fields( results, 'gpr_' + str(m + 1) + '_' + t + '_d' + str(i + 1), draws[((2 * m + x) * iters) + i]) results = recfunctions.append_fields( results, 'gpr_' + str(m + 1) + '_' + t + '_mean', np.mean(draws[((2 * m + x) * iters):((2 * m + x + 1) * iters)], axis=0)) rec2csv(results, outfile)
lon_ind = np.argmin(np.abs(np.subtract.outer(lon_old, lon_new)), axis=0) lat_ind = np.argmin(np.abs(np.subtract.outer(lat_old, lat_new)), axis=0) out = lat_new*0 for i in xrange(len(lon_new)): lai, loi = lat_ind[i], lon_ind[i] if data.mask[lai, loi]: for d in xrange(10): if True-np.all(data.mask[lai-d:lai+d,loi-d:loi+d]): out[i] = mode(data.data[lai-d:lai+d,loi-d:loi+d][np.where(True-data.mask[lai-d:lai+d,loi-d:loi+d])]) break else: out[i] = data[lai,loi] if np.any(np.isnan(out)): raise ValueError return out for fname in map(lambda n: n+'.hdf5', covariate_names): print 'Evaluating %s'%fname colname = os.path.splitext(fname)[0] hf = tb.openFile(os.path.join(covariate_path,fname)) cols[colname] = map_utils.interp_geodata(hf.root.lon[:],hf.root.lat[:],hf.root.data[:],cols['lon'],cols['lat'],hf.root.mask[:],order=0,nan_handler=nan_callback) if np.any(np.isnan(cols[colname])): raise ValueError hf.close() keys = cols.keys() data_out = np.rec.fromarrays([cols[k] for k in keys], names=keys) rec2csv(data_out, os.path.splitext(os.path.basename(sys.argv[1]))[0]+'_with_covariates.csv')
weights = np.array(1) sumval = 1.0 else: indexer[axis] = slice(i, i+2) j = i + 1 weights = np.array([(j - index), (index - i)],float) wshape = [1]*sorted.ndim wshape[axis] = 2 weights.shape = wshape sumval = weights.sum() return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval # save basic estimates model_estimates = model.trace('estimate')[:] mean_estimate = model_estimates.mean(axis=0) lower_estimate = percentile(model_estimates, 2.5, axis=0) upper_estimate = percentile(model_estimates, 97.5, axis=0) output = pl.rec_append_fields( rec = data, names = ['mean', 'lower', 'upper'], arrs = [mean_estimate, lower_estimate, upper_estimate]) pl.rec2csv(output, proj_dir + 'outputs/model results/spatial smoothing/spatial_intercept_' + str(sex) + '_' + age + '.csv') # save draws draws = pl.rec_append_fields( rec = data, names = ['draw_' + str(i+1) for i in range(100)], arrs = [model.trace('estimate')[i] for i in range(100)]) pl.rec2csv(draws, proj_dir + 'outputs/model results/spatial smoothing/spatial_intercept_draws_' + str(sex) + '_' + age + '.csv')
## find mean and standard error, drawing from M and C draws = 1000 mort_draws = np.zeros((draws, len(predictionyears))) gpr_seeds = [x+123456 for x in range(1,1001)] for draw in range(draws): np.random.seed(gpr_seeds[draw]) mort_draws[draw,:] = Realization(M, C)(predictionyears) logit_est = gpr.collapse_sims(mort_draws) unlogit_est = gpr.collapse_sims(mort_draws) os.chdir('FILEPATH') all_est = [] for i in range(len(predictionyears)): all_est.append((cc, predictionyears[i], unlogit_est['med'][i], unlogit_est['lower'][i], unlogit_est['upper'][i])) all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('med', '<f8'), ('lower', '<f8'), ('upper', '<f8')]) pl.rec2csv(all_est, 'gpr_%s.txt' %cc) # save the sims all_sim = [] for i in range(len(predictionyears)): for s in range(draws): all_sim.append((cc, predictionyears[i], s, mort_draws[s][i])) all_sim = pl.array(all_sim, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('sim', '<f8'), ('mort', '<f8')]) pl.rec2csv(all_sim, 'gpr_%s_sim.txt' %cc)
C = pm.gp.FullRankCovariance(my_st, amp=1, scale=1, inc=np.pi/4, ecc=.3,st=.1, sd=.5, tlc=.2, sf = .1) dm = np.vstack((lon,lat,t)).T C_eval = C(dm,dm) f = pm.rmv_normal_cov(np.sum([cv[name]*vals[name] for name in names],axis=0), C_eval) + np.random.normal(size=n_data+n_pred)*np.sqrt(V) p = pm.flib.invlogit(f) ns = 100 pos = pm.rbinomial(ns, p) neg = ns - pos print p ra_data = np.rec.fromarrays((pos[:n_data], neg[:n_data], lon[:n_data], lat[:n_data]) + tuple([cv[name][:n_data] for name in names]), names=['pos','neg','lon','lat']+names) pl.rec2csv(ra_data,'test_data.csv') ra_pred = np.rec.fromarrays((pos[n_data:], neg[n_data:], lon[n_data:], lat[n_data:]) + tuple([cv[name][n_data:] for name in names]), names=['pos','neg','lon','lat']+names) pl.rec2csv(ra_pred,'test_pred.csv') os.system('infer cov_test test_db test_data.csv -t 10 -n 8 -i 100000') # os.system('cov-test-predict test test_pred.csv 1000 100') # # # ra_data = pl.csv2rec('test_data.csv') # # ra_pred = pl.csv2rec('test_pred.csv') # samps = np.fromfile('test_samps.csv',sep=',').reshape((n_pred,-1)) # # pos_pred = pos[n_data:] # neg_pred = neg[n_data:] # p_pred = (pos_pred+1.)/(pos_pred+neg_pred+2.) #
coldict = {} coldict["t"] = np.concatenate((duffy_nan, (tstart + tend) / 2.0)) coldict["lon"] = np.concatenate((duffy_data.lon, vivax_data.lon)) coldict["lat"] = np.concatenate((duffy_data.lat, vivax_data.lat)) coldict["n"] = np.concatenate((duffy_data.n, vivax_data.pos + vivax_data.neg)) coldict["vivax_pos"] = np.concatenate((duffy_nan, vivax_data.pos)) coldict["vivax_neg"] = np.concatenate((duffy_nan, vivax_data.neg)) coldict["datatype"] = np.concatenate((duffy_data.datatype, np.repeat("vivax", n_vivax))) for colname in vivaxcols: coldict[colname] = np.concatenate((duffy_nan, vivax_data[colname])) for colname in duffycols: coldict[colname] = np.concatenate((duffy_data[colname], vivax_nan)) allcols = coldict.keys() combined_data = np.rec.fromarrays([coldict[col] for col in allcols], names=allcols) # FIXME: Do the Sahel instead. def box_data(data, llcrnrlon, llcrnrlat, urcrnrlon, urcrnrlat): indicator = (data.lon > llcrnrlon) * (data.lon < urcrnrlon) * (data.lat > llcrnrlat) * (data.lat < urcrnrlat) return data[np.where(indicator)] # Write out # warnings.warn('Boxing') # combined_data = combined_data[np.where((combined_data.lon>-19)*(combined_data.lon<54)*(combined_data.lat>0))] # combined_data = box_data(combined_data, 31.5, 11.5, 64, 32) rec2csv(combined_data, combined_datafile)
def compile_all_results(scenarios, dir='../data'): """ Compiles the results across multiple scenarios produced by running run_on_cluster on each one into a single sv file. The specified directory must be where where the results of running run_on_cluster for each scenario are stored (each is a sub-directory named v0, v1, etc.) and is also where the output from this function will be saved. """ models = [] causes = [] time = [] true_cf = [] true_std = [] std_bias = [] mean_abs_err = [] median_abs_err = [] mean_rel_err = [] median_rel_err = [] mean_csmf_accuracy = [] median_csmf_accuracy = [] mean_coverage_bycause = [] mean_coverage = [] percent_total_coverage = [] scenario = [] for i in range(scenarios): for j in ['bad_model', 'latent_simplex']: read = csv.reader(open('%s/v%s/%s_summary.csv' % (dir, i, j))) read.next() for row in read: models.append(row[0]) causes.append(row[1]) time.append(row[2]) true_cf.append(row[3]) true_std.append(row[4]) std_bias.append(row[5]) mean_abs_err.append(row[6]) median_abs_err.append(row[7]) mean_rel_err.append(row[8]) median_rel_err.append(row[9]) mean_csmf_accuracy.append(row[10]) median_csmf_accuracy.append(row[11]) mean_coverage_bycause.append(row[12]) mean_coverage.append(row[13]) percent_total_coverage.append(row[14]) scenario.append(i) all = pl.np.core.records.fromarrays( [ scenario, models, time, true_cf, true_std, causes, mean_abs_err, median_abs_err, mean_rel_err, median_rel_err, mean_csmf_accuracy, median_csmf_accuracy, mean_coverage_bycause, mean_coverage, percent_total_coverage ], names=[ 'scenario', 'model', 'time', 'true_cf', 'true_std', 'cause', 'mean_abs_err', 'median_abs_err', 'mean_rel_err', 'median_rel_err', 'mean_csmf_accuracy', 'median_csmf_accuracy', 'mean_covearge_bycause', 'mean_coverage', 'percent_total_coverage' ]) pl.rec2csv(all, fname='%s/all_summary_metrics.csv' % (dir))
mse=mse) else: # data model [M, C] = gpr.gpmodel(ihme_loc_id, region_name, data_year, data_mort, data_var, data_category, prior_year, prior_mort, mse, best_scale, best_amp2x, predictionyears) ## find mean and standard error, drawing from M and C draws = 1000 mort_draws = np.zeros((draws, len(predictionyears))) gpr_seeds = [x + 123456 for x in range(1, 1001)] for draw in range(draws): np.random.seed(gpr_seeds[draw]) mort_draws[draw, :] = Realization(M, C)(predictionyears) # collapse across draws # note: space transformations need to be performed at the draw level logit_est = gpr.collapse_sims(mort_draws) unlogit_est = gpr.collapse_sims(gpr.inv_logit(mort_draws)) # save the sims all_sim = [] for i in range(len(predictionyears)): for s in range(draws): all_sim.append((ihme_loc_id, predictionyears[i], s, gpr.inv_logit(mort_draws[s][i]))) all_sim = pl.array(all_sim, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('sim', '<f8'), ('mort', '<f8')]) pl.rec2csv(all_sim, "FILEPATH")
weights.shape = wshape sumval = weights.sum() return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval import time print 'Finished at %s' % time.ctime() # save basic predictions predictions = model.trace('predicted')[:] mean_prediction = predictions.mean(axis=0) lower_prediction = percentile(predictions, 2.5, axis=0) upper_prediction = percentile(predictions, 97.5, axis=0) output = pl.rec_append_fields( rec = data, names = ['mean', 'lower', 'upper'], arrs = [mean_prediction, lower_prediction, upper_prediction]) pl.rec2csv(output, proj_dir + 'outputs/model results/epi transition by state/all_cause_males.csv') # plot surfaces from mpl_toolkits.mplot3d import axes3d import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages pp = PdfPages(proj_dir + 'outputs/model results/epi transition by state/surfaces.pdf') fig = plt.figure() ax = fig.gca(projection='3d') X,Y = np.meshgrid(years, ages) Z = model.trace('alpha_surf')[:].mean(axis=0) ax.plot_wireframe(X, Y, Z, color='#315B7E') ax.set_title('National') pp.savefig() for g in g_list: fig = plt.figure()
wshape[axis] = 2 weights.shape = wshape sumval = weights.sum() return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval # save basic estimates model_estimates = model.trace('estimate')[:] mean_estimate = model_estimates.mean(axis=0) lower_estimate = percentile(model_estimates, 2.5, axis=0) upper_estimate = percentile(model_estimates, 97.5, axis=0) output = pl.rec_append_fields( rec = data, names = ['mean', 'lower', 'upper'], arrs = [mean_estimate, lower_estimate, upper_estimate]) pl.rec2csv(output, proj_dir + 'outputs/model results/random effects plus flex time/srw_plus_interaction_results.csv') ''' ### plot diagnostics # setup plotting #import matplotlib.pyplot as pp #pp.switch_backend('acc') plot_me = [mu_si, mu_ss, mu_ci, mu_cs, sigma_si, sigma_ss, sigma_ci, sigma_cs, state_intercepts, state_slopes, cause_intercepts, cause_slopes] # plot traces os.chdir(proj_dir + '/outputs/model results/simple random effects by state/mcmc plots/traces/') for p in plot_me: mc.Matplot.plot(p, suffix='_trace') if len(p.shape) == 0: plt.close()
wshape = [1] * sorted.ndim wshape[axis] = 2 weights.shape = wshape sumval = weights.sum() return np.add.reduce(sorted[indexer] * weights, axis=axis, out=out) / sumval # save basic estimates model_estimates = model.trace("estimate")[:] mean_estimate = model_estimates.mean(axis=0) lower_estimate = percentile(model_estimates, 2.5, axis=0) upper_estimate = percentile(model_estimates, 97.5, axis=0) output = pl.rec_append_fields( rec=data, names=["mean", "lower", "upper"], arrs=[mean_estimate, lower_estimate, upper_estimate] ) pl.rec2csv(output, proj_dir + "outputs/model results/simple random effects by state/pymc_results.csv") ### plot diagnostics # setup plotting # import matplotlib.pyplot as pp # pp.switch_backend('acc') plot_me = [ mu_si, mu_ss, mu_ci, mu_cs, sigma_si, sigma_ss, sigma_ci, sigma_cs,
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test): # load in the data all_data = csv2rec(infile, use_mrecords=False) for m in range(number_submodels): if all_data['spacetime_' + str(m+1)].dtype == 'float64': all_data = np.delete(all_data, np.where(np.isnan(all_data['spacetime_' + str(m+1)]))[0], axis=0) # find the list of years for which we need to predict year_list = np.unique(all_data.year) # find the list of country/age groups country_age = np.array([str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data))]) country_age_list = np.repeat(np.unique(country_age), len(year_list)) # make empty arrays in which to store the results draws = [np.empty(len(country_age_list), 'float') for i in range(number_submodels)] iso3 = np.empty(len(country_age_list), '|S3') age_group = np.empty(len(country_age_list), 'int') year = np.empty(len(country_age_list), 'int') # loop through country/age groups for ca in np.unique(country_age_list): print('GPRing ' + ca) # subset the data for this particular country/age ca_data = all_data[country_age==ca] # subset just the observed data if ca_data['lt_cf'].dtype != '|O8': ca_observed = ca_data[(np.isnan(ca_data['lt_cf'])==0) & (ca_data['test_' + test]==0)] if len(ca_observed) > 1: has_data = True else: has_data = False else: has_data = False # loop through each submodel for m in range(number_submodels): # skip models with no spacetime results if all_data['spacetime_' + str(m+1)].dtype != 'float64': draws[m][country_age_list==ca] = np.NaN continue # identify the dependent variable for this model dv = dv_list[m] # make a list of the spacetime predictions ca_prior = np.array([np.mean(ca_data['spacetime_' + str(m+1)][ca_data.year==y]) for y in year_list]) # find the amplitude for this country/age amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m+1)]) # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR def mean_function(x) : return np.interp(x, year_list, ca_prior) # setup the covariance function M = gp.Mean(mean_function) C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale) # observe the data if there is any if has_data: gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m+1)], obs_vals=ca_observed[dv]) # save the data for this country/age into the results array iso3[country_age_list==ca] = ca[0:3] age_group[country_age_list==ca] = ca[4:] year[country_age_list==ca] = year_list.T draws[m][country_age_list==ca] = M(year_list) # save the results print('Saving GPR results') names = ['iso3','age_group','year'] results = np.core.records.fromarrays([iso3,age_group,year], names=names) for m in range(number_submodels): results = recfunctions.append_fields(results, 'gpr_' + str(m+1) + '_spacetime_mean', draws[m]) rec2csv(results, outfile)
wshape[axis] = 2 weights.shape = wshape sumval = weights.sum() return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval # save basic estimates model_estimates = model.trace('estimate')[:] mean_estimate = model_estimates.mean(axis=0) lower_estimate = percentile(model_estimates, 2.5, axis=0) upper_estimate = percentile(model_estimates, 97.5, axis=0) output = pl.rec_append_fields( rec = data, names = ['mean', 'lower', 'upper'], arrs = [mean_estimate, lower_estimate, upper_estimate]) pl.rec2csv(output, proj_dir + 'outputs/model results/random effects plus flex time/pymc_results.csv') ''' ### plot diagnostics # setup plotting #import matplotlib.pyplot as pp #pp.switch_backend('acc') plot_me = [mu_si, mu_ss, mu_ci, mu_cs, sigma_si, sigma_ss, sigma_ci, sigma_cs, state_intercepts, state_slopes, cause_intercepts, cause_slopes] # plot traces os.chdir(proj_dir + '/outputs/model results/simple random effects by state/mcmc plots/traces/') for p in plot_me: mc.Matplot.plot(p, suffix='_trace') if len(p.shape) == 0: plt.close()
if (transform == 'log10'): re = (10**mort - unlog_est['med'][pred_index])/(10**mort) # log base 10 #print('1') elif (transform == 'ln'): re = (math.e**mort - unlog_est['med'][pred_index])/(math.e**mort) # natural log #print('2') elif (transform == 'logit'): re = (((math.e**mort)/(1+(math.e**mort))) - unlog_est['med'][pred_index])/((math.e**mort)/(1+(math.e**mort))) # logit #print('3') elif (transform == 'logit10'): re = (((10**mort)/(1+(10**mort))) - unlog_est['med'][pred_index])/((10**mort)/(1+(10**mort))) # logit #print('4') total_var = stderr**2 + log_est['std'][pred_index]**2 # This evaluates coverage as if any part of uncertainty of data and estimates overlap # total_var = log_est['std'][pred_index]**2 # This calculates coverage based only on the uncertainty of the estimate coverage = int((log_est['med'][pred_index] - 1.96*pl.sqrt(total_var)) < mort < (log_est['med'][pred_index] + 1.96*pl.sqrt(total_var))) all_err.append((rr, cc, ss, ho, scale, amp2x, lam, zeta, mse*amp2x, year, mort, re, coverage)) ## write files os.chdir('/strPath') all_est = pl.array(all_est, [('region_name', '|S64'), ('ihme_loc_id', '|S32'), ('sex', '|S32'), ('ho', '<f8'), ('scale', '<f8'), ('amp2x', '<f8'), ('lambda', '<f8'), ('zeta', '<f8'), ('amp2', '<f8'), ('year', '<f8'), ('mort', '<f8'), ('std', '<f8')]) pl.rec2csv(all_est, 'gpr_%s_%s_%i_%s_%s.txt' %(cc, ss, ho, lam, zeta)) os.chdir('strPath') all_err = pl.array(all_err, [('region_name', '|S64'), ('ihme_loc_id', '|S32'), ('sex', '|S32'), ('ho', '<f8'), ('scale', '<f8'), ('amp2x', '<f8'), ('lambda', '<f8'), ('zeta', '<f8'),('amp2', '<f8'), ('year', '<f8'), ('mort', '<f8'), ('re', '<f8'), ('coverage', '<f8')]) pl.rec2csv(all_err, 'loss_%s_%s_%i_%s_%s.txt' %(cc, ss, ho, lam, zeta))