def resample(data): if len(data) == 0: return data delta_true = .1 p = data['mu_pred']+1.e-6 # TODO: abstract this block of code into rate_model.py; it is also called in data_model.py ## ensure that all data has uncertainty quantified appropriately # first replace all missing se from ci missing_se = pl.isnan(data['standard_error']) | (data['standard_error'] <= 0) data['standard_error'][missing_se] = (data['upper_ci'][missing_se] - data['lower_ci'][missing_se]) / (2*1.96) # then replace all missing ess with se missing_ess = pl.isnan(data['effective_sample_size']) data['effective_sample_size'][missing_ess] = data['value'][missing_ess]*(1-data['value'][missing_ess])/data['standard_error'][missing_ess]**2 # warn and drop data that doesn't have effective sample size quantified, or is is non-positive missing_ess = pl.isnan(data['effective_sample_size']) | (data['effective_sample_size'] < 0) if sum(missing_ess) > 0: print 'WARNING: %d rows of data has invalid quantification of uncertainty.' % sum(missing_ess) data['effective_sample_size'][missing_ess] = 1.0 n = data['effective_sample_size'] data['true'] = p data['value'] = (1.0 * mc.rnegative_binomial(n*p, delta_true*n*p)) / n # uncomment below to test the effect of having very wrong data #data['value'] = 0. #data['effective_sample_size'] = 1.e6 return data
def process(self): """rearranges the ping data into a matrix of max amplitude of dimensions corrisponding to the power, gain and beam sections.""" MINSAMPLES = 5 datadim = self.pingdata.shape self.pingmax = pl.zeros((len(self.settings['power']), len(self.settings['gain']), datadim[2])) for i, power in enumerate(self.settings['power']): for j, gain in enumerate(self.settings['gain']): for k in xrange(datadim[2]): sampleindx = pl.find((self.pingdata[:, 1, k] == power) & (self.pingdata[:, 2, k] == gain)) if len(sampleindx) > MINSAMPLES: temp = self.pingdata[sampleindx[-MINSAMPLES:], 0, k] tempmax = temp.max() if tempmax == 0: self.pingmax[i, j, k] = pl.NaN else: self.pingmax[i, j, k] = temp.max() else: self.pingmax[i, j, k] = pl.NaN #The following section removes settings that were collected erroniously. #gain settings first null = pl.zeros((len(self.settings['gain']), datadim[2])) powershortlist = [] self.havedata = True # this is an ugly workaround... for i, power in enumerate(self.settings['power']): test = pl.isnan(self.pingmax[i, :, :] ) if test.all(): powershortlist.append(i) print 'removing ' + str(power) + ' power setting.' for i in powershortlist: try: self.settings['power'].pop(i) except IndexError: self.havedata = False if self.havedata: self.pingmax = pl.delete(self.pingmax, powershortlist, 0) #then power settings null = pl.zeros((len(self.settings['power']), datadim[2])) gainshortlist = [] for i, gain in enumerate(self.settings['gain']): test = pl.isnan(self.pingmax[:, i, :]) if test.all(): gainshortlist.append(i) print 'removing ' + str(gain) + ' gain setting.' for i in gainshortlist: try: self.settings['gain'].pop(i) except IndexError: self.havedata = False if self.havedata: self.pingmax = pl.delete(self.pingmax, gainshortlist, 1) #remove the power and gain to normalize self.pingmax = 20*pl.log10(self.pingmax) for i, power in enumerate(self.settings['power']): for j, gain in enumerate(self.settings['gain']): self.pingmax[i, j, :] = self.pingmax[i, j, :] - power - gain
def evaluate_model(mod, comment='', data_fname='missing_noisy_data.csv', truth_fname='data.csv'): """ Run specified model on existing data (data.csv / missing_noisy_data.csv) and save results in dev_log.csv Existing models: %s """ % data_run_models if mod not in data_run_models.split(' '): raise TypeError, 'Unrecognized model "%s"; must be one of %s' % (mod, data_run_models) import model reload(model) print 'loading data' data = pl.csv2rec(data_fname) truth = pl.csv2rec(truth_fname) t0 = time.time() print 'generating model' mod_mc = eval('model.%s(data)' % mod) print 'fitting model with mcmc' mod_mc.sample(10000, 5000, 50, verbose=1) t1 = time.time() print 'summarizing results' import graphics reload(graphics) pl.figure(figsize=(22, 17), dpi=300) pl.clf() graphics.plot_all_predictions_over_time(data, mod_mc.predicted, more_data=truth) data_stats = mod_mc.data_predicted.stats() i_out = [i for i in range(len(data)) if pl.isnan(data.y[i])] rmse_abs_out = pl.rms_flat(truth.y[i_out] - data_stats['mean'][i_out]) rmse_rel_out = 100*pl.rms_flat(1. - data_stats['mean'][i_out]/truth.y[i_out]) i_in = [i for i in range(len(data)) if not pl.isnan(data.y[i])] rmse_abs_in = pl.rms_flat(truth.y[i_in] - data_stats['mean'][i_in]) rmse_rel_in = 100*pl.rms_flat(1. - data_stats['mean'][i_in]/truth.y[i_in]) param_stats = mod_mc.param_predicted.stats() coverage = 100*pl.sum((truth.y[i_out] >= param_stats['95% HPD interval'][i_out, 0]) & (truth.y[i_out] <= param_stats['95% HPD interval'][i_out, 1])) / float(len(i_out)) import md5 data_hash = md5.md5(data).hexdigest() results = [mod, t1-t0, rmse_abs_out, rmse_rel_out, rmse_abs_in, rmse_rel_in, coverage, len(data), len(pl.unique(data.region)), len(pl.unique(data.country)), len(pl.unique(data.year)), len(pl.unique(data.age)), data_hash, t0, comment] print '%s: time: %.0fs out-of-samp rmse abs=%.1f rel=%.0f in-samp rmse abs=%.1f rel=%.0f coverage=%.0f\ndata: %d rows; %d regions, %d countries %d years %d ages [data hash: %s]\n(run conducted at %f)\n%s' % tuple(results) pl.savefig('/home/j/Project/Models/space-time-smoothing/images/%s.png' % t0) # FIXME: don't hardcode path for saving images import csv f = open('dev_log.csv', 'a') f_csv = csv.writer(f) f_csv.writerow(results) f.close() return mod_mc
def create_uncertainty(model, rate_type): '''data without valid uncertainty is given the 10% uncertainty of the data set Parameters ---------- model : data.ModelData dismod model rate_type : str a rate model 'neg_binom', 'binom', 'normal', 'log_norm', 'poisson', 'beta' Results ------- model : data.ModelData dismod model with measurements of uncertainty for all data ''' # fill any missing covariate data with 0s for cv in list(model.input_data.filter(like='x_').columns): model.input_data[cv] = model.input_data[cv].fillna([0]) # find indices that are negative for standard error and # calculate standard error from effective sample size missing_se = pl.isnan(model.input_data['standard_error']) | (model.input_data['standard_error'] < 0) if True in set(missing_se): model.input_data['standard_error'][missing_se] = (model.input_data['upper_ci'][missing_se] - model.input_data['lower_ci'][missing_se]) / (2*1.96) missing_se_still = pl.isnan(model.input_data['standard_error']) | (model.input_data['standard_error'] < 0) if True in set(missing_se_still): model.input_data['standard_error'][missing_se_still] = pl.sqrt(model.input_data['value'][missing_se_still]*(1-model.input_data['value'][missing_se_still])/model.input_data['effective_sample_size'][missing_se_still]) # find indices that contain nan for effective sample size missing_ess = pl.isnan(model.input_data['effective_sample_size'])==1 # calculate effective sample size from standard error model.input_data['effective_sample_size'][missing_ess] = model.input_data['value'][missing_ess]*(1-model.input_data['value'][missing_ess])/(model.input_data['standard_error'][missing_ess])**2 # find effective sample size of entire dataset non_missing_ess_still = pl.isnan(model.input_data['effective_sample_size'])==0 # finds all real numbers if False in non_missing_ess_still: percent = pl.percentile(model.input_data['effective_sample_size'][non_missing_ess_still], 10.) missing_ess_still = pl.isnan(model.input_data['effective_sample_size'])==1 # finds all nan # replace nan effective sample size with 10th percentile model.input_data['effective_sample_size'][missing_ess_still] = percent # change values of 0 in lognormal model to 1 observation if rate_type == 'log_normal': # find indices where values are 0 zero_val = (model.input_data['value'] == 0) # add 1 observation so no values are zero, also change effective sample size model.input_data['effective_sample_size'][zero_val] = model.input_data['effective_sample_size'][zero_val] + 1 model.input_data['value'][zero_val] = 1.0/model.input_data['effective_sample_size'][zero_val] # update standard error model.input_data['standard_error'][zero_val] = pl.sqrt(model.input_data['value'][zero_val]*(1-model.input_data['value'][zero_val])/model.input_data['effective_sample_size'][zero_val]) return model
def likelihood(self, verbose=None): ''' Compute the log-likelihood of the current simulation based on the number of new diagnoses. ''' if verbose is None: verbose = self['verbose'] if not self.results['ready']: self.run(calc_likelihood=False, verbose=verbose) # To avoid an infinite loop loglike = 0 for d, datum in enumerate(self.data['new_positives']): if not pl.isnan(datum): # Skip days when no tests were performed estimate = self.results['diagnoses'][d] p = cv.poisson_test(datum, estimate) logp = pl.log(p) loglike += logp if verbose >= 2: print( f' {self.data["date"][d]}, data={datum:3.0f}, model={estimate:3.0f}, log(p)={logp:10.4f}, loglike={loglike:10.4f}' ) self.results['likelihood'] = loglike if verbose >= 1: print(f'Likelihood: {loglike}') return loglike
def sample(self, model, evidence): z = evidence['z'] T, g, h, sigma_g = [evidence[var] for var in ['T', 'g', 'h', 'sigma_g']] sigma_z_g = model.known_params['sigma_z_g'] sigma_z_h = model.known_params['sigma_z_h'] prior_mu_g, prior_cov_g = [model.hyper_params[var] for var in ['prior_mu_g', 'prior_cov_g']] n = len(g) # Must be a more concise way to deal with scalar vs vector g = g.copy().reshape((n,1)) h = h.copy().reshape((n,1)) z_g = ma.asarray(z.copy().reshape((n,1))) obs_cov = sigma_z_g**2*ones((n,1,1)) if sum(T == 0) > 0: z_g[T == 0] = nan if sum(T == 2) > 0: z_g[T == 2] -= h[T == 2] obs_cov[T == 2] = sigma_z_h**2 z_g[isnan(z_g)] = ma.masked kalman = self._kalman kalman.initial_state_mean = array([prior_mu_g[0],]) kalman.initial_state_covariance = array([prior_cov_g[0,0],]) kalman.transition_matrices = eye(1) kalman.transition_covariance = array([sigma_g**2,]) kalman.observation_matrices = eye(1) kalman.observation_covariance = obs_cov sampled_g = forward_filter_backward_sample(kalman, z_g) return sampled_g.reshape((n,))
def sample(self, model, evidence): z, T, g, h, sigma_h, phi = [evidence[var] for var in ['z', 'T', 'g', 'h', 'sigma_h', 'phi']] sigma_z_h = model.known_params['sigma_z_h'] mu_h = model.known_params['mu_h'] prior_mu_h = model.hyper_params['prior_mu_h'] prior_cov_h = model.hyper_params['prior_cov_h'] n = len(h) g = g.copy().reshape((n,1)) h = h.copy().reshape((n,1)) z_h = ma.asarray(z.copy().reshape((n,1))) if sum(T == 0) > 0: z_h[T == 0] = nan if sum(T == 1) > 0: z_h[T == 1] = nan if sum(T == 2) > 0: z_h[T == 2] -= g[T == 2] z_h[isnan(z_h)] = ma.masked kalman = self._kalman kalman.initial_state_mean = array([prior_mu_h[0],]) kalman.initial_state_covariance = array([prior_cov_h[0,0],]) kalman.transition_matrices = array([phi,]) kalman.transition_covariance = array([sigma_h**2,]) kalman.transition_offsets = mu_h*(1-phi)*ones((n, 1)) kalman.observation_matrices = eye(1) kalman.observation_covariance = array([sigma_z_h**2,]) sampled_h = forward_filter_backward_sample(kalman, z_h) return sampled_h.reshape((n,))
def add_noise_to_cube(data, beamfwhm_pix, fluxmap=None): import pylab as pl pl.seed() s = data.shape noise = pl.randn(s[0], s[1], s[2]) noisescale = 1. if type(fluxmap) != type(None): noisescale = 1.26 * fluxmap**2 z = pl.where(pl.isnan(noisescale)) if len(z[0]) > 0: noisescale[z] = 1. # from astropy.convolution import convolve_fft,Gaussian2DKernel # psf=Gaussian2DKernel(stddev=beamfwhm_pix/2.354) # for i in range(s[0]): # ASSUMES FIRST AXIS IS VEL # noise[i]=convolve_fft(noise[i]/noisescale,psf)#,interpolate_nan=True) from scipy.ndimage.filters import gaussian_filter for i in range(s[0]): # ASSUMES FIRST AXIS IS VEL noise[i] = gaussian_filter(noise[i], beamfwhm_pix / 2.354) / noisescale def mad(data, axis=None): return pl.nanmedian(pl.absolute(data - pl.nanmedian(data, axis)), axis) rms = mad(data) # rms of original cube current_rms = mad(noise) noise = rms * noise / current_rms # scale the noise to have the same rms as the data - there's a sqrt(2) problem I think return noise + data
def add_thermodynamic_constraints(cpl, dG0_f, c_range=(1e-6, 1e-2), T=default_T, bounds=None): """ For any compound that does not have an explicit bound set by the 'bounds' argument, create a bound using the 'margin' variables (the last to columns of A). """ Nc = dG0_f.shape[0] if bounds != None and len(bounds) != Nc: raise Exception("The concentration bounds list must be the same length as the number of compounds") if bounds == None: bounds = [(None, None)] * Nc for c in xrange(Nc): if pylab.isnan(dG0_f[c, 0]): continue # unknown dG0_f - cannot bound this compound's concentration at all b_low = bounds[c][0] or c_range[0] b_high = bounds[c][1] or c_range[1] # lower bound: dG0_f + R*T*ln(Cmin) <= x_i cpl.variables.set_lower_bounds('c%d' % c, dG0_f[c, 0] + R*T*pylab.log(b_low)) # upper bound: x_i <= dG0_f + R*T*ln(Cmax) cpl.variables.set_upper_bounds('c%d' % c, dG0_f[c, 0] + R*T*pylab.log(b_high))
def test_from_gbd_json(): d = data.ModelData.from_gbd_json('tests/dismoditis.json') assert len(d.input_data) > 17, 'dismoditis model has more than 17 data points' for field in 'data_type value area sex age_start age_end year_start year_end standard_error effective_sample_size lower_ci upper_ci age_weights'.split(): assert field in d.input_data.columns, 'Input data CSV should have field "%s"' % field #assert len(d.input_data.filter(regex='x_').columns) == 1, 'should have added country-level covariates to input data' #assert len(d.input_data['x_LDI_id_Updated_7July2011'].dropna().index) > 0 assert len(d.output_template) > 100 for field in 'area sex year pop'.split(): assert field in d.output_template.columns, 'Output template CSV should have field "%s"' % field #assert len(d.output_template.filter(regex='x_').columns) == 1, 'should have added country-level covariates to output template' #assert len(d.output_template['x_LDI_id_Updated_7July2011'].dropna().index) > 0 for data_type in 'i p r f rr X'.split(): for prior in 'smoothness heterogeneity level_value level_bounds increasing decreasing'.split(): assert prior in d.parameters[data_type], 'Parameters for %s should include prior on %s' % (data_type, prior) assert 'CHN' in d.hierarchy.successors('asia_east') assert pl.isnan(d.hierarchy['asia_east']['CHN'].get('weight')) #assert set(d.hierarchy.node['asia_east'].keys()) == set('area sex year_start year_end pop'.split()) #assert len(d.nodes_to_fit) == 21*3*2 + 1 import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json()))
def likelihood(self, weights=None, verbose=None): ''' Compute the log-likelihood of the current simulation based on the number of new diagnoses. ''' if verbose is None: verbose = self['verbose'] if weights is None: weights = {} loglike = 0 if self.data is not None: # Only perform likelihood calculation if data are available for key in self.reskeys: if key in self.data: if key in weights: weight = weights[key] else: weight = 1 for d, datum in enumerate(self.data[key]): if not pl.isnan(datum) and d < len( self.results[key].values): estimate = self.results[key][d] if datum and estimate: p = cvu.poisson_test(datum, estimate) logp = pl.log(p) loglike += weight * logp sc.printv( f' {self.data["date"][d]}, data={datum:3.0f}, model={estimate:3.0f}, log(p)={logp:10.4f}, loglike={loglike:10.4f}', 2, verbose) self.results['likelihood'] = loglike sc.printv(f'Likelihood: {loglike}', 1, verbose) return loglike
def metric_heat(group): if all(pylab.isnan(group[metric])): #print metric #describe_group(group) pass return groupfunc(group[metric])
def MutationPerClone(x): '''Calculates average number and STD of mutations from MutationRecords.dat file per clone. Mut_record[0] - number of cells, [1] - number of clones, [2] - mean number of point mutations, [3] - STD of point mutations, [4] - mean number of duplications, [5] - STD of duplications, [6] - mean number of deletion, [7] - STD of deletion.''' Mut_record = p.zeros(10) Mut_record[0] = x.shape[0] ZERRO = p.zeros((x.shape[0], 1)) x = p.concatenate((x, ZERRO), axis=1) for i in xrange(0, x.shape[0]): if (x[i, 5] == 0): for j in xrange(i+1, x.shape[0]): if (x[j, 0] == x[i, 0]): x[j, 5] = p.nan x = x[~p.isnan(x).any(1)] Mut_record[1] = x.shape[0] Mut_record[2] = x[:, 1].mean() Mut_record[3] = x[:, 1].std() Mut_record[4] = x[:, 2].mean() Mut_record[5] = x[:, 2].std() Mut_record[6] = x[:, 3].mean() Mut_record[7] = x[:, 3].std() Mut_record[8] = x[:, 4].mean() Mut_record[9] = x[:, 4].std() return Mut_record
def write_angles(): prev_neurons = set() first_line = False if FNAME in os.listdir(OUTDIR): df = pd.read_csv('%s/%s' % (OUTDIR, FNAME), skipinitialspace=True) neuron_names = list(df['neuron name']) neuron_types = list(df['neuron type']) prev_neurons = set(zip(neuron_names, neuron_types)) else: first_line = True i = 0 with open('%s/%s' % (OUTDIR, FNAME), 'a') as f: if first_line: f.write( 'neuron name, neuron type, parent, child1, child2, angle\n') directory = DATASETS_DIR for cell_type in os.listdir(directory): for species in os.listdir(directory + '/' + cell_type): for region in os.listdir(directory + '/' + cell_type + '/' + species): for lab in os.listdir(directory + "/" + cell_type + '/' + species + '/' + region): for neuron in os.listdir(directory + "/" + cell_type + "/" + species + '/' + region + '/' + lab): filename = directory + "/" + cell_type + "/" + species + "/" + region + '/' + lab + '/' + neuron if neuron[-8:] != ".CNG.swc": continue neuron_name = neuron[:-8] try: graphs = get_neuron_points(filename) except AssertionError: continue for i, G in enumerate(graphs): neuron_type = NEURON_TYPES[i] if (neuron_name, neuron_type) in prev_neurons: continue prev_neurons.add((neuron_name, neuron_type)) if G == None: continue print neuron_name, neuron_type angles = get_angles(G) for (parent, child1, child2), angle in angles.iteritems(): if pylab.isnan(angle): continue parent = int(parent) child1 = int(child1) child2 = int(child2) f.write('%s, %s, %d, %d, %d, %f\n' %\ (neuron_name, neuron_type, parent, child1, child2, angle))
def sample(self, model, evidence): z = evidence['z'] T, g, sigma_g = [evidence[var] for var in ['T', 'g', 'sigma_g']] sigma_z_g = model.known_params['sigma_z_g'] prior_mu_g, prior_cov_g = [ model.hyper_params[var] for var in ['prior_mu_g', 'prior_cov_g'] ] n = len(g) z_g = z.copy() z_g[T == 0] = nan z_g = ma.asarray(z_g) z_g[isnan(z_g)] = ma.masked kalman = self._kalman kalman.initial_state_mean = prior_mu_g[0] kalman.initial_state_covariance = prior_cov_g[0, 0] kalman.transition_matrices = 1 kalman.transition_covariance = sigma_g**2 kalman.observation_matrices = 1 kalman.observation_covariance = sigma_z_g**2 pdb.set_trace() sampled_g = forward_filter_backward_sample(kalman, z_g) return sampled_g
def find_unfeasible_concentrations(S, dG0_f, c_range, c_mid=1e-4, T=default_T, bounds=None, log_stream=None): """ Almost the same as find_pCr, but adds a global restriction on the concentrations (for compounds that don't have specific bounds in 'bounds'). After the solution which optimizes the pCr is found, any concentration which does not confer to the limits of c_range will be truncated to the closes allowed concentration. If at least one concentration needs to be adjusted, then pCr looses its meaning and therefore is returned with the value None. """ dG_f, concentrations, pCr = find_pCr(S, dG0_f, c_mid=c_mid, bounds=bounds, log_stream=log_stream) for c in xrange(dG0_f.shape[0]): if (pylab.isnan(dG0_f[c, 0])): continue # unknown dG0_f - therefore the concentration of this compounds is meaningless if ((bounds == None or bounds[c][0] == None) and concentrations[c, 0] < c_range[0]): concentrations[c, 0] = c_range[0] dG_f[c, 0] = dG0_f[c, 0] + R * T * c_range[0] pCr = None elif ((bounds == None or bounds[c][1] == None) and concentrations[c, 0] > c_range[1]): concentrations[c, 0] = c_range[1] dG_f[c, 0] = dG0_f[c, 0] + R * T * c_range[1] pCr = None return (dG_f, concentrations, pCr)
def timeseriesstd(times, x, xmean=pylab.nan): if pylab.isnan(xmean): xmean = timeseriesmean(times, x) return pylab.sqrt(1.0 * sum([ (t2 - t1) * ((x1 - xmean)**2 + (x2 - xmean)**2) / 2 for t1, t2, x1, x2 in zip(times[0:-1], times[1:], x[0:-1], x[1:]) ]) / (times[-1] - times[0]))
def likelihood(self, verbose=None): ''' Compute the log-likelihood of the current simulation based on the number of new diagnoses. ''' if verbose is None: verbose = self['verbose'] loglike = 0 if self.data is not None and len( self.data ): # Only perform likelihood calculation if data are available for d, datum in enumerate(self.data['new_positives']): if not pl.isnan( datum): # Skip days when no tests were performed estimate = self.results['diagnoses'][d] p = cvu.poisson_test(datum, estimate) logp = pl.log(p) loglike += logp sc.printv( f' {self.data["date"][d]}, data={datum:3.0f}, model={estimate:3.0f}, log(p)={logp:10.4f}, loglike={loglike:10.4f}', 2, verbose) self.results['likelihood'] = loglike sc.printv(f'Likelihood: {loglike}', 1, verbose) return loglike
def setup(dm, key, data_list, rate_stoch): """ Generate the PyMC variables for a log-normal model of a function of age Parameters ---------- dm : dismod3.DiseaseModel the object containing all the data, priors, and additional information (like input and output age-mesh) key : str the name of the key for everything about this model (priors, initial values, estimations) data_list : list of data dicts the observed data to use in the beta-binomial liklihood function rate_stoch : pymc.Stochastic a PyMC stochastic (or deterministic) object, with len(rate_stoch.value) == len(dm.get_estimation_age_mesh()). Results ------- vars : dict Return a dictionary of all the relevant PyMC objects for the log-normal model. vars['rate_stoch'] is of particular relevance, for details see the beta_binomial_model """ vars = {} est_mesh = dm.get_estimate_age_mesh() vars['rate_stoch'] = rate_stoch # set up priors and observed data prior_str = dm.get_priors(key) dismod3.utils.generate_prior_potentials(vars, prior_str, est_mesh) vars['observed_rates'] = [] for d in data_list: age_indices = dismod3.utils.indices_for_range(est_mesh, d['age_start'], d['age_end']) age_weights = d.get('age_weights', pl.ones(len(age_indices)) / len(age_indices)) lb, ub = dm.bounds_per_1(d) se = (pl.log(ub) - pl.log(lb)) / (2. * 1.96) if pl.isnan(se) or se <= 0.: se = 1. print 'data %d: log(value) = %f, se = %f' % (d['id'], pl.log(dm.value_per_1(d)), se) @mc.observed @mc.stochastic(name='obs_%d' % d['id']) def obs(f=vars['rate_stoch'], age_indices=age_indices, age_weights=age_weights, value=pl.log(dm.value_per_1(d)), tau=se**-2, data=d): f_i = dismod3.utils.rate_for_range(f, age_indices, age_weights) return mc.normal_like(value, pl.log(f_i), tau) vars['observed_rates'].append(obs) return vars
def load_new_model(disease, country='all', sex=['total', 'male', 'female'], cov='no'): '''create disease model with relavtive data cov : str method to handle covariates default is nothing ('no') options include, - 'drop' : drop all covartiates - 'zero' : missing values replaced with 0 - 'average' : missing values replaced with average of column ''' model = dismod3.data.load('/home/j/Project/dismod/output/dm-%s'%disease) # keep relative data if (type(sex)==str) & (sex != 'total'): model.keep(areas=[country], sexes=[sex, 'total']) else: model.keep(areas=[country], sexes=sex) if (True in pl.isnan(pl.array(model.output_template.filter(like='x_')))) | (True in pl.isnan(pl.array(model.input_data.filter(like='x_')))): print 'Covariates missing, %s method used'%(cov) col = model.input_data.filter(like='x_').columns for i in col: if cov == 'drop': model.input_data = model.input_data.drop(i,1) model.output_template = model.output_template.drop(i,1) elif cov == 'zero': model.input_data[i] = model.input_data[i].fillna([0]) model.output_template[i] = model.output_template[i].fillna([0]) elif cov == 'average': model.input_data[i] = model.input_data[i].fillna([model.input_data[i].mean()]) model.output_template[i] = model.output_template[i].fillna(model.output_template[i].mean()) return model
def get_cod_data_all_causes(iso3='USA', age_group='1_4', sex='F'): """ TODO: write doc string for this function""" print 'loading', iso3, age_group, sex import glob cause_list = [] fpath = '/home/j/Project/Causes of Death/Under Five Deaths/CoD Correct Input Data/v02_prep_%s/%s+*+%s+%s.csv' % ( iso3, iso3, age_group, sex) #fpath = '/home/j/Project/GBD/dalynator/data/cod_correct_input_pos/run_9_cause_*.csv' # use Mike's validation data fnames = glob.glob(fpath) # initialize input distribution array N = 990 # TODO: get this from the data files T = 32 # TODO: get this from the data files J = len(fnames) F = pl.zeros((N, T, J)) # fill input distribution array with data from files for j, fname in enumerate(sorted(fnames)): cause = fname.split('+')[1] # TODO: make this less brittle and clearer #cause = str(j) # use Mike's validation data causes print 'loading cause', cause F_j = pl.csv2rec(fname) for n in range(N): F[n, :, j] = F_j['ensemble_d%d' % (n + 1)] / F_j['envelope'] #F[n, :, j] = F_j['d%d'%(n+1)]/F_j['envelope'] # use Mike's validation data assert not pl.any( pl.isnan(F)), '%s should have no missing values' % fname cause_list.append(cause) print 'loading complete' return F, cause_list
def get_cod_data_all_causes(iso3='USA', age_group='1_4', sex='F'): """ TODO: write doc string for this function""" print 'loading', iso3, age_group, sex import glob cause_list = [] fpath = '/home/j/Project/Causes of Death/Under Five Deaths/CoD Correct Input Data/v02_prep_%s/%s+*+%s+%s.csv' % (iso3, iso3, age_group, sex) #fpath = '/home/j/Project/GBD/dalynator/data/cod_correct_input_pos/run_9_cause_*.csv' # use Mike's validation data fnames = glob.glob(fpath) # initialize input distribution array N = 990 # TODO: get this from the data files T = 32 # TODO: get this from the data files J = len(fnames) F = pl.zeros((N, T, J)) # fill input distribution array with data from files for j, fname in enumerate(sorted(fnames)): cause = fname.split('+')[1] # TODO: make this less brittle and clearer #cause = str(j) # use Mike's validation data causes print 'loading cause', cause F_j = pl.csv2rec(fname) for n in range(N): F[n, :, j] = F_j['ensemble_d%d'%(n+1)]/F_j['envelope'] #F[n, :, j] = F_j['d%d'%(n+1)]/F_j['envelope'] # use Mike's validation data assert not pl.any(pl.isnan(F)), '%s should have no missing values' % fname cause_list.append(cause) print 'loading complete' return F, cause_list
def test_from_gbd_json(): d = data.ModelData.from_gbd_json('tests/dismoditis.json') assert len( d.input_data) > 17, 'dismoditis model has more than 17 data points' for field in 'data_type value area sex age_start age_end year_start year_end standard_error effective_sample_size lower_ci upper_ci age_weights'.split( ): assert field in d.input_data.columns, 'Input data CSV should have field "%s"' % field #assert len(d.input_data.filter(regex='x_').columns) == 1, 'should have added country-level covariates to input data' #assert len(d.input_data['x_LDI_id_Updated_7July2011'].dropna().index) > 0 assert len(d.output_template) > 100 for field in 'area sex year pop'.split(): assert field in d.output_template.columns, 'Output template CSV should have field "%s"' % field #assert len(d.output_template.filter(regex='x_').columns) == 1, 'should have added country-level covariates to output template' #assert len(d.output_template['x_LDI_id_Updated_7July2011'].dropna().index) > 0 for data_type in 'i p r f rr X'.split(): for prior in 'smoothness heterogeneity level_value level_bounds increasing decreasing'.split( ): assert prior in d.parameters[ data_type], 'Parameters for %s should include prior on %s' % ( data_type, prior) assert 'CHN' in d.hierarchy.successors('asia_east') assert pl.isnan(d.hierarchy['asia_east']['CHN'].get('weight')) #assert set(d.hierarchy.node['asia_east'].keys()) == set('area sex year_start year_end pop'.split()) #assert len(d.nodes_to_fit) == 21*3*2 + 1 import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons( json.loads(dismod3.disease_json.DiseaseJson().to_json()))
def _get_angles(steps,track_length): angles = pl.zeros(track_length-2) polar = pl.zeros(pl.shape(steps)) for i in range(track_length-1): polar[i,0] = pl.norm(steps[i,:]) polar[i,1] = pl.arctan(steps[i,0]/steps[i,1]) if pl.isnan( polar[i,1]): polar[i,1] = 0 if (steps[i,0] >= 0): if (steps[i,1] >= 0): pass elif (steps[i,1] < 0): polar[i,1] += 2.*pl.pi elif (steps[i,0] < 0): if (steps[i,1] >= 0): polar[i,1] += pl.pi elif (steps[i,1] < 0): polar[i,1] += pl.pi for i in range(track_length-2): angles[i] = polar[i+1,1] - polar[i,1] return angles
def sample(self, model, evidence): z = evidence['z'] T, surfaces, sigma_g, sigma_h = [evidence[var] for var in ['T', 'surfaces', 'sigma_g', 'sigma_h']] mu_h, phi, sigma_z_g, sigma_z_h = [model.known_params[var] for var in ['mu_h', 'phi', 'sigma_z_g', 'sigma_z_h']] prior_mu_g, prior_cov_g = [model.hyper_params[var] for var in ['prior_mu_g', 'prior_cov_g']] prior_mu_h, prior_cov_h = [model.hyper_params[var] for var in ['prior_mu_h', 'prior_cov_h']] n = len(g) y = ma.asarray(ones((n, 2))*nan) if sum(T==1) > 0: y[T==1, 0] = z[T==1] if sum(T==2) > 0: y[T==2, 1] = z[T==2] y[isnan(y)] = ma.masked kalman = self._kalman kalman.initial_state_mean=[prior_mu_g[0], prior_mu_h[0]] kalman.initial_state_covariance=diag([prior_cov_g[0,0], prior_cov_h[0,0]]) kalman.transition_matrices=[[1, 0], [0, phi]] kalman.transition_offsets =ones((n, 2))*[0, mu_h*(1-phi)] kalman.transition_covariance=[[sigma_g**2, 0], [0, sigma_h**2]] kalman.observation_matrices=[[1, 0], [1, 1]] kalman.observation_covariance=[[sigma_z_g**2, 0], [0, sigma_z_h**2]] sampled_surfaces = forward_filter_backward_sample(kalman, y) return sampled_surfaces
def getAngle(t1,c1,t2,c2): ''' Get angle between two celestials at t1 and t2 Verify if ignoring the k-cordinate makes any sense timeit 240 microseconds ''' if type(t2) == numpy.ndarray: t2 = t2[0] elif isnan(t2): print "ERROR, t2 is nan!" return t2 p1 = c1.eph(t1)[0] p1[2] = 0.0 p1l = norm(p1) p1 /= p1l p2 = c2.eph(t2)[0] p2[2] = 0.0 p2l = norm(p2) p2 /= p2l #if p1l > p2l: return p1.dot(p2) #else: # return p1.dot(p2) '''
def add_thermodynamic_constraints(cpl, dG0_f, c_range=(1e-6, 1e-2), T=default_T, bounds=None): """ For any compound that does not have an explicit bound set by the 'bounds' argument, create a bound using the 'margin' variables (the last to columns of A). """ Nc = dG0_f.shape[0] if bounds != None and len(bounds) != Nc: raise Exception( "The concentration bounds list must be the same length as the number of compounds" ) if bounds == None: bounds = [(None, None)] * Nc for c in xrange(Nc): if pylab.isnan(dG0_f[c, 0]): continue # unknown dG0_f - cannot bound this compound's concentration at all b_low = bounds[c][0] or c_range[0] b_high = bounds[c][1] or c_range[1] # lower bound: dG0_f + R*T*ln(Cmin) <= x_i cpl.variables.set_lower_bounds('c%d' % c, dG0_f[c, 0] + R * T * pylab.log(b_low)) # upper bound: x_i <= dG0_f + R*T*ln(Cmax) cpl.variables.set_upper_bounds('c%d' % c, dG0_f[c, 0] + R * T * pylab.log(b_high))
def make_pCr_problem(S, dG0_f, c_mid=1e-3, ratio=3.0, T=default_T, bounds=None, log_stream=None): """Creates a Cplex problem for finding the pCr. Simply sets up all the constraints. Does not set the objective. Args: S: stoichiometric matrix. dG0_f: deltaG0'-formation values for all compounds (in kJ/mol) (1 x compounds) c_mid: the default concentration to center the pCr on. ratio: the ratio between the distance of the upper bound from c_mid and the lower bound from c_mid (in logarithmic scale) bounds: the concentration bounds for metabolites. log_stream: where to write Cplex logs to. Returns: A cplex.Cplex object for the problem. """ Nc = S.shape[1] if Nc != dG0_f.shape[0]: raise Exception("The S matrix has %d columns, while the dG0_f vector has %d" % (Nc, dG0_f.shape[0])) if bounds and len(bounds) != Nc: raise Exception("The concentration bounds list must be the same length as the number of compounds") cpl = create_cplex(S, dG0_f, log_stream) # Add pC variable. cpl.variables.add(names=['pC'], lb=[0], ub=[1e6]) # Add variables for concentration bounds for each metabolite. for c in xrange(Nc): if pylab.isnan(dG0_f[c, 0]): continue # unknown dG0_f - cannot bound this compound's concentration at all # dG at the center concentration. dG_f_mid = dG0_f[c, 0] + R*T*pylab.log(c_mid) if bounds == None or bounds[c][0] == None: # lower bound: x_i + r/(1+r) * R*T*ln(10)*pC >= dG0_f + R*T*ln(Cmid) cpl.linear_constraints.add(senses='G', names=['c%d_lower' % c], rhs=[dG_f_mid]) cpl.linear_constraints.set_coefficients('c%d_lower' % c, 'c%d' % c, 1) cpl.linear_constraints.set_coefficients('c%d_lower' % c, 'pC', R*T*pylab.log(10) * ratio / (ratio + 1.0)) else: # this compound has a specific lower bound on its activity cpl.variables.set_lower_bounds('c%d' % c, dG0_f[c, 0] + R*T*pylab.log(bounds[c][0])) if bounds == None or bounds[c][1] == None: # upper bound: x_i - 1/(1+r) * R*T*ln(10)*pC <= dG0_f + R*T*ln(Cmid) cpl.linear_constraints.add(senses='L', names=['c%d_upper' % c], rhs=[dG_f_mid]) cpl.linear_constraints.set_coefficients('c%d_upper' % c, 'c%d' % c, 1) cpl.linear_constraints.set_coefficients('c%d_upper' % c, 'pC', -R*T*pylab.log(10) / (ratio + 1.0)) else: # this compound has a specific upper bound on its activity cpl.variables.set_upper_bounds('c%d' % c, dG0_f[c, 0] + R*T*pylab.log(bounds[c][1])) return cpl
def fe(data): """ Fixed Effect model:: Y_r,c,t = beta * X_r,c,t + e_r,c,t e_r,c,t ~ N(0, sigma^2) """ # covariates K1 = count_covariates(data, 'x') X = pl.array([data['x%d' % i] for i in range(K1)]) K2 = count_covariates(data, 'w') W = pl.array([data['w%d' % i] for i in range(K1)]) # priors beta = mc.Uninformative('beta', value=pl.zeros(K1)) gamma = mc.Uninformative('gamma', value=pl.zeros(K2)) sigma_e = mc.Uniform('sigma_e', lower=0, upper=1000, value=1) # predictions @mc.deterministic def mu(X=X, beta=beta): return pl.dot(beta, X) param_predicted = mu @mc.deterministic def sigma_explained(W=W, gamma=gamma): """ sigma_explained_i,r,c,t,a = gamma * W_i,r,c,t,a""" return pl.dot(gamma, W) @mc.deterministic def predicted(mu=mu, sigma_explained=sigma_explained, sigma_e=sigma_e): return mc.rnormal(mu, 1 / (sigma_explained**2. + sigma_e**2.)) # likelihood i_obs = pl.find(1 - pl.isnan(data.y)) @mc.observed def obs(value=data.y, i_obs=i_obs, mu=mu, sigma_explained=sigma_explained, sigma_e=sigma_e): return mc.normal_like(value[i_obs], mu[i_obs], 1. / (sigma_explained[i_obs]**2. + sigma_e**-2.)) # set up MCMC step methods mod_mc = mc.MCMC(vars()) mod_mc.use_step_method(mc.AdaptiveMetropolis, mod_mc.beta) # find good initial conditions with MAP approx print 'attempting to maximize likelihood' var_list = [mod_mc.beta, mod_mc.obs, mod_mc.sigma_e] mc.MAP(var_list).fit(method='fmin_powell', verbose=1) return mod_mc
def clean_estpoints(self): """Removes NaN from the estpoints that result from cleaning by the user in the extractfitpoints method.""" temp = self.estpoints.tolist() indx = 0 while indx < len(temp): if pl.isnan(temp[indx][1]): temp.pop(indx) else: indx+=1 self.estpoints = pl.array(temp)
def apply_mask(x): """ Gets arrays with NaN from MAT files and applies python masked_where """ f = pl.find(pl.isnan(x) == 1) l1, l2 = x.shape x = pl.ravel(x) x[f] = 0 x.shape = (l1,l2) x = pl.ma.masked_where(x == 0, x) return x
def maskOD(data): '''Mask too large/small values for plots''' for c, wDict in data.items(): for w, curve in wDict.items(): curve[(curve > 2) | (curve < 0.01)] = None # TODO: Report masks when they occur if py.isnan(py.sum(curve)): msg = ('Masking value with "nan" in ' '{} -- {}'.format(c, w)) print(msg, file=sys.stderr) return data
def resample(data): if len(data) == 0: return data delta_true = .1 p = data['mu_pred'] + 1.e-6 # TODO: abstract this block of code into rate_model.py; it is also called in data_model.py ## ensure that all data has uncertainty quantified appropriately # first replace all missing se from ci missing_se = pl.isnan( data['standard_error']) | (data['standard_error'] <= 0) data['standard_error'][missing_se] = ( data['upper_ci'][missing_se] - data['lower_ci'][missing_se]) / (2 * 1.96) # then replace all missing ess with se missing_ess = pl.isnan(data['effective_sample_size']) data['effective_sample_size'][missing_ess] = data['value'][missing_ess] * ( 1 - data['value'][missing_ess]) / data['standard_error'][missing_ess]**2 # warn and drop data that doesn't have effective sample size quantified, or is is non-positive missing_ess = pl.isnan( data['effective_sample_size']) | (data['effective_sample_size'] < 0) if sum(missing_ess) > 0: print 'WARNING: %d rows of data has invalid quantification of uncertainty.' % sum( missing_ess) data['effective_sample_size'][missing_ess] = 1.0 n = data['effective_sample_size'] data['true'] = p data['value'] = (1.0 * mc.rnegative_binomial(n * p, delta_true * n * p)) / n # uncomment below to test the effect of having very wrong data #data['value'] = 0. #data['effective_sample_size'] = 1.e6 return data
def loadFile(objectFileName): oimg = pyfits.open(objectFileName) # Load the IFU data -- Row-stacked spectra odata = oimg[1].data oError = oimg[2].data odata_dim = odata.shape wcs = astWCS.WCS(objectFileName, extensionName=1) owavelengthStartEnd = wcs.getImageMinMaxWCSCoords()[0:2] fiberNumber = wcs.getImageMinMaxWCSCoords()[2:4] owavelengthStep = oimg[1].header['CDELT1'] owavelengthRange = [owavelengthStartEnd[0] + i * owavelengthStep for i in range(odata_dim[1])] # Check to make sure we got it right if not owavelengthRange[-1] == owavelengthStartEnd[-1]: print 'The ending wavelenghts do not match... Exiting' sys.exit(1) else: # make median sky specs = pyl.array([flux for flux in odata]) skySpec = pyl.median(specs, axis=0) RSS = [] for i in range(int(fiberNumber[1])): #oflux = odata[i] - oskyflux oflux = odata[i] - skySpec oflux[pyl.isnan(oflux)] = 0.0 oErrorFlux = oError[i] #oflux = odata[i] # Mask out extreme values in spectrum # Just because edges dodgy in efosc med = pyl.median(oflux) oflux[pyl.greater(abs(oflux), 10.0 * med)] = 0.0001 objSED = astSED.SED(wavelength=owavelengthRange, flux=oflux) #skySED = astSED.SED(wavelength=owavelengthRange, flux=oskyflux) skySED = astSED.SED(wavelength=owavelengthRange, flux=skySpec) errSED = astSED.SED(wavelength=owavelengthRange, flux=oErrorFlux) # make it > 0 everywhere objSED.flux = objSED.flux - objSED.flux.min() objSED.flux = objSED.flux / objSED.flux.max() errSED.flux = errSED.flux - errSED.flux.min() errSED.flux = errSED.flux / errSED.flux.max() skySED.flux = skySED.flux - skySED.flux.min() skySED.flux = skySED.flux / skySED.flux.max() RSS.append({'object': objSED, 'sky': skySED, 'error': errSED}) return RSS
def renormalize(x_unpurt,x_puturb,epsilon): final_dist = distance(x_unpurt,x_puturb) xnew = pl.array([0.0,0.0,0.0,1.0]) # the new renormalized vx (see lab book #2 pg 61) xnew[0] = x_unpurt[0]+(epsilon/final_dist)*(x_puturb[0]-x_unpurt[0]) xnew[2] = x_unpurt[2]+(epsilon/final_dist)*(x_puturb[2]-x_unpurt[2]) if pl.isnan(xnew[0]): print('RENORMALIZED PARRALEL VECTORS !!! FIX THIS!!!!') sys.exit() return xnew
def CreateFromAliFile(self): self.LoadAligments(self.AliFile) printStr = '' self._lst_ignored_files = [] self.NumFrames = 0 created_means = False for index, (file_name, utterance_id) in \ enumerate(zip(self.RawFileList, self.UtteranceIds)): printStrNew = '\b' * (len(printStr)+1) printStr = "Loading data for utterance #: " + str(index+1) printString = printStrNew + printStr print printString, sys.stdout.flush() data = HTK.ReadHTKWithDeltas(file_name) if sum(isnan(data)) != 0 or sum(isinf(data)) != 0: self._lst_ignored_files.append(index) continue if not created_means: created_means = True self.data_dim = data.shape[0] self.__CreateMeansAndStdevs() self.DataSumSq += (data**2).sum(axis=1).reshape(-1,1) self.DataSum += data.sum(axis=1).reshape(-1,1) self.NumFrames += data.shape[1] if self.Utt2Speaker != None: speaker = self.Utt2Speaker[utterance_id] self.SpeakerMeans[speaker] += data.sum(axis=1).reshape(-1,1) self.SpeakerStds[speaker] += (data**2).sum(axis=1).reshape(-1,1) self.SpeakerNumFrames[speaker] += data.shape[1] sys.stdout.write("\n") for file_num in self._lst_ignored_files: sys.stdout.write("File # " + str(file_num) + " was ignored \ because of errors\n") if self.Utt2Speaker != None: for speaker in self.Speaker2Utt.keys(): self.SpeakerMeans[speaker] /= (1.0 *self.SpeakerNumFrames[speaker]) self.SpeakerStds[speaker] -= self.SpeakerNumFrames[speaker] * \ (self.SpeakerMeans[speaker]**2) self.SpeakerStds[speaker] /= (1.0 *self.SpeakerNumFrames[speaker]-1) self.SpeakerStds[speaker][self.SpeakerStds[speaker] < 1e-8] = 1e-8 self.SpeakerStds[speaker] = sqrt(self.SpeakerStds[speaker]) self.DataMeanVect = self.DataSum/self.NumFrames variances = (self.DataSumSq - self.NumFrames*(self.DataMeanVect**2))/(self.NumFrames-1) variances[variances < 1e-8] = 1e-8 self.DataStdVect = sqrt(variances)
def load_new_model(disease, country='all', sex=['total', 'male', 'female'], cov='no'): '''create disease model with relavtive data cov : str method to handle covariates default is nothing ('no') options include, - 'drop' : drop all covartiates - 'zero' : missing values replaced with 0 - 'average' : missing values replaced with average of column ''' model = dismod3.data.load('/home/j/Project/dismod/output/dm-%s' % disease) # keep relative data if (type(sex) == str) & (sex != 'total'): model.keep(areas=[country], sexes=[sex, 'total']) else: model.keep(areas=[country], sexes=sex) if (True in pl.isnan(pl.array( model.output_template.filter(like='x_')))) | (True in pl.isnan( pl.array(model.input_data.filter(like='x_')))): print 'Covariates missing, %s method used' % (cov) col = model.input_data.filter(like='x_').columns for i in col: if cov == 'drop': model.input_data = model.input_data.drop(i, 1) model.output_template = model.output_template.drop(i, 1) elif cov == 'zero': model.input_data[i] = model.input_data[i].fillna([0]) model.output_template[i] = model.output_template[i].fillna([0]) elif cov == 'average': model.input_data[i] = model.input_data[i].fillna( [model.input_data[i].mean()]) model.output_template[i] = model.output_template[i].fillna( model.output_template[i].mean()) return model
def runtimes_stats(): df = pd.read_csv('test_runtimes.csv', skipinitialspace=True) print "total trials" print len(df['algorithm']) / len(df['algorithm'].unique()) ratios = [] labels = [] weights = [] hist_algorithms = ['prim', 'khuller'] algorithm_labels = {'prim': 'Karger', 'khuller': 'Khuller'} sns.set() pylab.figure() for algorithm, group in df.groupby('algorithm'): print algorithm comparisons = group['comparisons'].sum() dominated = group['dominated'].sum() print float(dominated) / float( comparisons), "(", dominated, "/", comparisons, ")" print binom_test(dominated, comparisons) group = group.groupby('points', as_index=False).agg(pylab.mean) pylab.plot(group['points'], group['runtime'], label=algorithm) ratio = group['cost ratio'] ratio = ratio[~pylab.isnan(ratio)] ratio = ratio - 1 print "cost comparisons", len(ratio) print "cost ratio", pylab.mean(ratio), "+/-", pylab.std(ratio, ddof=1) if algorithm in hist_algorithms: ratios.append(ratio) labels.append(algorithm_labels[algorithm]) weight = pylab.ones_like(ratio) / float(len(ratio)) weights.append(weight) pylab.legend(loc=2) pylab.xlabel('number of points') pylab.ylabel('rumtime (minutes)') pylab.savefig('test_runtimes/runtimes.pdf', format='pdf') pylab.close() pylab.figure() pylab.hist(ratios, label=labels, weights=weights) pylab.xlabel('percent better/worse than Steiner', size=20) pylab.ylabel('proportion', size=20) pylab.legend() ax = pylab.gca() pylab.setp(ax.get_legend().get_texts(), fontsize=20) # for legend text pylab.tight_layout() pylab.savefig('test_runtimes/cost_ratios_hist.pdf', format='pdf') pylab.close()
def crunchZfile(f, aCol, sCol, bCol, normFactor): ''' Takes a zAveraged... data file generated from the crunchData function of this library and produces the arithmetic mean as well as the standard error from all seeds. The error is done through the propagation of errors as: e = sqrt{ \sum_k (c_k e_k)^2 } where e_k are the individual seed's standard errors and c_k are the weighting coefficients obeying \sum_k c_k = 1. ''' avgs, stds, bins = pl.genfromtxt(f, usecols=(aCol, sCol, bCol), unpack=True, delimiter=',') # get rid of any items which are not numbers.. # this is some beautiful Python juju. bins = bins[pl.logical_not(pl.isnan(bins))] stds = stds[pl.logical_not(pl.isnan(stds))] avgs = avgs[pl.logical_not(pl.isnan(avgs))] # normalize data. stds *= normFactor avgs *= normFactor weights = bins / pl.sum(bins) avgs *= weights stds *= weights # over-estimates error bars stds *= stds avg = pl.sum(avgs) stdErr = pl.sum(stds) stdErr = stdErr**0.5 return avg, stdErr
def identify_nans(self, data, fn): """ private method to identify rows and columns of all nans from grids. This happens when the data from multiple GIS databases don't quite align on whatever the desired grid is. """ good_x = ~all(isnan(data), axis=0) & self.good_x # good cols good_y = ~all(isnan(data), axis=1) & self.good_y # good rows if any(good_x != self.good_x): total_nan_x = sum(good_x == False) self.rem_nans = True s = "Warning: %d row(s) of \"%s\" are entirely NaN." % (total_nan_x, fn) print_text(s, self.color) if any(good_y != self.good_y): total_nan_y = sum(good_y == False) self.rem_nans = True s = "Warning: %d col(s) of \"%s\" are entirely NaN." % (total_nan_y, fn) print_text(s, self.color) self.good_x = good_x self.good_y = good_y
def identify_nans(self, data, fn): """ private method to identify rows and columns of all nans from grids. This happens when the data from multiple GIS databases don't quite align on whatever the desired grid is. """ #print "::: DataInput identifying NaNs for %s :::" % fn good_x = ~all(isnan(data), axis=0) & self.good_x # good cols good_y = ~all(isnan(data), axis=1) & self.good_y # good rows if any(good_x != self.good_x): total_nan_x = sum(good_x == False) self.rem_nans = True print "Warning: %d row(s) of \"%s\" are entirely NaN." % (total_nan_x, fn) if any(good_y != self.good_y): total_nan_y = sum(good_y == False) self.rem_nans = True print "Warning: %d col(s) of \"%s\" are entirely NaN." % (total_nan_y, fn) self.good_x = good_x self.good_y = good_y
def wrap_to_pi(angle): if type(angle) == list: angle = array(angle) angle %= (2 * pi) if type(angle) == ndarray: valid = ~pylab.isnan(angle) out_of_bounds = pylab.zeros(angle.size, dtype=bool) out_of_bounds[valid] = (angle[valid] > pi) angle[out_of_bounds] -= (2 * pi) return angle else: if angle > pi: angle -= (2 * pi) return angle
def fit_quality(time, parameters, noise, repetitions): """ Apply the fitting routine a number of times, as given by `repetitions`, and return informations about the fit performance. """ results = [] errors = [] from numpy.random import seed alpha_psp = AlphaPSP() for _ in range(repetitions): seed() value = noisy_psp(time=time, noise=noise, **parameters) fit_result = fit(alpha_psp, time, value, noise, fail_on_negative_cov=[True, True, True, False, False]) if fit_result is not None: result, error, chi2, success = fit_result if chi2 < 1.5 and success: print(chi2, result) results.append(result) errors.append(error) else: print("fit failed:", end=' ') print(fit_result) keys = alpha_psp.parameter_names() result_dict = dict(((key, []) for key in keys)) error_dict = dict(((key, []) for key in keys)) for result in results: for r, key in zip(result, keys): result_dict[key].append(r) for error in errors: for r, key in zip(p.diag(error), keys): error_dict[key].append(p.sqrt(r)) if p.isnan(p.sqrt(r)): print("+++++++", r) return ([p.mean(result_dict[key]) for key in keys], [p.std(result_dict[key]) for key in keys], len(results), keys, [result_dict[key] for key in keys], [error_dict[key] for key in keys])
def from_goal_msg(goal_msg): rpy = quat2rpy([ goal_msg.pos.rotation.w, goal_msg.pos.rotation.x, goal_msg.pos.rotation.y, goal_msg.pos.rotation.z ]) goal = FootGoal(pos=pl.hstack([ goal_msg.pos.translation.x, goal_msg.pos.translation.y, goal_msg.pos.translation.z, rpy ]), step_speed=goal_msg.step_speed, step_height=goal_msg.step_height, step_id=goal_msg.id, pos_fixed=[ goal_msg.fixed_x, goal_msg.fixed_y, goal_msg.fixed_z, goal_msg.fixed_roll, goal_msg.fixed_pitch, goal_msg.fixed_yaw ], is_right_foot=goal_msg.is_right_foot, is_in_contact=goal_msg.is_in_contact, bdi_step_duration=goal_msg.bdi_step_duration, bdi_sway_duration=goal_msg.bdi_sway_duration, bdi_lift_height=goal_msg.bdi_lift_height, bdi_toe_off=goal_msg.bdi_toe_off, bdi_knee_nominal=goal_msg.bdi_knee_nominal, bdi_max_body_accel=goal_msg.bdi_max_body_accel, bdi_max_foot_vel=goal_msg.bdi_max_foot_vel, bdi_sway_end_dist=goal_msg.bdi_sway_end_dist, bdi_step_end_dist=goal_msg.bdi_step_end_dist, support_contact_groups=goal_msg.support_contact_groups, terrain_pts=pl.vstack([ goal_msg.terrain_path_dist, goal_msg.terrain_height ])) if any(pl.isnan(goal.pos[[0, 1, 5]])): raise ValueError("I don't know how to handle NaN in x, y, or yaw") else: goal.pos[pl.find(pl.isnan(goal.pos))] = 0 return goal
def crunchZfile(f,aCol,sCol,bCol,normFactor): ''' Takes a zAveraged... data file generated from the crunchData function of this library and produces the arithmetic mean as well as the standard error from all seeds. The error is done through the propagation of errors as: e = sqrt{ \sum_k (c_k e_k)^2 } where e_k are the individual seed's standard errors and c_k are the weighting coefficients obeying \sum_k c_k = 1. ''' avgs,stds,bins = pl.genfromtxt(f, usecols=(aCol, sCol, bCol), unpack=True, delimiter=',') # get rid of any items which are not numbers.. # this is some beautiful Python juju. bins = bins[pl.logical_not(pl.isnan(bins))] stds = stds[pl.logical_not(pl.isnan(stds))] avgs = avgs[pl.logical_not(pl.isnan(avgs))] # normalize data. stds *= normFactor avgs *= normFactor weights = bins/pl.sum(bins) avgs *= weights stds *= weights # over-estimates error bars stds *= stds avg = pl.sum(avgs) stdErr = pl.sum(stds) stdErr = stdErr**0.5 return avg, stdErr
def from_footstep_msg(goal_msg): rpy = quat2rpy( [goal_msg.pos.rotation.w, goal_msg.pos.rotation.x, goal_msg.pos.rotation.y, goal_msg.pos.rotation.z] ) goal = FootGoal( pos=pl.hstack([goal_msg.pos.translation.x, goal_msg.pos.translation.y, goal_msg.pos.translation.z, rpy]), step_speed=goal_msg.params.step_speed, step_height=goal_msg.params.step_height, step_id=goal_msg.id, pos_fixed=[ goal_msg.fixed_x, goal_msg.fixed_y, goal_msg.fixed_z, goal_msg.fixed_roll, goal_msg.fixed_pitch, goal_msg.fixed_yaw, ], is_right_foot=goal_msg.is_right_foot, is_in_contact=goal_msg.is_in_contact, bdi_step_duration=goal_msg.params.bdi_step_duration, bdi_sway_duration=goal_msg.params.bdi_sway_duration, bdi_lift_height=goal_msg.params.bdi_lift_height, bdi_toe_off=goal_msg.params.bdi_toe_off, bdi_knee_nominal=goal_msg.params.bdi_knee_nominal, bdi_max_body_accel=goal_msg.params.bdi_max_body_accel, bdi_max_foot_vel=goal_msg.params.bdi_max_foot_vel, bdi_sway_end_dist=goal_msg.params.bdi_sway_end_dist, bdi_step_end_dist=goal_msg.params.bdi_step_end_dist, support_contact_groups=goal_msg.params.support_contact_groups, terrain_pts=pl.vstack([goal_msg.terrain_path_dist, goal_msg.terrain_height]), ) if any(pl.isnan(goal.pos[[0, 1, 5]])): raise ValueError("I don't know how to handle NaN in x, y, or yaw") else: goal.pos[pl.find(pl.isnan(goal.pos))] = 0 return goal
def one_ci(v, ci, bootstraps): v = pylab.array(v) v = pylab.ma.masked_array(v,pylab.isnan(v)).compressed() if v.size == 0: return pylab.nan, 0, 0 #Nothing to compute r = pylab.randint(v.size, size=(v.size, bootstraps)) booted_samp = pylab.array([pylab.median(v[r[:,n]]) for n in xrange(bootstraps)]) booted_samp.sort() med = pylab.median(booted_samp) idx_lo = int(bootstraps * ci/2.0) idx_hi = int(bootstraps * (1.0-ci/2)) return med, med-booted_samp[idx_lo], booted_samp[idx_hi]-med
def fe(data): """ Fixed Effect model:: Y_r,c,t = beta * X_r,c,t + e_r,c,t e_r,c,t ~ N(0, sigma^2) """ # covariates K1 = count_covariates(data, 'x') X = pl.array([data['x%d'%i] for i in range(K1)]) K2 = count_covariates(data, 'w') W = pl.array([data['w%d'%i] for i in range(K1)]) # priors beta = mc.Uninformative('beta', value=pl.zeros(K1)) gamma = mc.Uninformative('gamma', value=pl.zeros(K2)) sigma_e = mc.Uniform('sigma_e', lower=0, upper=1000, value=1) # predictions @mc.deterministic def mu(X=X, beta=beta): return pl.dot(beta, X) param_predicted = mu @mc.deterministic def sigma_explained(W=W, gamma=gamma): """ sigma_explained_i,r,c,t,a = gamma * W_i,r,c,t,a""" return pl.dot(gamma, W) @mc.deterministic def predicted(mu=mu, sigma_explained=sigma_explained, sigma_e=sigma_e): return mc.rnormal(mu, 1 / (sigma_explained**2. + sigma_e**2.)) # likelihood i_obs = pl.find(1 - pl.isnan(data.y)) @mc.observed def obs(value=data.y, i_obs=i_obs, mu=mu, sigma_explained=sigma_explained, sigma_e=sigma_e): return mc.normal_like(value[i_obs], mu[i_obs], 1. / (sigma_explained[i_obs]**2. + sigma_e**-2.)) # set up MCMC step methods mod_mc = mc.MCMC(vars()) mod_mc.use_step_method(mc.AdaptiveMetropolis, mod_mc.beta) # find good initial conditions with MAP approx print 'attempting to maximize likelihood' var_list = [mod_mc.beta, mod_mc.obs, mod_mc.sigma_e] mc.MAP(var_list).fit(method='fmin_powell', verbose=1) return mod_mc
def one_ci(v, ci, bootstraps): v = pylab.array(v) v = pylab.ma.masked_array(v, pylab.isnan(v)).compressed() if v.size == 0: return pylab.nan, 0, 0 #Nothing to compute r = pylab.randint(v.size, size=(v.size, bootstraps)) booted_samp = pylab.array( [pylab.median(v[r[:, n]]) for n in xrange(bootstraps)]) booted_samp.sort() med = pylab.median(booted_samp) idx_lo = int(bootstraps * ci / 2.0) idx_hi = int(bootstraps * (1.0 - ci / 2)) return med, med - booted_samp[idx_lo], booted_samp[idx_hi] - med
def wrap_to_2pi(angle): if type(angle) == float or type(angle) == int: if (angle < 0) | (angle >= 2 * pi): angle %= (2 * pi) return angle valid = ~pylab.isnan(angle) if type(angle) == list: angle = array(angle) if type(angle) == ndarray: out_of_bounds = pylab.zeros(angle.size, dtype=bool) out_of_bounds[valid] = (angle[valid] < 0) | (angle[valid] >= 2 * pi) angle[out_of_bounds] %= (2 * pi) else: if (angle[valid] < 0) | (angle[valid] >= 2 * pi): angle[valid] %= (2 * pi) return angle
def forward(self, xs): """Perform forward propagation of activations and update the internal state for a subsequent call to `backward`. Since this performs sequence classification, `xs` is a 2D array, with rows representing input vectors at each time step. Returns a 2D array whose rows represent output vectors for each input vector.""" ni, ns, na = self.dims assert len(xs[0]) == ni n = len(xs) # self.last_n = n N = len(self.gi) if n > N: raise ocrolib.RecognitionError("input too large for LSTM model") self.reset(n) # Both functions are a straightforward implementation of the # LSTM equations. It is possible to abstract this further and # represent gates and memory cells as individual data structures. # However, that is several times slower and the extra abstraction # isn't actually all that useful. """Perform forward propagation of activations for a simple LSTM layer.""" for t in range(n): prev = zeros(ns) if t == 0 else self.output[t - 1] self.source[t, 0] = 1 self.source[t, 1 : 1 + ni] = xs[t] self.source[t, 1 + ni :] = prev self.gix[t] = dot(self.WGI, self.source[t]) self.gfx[t] = dot(self.WGF, self.source[t]) self.gox[t] = dot(self.WGO, self.source[t]) self.cix[t] = dot(self.WCI, self.source[t]) if t > 0: self.gix[t] += self.WIP * self.state[t - 1] self.gfx[t] += self.WFP * self.state[t - 1] self.gi[t] = ffunc(self.gix[t]) self.gf[t] = ffunc(self.gfx[t]) self.ci[t] = gfunc(self.cix[t]) self.state[t] = self.ci[t] * self.gi[t] if t > 0: self.state[t] += self.gf[t] * self.state[t - 1] self.gox[t] += self.WOP * self.state[t] self.go[t] = ffunc(self.gox[t]) self.output[t] = hfunc(self.state[t]) * self.go[t] assert not isnan(self.output[:n]).any() return self.output[:n]
def forward(self, xs): """Perform forward propagation of activations and update the internal state for a subsequent call to `backward`. Since this performs sequence classification, `xs` is a 2D array, with rows representing input vectors at each time step. Returns a 2D array whose rows represent output vectors for each input vector.""" ni, ns, na = self.dims assert len(xs[0]) == ni n = len(xs) # self.last_n = n N = len(self.gi) if n > N: raise ocrolib.RecognitionError("input too large for LSTM model") self.reset(n) # Both functions are a straightforward implementation of the # LSTM equations. It is possible to abstract this further and # represent gates and memory cells as individual data structures. # However, that is several times slower and the extra abstraction # isn't actually all that useful. """Perform forward propagation of activations for a simple LSTM layer.""" for t in range(n): prev = zeros(ns) if t == 0 else self.output[t - 1] self.source[t, 0] = 1 self.source[t, 1:1 + ni] = xs[t] self.source[t, 1 + ni:] = prev self.gix[t] = dot(self.WGI, self.source[t]) self.gfx[t] = dot(self.WGF, self.source[t]) self.gox[t] = dot(self.WGO, self.source[t]) self.cix[t] = dot(self.WCI, self.source[t]) if t > 0: self.gix[t] += self.WIP * self.state[t - 1] self.gfx[t] += self.WFP * self.state[t - 1] self.gi[t] = ffunc(self.gix[t]) self.gf[t] = ffunc(self.gfx[t]) self.ci[t] = gfunc(self.cix[t]) self.state[t] = self.ci[t] * self.gi[t] if t > 0: self.state[t] += self.gf[t] * self.state[t - 1] self.gox[t] += self.WOP * self.state[t] self.go[t] = ffunc(self.gox[t]) self.output[t] = hfunc(self.state[t]) * self.go[t] assert not isnan(self.output[:n]).any() return self.output[:n]
def fit_map(hit_map, signal_map, guess_center, guess_fwhm): radius = guess_fwhm*3.0 nside = healpy.npix2nside(hit_map.size) mask = hit_map == 0 mask |= pylab.isnan(signal_map) indices = _get_close_pixels(guess_center, radius, nside, mask)[0] p0 = guess_center + [1.0, guess_fwhm] def _model(indices, *params): center_lon, center_lat, scale, fwhm = params thetas, lons = healpy.pix2ang(nside, indices) lats = pylab.pi/2.0 - thetas dxs = (lons - center_lon) * pylab.cos(lats) dxs = (dxs + pylab.pi) % (2.0*pylab.pi) - pylab.pi dys = lats - center_lat return normal_2d(dxs, dys, scale, fwhm=fwhm) fit = curve_fit(_model, indices, signal_map[indices], p0=p0) return fit
def forward(self,xs): ni,ns,na = self.dims assert len(xs[0])==ni n = len(xs) self.last_n = n N = len(self.gi) if n>N: raise RecognitionError("[i] Input too large for model") self.reset(n) forward_py( n,N,ni,ns,na,xs, self.source, self.gix,self.gfx,self.gox,self.cix, self.gi,self.gf,self.go,self.ci, self.state,self.output, self.WGI,self.WGF,self.WGO,self.WCI, self.WIP,self.WFP,self.WOP) assert not isnan(self.output[:n]).any() return self.output[:n]
def mu_age_p(logit_C0=logit_C0, i=rate["i"]["mu_age"], r=rate["r"]["mu_age"], f=rate["f"]["mu_age"]): # for acute conditions, it is silly to use ODE solver to # derive prevalence, and it can be approximated with a simple # transformation of incidence if r.min() > 5.99: return i / (r + m_all + f) C0 = mc.invlogit(logit_C0) x = pl.hstack((i, r, f, 1 - C0, C0)) y = fun.forward(0, x) susceptible = y[:N] condition = y[N:] p = condition / (susceptible + condition) p[pl.isnan(p)] = 0.0 return p
def ctc_align_targets(outputs, targets, threshold=100.0, verbose=0, debug=0, lo=1e-5): outputs = maximum(lo, outputs) outputs = outputs * 1.0 / sum(outputs, axis=1)[:, newaxis] match = dot(outputs, targets.T) lmatch = log(match) assert not isnan(lmatch).any() both = forwardbackward(lmatch) epath = exp(both - amax(both)) l = sum(epath, axis=0)[newaxis, :] epath /= where(l == 0.0, 1e-9, l) aligned = maximum(lo, dot(epath, targets)) l = sum(aligned, axis=1)[:, newaxis] aligned /= where(l == 0.0, 1e-9, l) return aligned
def validate_complex_model(N_rep=20, simulation=good_complex_sim): q = pandas.DataFrame() for n in range(N_rep): # simulate data and fit model d, m = simulation() # tally posterior quantiles results = {} for var in 'eta_cross_eta eta delta_mu delta_beta beta gamma mu sigma'.split(): for j, var_j in enumerate(d[var]): stats = m[var].stats() results['%s_%d'%(var, j)] = [(var_j > m[var].trace()[:,j]).sum() / float(stats['n'])] # add y_mis k = 0 for j, n_j in enumerate(d['n']): for i in range(n_j): if pl.isnan(m['y'][j][i]): results['y_mis_%d'%k] = [(d['y'][j][i] > m['y_pred'][j].trace()[:,i]).sum() / float(stats['n'])] k += 1 q = q.append(pandas.DataFrame(results, index=['q_rep_%d'%n])) results = validation_transform(q) # display results graphics.scalar_validation_statistics( results, [[r'$y_{mis}$', results.filter(like='y_mis').columns], [r'$\eta\times\eta$', results.filter(like='eta_cross_eta').columns], [r'$\eta$', results.filter(regex='eta_\d').columns], [r'$\delta_\mu$', results.filter(like='delta_mu').columns], [r'$\delta_\beta$', results.filter(like='delta_beta').columns], [r'$\sigma$', results.filter(like='sigma').columns], [r'$\beta$', results.filter(regex='^beta').columns], [r'$\gamma$', results.filter(regex='gamma').columns], [r'$\mu$', results.filter(regex='^mu').columns], ]) return results
def sample(self, model, evidence): z, T, g, h, sigma_h, phi = [ evidence[var] for var in ['z', 'T', 'g', 'h', 'sigma_h', 'phi'] ] sigma_z_h = model.known_params['sigma_z_h'] mu_h = model.known_params['mu_h'] prior_mu_h = model.hyper_params['prior_mu_h'] prior_cov_h = model.hyper_params['prior_cov_h'] n = len(h) g = g.copy().reshape((n, 1)) h = h.copy().reshape((n, 1)) z_h = ma.asarray(z.copy().reshape((n, 1))) if sum(T == 0) > 0: z_h[T == 0] = nan if sum(T == 1) > 0: z_h[T == 1] = nan if sum(T == 2) > 0: z_h[T == 2] -= g[T == 2] z_h[isnan(z_h)] = ma.masked kalman = self._kalman kalman.initial_state_mean = array([ prior_mu_h[0], ]) kalman.initial_state_covariance = array([ prior_cov_h[0, 0], ]) kalman.transition_matrices = array([ phi, ]) kalman.transition_covariance = array([ sigma_h**2, ]) kalman.transition_offsets = mu_h * (1 - phi) * ones((n, 1)) kalman.observation_matrices = eye(1) kalman.observation_covariance = array([ sigma_z_h**2, ]) sampled_h = forward_filter_backward_sample(kalman, z_h) return sampled_h.reshape((n, ))