def __setitem__(self, wavelength, intensity): index, = pylab.where(self.wavelengths == wavelength) if pylab.any(index.shape): self.intensities[index] = intensity else: index, = pylab.where(self.wavelengths < wavelength) if pylab.any(index.shape): self.wavelengths = pylab.insert(self.wavelengths, index[-1] + 1, wavelength) self.intensities = pylab.insert(self.intensities, index[-1] + 1, intensity) else: self.wavelengths = pylab.insert(self.wavelengths, 0, wavelength) self.intensities = pylab.insert(self.intensities, 0, intensity)
def get_cod_data_all_causes(iso3='USA', age_group='1_4', sex='F'): """ TODO: write doc string for this function""" print 'loading', iso3, age_group, sex import glob cause_list = [] fpath = '/home/j/Project/Causes of Death/Under Five Deaths/CoD Correct Input Data/v02_prep_%s/%s+*+%s+%s.csv' % ( iso3, iso3, age_group, sex) #fpath = '/home/j/Project/GBD/dalynator/data/cod_correct_input_pos/run_9_cause_*.csv' # use Mike's validation data fnames = glob.glob(fpath) # initialize input distribution array N = 990 # TODO: get this from the data files T = 32 # TODO: get this from the data files J = len(fnames) F = pl.zeros((N, T, J)) # fill input distribution array with data from files for j, fname in enumerate(sorted(fnames)): cause = fname.split('+')[1] # TODO: make this less brittle and clearer #cause = str(j) # use Mike's validation data causes print 'loading cause', cause F_j = pl.csv2rec(fname) for n in range(N): F[n, :, j] = F_j['ensemble_d%d' % (n + 1)] / F_j['envelope'] #F[n, :, j] = F_j['d%d'%(n+1)]/F_j['envelope'] # use Mike's validation data assert not pl.any( pl.isnan(F)), '%s should have no missing values' % fname cause_list.append(cause) print 'loading complete' return F, cause_list
def dict_diff(dict1, dict2): """Return the difference between two dictionaries as a dictionary of key: [val1, val2] pairs. Keys unique to either dictionary are included as key: [val1, '-'] or key: ['-', val2].""" diff_keys = [] common_keys = pylab.intersect1d(dict1.keys(), dict2.keys()) for key in common_keys: if pylab.iterable(dict1[key]): if pylab.any(dict1[key] != dict2[key]): diff_keys.append(key) else: if dict1[key] != dict2[key]: diff_keys.append(key) dict1_unique = [key for key in dict1.keys() if key not in common_keys] dict2_unique = [key for key in dict2.keys() if key not in common_keys] diff = {} for key in diff_keys: diff[key] = [dict1[key], dict2[key]] for key in dict1_unique: diff[key] = [dict1[key], '-'] for key in dict2_unique: diff[key] = ['-', dict2[key]] return diff
def get_cod_data_all_causes(iso3='USA', age_group='1_4', sex='F'): """ TODO: write doc string for this function""" print 'loading', iso3, age_group, sex import glob cause_list = [] fpath = '/home/j/Project/Causes of Death/Under Five Deaths/CoD Correct Input Data/v02_prep_%s/%s+*+%s+%s.csv' % (iso3, iso3, age_group, sex) #fpath = '/home/j/Project/GBD/dalynator/data/cod_correct_input_pos/run_9_cause_*.csv' # use Mike's validation data fnames = glob.glob(fpath) # initialize input distribution array N = 990 # TODO: get this from the data files T = 32 # TODO: get this from the data files J = len(fnames) F = pl.zeros((N, T, J)) # fill input distribution array with data from files for j, fname in enumerate(sorted(fnames)): cause = fname.split('+')[1] # TODO: make this less brittle and clearer #cause = str(j) # use Mike's validation data causes print 'loading cause', cause F_j = pl.csv2rec(fname) for n in range(N): F[n, :, j] = F_j['ensemble_d%d'%(n+1)]/F_j['envelope'] #F[n, :, j] = F_j['d%d'%(n+1)]/F_j['envelope'] # use Mike's validation data assert not pl.any(pl.isnan(F)), '%s should have no missing values' % fname cause_list.append(cause) print 'loading complete' return F, cause_list
def log_pdf_full_mvn( x, mu, cov = None, invcov = None, logdet = None ): if cov is None: assert invcov is not None, "need cov or invcov" if invcov is None: invcov = sp.linalg.pinv2( cov ) #invcov = np.linalg.pinv( cov ) difx = x-mu if len(x.shape) > 1 or len(mu.shape)>1: if len(x.shape) > 1: nVals = x.shape[0] dim = x.shape[1] else: nVals = mu.shape[0] dim = np.float( len(x) ) malhab = (np.dot( difx, invcov ) * difx ).sum(1) else: nVals = 1 dim = np.float( len(x) ) malhab = np.dot( np.dot( difx, invcov ), difx ) if logdet is None: try: neglogdet = np.log( np.linalg.det(cov ) ) # logdet = -neglogdet #logdet = sum(numpy.log(numpy.linalg.svd(invcov)[1])) except: logdet = sum( np.log( np.diag( invcov ) ) ) #print str(-0.5*dim*numpy.log( 2.0 * numpy.pi ) ) #print str(0.5*logdet) #print str(malhab) logpdf = -0.5*dim*np.log( 2.0 * np.pi ) + 0.5*logdet - 0.5*malhab if pp.any( np.isnan( logpdf ) ) or pp.any( np.isinf( logpdf ) ): pdb.set_trace() print "********************************" print "********************************" print "log_pdf_full_mvn has inf" print logpdf print "********************************" print "********************************" return -np.inf return logpdf
def log_pdf_full_mvn(x, mu, cov=None, invcov=None, logdet=None): if cov is None: assert invcov is not None, "need cov or invcov" if invcov is None: invcov = sp.linalg.pinv2(cov) #invcov = np.linalg.pinv( cov ) difx = x - mu if len(x.shape) > 1 or len(mu.shape) > 1: if len(x.shape) > 1: nVals = x.shape[0] dim = x.shape[1] else: nVals = mu.shape[0] dim = np.float(len(x)) malhab = (np.dot(difx, invcov) * difx).sum(1) else: nVals = 1 dim = np.float(len(x)) malhab = np.dot(np.dot(difx, invcov), difx) if logdet is None: try: neglogdet = np.log(np.linalg.det(cov)) # logdet = -neglogdet #logdet = sum(numpy.log(numpy.linalg.svd(invcov)[1])) except: logdet = sum(np.log(np.diag(invcov))) #print str(-0.5*dim*numpy.log( 2.0 * numpy.pi ) ) #print str(0.5*logdet) #print str(malhab) logpdf = -0.5 * dim * np.log(2.0 * np.pi) + 0.5 * logdet - 0.5 * malhab if pp.any(np.isnan(logpdf)) or pp.any(np.isinf(logpdf)): pdb.set_trace() print "********************************" print "********************************" print "log_pdf_full_mvn has inf" print logpdf print "********************************" print "********************************" return -np.inf return logpdf
def mu_interval(weighted_sum_mu=weighted_sum_mu, cum_sum_weights=cum_sum_weights, mu_age=mu_age, age_start=pl.array(age_start, dtype=int), age_end=pl.array(age_end, dtype=int)): mu = (weighted_sum_mu[age_end] - weighted_sum_mu[age_start]) / (cum_sum_weights[age_end] - cum_sum_weights[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): mu[i] = mu_age[age_start[i]] return mu
def mu_interval(weighted_sum_mu=weighted_sum_mu, cum_sum_weights=cum_sum_weights, mu_age=mu_age, age_start=pl.array(age_start, dtype=int), age_end=pl.array(age_end, dtype=int)): mu = (weighted_sum_mu[age_end] - weighted_sum_mu[age_start]) / ( cum_sum_weights[age_end] - cum_sum_weights[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): mu[i] = mu_age[age_start[i]] return mu
def main(): df = get_df() for category in CATEGORIES: df = remove_small_counts(df, category) df.drop('count', axis=1, inplace=True) regression_df = pd.DataFrame() for category in CATEGORIES: dummies = pd.get_dummies(df[category]) regression_df = pd.concat([regression_df, dummies], axis=1) regression_df['name'] = df['name'] regression_df = regression_df.groupby('name').agg(lambda x : int(pylab.any(x))) print regression_df
def generate_data(N, delta_true, pi_true, heterogeneity, bias, sigma_prior): a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) model = data_simulation.simple_model(N) model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount='Moderately') model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true*age_weights) sum_wt = pl.cumsum(age_weights) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] n = mc.runiform(10000, 100000, size=N) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n * pl.exp(bias) emp_priors = {} emp_priors['p', 'mu'] = pi_age_true emp_priors['p', 'sigma'] = sigma_prior*pi_age_true model.emp_priors = emp_priors model.a = a model.pi_age_true = pi_age_true model.delta_true = delta_true return model
def positive(f=sm.f_eval): if pl.any(f < 0.): return -pl.inf else: return 0.
def ellfit(x, y, wt=None): import pylab as pl # Calculate the best fit ellipse for an X and Y distribution, allowing # for weighting. # OUTPUTS: # MAJOR - major axis in same units as x and y # MINOR - minor axis in same units as x and y # POSANG - the position angle CCW from the X=0 line of the coordinates # # Adam: The intensity weighted major and minor values are equal to the # second moment. # For equal weighting by pixel (of the sort that # might be done for blob analysis) the ellipse fit to the # half-maximum area will have semimajor axis equal to 1./1.69536 the # second moment. For the quarter maximum surface this is 1./1.19755. # # i.e. if you run this with x,y down to zero intensity (like integrating # to infinity), and wt=intensity, you get the second moments sig_major, # sig_minor back # if you run this with x,y down to half-intensity, and wt=None, you get # sigx/1.6986 back (not sure why my integra differs from his slightly) # # but adam did not have the factor of 4 to turn eigenval into major axis # # translation: if we run this with intensity weight, we get # the second moment back (a sigma). for flat weights i think he means # the halfmax contour semimajor axis if type(wt) == type(None): wt = x * 0.0 + 1.0 tot_wt = wt.sum() # WEIGHTED X AND Y CENTERS x_ctr = (wt * x).sum() / tot_wt y_ctr = (wt * y).sum() / tot_wt # BUILD THE MATRIX i11 = (wt * (x - x_ctr)**2).sum() / tot_wt i22 = (wt * (y - y_ctr)**2).sum() / tot_wt i12 = (wt * (x - x_ctr) * (y - y_ctr)).sum() / tot_wt mat = [[i11, i12], [i12, i22]] # CATCH THE CASE OF ZERO DETERMINANT if pl.det(mat) == 0: return pl.nan, pl.nan, pl.nan if pl.any(pl.isnan(mat)): return pl.nan, pl.nan, pl.nan # WORK OUT THE EIGENVALUES evals, evec = pl.eig(mat) # PICK THE MAJOR AXIS absvals = pl.absolute(evals) major = absvals.max() maj_ind = pl.where(absvals == major)[0][0] major_vec = evec[maj_ind] min_ind = 1 - maj_ind # WORK OUT THE ORIENTATION OF THE MAJOR AXIS posang = pl.arctan2(major_vec[1], major_vec[0]) # compared to the original idl code, this code is returning # pi-the desired angle, so: # posang=pl.pi-posang # if posang<0: posang = posang+pl.pi # MAJOR AND MINOR AXIS SIZES # turn into real half-max major/minor axis major = pl.sqrt(evals[maj_ind]) * 4. minor = pl.sqrt(evals[min_ind]) * 4. return major, minor, posang
def positive(f=f): if pl.any(f < 0.): return -pl.inf else: return 0.
def linear_norm(x, y, msk, eps=0.003, deps=0.001, nmin=2, nwin=3): '''Linear normalization of a slice of a spectra, assuming that the slice is centered on the line to normalized. ''' bla = False blabla = False x = x[msk] y = y[msk] n = int((len(y) / 2.)) yl = y[:n] yr = y[n + 1:] # Criteria on the left of the central wavelength epsl, epsr = eps, eps while 1: critl = abs(max(yl) - yl) / max(yl) idx_yl = pl.where(critl <= epsl)[0] idx_yl = idx_yl.astype(int) if blabla: print " epsl:", epsl print " idx_yl, yl:", idx_yl, [y[i] for i in idx_yl] if pl.size(idx_yl) >= nmin: break else: epsl = epsl + deps # Criteria on the right of the central wavelength while 1: critr = abs(max(yr) - yr) / max(yr) idx_yr = pl.where(critr <= epsr)[0] + n idx_yr = idx_yr.astype(int) if blabla: print " epsr:", epsr print "idx_yr, yr:", idx_yr, [y[i] for i in idx_yr] if pl.size(idx_yr) >= nmin: break else: epsr = epsr + deps idx_y = pl.concatenate([idx_yl, idx_yr]) if bla: print " nmin, nwin =", nmin, nwin print " Number of selected left continuum points: ", idx_yl.size, "/", n print " Number of selected right continuum points: ", idx_yr.size, "/", n print " Number of selected continuum points: ", idx_y.size, "/", y.size xs = [x[i] for i in idx_y] ys = [y[i] for i in idx_y] xs, ys = pl.asarray(xs), pl.asarray(ys) n_xs = xs.size # Mean value around selected points for ind, val in enumerate(ys): i = idx_y[ind] - nwin j = idx_y[ind] + nwin if i < 0: i = 0 if j > len(y): j = len(y) ys[ind] = y[i:j].mean() if blabla: print "xs, ys", xs, ys A = pl.concatenate([xs, pl.ones(n_xs)]) A = A.reshape((2, n_xs)) w = pl.linalg.lstsq(A.T, ys)[0] # Test if one of the value of w is a nan if pl.any(w != w): print "Pb with linalg.lstsq. Try to reduce eps or nmin." quit(1) a, b = w[0], w[1] if blabla: print "a =", a, "b =", b return a, b, xs, ys
def __getitem__(self, wavelength): index, = pylab.where(self.wavelengths == wavelength) if pylab.any(index.shape): return self.intensities[index] else: return None
print "\nLoading data from file "+F.CYAN+" "+filename+F.RESET+"\n" Data=pl.loadtxt(filename) N=Data.shape[0] from scipy.spatial.distance import pdist,squareform # compute distances dists=squareform(pdist(Data)) # exclude the case of self-distance pl.fill_diagonal(dists, pl.inf) test= (dists<cutoff) if(mode==1): picked=[] for p in range(N): if pl.any(test[p,:]): test[:,p]=False test[p,:]=False else: picked.append(p) No_overlaps=Data[picked] if(mode==2): print "- Cutting out particles with at least two overlaps at distance <", cutoff,"..." picked=[] for p in range(N): # removing only double overlaps if(pl.sum(test[p][p:])>1): pass else: picked.append(p)
def validate_age_integrating_model_sim(N=500, delta_true=.15, pi_true=quadratic): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) model = data_simulation.simple_model(N) #model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) #model.parameters['p']['smoothness'] = dict(amount='Very') age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true * age_weights) sum_wt = pl.cumsum(age_weights) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] n = mc.runiform(100, 10000, size=N) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(a, pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame( dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats() ['standard deviation'])) data_simulation.add_quality_metrics(model.mu) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) print model.results return model
def validate_consistent_model_sim(N=500, delta_true=.5, true=dict(i=quadratic, f=constant, r=constant)): types = pl.array(['i', 'r', 'f', 'p']) ## generate simulated data model = data_simulation.simple_model(N) model.input_data['effective_sample_size'] = 1. model.input_data['value'] = 0. for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) for t in 'irf': for i, k_i in enumerate(sim[t]['knots']): sim[t]['gamma'][i].value = pl.log(true[t](k_i)) age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)] a = pl.arange(101) age_weights = pl.ones_like(a) sum_wt = pl.cumsum(age_weights) p = pl.zeros(N) for t in types: mu_t = sim[t]['mu_age'].value sum_mu_wt = pl.cumsum(mu_t * age_weights) p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p_t[i] = mu_t[age_start[i]] # copy part into p p[data_type == t] = p_t[data_type == t] n = mc.runiform(100, 10000, size=N) model.input_data['data_type'] = data_type model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_convergence_diag(model.vars) graphics.plot_fit(model, model.vars, {}, {}) for i, t in enumerate('i r f p rr pf'.split()): pl.subplot(2, 3, i + 1) pl.plot(a, sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2) pl.plot(a, sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1) #graphics.plot_one_type(model, model.vars['p'], {}, 'p') #pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = 0. model.input_data['sigma_pred'] = 0. for t in types: model.input_data['mu_pred'][ data_type == t] = model.vars[t]['p_pred'].stats()['mean'] model.input_data['sigma_pred'][data_type == t] = model.vars['p'][ 'p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame( dict(true=[delta_true for t in types if t != 'rr'])) model.delta['mu_pred'] = [ pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr' ] model.delta['sigma_pred'] = [ pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr' ] data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame() for t in types: model.mu = model.mu.append(pandas.DataFrame( dict(true=sim[t]['mu_age'].value, mu_pred=model.vars[t]['mu_age'].stats()['mean'], sigma_pred=model.vars[t]['mu_age'].stats() ['standard deviation'])), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.mu['abs_err'].mean(), pl.median(pl.absolute( model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.finalize_results(model) print model.results return model
def evaluate_fit_quality(time, par, noise_values, trials, debug_plot=True): """ present the results obtained by fit_quality(...) in a graphical way """ mean_fit = [] std_fit = [] success_count = [] allvals = [] errors = [] for noise in noise_values: mean, std, success, keys, a, errs = fit_quality( time, par, noise, trials) mean_fit.append(mean) std_fit.append(std) success_count.append(success) allvals.append(a) errors.append(errs) print(p.any(p.isnan(errs))) mean_fit = p.array(mean_fit) std_fit = p.array(std_fit) if debug_plot: p.figure() num_subplots = mean_fit.shape[1] + 1 plot = None p.title("fit quality evaluation") for i in range(num_subplots - 1): plot = p.subplot(num_subplots, 1, 1 + i, sharex=plot) p.axhline(par[keys[i]], c="r") p.errorbar(noise_values, mean_fit[:, i], yerr=std_fit[:, i]) p.semilogx() p.xlim(min(noise_values) / 2., max(noise_values) * 2.) p.ylabel(keys[i]) for n, a, e in zip(noise_values, allvals, errors): # p.plot([n] * len(a[i]), a[i], "rx", alpha=.4) p.errorbar([n] * len(a[i]), a[i], yerr=e[i], fmt="rx", alpha=.4) p.subplot(num_subplots, 1, num_subplots, sharex=plot) p.plot(noise_values, [trials - x for x in success_count]) p.ylabel("failure count (of {0} total)".format(trials)) p.xlabel("noise / [value]") p.savefig("plots/example_fit_precision.pdf") p.figure() for i in range(num_subplots - 1): plot = p.subplot(num_subplots - 1, 1, 1 + i, sharex=plot) if i == 0: p.title("RMS of (fit - param) / estimated_error") p.axhline(1) p.axhline(0) for n, a, e in zip(noise_values, allvals, errors): rmsvals = p.sqrt( p.mean( ((p.array(a[i]) - par[keys[i]]) / p.array(e[i]))**2)) p.plot([n], rmsvals, "go") print("rmsvals for noise={0}, param={1}:".format(n, keys[i]), rmsvals, p.array(e[i])) p.ylim(0, None) p.ylabel(keys[i]) p.xlabel("noise / [value]") p.savefig("plots/example_fit_error_estimate.pdf") return mean_fit, std_fit, success_count
def age_specific_rate(model, data_type, reference_area='all', reference_sex='total', reference_year='all', mu_age=None, mu_age_parent=None, sigma_age_parent=None, rate_type='neg_binom', lower_bound=None, interpolation_method='linear', include_covariates=True, zero_re=False): # TODO: expose (and document) interface for alternative rate_type as well as other options, # record reference values in the model """ Generate PyMC objects for model of epidemological age-interval data :Parameters: - `model` : data.ModelData - `data_type` : str, one of 'i', 'r', 'f', 'p', or 'pf' - `reference_area, reference_sex, reference_year` : the node of the model to fit consistently - `mu_age` : pymc.Node, will be used as the age pattern, set to None if not needed - `mu_age_parent` : pymc.Node, will be used as the age pattern of the parent of the root area, set to None if not needed - `sigma_age_parent` : pymc.Node, will be used as the standard deviation of the age pattern, set to None if not needed - `rate_type` : str, optional. One of 'beta_binom', 'binom', 'log_normal_model', 'neg_binom', 'neg_binom_lower_bound_model', 'neg_binom_model', 'normal_model', 'offest_log_normal', or 'poisson' - `lower_bound` : - `interpolation_method` : str, optional, one of 'linear', 'nearest', 'zero', 'slinear', 'quadratic, or 'cubic' - `include_covariates` : boolean - `zero_re` : boolean, change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic :Results: - Returns dict of PyMC objects, including 'pi', the covariate adjusted predicted values for each row of data """ name = data_type import data result = data.ModelVars() if (mu_age_parent != None and pl.any(pl.isnan(mu_age_parent))) \ or (sigma_age_parent != None and pl.any(pl.isnan(sigma_age_parent))): mu_age_parent = None sigma_age_parent = None print 'WARNING: nan found in parent mu/sigma. Ignoring' ages = pl.array(model.parameters['ages']) data = model.get_data(data_type) if lower_bound: lb_data = model.get_data(lower_bound) parameters = model.parameters.get(data_type, {}) area_hierarchy = model.hierarchy vars = dismod3.data.ModelVars() vars += dict(data=data) if 'parameter_age_mesh' in parameters: knots = pl.array(parameters['parameter_age_mesh']) else: knots = pl.arange(ages[0], ages[-1] + 1, 5) smoothing_dict = { 'No Prior': pl.inf, 'Slightly': .5, 'Moderately': .05, 'Very': .005 } if 'smoothness' in parameters: smoothing = smoothing_dict[parameters['smoothness']['amount']] else: smoothing = 0. if mu_age == None: vars.update( age_pattern.age_pattern(name, ages=ages, knots=knots, smoothing=smoothing, interpolation_method=interpolation_method)) else: vars.update(dict(mu_age=mu_age, ages=ages)) vars.update( expert_prior_model.level_constraints(name, parameters, vars['mu_age'], ages)) vars.update( expert_prior_model.derivative_constraints(name, parameters, vars['mu_age'], ages)) if mu_age_parent != None: # setup a hierarchical prior on the simliarity between the # consistent estimate here and (inconsistent) estimate for its # parent in the areas hierarchy #weight_dict = {'Unusable': 10., 'Slightly': 10., 'Moderately': 1., 'Very': .1} #weight = weight_dict[parameters['heterogeneity']] vars.update( similarity_prior_model.similar('parent_similarity_%s' % name, vars['mu_age'], mu_age_parent, sigma_age_parent, 0.)) # also use this as the initial value for the age pattern, if it is not already specified if mu_age == None: if isinstance(mu_age_parent, mc.Node): # TODO: test this code initial_mu = mu_age_parent.value else: initial_mu = mu_age_parent for i, k_i in enumerate(knots): vars['gamma'][i].value = (pl.log( initial_mu[k_i - ages[0]])).clip(-12, 6) age_weights = pl.ones_like( vars['mu_age'].value ) # TODO: use age pattern appropriate to the rate type if len(data) > 0: vars.update( age_integrating_model.age_standardize_approx( name, age_weights, vars['mu_age'], data['age_start'], data['age_end'], ages)) # uncomment the following to effectively remove alleffects #if 'random_effects' in parameters: # for i in range(5): # effect = 'sigma_alpha_%s_%d' % (name, i) # parameters['random_effects'][effect] = dict(dist='TruncatedNormal', mu=.0001, sigma=.00001, lower=.00009, upper=.00011) #if 'fixed_effects' in parameters: # for effect in ['x_sex', 'x_LDI_id_Updated_7July2011']: # parameters['fixed_effects'][effect] = dict(dist='normal', mu=.0001, sigma=.00001) if include_covariates: vars.update( covariate_model.mean_covariate_model(name, vars['mu_interval'], data, parameters, model, reference_area, reference_sex, reference_year, zero_re=zero_re)) else: vars.update({'pi': vars['mu_interval']}) ## ensure that all data has uncertainty quantified appropriately # first replace all missing se from ci missing_se = pl.isnan( data['standard_error']) | (data['standard_error'] < 0) data['standard_error'][missing_se] = (data['upper_ci'][missing_se] - data['lower_ci'][missing_se]) / ( 2 * 1.96) # then replace all missing ess with se missing_ess = pl.isnan(data['effective_sample_size']) data['effective_sample_size'][missing_ess] = data['value'][ missing_ess] * (1 - data['value'][missing_ess] ) / data['standard_error'][missing_ess]**2 if rate_type == 'neg_binom': # warn and drop data that doesn't have effective sample size quantified, or is is non-positive missing_ess = pl.isnan(data['effective_sample_size']) | ( data['effective_sample_size'] < 0) if sum(missing_ess) > 0: print 'WARNING: %d rows of %s data has invalid quantification of uncertainty.' % ( sum(missing_ess), name) data['effective_sample_size'][missing_ess] = 0.0 # warn and change data where ess is unreasonably huge large_ess = data['effective_sample_size'] >= 1.e10 if sum(large_ess) > 0: print 'WARNING: %d rows of %s data have effective sample size exceeding 10 billion.' % ( sum(large_ess), name) data['effective_sample_size'][large_ess] = 1.e10 if 'heterogeneity' in parameters: lower_dict = {'Slightly': 9., 'Moderately': 3., 'Very': 1.} lower = lower_dict[parameters['heterogeneity']] else: lower = 1. # special case, treat pf data as poisson if data_type == 'pf': lower = 1.e12 vars.update( covariate_model.dispersion_covariate_model( name, data, lower, lower * 9.)) vars.update( rate_model.neg_binom_model(name, vars['pi'], vars['delta'], data['value'], data['effective_sample_size'])) elif rate_type == 'log_normal': # warn and drop data that doesn't have effective sample size quantified missing = pl.isnan( data['standard_error']) | (data['standard_error'] < 0) if sum(missing) > 0: print 'WARNING: %d rows of %s data has no quantification of uncertainty.' % ( sum(missing), name) data['standard_error'][missing] = 1.e6 # TODO: allow options for alternative priors for sigma vars['sigma'] = mc.Uniform('sigma_%s' % name, lower=.0001, upper=1., value=.01) #vars['sigma'] = mc.Exponential('sigma_%s'%name, beta=100., value=.01) vars.update( rate_model.log_normal_model(name, vars['pi'], vars['sigma'], data['value'], data['standard_error'])) elif rate_type == 'normal': # warn and drop data that doesn't have standard error quantified missing = pl.isnan( data['standard_error']) | (data['standard_error'] < 0) if sum(missing) > 0: print 'WARNING: %d rows of %s data has no quantification of uncertainty.' % ( sum(missing), name) data['standard_error'][missing] = 1.e6 vars['sigma'] = mc.Uniform('sigma_%s' % name, lower=.0001, upper=.1, value=.01) vars.update( rate_model.normal_model(name, vars['pi'], vars['sigma'], data['value'], data['standard_error'])) elif rate_type == 'binom': missing_ess = pl.isnan(data['effective_sample_size']) | ( data['effective_sample_size'] < 0) if sum(missing_ess) > 0: print 'WARNING: %d rows of %s data has invalid quantification of uncertainty.' % ( sum(missing_ess), name) data['effective_sample_size'][missing_ess] = 0.0 vars += rate_model.binom(name, vars['pi'], data['value'], data['effective_sample_size']) elif rate_type == 'beta_binom': vars += rate_model.beta_binom(name, vars['pi'], data['value'], data['effective_sample_size']) elif rate_type == 'poisson': missing_ess = pl.isnan(data['effective_sample_size']) | ( data['effective_sample_size'] < 0) if sum(missing_ess) > 0: print 'WARNING: %d rows of %s data has invalid quantification of uncertainty.' % ( sum(missing_ess), name) data['effective_sample_size'][missing_ess] = 0.0 vars += rate_model.poisson(name, vars['pi'], data['value'], data['effective_sample_size']) elif rate_type == 'offset_log_normal': vars['sigma'] = mc.Uniform('sigma_%s' % name, lower=.0001, upper=10., value=.01) vars += rate_model.offset_log_normal(name, vars['pi'], vars['sigma'], data['value'], data['standard_error']) else: raise Exception, 'rate_model "%s" not implemented' % rate_type else: if include_covariates: vars.update( covariate_model.mean_covariate_model(name, [], data, parameters, model, reference_area, reference_sex, reference_year, zero_re=zero_re)) if include_covariates: vars.update( expert_prior_model.covariate_level_constraints( name, model, vars, ages)) if lower_bound and len(lb_data) > 0: vars['lb'] = age_integrating_model.age_standardize_approx( 'lb_%s' % name, age_weights, vars['mu_age'], lb_data['age_start'], lb_data['age_end'], ages) if include_covariates: vars['lb'].update( covariate_model.mean_covariate_model('lb_%s' % name, vars['lb']['mu_interval'], lb_data, parameters, model, reference_area, reference_sex, reference_year, zero_re=zero_re)) else: vars['lb'].update({'pi': vars['lb']['mu_interval']}) vars['lb'].update( covariate_model.dispersion_covariate_model( 'lb_%s' % name, lb_data, 1e12, 1e13) # treat like poisson ) ## ensure that all data has uncertainty quantified appropriately # first replace all missing se from ci missing_se = pl.isnan( lb_data['standard_error']) | (lb_data['standard_error'] <= 0) lb_data['standard_error'][missing_se] = ( lb_data['upper_ci'][missing_se] - lb_data['lower_ci'][missing_se]) / (2 * 1.96) # then replace all missing ess with se missing_ess = pl.isnan(lb_data['effective_sample_size']) lb_data['effective_sample_size'][missing_ess] = lb_data['value'][ missing_ess] * (1 - lb_data['value'][missing_ess] ) / lb_data['standard_error'][missing_ess]**2 # warn and drop lb_data that doesn't have effective sample size quantified missing_ess = pl.isnan(lb_data['effective_sample_size']) | ( lb_data['effective_sample_size'] <= 0) if sum(missing_ess) > 0: print 'WARNING: %d rows of %s lower bound data has no quantification of uncertainty.' % ( sum(missing_ess), name) lb_data['effective_sample_size'][missing_ess] = 1.0 vars['lb'].update( rate_model.neg_binom_lower_bound_model( 'lb_%s' % name, vars['lb']['pi'], vars['lb']['delta'], lb_data['value'], lb_data['effective_sample_size'])) result[data_type] = vars return result
def age_specific_rate( model, data_type, reference_area="all", reference_sex="total", reference_year="all", mu_age=None, mu_age_parent=None, sigma_age_parent=None, rate_type="neg_binom", lower_bound=None, interpolation_method="linear", include_covariates=True, zero_re=False, ): # TODO: expose (and document) interface for alternative rate_type as well as other options, # record reference values in the model """ Generate PyMC objects for model of epidemological age-interval data :Parameters: - `model` : data.ModelData - `data_type` : str, one of 'i', 'r', 'f', 'p', or 'pf' - `reference_area, reference_sex, reference_year` : the node of the model to fit consistently - `mu_age` : pymc.Node, will be used as the age pattern, set to None if not needed - `mu_age_parent` : pymc.Node, will be used as the age pattern of the parent of the root area, set to None if not needed - `sigma_age_parent` : pymc.Node, will be used as the standard deviation of the age pattern, set to None if not needed - `rate_type` : str, optional. One of 'beta_binom', 'binom', 'log_normal_model', 'neg_binom', 'neg_binom_lower_bound_model', 'neg_binom_model', 'normal_model', 'offest_log_normal', or 'poisson' - `lower_bound` : - `interpolation_method` : str, optional, one of 'linear', 'nearest', 'zero', 'slinear', 'quadratic, or 'cubic' - `include_covariates` : boolean - `zero_re` : boolean, change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic :Results: - Returns dict of PyMC objects, including 'pi', the covariate adjusted predicted values for each row of data """ name = data_type import data result = data.ModelVars() if (mu_age_parent != None and pl.any(pl.isnan(mu_age_parent))) or ( sigma_age_parent != None and pl.any(pl.isnan(sigma_age_parent)) ): mu_age_parent = None sigma_age_parent = None print "WARNING: nan found in parent mu/sigma. Ignoring" ages = pl.array(model.parameters["ages"]) data = model.get_data(data_type) if lower_bound: lb_data = model.get_data(lower_bound) parameters = model.parameters.get(data_type, {}) area_hierarchy = model.hierarchy vars = dismod3.data.ModelVars() vars += dict(data=data) if "parameter_age_mesh" in parameters: knots = pl.array(parameters["parameter_age_mesh"]) else: knots = pl.arange(ages[0], ages[-1] + 1, 5) smoothing_dict = {"No Prior": pl.inf, "Slightly": 0.5, "Moderately": 0.05, "Very": 0.005} if "smoothness" in parameters: smoothing = smoothing_dict[parameters["smoothness"]["amount"]] else: smoothing = 0.0 if mu_age == None: vars.update( age_pattern.age_pattern( name, ages=ages, knots=knots, smoothing=smoothing, interpolation_method=interpolation_method ) ) else: vars.update(dict(mu_age=mu_age, ages=ages)) vars.update(expert_prior_model.level_constraints(name, parameters, vars["mu_age"], ages)) vars.update(expert_prior_model.derivative_constraints(name, parameters, vars["mu_age"], ages)) if mu_age_parent != None: # setup a hierarchical prior on the simliarity between the # consistent estimate here and (inconsistent) estimate for its # parent in the areas hierarchy # weight_dict = {'Unusable': 10., 'Slightly': 10., 'Moderately': 1., 'Very': .1} # weight = weight_dict[parameters['heterogeneity']] vars.update( similarity_prior_model.similar( "parent_similarity_%s" % name, vars["mu_age"], mu_age_parent, sigma_age_parent, 0.0 ) ) # also use this as the initial value for the age pattern, if it is not already specified if mu_age == None: if isinstance(mu_age_parent, mc.Node): # TODO: test this code initial_mu = mu_age_parent.value else: initial_mu = mu_age_parent for i, k_i in enumerate(knots): vars["gamma"][i].value = (pl.log(initial_mu[k_i - ages[0]])).clip(-12, 6) age_weights = pl.ones_like(vars["mu_age"].value) # TODO: use age pattern appropriate to the rate type if len(data) > 0: vars.update( age_integrating_model.age_standardize_approx( name, age_weights, vars["mu_age"], data["age_start"], data["age_end"], ages ) ) # uncomment the following to effectively remove alleffects # if 'random_effects' in parameters: # for i in range(5): # effect = 'sigma_alpha_%s_%d' % (name, i) # parameters['random_effects'][effect] = dict(dist='TruncatedNormal', mu=.0001, sigma=.00001, lower=.00009, upper=.00011) # if 'fixed_effects' in parameters: # for effect in ['x_sex', 'x_LDI_id_Updated_7July2011']: # parameters['fixed_effects'][effect] = dict(dist='normal', mu=.0001, sigma=.00001) if include_covariates: vars.update( covariate_model.mean_covariate_model( name, vars["mu_interval"], data, parameters, model, reference_area, reference_sex, reference_year, zero_re=zero_re, ) ) else: vars.update({"pi": vars["mu_interval"]}) ## ensure that all data has uncertainty quantified appropriately # first replace all missing se from ci missing_se = pl.isnan(data["standard_error"]) | (data["standard_error"] < 0) data["standard_error"][missing_se] = (data["upper_ci"][missing_se] - data["lower_ci"][missing_se]) / (2 * 1.96) # then replace all missing ess with se missing_ess = pl.isnan(data["effective_sample_size"]) data["effective_sample_size"][missing_ess] = ( data["value"][missing_ess] * (1 - data["value"][missing_ess]) / data["standard_error"][missing_ess] ** 2 ) if rate_type == "neg_binom": # warn and drop data that doesn't have effective sample size quantified, or is is non-positive missing_ess = pl.isnan(data["effective_sample_size"]) | (data["effective_sample_size"] < 0) if sum(missing_ess) > 0: print "WARNING: %d rows of %s data has invalid quantification of uncertainty." % ( sum(missing_ess), name, ) data["effective_sample_size"][missing_ess] = 0.0 # warn and change data where ess is unreasonably huge large_ess = data["effective_sample_size"] >= 1.0e10 if sum(large_ess) > 0: print "WARNING: %d rows of %s data have effective sample size exceeding 10 billion." % ( sum(large_ess), name, ) data["effective_sample_size"][large_ess] = 1.0e10 if "heterogeneity" in parameters: lower_dict = {"Slightly": 9.0, "Moderately": 3.0, "Very": 1.0} lower = lower_dict[parameters["heterogeneity"]] else: lower = 1.0 # special case, treat pf data as poisson if data_type == "pf": lower = 1.0e12 vars.update(covariate_model.dispersion_covariate_model(name, data, lower, lower * 9.0)) vars.update( rate_model.neg_binom_model( name, vars["pi"], vars["delta"], data["value"], data["effective_sample_size"] ) ) elif rate_type == "log_normal": # warn and drop data that doesn't have effective sample size quantified missing = pl.isnan(data["standard_error"]) | (data["standard_error"] < 0) if sum(missing) > 0: print "WARNING: %d rows of %s data has no quantification of uncertainty." % (sum(missing), name) data["standard_error"][missing] = 1.0e6 # TODO: allow options for alternative priors for sigma vars["sigma"] = mc.Uniform("sigma_%s" % name, lower=0.0001, upper=1.0, value=0.01) # vars['sigma'] = mc.Exponential('sigma_%s'%name, beta=100., value=.01) vars.update( rate_model.log_normal_model(name, vars["pi"], vars["sigma"], data["value"], data["standard_error"]) ) elif rate_type == "normal": # warn and drop data that doesn't have standard error quantified missing = pl.isnan(data["standard_error"]) | (data["standard_error"] < 0) if sum(missing) > 0: print "WARNING: %d rows of %s data has no quantification of uncertainty." % (sum(missing), name) data["standard_error"][missing] = 1.0e6 vars["sigma"] = mc.Uniform("sigma_%s" % name, lower=0.0001, upper=0.1, value=0.01) vars.update(rate_model.normal_model(name, vars["pi"], vars["sigma"], data["value"], data["standard_error"])) elif rate_type == "binom": missing_ess = pl.isnan(data["effective_sample_size"]) | (data["effective_sample_size"] < 0) if sum(missing_ess) > 0: print "WARNING: %d rows of %s data has invalid quantification of uncertainty." % ( sum(missing_ess), name, ) data["effective_sample_size"][missing_ess] = 0.0 vars += rate_model.binom(name, vars["pi"], data["value"], data["effective_sample_size"]) elif rate_type == "beta_binom": vars += rate_model.beta_binom(name, vars["pi"], data["value"], data["effective_sample_size"]) elif rate_type == "poisson": missing_ess = pl.isnan(data["effective_sample_size"]) | (data["effective_sample_size"] < 0) if sum(missing_ess) > 0: print "WARNING: %d rows of %s data has invalid quantification of uncertainty." % ( sum(missing_ess), name, ) data["effective_sample_size"][missing_ess] = 0.0 vars += rate_model.poisson(name, vars["pi"], data["value"], data["effective_sample_size"]) elif rate_type == "offset_log_normal": vars["sigma"] = mc.Uniform("sigma_%s" % name, lower=0.0001, upper=10.0, value=0.01) vars += rate_model.offset_log_normal(name, vars["pi"], vars["sigma"], data["value"], data["standard_error"]) else: raise Exception, 'rate_model "%s" not implemented' % rate_type else: if include_covariates: vars.update( covariate_model.mean_covariate_model( name, [], data, parameters, model, reference_area, reference_sex, reference_year, zero_re=zero_re ) ) if include_covariates: vars.update(expert_prior_model.covariate_level_constraints(name, model, vars, ages)) if lower_bound and len(lb_data) > 0: vars["lb"] = age_integrating_model.age_standardize_approx( "lb_%s" % name, age_weights, vars["mu_age"], lb_data["age_start"], lb_data["age_end"], ages ) if include_covariates: vars["lb"].update( covariate_model.mean_covariate_model( "lb_%s" % name, vars["lb"]["mu_interval"], lb_data, parameters, model, reference_area, reference_sex, reference_year, zero_re=zero_re, ) ) else: vars["lb"].update({"pi": vars["lb"]["mu_interval"]}) vars["lb"].update( covariate_model.dispersion_covariate_model("lb_%s" % name, lb_data, 1e12, 1e13) # treat like poisson ) ## ensure that all data has uncertainty quantified appropriately # first replace all missing se from ci missing_se = pl.isnan(lb_data["standard_error"]) | (lb_data["standard_error"] <= 0) lb_data["standard_error"][missing_se] = (lb_data["upper_ci"][missing_se] - lb_data["lower_ci"][missing_se]) / ( 2 * 1.96 ) # then replace all missing ess with se missing_ess = pl.isnan(lb_data["effective_sample_size"]) lb_data["effective_sample_size"][missing_ess] = ( lb_data["value"][missing_ess] * (1 - lb_data["value"][missing_ess]) / lb_data["standard_error"][missing_ess] ** 2 ) # warn and drop lb_data that doesn't have effective sample size quantified missing_ess = pl.isnan(lb_data["effective_sample_size"]) | (lb_data["effective_sample_size"] <= 0) if sum(missing_ess) > 0: print "WARNING: %d rows of %s lower bound data has no quantification of uncertainty." % ( sum(missing_ess), name, ) lb_data["effective_sample_size"][missing_ess] = 1.0 vars["lb"].update( rate_model.neg_binom_lower_bound_model( "lb_%s" % name, vars["lb"]["pi"], vars["lb"]["delta"], lb_data["value"], lb_data["effective_sample_size"], ) ) result[data_type] = vars return result
def validate_consistent_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], true=dict(i=quadratic, f=constant, r=constant)): types = pl.array(['i', 'r', 'f', 'p']) ## generate simulated data model = data_simulation.simple_model(N) model.input_data['effective_sample_size'] = 1. model.input_data['value'] = 0. # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) for t in 'irf': for i, k_i in enumerate(sim[t]['knots']): sim[t]['gamma'][i].value = pl.log(true[t](k_i)) age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)] a = pl.arange(101) age_weights = pl.ones_like(a) sum_wt = pl.cumsum(age_weights) p = pl.zeros(N) for t in types: mu_t = sim[t]['mu_age'].value sum_mu_wt = pl.cumsum(mu_t*age_weights) p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p_t[i] = mu_t[age_start[i]] # copy part into p p[data_type==t] = p_t[data_type==t] # add covariate shifts import dismod3 import simplejson as json gbd_model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) model.hierarchy = gbd_model.hierarchy from validate_covariates import alpha_true_sim area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR']) alpha = {} for t in types: alpha[t] = alpha_true_sim(model, area_list, sigma_true) print json.dumps(alpha, indent=2) model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): t = data_type[i] p[i] = p[i] * pl.exp(pl.sum([alpha[t][n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) n = mc.runiform(100, 10000, size=N) model.input_data['data_type'] = data_type model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n # coarse knot spacing for fast testing for t in types: model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20) ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {}) #model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=101, burn=0, thin=1, tune_interval=100) model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_convergence_diag(model.vars) graphics.plot_fit(model, model.vars, {}, {}) for i, t in enumerate('i r f p rr pf'.split()): pl.subplot(2, 3, i+1) pl.plot(range(101), sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2) pl.plot(range(101), sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1) pl.show() model.input_data['mu_pred'] = 0. model.input_data['sigma_pred'] = 0. for t in types: model.input_data['mu_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['mean'] model.input_data['sigma_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true for t in types if t != 'rr'])) model.delta['mu_pred'] = [pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr'] model.delta['sigma_pred'] = [pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr'] data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame() model.sigma = pandas.DataFrame() for t in types: alpha_t = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) alpha_t['true'] = pandas.Series(dict(alpha[t])) alpha_t['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns) alpha_t['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns) alpha_t['type'] = t model.alpha = model.alpha.append(alpha_t.dropna(), ignore_index=True) sigma_t = pandas.DataFrame(dict(true=sigma_true)) sigma_t['mu_pred'] = [n.stats()['mean'] for n in model.vars[t]['sigma_alpha']] sigma_t['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars[t]['sigma_alpha']] model.sigma = model.sigma.append(sigma_t.dropna(), ignore_index=True) data_simulation.add_quality_metrics(model.alpha) data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame() for t in types: model.mu = model.mu.append(pandas.DataFrame(dict(true=sim[t]['mu_age'].value, mu_pred=model.vars[t]['mu_age'].stats()['mean'], sigma_pred=model.vars[t]['mu_age'].stats()['standard deviation'])), ignore_index=True) data_simulation.add_quality_metrics(model.mu) print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.mu['abs_err'].mean(), pl.median(pl.absolute(model.mu['rel_err'].dropna())), model.mu['covered?'].mean()) print data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
#for ncase in range(1): #for ncase in [6]: KK = np.uint64(CaseIn[ncase][0]) CC = np.uint64(CaseIn[ncase][1]) SS = np.uint64(CaseIn[ncase][2]) if (CC * SS < KK): print("Case #{}: IMPOSSIBLE".format(ncase + 1)) else: remaining = np.array([np.uint64(jj) for jj in range(KK)]) place = np.arange(min(KK, CC) - 1, -1, -1, dtype=np.uint64) #place = np.arange(0, CC, dtype = np.uint64) Kpow = np.array([KK**cj for cj in place]) ##With limits at 1e+18, should be no problem with int64 sample_pos = [] if (len(Kpow) > 1): if (pl.any(np.diff(np.double(Kpow)) >= 0)): # if(pl.any(np.diff(np.double(Kpow)) <=0)): print('unexpected overflow') sys.exit(1) while (len(remaining) >= CC): ##Break off the next word and convert to position pos_baseK = remaining[:CC] remaining = remaining[CC:] ##Generate a position ## (Kpow * (0*pos_baseK)).sum() + np.uint64(1) posj = (Kpow * pos_baseK).sum() + np.uint64(1) sample_pos.append(posj) if (len(remaining) > 0): pos_baseK = remaining posj = (Kpow[:len(pos_baseK)] * pos_baseK).sum() + np.uint64(1) sample_pos.append(posj)
def validate_age_integrating_model_sim(N=500, delta_true=.15, pi_true=quadratic): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) model = data_simulation.simple_model(N) #model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) #model.parameters['p']['smoothness'] = dict(amount='Very') age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true*age_weights) sum_wt = pl.cumsum(age_weights) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] n = mc.runiform(100, 10000, size=N) model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = n model.input_data['true'] = p model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(a, pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame(dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation'])) data_simulation.add_quality_metrics(model.mu) model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[]) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split()) print model.results return model
def mean_covariate_model(name, mu, input_data, parameters, model, root_area, root_sex, root_year, zero_re=True): """ Generate PyMC objects covariate adjusted version of mu :Parameters: - `name` : str - `mu` : the unadjusted mean parameter for this node - `model` : ModelData to use for covariates - `root_area, root_sex, root_year` : str, str, int - `zero_re` : boolean, change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic :Results: - Returns dict of PyMC objects, including 'pi', the covariate adjusted predicted values for the mu and X provided """ n = len(input_data.index) # make U and alpha p_U = model.hierarchy.number_of_nodes() # random effects for area U = pandas.DataFrame(pl.zeros((n, p_U)), columns=model.hierarchy.nodes(), index=input_data.index) for i, row in input_data.T.iteritems(): if row['area'] not in model.hierarchy: print 'WARNING: "%s" not in model hierarchy, skipping random effects for this observation' % row['area'] continue for level, node in enumerate(nx.shortest_path(model.hierarchy, 'all', input_data.ix[i, 'area'])): model.hierarchy.node[node]['level'] = level U.ix[i, node] = 1. for n2 in model.hierarchy.nodes(): for level, node in enumerate(nx.shortest_path(model.hierarchy, 'all', n2)): model.hierarchy.node[node]['level'] = level #U = U.select(lambda col: U[col].std() > 1.e-5, axis=1) # drop constant columns if len(U.index) == 0: U = pandas.DataFrame() else: U = U.select(lambda col: (U[col].max() > 0) and (model.hierarchy.node[col].get('level') > model.hierarchy.node[root_area]['level']), axis=1) # drop columns with only zeros and which are for higher levels in hierarchy #U = U.select(lambda col: model.hierarchy.node[col].get('level') <= 2, axis=1) # drop country-level REs #U = U.drop(['super-region_0', 'north_america_high_income', 'USA'], 1) #U = U.drop(['super-region_0', 'north_america_high_income'], 1) #U = U.drop(U.columns, 1) ## drop random effects with less than 1 observation or with all observations set to 1, unless they have an informative prior keep = [] if 'random_effects' in parameters: for re in parameters['random_effects']: if parameters['random_effects'][re].get('dist') == 'Constant': keep.append(re) U = U.select(lambda col: 1 <= U[col].sum() < len(U[col]) or col in keep, axis=1) U_shift = pandas.Series(0., index=U.columns) for level, node in enumerate(nx.shortest_path(model.hierarchy, 'all', root_area)): if node in U_shift: U_shift[node] = 1. U = U - U_shift sigma_alpha = [] for i in range(5): # max depth of hierarchy is 5 effect = 'sigma_alpha_%s_%d'%(name,i) if 'random_effects' in parameters and effect in parameters['random_effects']: prior = parameters['random_effects'][effect] print 'using stored RE hyperprior for', effect, prior sigma_alpha.append(MyTruncatedNormal(effect, prior['mu'], pl.maximum(prior['sigma'], .001)**-2, min(prior['mu'], prior['lower']), max(prior['mu'], prior['upper']), value=prior['mu'])) else: sigma_alpha.append(MyTruncatedNormal(effect, .05, .03**-2, .05, .5, value=.1)) alpha = pl.array([]) const_alpha_sigma = pl.array([]) alpha_potentials = [] if len(U.columns) > 0: tau_alpha_index = [] for alpha_name in U.columns: tau_alpha_index.append(model.hierarchy.node[alpha_name]['level']) tau_alpha_index=pl.array(tau_alpha_index, dtype=int) tau_alpha_for_alpha = [sigma_alpha[i]**-2 for i in tau_alpha_index] alpha = [] for i, tau_alpha_i in enumerate(tau_alpha_for_alpha): effect = 'alpha_%s_%s'%(name, U.columns[i]) if 'random_effects' in parameters and U.columns[i] in parameters['random_effects']: prior = parameters['random_effects'][U.columns[i]] print 'using stored RE for', effect, prior if prior['dist'] == 'Normal': alpha.append(mc.Normal(effect, prior['mu'], pl.maximum(prior['sigma'], .001)**-2, value=0.)) elif prior['dist'] == 'TruncatedNormal': alpha.append(MyTruncatedNormal(effect, prior['mu'], pl.maximum(prior['sigma'], .001)**-2, prior['lower'], prior['upper'], value=0.)) elif prior['dist'] == 'Constant': alpha.append(float(prior['mu'])) else: assert 0, 'ERROR: prior distribution "%s" is not implemented' % prior['dist'] else: alpha.append(mc.Normal(effect, 0, tau=tau_alpha_i, value=0)) # sigma for "constant" alpha const_alpha_sigma = [] for i, tau_alpha_i in enumerate(tau_alpha_for_alpha): effect = 'alpha_%s_%s'%(name, U.columns[i]) if 'random_effects' in parameters and U.columns[i] in parameters['random_effects']: prior = parameters['random_effects'][U.columns[i]] if prior['dist'] == 'Constant': const_alpha_sigma.append(float(prior['sigma'])) else: const_alpha_sigma.append(pl.nan) else: const_alpha_sigma.append(pl.nan) if zero_re: column_map = dict([(n,i) for i,n in enumerate(U.columns)]) # change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic for parent in model.hierarchy: node_names = model.hierarchy.successors(parent) nodes = [column_map[n] for n in node_names if n in U] if len(nodes) > 0: i = nodes[0] old_alpha_i = alpha[i] # do not change if prior for this node has dist='constant' if parameters.get('random_effects', {}).get(U.columns[i], {}).get('dist') == 'Constant': continue alpha[i] = mc.Lambda('alpha_det_%s_%d'%(name, i), lambda other_alphas_at_this_level=[alpha[n] for n in nodes[1:]]: -sum(other_alphas_at_this_level)) if isinstance(old_alpha_i, mc.Stochastic): @mc.potential(name='alpha_pot_%s_%s'%(name, U.columns[i])) def alpha_potential(alpha=alpha[i], mu=old_alpha_i.parents['mu'], tau=old_alpha_i.parents['tau']): return mc.normal_like(alpha, mu, tau) alpha_potentials.append(alpha_potential) # make X and beta X = input_data.select(lambda col: col.startswith('x_'), axis=1) # add sex as a fixed effect (TODO: decide if this should be in data.py, when loading gbd model) X['x_sex'] = [sex_value[row['sex']] for i, row in input_data.T.iteritems()] beta = pl.array([]) const_beta_sigma = pl.array([]) X_shift = pandas.Series(0., index=X.columns) if len(X.columns) > 0: # shift columns to have zero for root covariate try: output_template = model.output_template.groupby(['area', 'sex', 'year']).mean() # TODO: change to .first(), but that doesn't work with old pandas except pandas.core.groupby.DataError: output_template = model.output_template.groupby(['area', 'sex', 'year']).first() covs = output_template.filter(list(X.columns) + ['pop']) if len(covs.columns) > 1: leaves = [n for n in nx.traversal.bfs_tree(model.hierarchy, root_area) if model.hierarchy.successors(n) == []] if len(leaves) == 0: # networkx returns an empty list when the bfs tree is a single node leaves = [root_area] if root_sex == 'total' and root_year == 'all': # special case for all years and sexes covs = covs.delevel().drop(['year', 'sex'], axis=1).groupby('area').mean() # TODO: change to .reset_index(), but that doesn't work with old pandas leaf_covs = covs.ix[leaves] elif root_sex == 'total': raise Exception, 'root_sex == total, root_year != all is Not Yet Implemented' elif root_year == 'all': raise Exception, 'root_year == all, root_sex != total is Not Yet Implemented' else: leaf_covs = covs.ix[[(l, root_sex, root_year) for l in leaves]] for cov in covs: if cov != 'pop': X_shift[cov] = (leaf_covs[cov] * leaf_covs['pop']).sum() / leaf_covs['pop'].sum() if 'x_sex' in X.columns: X_shift['x_sex'] = sex_value[root_sex] X = X - X_shift assert not pl.any(pl.isnan(X.__array__())), 'Covariate matrix should have no missing values' beta = [] for i, effect in enumerate(X.columns): name_i = 'beta_%s_%s'%(name, effect) if 'fixed_effects' in parameters and effect in parameters['fixed_effects']: prior = parameters['fixed_effects'][effect] print 'using stored FE for', name_i, effect, prior if prior['dist'] == 'TruncatedNormal': beta.append(MyTruncatedNormal(name_i, mu=float(prior['mu']), tau=pl.maximum(prior['sigma'], .001)**-2, a=prior['lower'], b=prior['upper'], value=.5*(prior['lower']+prior['upper']))) elif prior['dist'] == 'Normal': beta.append(mc.Normal(name_i, mu=float(prior['mu']), tau=pl.maximum(prior['sigma'], .001)**-2, value=float(prior['mu']))) elif prior['dist'] == 'Constant': beta.append(float(prior['mu'])) else: assert 0, 'ERROR: prior distribution "%s" is not implemented' % prior['dist'] else: beta.append(mc.Normal(name_i, mu=0., tau=1.**-2, value=0)) # sigma for "constant" beta const_beta_sigma = [] for i, effect in enumerate(X.columns): name_i = 'beta_%s_%s'%(name, effect) if 'fixed_effects' in parameters and effect in parameters['fixed_effects']: prior = parameters['fixed_effects'][effect] if prior['dist'] == 'Constant': const_beta_sigma.append(float(prior.get('sigma', 1.e-6))) else: const_beta_sigma.append(pl.nan) else: const_beta_sigma.append(pl.nan) @mc.deterministic(name='pi_%s'%name) def pi(mu=mu, U=pl.array(U, dtype=float), alpha=alpha, X=pl.array(X, dtype=float), beta=beta): return mu * pl.exp(pl.dot(U, [float(x) for x in alpha]) + pl.dot(X, [float(x) for x in beta])) return dict(pi=pi, U=U, U_shift=U_shift, sigma_alpha=sigma_alpha, alpha=alpha, alpha_potentials=alpha_potentials, X=X, X_shift=X_shift, beta=beta, hierarchy=model.hierarchy, const_alpha_sigma=const_alpha_sigma, const_beta_sigma=const_beta_sigma)
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1, .1, .1, .1, .1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons( json.loads(dismod3.disease_json.DiseaseJson().to_json())) gbd_hierarchy = model.hierarchy model = data_simulation.simple_model(N) model.hierarchy = gbd_hierarchy model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount=smoothness) model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true * age_weights) sum_wt = pl.cumsum(age_weights * 1.) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N) from validate_covariates import alpha_true_sim area_list = pl.array([ 'all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR' ]) alpha = alpha_true_sim(model, area_list, sigma_true) print alpha model.input_data['true'] = pl.nan model.input_data['area'] = area_list[mc.rcategorical( pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = p[i] * pl.exp( pl.sum([ alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha ])) p = model.input_data['true'] n = model.input_data['effective_sample_size'] model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true * n * p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(range(101), pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats( )['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame( index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series( [n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series( [n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha = model.alpha.dropna() data_simulation.add_quality_metrics(model.alpha) model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [ n.stats()['mean'] for n in model.vars['p']['sigma_alpha'] ] model.sigma['sigma_pred'] = [ n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha'] ] data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % ( model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame( dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats() ['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
def linear_norm(x, y, msk, eps=0.003, deps=0.001, nmin=2, nwin=3): '''Linear normalization of a slice of a spectra, assuming that the slice is centered on the line to normalized. ''' bla = False blabla = False x = x[msk] y = y[msk] n = int((len(y)/2.)) yl = y[:n] yr = y[n+1:] # Criteria on the left of the central wavelength epsl, epsr = eps, eps while 1: critl = abs(max(yl)-yl) / max(yl) idx_yl = pl.where(critl <= epsl)[0] idx_yl = idx_yl.astype(int) if blabla: print " epsl:", epsl print " idx_yl, yl:", idx_yl, [y[i] for i in idx_yl] if pl.size(idx_yl) >= nmin: break else: epsl = epsl + deps # Criteria on the right of the central wavelength while 1: critr = abs(max(yr)-yr) / max(yr) idx_yr = pl.where(critr <= epsr)[0] + n idx_yr = idx_yr.astype(int) if blabla: print " epsr:", epsr print "idx_yr, yr:", idx_yr, [y[i] for i in idx_yr] if pl.size(idx_yr) >= nmin: break else: epsr = epsr + deps idx_y = pl.concatenate([idx_yl, idx_yr]) if bla: print " nmin, nwin =", nmin, nwin print " Number of selected left continuum points: ", idx_yl.size, "/", n print " Number of selected right continuum points: ", idx_yr.size, "/", n print " Number of selected continuum points: ", idx_y.size, "/", y.size xs = [x[i] for i in idx_y] ys = [y[i] for i in idx_y] xs, ys = pl.asarray(xs), pl.asarray(ys) n_xs = xs.size # Mean value around selected points for ind, val in enumerate(ys): i = idx_y[ind] - nwin j = idx_y[ind] + nwin if i < 0: i = 0 if j > len(y): j = len(y) ys[ind] = y[i:j].mean() if blabla: print "xs, ys", xs, ys A = pl.concatenate([xs, pl.ones(n_xs)]) A = A.reshape((2, n_xs)) w = pl.linalg.lstsq(A.T, ys)[0] # Test if one of the value of w is a nan if pl.any(w != w): print "Pb with linalg.lstsq. Try to reduce eps or nmin." quit(1) a, b = w[0], w[1] if blabla: print "a =", a, "b =", b return a, b, xs, ys
def fit(psp_shape, time, voltage, error_estimate, maxcall=1000, maximal_red_chi2=2.0, fail_on_negative_cov=None): """ psp_shape : object PSPShape instance time : numpy.ndarray of floats numpy array of data acquisition times voltage : numpy.ndarray numpy array of voltage values error_estimate : float estimate for the standard deviation of an individual data point. maxcall : int maximal number of calls to the fit routine fail_on_negative_cov : list of int returns : tuple (fit_results error_estimates chi2_per_dof success) """ assert len(time) == len(voltage) initial_values = psp_shape.initial_fit_values(time, voltage) result = optimize.leastsq( lambda param: (psp_shape(time, *param) - voltage), [initial_values[key] for key in psp_shape.parameter_names()], full_output=1, maxfev=maxcall) resultparams, cov_x, _, _, ier = result ndof = len(time) - len(psp_shape.parameter_names()) fit_voltage = psp_shape(time, *result[0]) red_chi2 = sum(((fit_voltage - voltage)) ** 2) \ / (error_estimate ** 2 * ndof) fail_neg = p.any(p.diag(cov_x) < 0) if fail_on_negative_cov is not None: fail_neg = p.any(p.logical_and( p.diag(cov_x) < 0, fail_on_negative_cov)) cov_x *= error_estimate**2 success = ((not fail_neg) and (ier in [1, 2, 3, 4]) and (red_chi2 <= maximal_red_chi2)) processed, processed_cov = psp_shape.process_fit_results( resultparams, cov_x) return processed, processed_cov, red_chi2, success
data_ecog_lp_ss[i,:] = signal.decimate(filters.low_pass_filter(data_ecog[i,:], Fsampling=f_sampling, Fcutoff=f_lp_cutoff), int(f_sampling/f_subsample)) data_ecog_lp_ss.flush() print(i) pl.save(os.path.join(memap_folder, 'data_ecog_lp_ss.npy'), data_ecog_lp_ss) spike_samples = tf.spikedetect(data_probe_hp, threshold_multiplier=6.5, bad_channels=probe_bad_channels) pl.save(os.path.join(memap_folder, 'spike_samples.npy'), spike_samples) spike_samples_clean = spike_samples for i in pl.arange(pl.size(spike_samples_clean)-1,-1,-1): data = data_probe_hp[:, spike_samples[i]-60:spike_samples[i]+60] stdevs = sp.std(data,1) if np.max(data) > 3000 or pl.any(stdevs>600): spike_samples_clean = pl.delete(spike_samples_clean, i) if i%100==0: print(i) spike_samples_clean = pl.delete(spike_samples_clean, 0) pl.save(os.path.join(memap_folder, 'spike_samples_clean.npy'), spike_samples_clean) channels = np.empty(0) for i in pl.arange(0, pl.size(spike_samples_clean)): data = np.array(data_probe_hp[:, spike_samples_clean[i]].tolist()) channels = np.append(channels, np.argmax(data)) if i%100==0: print(i) channels_spikes_df = pd.DataFrame([(channels, spike_samples_clean)], columns=['Channels', 'Samples']) spike_times_shaftA = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>7][channels_spikes_df.Channels[0]<16]
data_ecog_lp_ss[i,:] = signal.decimate( filters.low_pass_filter(data_ecog[i, :], Fsampling=f_sampling, Fcutoff=f_lp_cutoff), int(f_sampling / f_subsample)) data_ecog_lp_ss.flush() print(i) pl.save(os.path.join(memap_folder, 'data_ecog_lp_ss.npy'), data_ecog_lp_ss) spike_samples = tf.spikedetect(data_probe_hp, threshold_multiplier=6.5, bad_channels=probe_bad_channels) pl.save(os.path.join(memap_folder, 'spike_samples.npy'), spike_samples) spike_samples_clean = spike_samples for i in pl.arange(pl.size(spike_samples_clean)-1,-1,-1): data = data_probe_hp[:, spike_samples[i]-60:spike_samples[i]+60] stdevs = sp.std(data,1) if np.max(data) > 3000 or pl.any(stdevs>600): spike_samples_clean = pl.delete(spike_samples_clean, i) if i%100==0: print(i) spike_samples_clean = pl.delete(spike_samples_clean, 0) pl.save(os.path.join(memap_folder, 'spike_samples_clean.npy'), spike_samples_clean) channels = np.empty(0) for i in pl.arange(0, pl.size(spike_samples_clean)): data = np.array(data_probe_hp[:, spike_samples_clean[i]].tolist()) channels = np.append(channels, np.argmax(data)) if i%100==0: print(i) channels_spikes_df = pd.DataFrame([(channels, spike_samples_clean)], columns=['Channels', 'Samples']) spike_times_shaftA = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>7][channels_spikes_df.Channels[0]<16]
def win(board, letter): wins = logical_or(board == letter, board == 'T') return any(all(wins, 0)) or any(all(wins, 1)) or all(diag(wins)) or \ all(diag(rot90(wins)))
def setup(dm, key, data_list, rate_stoch): """ Generate the PyMC variables for a normal model of a function of age Parameters ---------- dm : dismod3.DiseaseModel the object containing all the data, priors, and additional information (like input and output age-mesh) key : str the name of the key for everything about this model (priors, initial values, estimations) data_list : list of data dicts the observed data to use in the beta-binomial liklihood function rate_stoch : pymc.Stochastic a PyMC stochastic (or deterministic) object, with len(rate_stoch.value) == len(dm.get_estimation_age_mesh()). Results ------- vars : dict Return a dictionary of all the relevant PyMC objects for the normal model. vars['rate_stoch'] is of particular relevance, for details see the beta_binomial_model """ vars = {} est_mesh = dm.get_estimate_age_mesh() if pl.any(pl.diff(est_mesh) != 1): raise ValueError, 'ERROR: Gaps in estimation age mesh must all equal 1' vars['rate_stoch'] = rate_stoch # set up priors and observed data prior_str = dm.get_priors(key) dismod3.utils.generate_prior_potentials(vars, prior_str, est_mesh) vars['observed_rates'] = [] for d in data_list: # set up observed stochs for all relevant data id = d['id'] if d['value'] == dismod3.settings.MISSING: print 'WARNING: data %d missing value' % id continue # ensure all rate data is valid d_val = dm.value_per_1(d) d_se = dm.se_per_1(d) if d['age_start'] < est_mesh[0] or d['age_end'] > est_mesh[-1]: raise ValueError, 'Data %d is outside of estimation range---([%d, %d] is not inside [%d, %d])' \ % (d['id'], d['age_start'], d['age_end'], est_mesh[0], est_mesh[-1]) age_indices = dismod3.utils.indices_for_range(est_mesh, d['age_start'], d['age_end']) age_weights = d.get('age_weights', pl.ones(len(age_indices)) / len(age_indices)) # data must have standard error to use normal model if d_se == 0: raise ValueError, 'Data %d has invalid standard error' % d['id'] print 'data %d: value = %f, se = %f' % (d['id'], d_val, d_se) @mc.observed @mc.stochastic(name='obs_%d' % id) def obs(f=rate_stoch, age_indices=age_indices, age_weights=age_weights, value=d_val, tau=1. / (d_se)**2): f_i = dismod3.utils.rate_for_range(f, age_indices, age_weights) return mc.normal_like(value, f_i, tau) vars['observed_rates'].append(obs) return vars
def setup(dm, key, data_list=[], rate_stoch=None, emp_prior={}, lower_bound_data=[]): """ Generate the PyMC variables for a negative-binomial model of a single rate function Parameters ---------- dm : dismod3.DiseaseModel the object containing all the data, priors, and additional information (like input and output age-mesh) key : str the name of the key for everything about this model (priors, initial values, estimations) data_list : list of data dicts the observed data to use in the negative binomial liklihood function rate_stoch : pymc.Stochastic, optional a PyMC stochastic (or deterministic) object, with len(rate_stoch.value) == len(dm.get_estimation_age_mesh()). This is used to link rate stochs into a larger model, for example. emp_prior : dict, optional the empirical prior dictionary, retrieved from the disease model if appropriate by:: >>> t, r, y, s = dismod3.utils.type_region_year_sex_from_key(key) >>> emp_prior = dm.get_empirical_prior(t) Results ------- vars : dict Return a dictionary of all the relevant PyMC objects for the rate model. vars['rate_stoch'] is of particular relevance; this is what is used to link the rate model into more complicated models, like the generic disease model. """ vars = {} est_mesh = dm.get_estimate_age_mesh() param_mesh = dm.get_param_age_mesh() if pl.any(pl.diff(est_mesh) != 1): raise ValueError, 'ERROR: Gaps in estimation age mesh must all equal 1' # calculate effective sample size for all data and lower bound data dm.calc_effective_sample_size(data_list) dm.calc_effective_sample_size(lower_bound_data) # generate regional covariates covariate_dict = dm.get_covariates() derived_covariate = dm.get_derived_covariate_values() X_region, X_study = regional_covariates(key, covariate_dict, derived_covariate) # use confidence prior from prior_str (only for posterior estimate, this is overridden below for empirical prior estimate) mu_delta = 1000. sigma_delta = 10. mu_log_delta = 3. sigma_log_delta = .25 from dismod3.settings import PRIOR_SEP_STR for line in dm.get_priors(key).split(PRIOR_SEP_STR): prior = line.strip().split() if len(prior) == 0: continue if prior[0] == 'heterogeneity': # originally designed for this: mu_delta = float(prior[1]) sigma_delta = float(prior[2]) # HACK: override design to set sigma_log_delta, # .25 = very, .025 = moderately, .0025 = slightly if float(prior[2]) > 0: sigma_log_delta = .025 / float(prior[2]) # use the empirical prior mean if it is available if len(set(emp_prior.keys()) & set(['alpha', 'beta', 'gamma'])) == 3: mu_alpha = pl.array(emp_prior['alpha']) sigma_alpha = pl.array(emp_prior['sigma_alpha']) alpha = pl.array(emp_prior['alpha']) # TODO: make this stochastic vars.update(region_coeffs=alpha) beta = pl.array(emp_prior['beta']) # TODO: make this stochastic sigma_beta = pl.array(emp_prior['sigma_beta']) vars.update(study_coeffs=beta) mu_gamma = pl.array(emp_prior['gamma']) sigma_gamma = pl.array(emp_prior['sigma_gamma']) # Do not inform dispersion parameter from empirical prior stage # if 'delta' in emp_prior: # mu_delta = emp_prior['delta'] # if 'sigma_delta' in emp_prior: # sigma_delta = emp_prior['sigma_delta'] else: import dismod3.regional_similarity_matrices as similarity_matrices n = len(X_region) mu_alpha = pl.zeros(n) sigma_alpha = .025 # TODO: make this a hyperparameter, with a traditional prior, like inverse gamma C_alpha = similarity_matrices.regions_nested_in_superregions(n, sigma_alpha) # use alternative region effect covariance structure if requested region_prior_key = 'region_effects' if region_prior_key in dm.params: if dm.params[region_prior_key] == 'uninformative': C_alpha = similarity_matrices.uninformative(n, sigma_alpha) region_prior_key = 'region_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0] if region_prior_key in dm.params: if dm.params[region_prior_key] == 'uninformative': C_alpha = similarity_matrices.regions_nested_in_superregions(n, dm.params[region_prior_key]['std']) # add informative prior for sex effect if requested sex_prior_key = 'sex_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0] if sex_prior_key in dm.params: print 'adjusting prior on sex effect coefficient for %s' % key mu_alpha[n-1] = pl.log(dm.params[sex_prior_key]['mean']) sigma_sex = (pl.log(dm.params[sex_prior_key]['upper_ci']) - pl.log(dm.params[sex_prior_key]['lower_ci'])) / (2*1.96) C_alpha[n-1, n-1]= sigma_sex**2. # add informative prior for time effect if requested time_prior_key = 'time_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0] # HACK: sometimes key is just parameter type, sometimes it is type+region+year+sex if time_prior_key in dm.params: print 'adjusting prior on time effect coefficient for %s' % key mu_alpha[n-2] = pl.log(dm.params[time_prior_key]['mean']) sigma_time = (pl.log(dm.params[time_prior_key]['upper_ci']) - pl.log(dm.params[time_prior_key]['lower_ci'])) / (2*1.96) C_alpha[n-2, n-2]= sigma_time**2. #C_alpha = similarity_matrices.all_related_equally(n, sigma_alpha) alpha = mc.MvNormalCov('region_coeffs_%s' % key, mu=mu_alpha, C=C_alpha, value=mu_alpha) vars.update(region_coeffs=alpha, region_coeffs_step_cov=.005*C_alpha) mu_beta = pl.zeros(len(X_study)) sigma_beta = .1 # add informative prior for beta effect if requested prior_key = 'beta_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0] # HACK: sometimes key is just parameter type, sometimes it is type+region+year+sex if prior_key in dm.params: print 'adjusting prior on beta effect coefficients for %s' % key mu_beta = pl.array(dm.params[prior_key]['mean']) sigma_beta = pl.array(dm.params[prior_key]['std']) beta = mc.Normal('study_coeffs_%s' % key, mu=mu_beta, tau=sigma_beta**-2., value=mu_beta) vars.update(study_coeffs=beta) mu_gamma = 0.*pl.ones(len(est_mesh)) sigma_gamma = 2.*pl.ones(len(est_mesh)) # add informative prior for gamma effect if requested prior_key = 'gamma_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0] # HACK: sometimes key is just parameter type, sometimes it is type+region+year+sex if prior_key in dm.params: print 'adjusting prior on gamma effect coefficients for %s' % key mu_gamma = pl.array(dm.params[prior_key]['mean']) sigma_gamma = pl.array(dm.params[prior_key]['std']) # always use dispersed prior on delta for empirical prior phase mu_log_delta = 3. sigma_log_delta = .25 # add informative prior for delta effect if requested prior_key = 'delta_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0] # HACK: sometimes key is just parameter type, sometimes it is type+region+year+sex if prior_key in dm.params: print 'adjusting prior on delta effect coefficients for %s' % key mu_log_delta = dm.params[prior_key]['mean'] sigma_log_delta = dm.params[prior_key]['std'] mu_zeta = 0. sigma_zeta = .25 # add informative prior for zeta effect if requested prior_key = 'zeta_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0] # HACK: sometimes key is just parameter type, sometimes it is type+region+year+sex if prior_key in dm.params: print 'adjusting prior on zeta effect coefficients for %s' % key mu_zeta = dm.params[prior_key]['mean'] sigma_zeta = dm.params[prior_key]['std'] if mu_delta != 0.: if sigma_delta != 0.: log_delta = mc.Normal('log_dispersion_%s' % key, mu=mu_log_delta, tau=sigma_log_delta**-2, value=3.) zeta = mc.Normal('zeta_%s'%key, mu=mu_zeta, tau=sigma_zeta**-2, value=mu_zeta) delta = mc.Lambda('dispersion_%s' % key, lambda x=log_delta: 50. + 10.**x) vars.update(dispersion=delta, log_dispersion=log_delta, zeta=zeta, dispersion_step_sd=.1*log_delta.parents['tau']**-.5) else: delta = mc.Lambda('dispersion_%s' % key, lambda x=mu_delta: mu_delta) vars.update(dispersion=delta) else: delta = mc.Lambda('dispersion_%s' % key, lambda mu=mu_delta: 0) vars.update(dispersion=delta) if len(sigma_gamma) == 1: sigma_gamma = sigma_gamma[0]*pl.ones(len(est_mesh)) # create varible for interpolated rate; # also create variable for age-specific rate function, if it does not yet exist if rate_stoch: # if the rate_stoch already exists, for example prevalence in the generic model, # we use it to back-calculate mu and eventually gamma mu = rate_stoch @mc.deterministic(name='age_coeffs_%s' % key) def gamma(mu=mu, Xa=X_region, Xb=X_study, alpha=alpha, beta=beta): return pl.log(pl.maximum(dismod3.settings.NEARLY_ZERO, mu)) - pl.dot(alpha, Xa) - pl.dot(beta, Xb) @mc.potential(name='age_coeffs_potential_%s' % key) def gamma_potential(gamma=gamma, mu_gamma=mu_gamma, tau_gamma=1./sigma_gamma[param_mesh]**2, param_mesh=param_mesh): return mc.normal_like(gamma[param_mesh], mu_gamma[param_mesh], tau_gamma) vars.update(rate_stoch=mu, age_coeffs=gamma, age_coeffs_potential=gamma_potential) else: # if the rate_stoch does not yet exists, we make gamma a stoch, and use it to calculate mu # for computational efficiency, gamma is a linearly interpolated version of gamma_mesh initial_gamma = pl.log(dismod3.settings.NEARLY_ZERO + dm.get_initial_value(key)) gamma_mesh = mc.Normal('age_coeffs_mesh_%s' % key, mu=mu_gamma[param_mesh], tau=sigma_gamma[param_mesh]**-2, value=initial_gamma[param_mesh]) @mc.deterministic(name='age_coeffs_%s' % key) def gamma(gamma_mesh=gamma_mesh, param_mesh=param_mesh, est_mesh=est_mesh): return dismod3.utils.interpolate(param_mesh, gamma_mesh, est_mesh) @mc.deterministic(name=key) def mu(Xa=X_region, Xb=X_study, alpha=alpha, beta=beta, gamma=gamma): return predict_rate([Xa, Xb], alpha, beta, gamma, lambda f, age: f, est_mesh) # Create a guess at the covariance matrix for MCMC proposals to update gamma_mesh from pymc.gp.cov_funs import matern a = pl.atleast_2d(param_mesh).T C = matern.euclidean(a, a, diff_degree = 2, amp = 1.**2, scale = 10.) vars.update(age_coeffs_mesh=gamma_mesh, age_coeffs=gamma, rate_stoch=mu, age_coeffs_mesh_step_cov=.005*pl.array(C)) # adjust value of gamma_mesh based on priors, if necessary # TODO: implement more adjustments, currently only adjusted based on at_least priors for line in dm.get_priors(key).split(PRIOR_SEP_STR): prior = line.strip().split() if len(prior) == 0: continue if prior[0] == 'at_least': delta_gamma = pl.log(pl.maximum(mu.value, float(prior[1]))) - pl.log(mu.value) gamma_mesh.value = gamma_mesh.value + delta_gamma[param_mesh] # create potentials for priors dismod3.utils.generate_prior_potentials(vars, dm.get_priors(key), est_mesh) # create observed stochastics for data vars['data'] = [] if mu_delta != 0.: value = [] N = [] Xa = [] Xb = [] ai = [] aw = [] Xz = [] for d in data_list: try: age_indices, age_weights, Y_i, N_i = values_from(dm, d) except ValueError: debug('WARNING: could not calculate likelihood for data %d' % d['id']) continue value.append(Y_i*N_i) N.append(N_i) Xa.append(covariates(d, covariate_dict)[0]) Xb.append(covariates(d, covariate_dict)[1]) Xz.append(float(d.get('bias') or 0.)) ai.append(age_indices) aw.append(age_weights) vars['data'].append(d) N = pl.array(N) Xa = pl.array(Xa) Xb = pl.array(Xb) Xz = pl.array(Xz) value = pl.array(value) vars['effective_sample_size'] = list(N) if len(vars['data']) > 0: # TODO: consider using only a subset of the rates at each step of the fit to speed computation; say 100 of them k = 50000 if len(vars['data']) < k: data_sample = range(len(vars['data'])) else: import random @mc.deterministic(name='data_sample_%s' % key) def data_sample(n=len(vars['data']), k=k): return random.sample(range(n), k) @mc.deterministic(name='rate_%s' % key) def rates(S=data_sample, Xa=Xa, Xb=Xb, alpha=alpha, beta=beta, gamma=gamma, bounds_func=vars['bounds_func'], age_indices=ai, age_weights=aw): # calculate study-specific rate function shifts = pl.exp(pl.dot(Xa[S], alpha) + pl.dot(Xb[S], pl.atleast_1d(beta))) exp_gamma = pl.exp(gamma) mu = pl.zeros_like(shifts) for i,s in enumerate(S): mu[i] = pl.dot(age_weights[s], bounds_func(shifts[i] * exp_gamma[age_indices[s]], age_indices[s])) # TODO: evaluate speed increase and accuracy decrease of the following: #midpoint = age_indices[s][len(age_indices[s])/2] #mu[i] = bounds_func(shifts[i] * exp_gamma[midpoint], midpoint) # TODO: evaluate speed increase and accuracy decrease of the following: (to see speed increase, need to code this up using difference of running sums #mu[i] = pl.dot(pl.ones_like(age_weights[s]) / float(len(age_weights[s])), # bounds_func(shifts[i] * exp_gamma[age_indices[s]], age_indices[s])) return mu vars['expected_rates'] = rates @mc.observed @mc.stochastic(name='data_%s' % key) def obs(value=value, S=data_sample, N=N, mu_i=rates, Xz=Xz, zeta=zeta, delta=delta): #zeta_i = .001 #residual = pl.log(value[S] + zeta_i) - pl.log(mu_i*N[S] + zeta_i) #return mc.normal_like(residual, 0, 100. + delta) logp = mc.negative_binomial_like(value[S], N[S]*mu_i, delta*pl.exp(Xz*zeta)) return logp vars['observed_counts'] = obs @mc.deterministic(name='predicted_data_%s' % key) def predictions(value=value, N=N, S=data_sample, mu=rates, delta=delta): r_S = mc.rnegative_binomial(N[S]*mu, delta)/N[S] r = pl.zeros(len(vars['data'])) r[S] = r_S return r vars['predicted_rates'] = predictions debug('likelihood of %s contains %d rates' % (key, len(vars['data']))) # now do the same thing for the lower bound data # TODO: refactor to remove duplicated code vars['lower_bound_data'] = [] value = [] N = [] Xa = [] Xb = [] ai = [] aw = [] for d in lower_bound_data: try: age_indices, age_weights, Y_i, N_i = values_from(dm, d) except ValueError: debug('WARNING: could not calculate likelihood for data %d' % d['id']) continue value.append(Y_i*N_i) N.append(N_i) Xa.append(covariates(d, covariate_dict)[0]) Xb.append(covariates(d, covariate_dict)[1]) ai.append(age_indices) aw.append(age_weights) vars['lower_bound_data'].append(d) N = pl.array(N) value = pl.array(value) if len(vars['lower_bound_data']) > 0: @mc.observed @mc.stochastic(name='lower_bound_data_%s' % key) def obs_lb(value=value, N=N, Xa=Xa, Xb=Xb, alpha=alpha, beta=beta, gamma=gamma, bounds_func=vars['bounds_func'], delta=delta, age_indices=ai, age_weights=aw): # calculate study-specific rate function shifts = pl.exp(pl.dot(Xa, alpha) + pl.dot(Xb, pl.atleast_1d(beta))) exp_gamma = pl.exp(gamma) mu_i = [pl.dot(weights, bounds_func(s_i * exp_gamma[ages], ages)) for s_i, ages, weights in zip(shifts, age_indices, age_weights)] # TODO: try vectorizing this loop to increase speed rate_param = mu_i*N violated_bounds = pl.nonzero(rate_param < value) logp = mc.negative_binomial_like(value[violated_bounds], rate_param[violated_bounds], delta) return logp vars['observed_lower_bounds'] = obs_lb debug('likelihood of %s contains %d lowerbounds' % (key, len(vars['lower_bound_data']))) return vars
def setup(dm, key, data_list, rate_stoch): """ Generate the PyMC variables for a normal model of a function of age Parameters ---------- dm : dismod3.DiseaseModel the object containing all the data, priors, and additional information (like input and output age-mesh) key : str the name of the key for everything about this model (priors, initial values, estimations) data_list : list of data dicts the observed data to use in the beta-binomial liklihood function rate_stoch : pymc.Stochastic a PyMC stochastic (or deterministic) object, with len(rate_stoch.value) == len(dm.get_estimation_age_mesh()). Results ------- vars : dict Return a dictionary of all the relevant PyMC objects for the normal model. vars['rate_stoch'] is of particular relevance, for details see the beta_binomial_model """ vars = {} est_mesh = dm.get_estimate_age_mesh() if pl.any(pl.diff(est_mesh) != 1): raise ValueError, 'ERROR: Gaps in estimation age mesh must all equal 1' vars['rate_stoch'] = rate_stoch # set up priors and observed data prior_str = dm.get_priors(key) dismod3.utils.generate_prior_potentials(vars, prior_str, est_mesh) vars['observed_rates'] = [] for d in data_list: # set up observed stochs for all relevant data id = d['id'] if d['value'] == dismod3.settings.MISSING: print 'WARNING: data %d missing value' % id continue # ensure all rate data is valid d_val = dm.value_per_1(d) d_se = dm.se_per_1(d) if d['age_start'] < est_mesh[0] or d['age_end'] > est_mesh[-1]: raise ValueError, 'Data %d is outside of estimation range---([%d, %d] is not inside [%d, %d])' \ % (d['id'], d['age_start'], d['age_end'], est_mesh[0], est_mesh[-1]) age_indices = dismod3.utils.indices_for_range(est_mesh, d['age_start'], d['age_end']) age_weights = d.get('age_weights', pl.ones(len(age_indices)) / len(age_indices)) # data must have standard error to use normal model if d_se == 0: raise ValueError, 'Data %d has invalid standard error' % d['id'] print 'data %d: value = %f, se = %f' % (d['id'], d_val, d_se) @mc.observed @mc.stochastic(name='obs_%d' % id) def obs(f=rate_stoch, age_indices=age_indices, age_weights=age_weights, value=d_val, tau=1./(d_se)**2): f_i = dismod3.utils.rate_for_range(f, age_indices, age_weights) return mc.normal_like(value, f_i, tau) vars['observed_rates'].append(obs) return vars
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'): ## generate simulated data a = pl.arange(0, 101, 1) pi_age_true = pi_true(a) import dismod3 import simplejson as json model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json())) gbd_hierarchy = model.hierarchy model = data_simulation.simple_model(N) model.hierarchy = gbd_hierarchy model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10) model.parameters['p']['smoothness'] = dict(amount=smoothness) model.parameters['p']['heterogeneity'] = heterogeneity age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int) age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int) age_weights = pl.ones_like(a) sum_pi_wt = pl.cumsum(pi_age_true*age_weights) sum_wt = pl.cumsum(age_weights*1.) p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start]) # correct cases where age_start == age_end i = age_start == age_end if pl.any(i): p[i] = pi_age_true[age_start[i]] model.input_data['age_start'] = age_start model.input_data['age_end'] = age_end model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N) from validate_covariates import alpha_true_sim area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR']) alpha = alpha_true_sim(model, area_list, sigma_true) print alpha model.input_data['true'] = pl.nan model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)] for i, a in model.input_data['area'].iteritems(): model.input_data['true'][i] = p[i] * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha])) p = model.input_data['true'] n = model.input_data['effective_sample_size'] model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n ## Then fit the model and compare the estimates to the truth model.vars = {} model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None) #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100) model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100) graphics.plot_one_ppc(model.vars['p'], 'p') graphics.plot_convergence_diag(model.vars) graphics.plot_one_type(model, model.vars['p'], {}, 'p') pl.plot(range(101), pi_age_true, 'r:', label='Truth') pl.legend(fancybox=True, shadow=True, loc='upper left') pl.show() model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean'] model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation'] data_simulation.add_quality_metrics(model.input_data) model.delta = pandas.DataFrame(dict(true=[delta_true])) model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean() model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std() data_simulation.add_quality_metrics(model.delta) model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)]) model.alpha['true'] = pandas.Series(dict(alpha)) model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns) model.alpha = model.alpha.dropna() data_simulation.add_quality_metrics(model.alpha) model.sigma = pandas.DataFrame(dict(true=sigma_true)) model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']] model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']] data_simulation.add_quality_metrics(model.sigma) print 'delta' print model.delta print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(), pl.median(pl.absolute(model.input_data['rel_err'].dropna())), model.input_data['covered?'].mean()) model.mu = pandas.DataFrame(dict(true=pi_age_true, mu_pred=model.vars['p']['mu_age'].stats()['mean'], sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation'])) data_simulation.add_quality_metrics(model.mu) data_simulation.initialize_results(model) data_simulation.add_to_results(model, 'delta') data_simulation.add_to_results(model, 'mu') data_simulation.add_to_results(model, 'input_data') data_simulation.add_to_results(model, 'alpha') data_simulation.add_to_results(model, 'sigma') data_simulation.finalize_results(model) print model.results return model
Positions=Positions[1:] # =================== REMOVE OVERLAPS =================== from scipy.spatial.distance import pdist,squareform N=Positions.shape[0] dists=squareform(pdist(Positions)) # exclude the case of self-distance pl.fill_diagonal(dists, pl.inf) # first in, first served test= (dists<cutoff) print "- Cutting overlaps" picked=[] for p in range(N): if pl.any(test[p,:]): test[:,p]=False test[p,:]=False else: picked.append(p) No_overlaps=Positions[picked] print "\n====> Detected"+F.GREEN,No_overlaps.shape[0],F.RESET+"particles.\n" # ======================== SAVING THE RESULT ===================== # reorder the columns z,y,x=No_overlaps[:,0],No_overlaps[:,1],No_overlaps[:,2] outfile="Detected_"+filename.split('_')[0][-3:]+".txt" pl.savetxt(outfile,zip(x,y,z), fmt="%g")
def mean_covariate_model(name, mu, input_data, parameters, model, root_area, root_sex, root_year, zero_re=True): """ Generate PyMC objects covariate adjusted version of mu :Parameters: - `name` : str - `mu` : the unadjusted mean parameter for this node - `model` : ModelData to use for covariates - `root_area, root_sex, root_year` : str, str, int - `zero_re` : boolean, change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic :Results: - Returns dict of PyMC objects, including 'pi', the covariate adjusted predicted values for the mu and X provided """ n = len(input_data.index) # make U and alpha p_U = model.hierarchy.number_of_nodes() # random effects for area U = pandas.DataFrame(pl.zeros((n, p_U)), columns=model.hierarchy.nodes(), index=input_data.index) for i, row in input_data.T.iteritems(): if row['area'] not in model.hierarchy: print 'WARNING: "%s" not in model hierarchy, skipping random effects for this observation' % row[ 'area'] continue for level, node in enumerate( nx.shortest_path(model.hierarchy, 'all', input_data.ix[i, 'area'])): model.hierarchy.node[node]['level'] = level U.ix[i, node] = 1. for n2 in model.hierarchy.nodes(): for level, node in enumerate( nx.shortest_path(model.hierarchy, 'all', n2)): model.hierarchy.node[node]['level'] = level #U = U.select(lambda col: U[col].std() > 1.e-5, axis=1) # drop constant columns if len(U.index) == 0: U = pandas.DataFrame() else: U = U.select( lambda col: (U[col].max() > 0) and (model.hierarchy.node[col].get( 'level') > model.hierarchy.node[root_area]['level']), axis=1 ) # drop columns with only zeros and which are for higher levels in hierarchy #U = U.select(lambda col: model.hierarchy.node[col].get('level') <= 2, axis=1) # drop country-level REs #U = U.drop(['super-region_0', 'north_america_high_income', 'USA'], 1) #U = U.drop(['super-region_0', 'north_america_high_income'], 1) #U = U.drop(U.columns, 1) ## drop random effects with less than 1 observation or with all observations set to 1, unless they have an informative prior keep = [] if 'random_effects' in parameters: for re in parameters['random_effects']: if parameters['random_effects'][re].get('dist') == 'Constant': keep.append(re) U = U.select( lambda col: 1 <= U[col].sum() < len(U[col]) or col in keep, axis=1) U_shift = pandas.Series(0., index=U.columns) for level, node in enumerate( nx.shortest_path(model.hierarchy, 'all', root_area)): if node in U_shift: U_shift[node] = 1. U = U - U_shift sigma_alpha = [] for i in range(5): # max depth of hierarchy is 5 effect = 'sigma_alpha_%s_%d' % (name, i) if 'random_effects' in parameters and effect in parameters[ 'random_effects']: prior = parameters['random_effects'][effect] print 'using stored RE hyperprior for', effect, prior sigma_alpha.append( MyTruncatedNormal(effect, prior['mu'], pl.maximum(prior['sigma'], .001)**-2, min(prior['mu'], prior['lower']), max(prior['mu'], prior['upper']), value=prior['mu'])) else: sigma_alpha.append( MyTruncatedNormal(effect, .05, .03**-2, .05, .5, value=.1)) alpha = pl.array([]) const_alpha_sigma = pl.array([]) alpha_potentials = [] if len(U.columns) > 0: tau_alpha_index = [] for alpha_name in U.columns: tau_alpha_index.append(model.hierarchy.node[alpha_name]['level']) tau_alpha_index = pl.array(tau_alpha_index, dtype=int) tau_alpha_for_alpha = [sigma_alpha[i]**-2 for i in tau_alpha_index] alpha = [] for i, tau_alpha_i in enumerate(tau_alpha_for_alpha): effect = 'alpha_%s_%s' % (name, U.columns[i]) if 'random_effects' in parameters and U.columns[i] in parameters[ 'random_effects']: prior = parameters['random_effects'][U.columns[i]] print 'using stored RE for', effect, prior if prior['dist'] == 'Normal': alpha.append( mc.Normal(effect, prior['mu'], pl.maximum(prior['sigma'], .001)**-2, value=0.)) elif prior['dist'] == 'TruncatedNormal': alpha.append( MyTruncatedNormal(effect, prior['mu'], pl.maximum(prior['sigma'], .001)**-2, prior['lower'], prior['upper'], value=0.)) elif prior['dist'] == 'Constant': alpha.append(float(prior['mu'])) else: assert 0, 'ERROR: prior distribution "%s" is not implemented' % prior[ 'dist'] else: alpha.append(mc.Normal(effect, 0, tau=tau_alpha_i, value=0)) # sigma for "constant" alpha const_alpha_sigma = [] for i, tau_alpha_i in enumerate(tau_alpha_for_alpha): effect = 'alpha_%s_%s' % (name, U.columns[i]) if 'random_effects' in parameters and U.columns[i] in parameters[ 'random_effects']: prior = parameters['random_effects'][U.columns[i]] if prior['dist'] == 'Constant': const_alpha_sigma.append(float(prior['sigma'])) else: const_alpha_sigma.append(pl.nan) else: const_alpha_sigma.append(pl.nan) if zero_re: column_map = dict([(n, i) for i, n in enumerate(U.columns)]) # change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic for parent in model.hierarchy: node_names = model.hierarchy.successors(parent) nodes = [column_map[n] for n in node_names if n in U] if len(nodes) > 0: i = nodes[0] old_alpha_i = alpha[i] # do not change if prior for this node has dist='constant' if parameters.get('random_effects', {}).get(U.columns[i], {}).get('dist') == 'Constant': continue alpha[i] = mc.Lambda( 'alpha_det_%s_%d' % (name, i), lambda other_alphas_at_this_level= [alpha[n] for n in nodes[1:]]: -sum(other_alphas_at_this_level)) if isinstance(old_alpha_i, mc.Stochastic): @mc.potential(name='alpha_pot_%s_%s' % (name, U.columns[i])) def alpha_potential(alpha=alpha[i], mu=old_alpha_i.parents['mu'], tau=old_alpha_i.parents['tau']): return mc.normal_like(alpha, mu, tau) alpha_potentials.append(alpha_potential) # make X and beta X = input_data.select(lambda col: col.startswith('x_'), axis=1) # add sex as a fixed effect (TODO: decide if this should be in data.py, when loading gbd model) X['x_sex'] = [sex_value[row['sex']] for i, row in input_data.T.iteritems()] beta = pl.array([]) const_beta_sigma = pl.array([]) X_shift = pandas.Series(0., index=X.columns) if len(X.columns) > 0: # shift columns to have zero for root covariate try: output_template = model.output_template.groupby([ 'area', 'sex', 'year' ]).mean( ) # TODO: change to .first(), but that doesn't work with old pandas except pandas.core.groupby.DataError: output_template = model.output_template.groupby( ['area', 'sex', 'year']).first() covs = output_template.filter(list(X.columns) + ['pop']) if len(covs.columns) > 1: leaves = [ n for n in nx.traversal.bfs_tree(model.hierarchy, root_area) if model.hierarchy.successors(n) == [] ] if len(leaves) == 0: # networkx returns an empty list when the bfs tree is a single node leaves = [root_area] if root_sex == 'total' and root_year == 'all': # special case for all years and sexes covs = covs.delevel().drop([ 'year', 'sex' ], axis=1).groupby('area').mean( ) # TODO: change to .reset_index(), but that doesn't work with old pandas leaf_covs = covs.ix[leaves] elif root_sex == 'total': raise Exception, 'root_sex == total, root_year != all is Not Yet Implemented' elif root_year == 'all': raise Exception, 'root_year == all, root_sex != total is Not Yet Implemented' else: leaf_covs = covs.ix[[(l, root_sex, root_year) for l in leaves]] for cov in covs: if cov != 'pop': X_shift[cov] = (leaf_covs[cov] * leaf_covs['pop'] ).sum() / leaf_covs['pop'].sum() if 'x_sex' in X.columns: X_shift['x_sex'] = sex_value[root_sex] X = X - X_shift assert not pl.any(pl.isnan( X.__array__())), 'Covariate matrix should have no missing values' beta = [] for i, effect in enumerate(X.columns): name_i = 'beta_%s_%s' % (name, effect) if 'fixed_effects' in parameters and effect in parameters[ 'fixed_effects']: prior = parameters['fixed_effects'][effect] print 'using stored FE for', name_i, effect, prior if prior['dist'] == 'TruncatedNormal': beta.append( MyTruncatedNormal( name_i, mu=float(prior['mu']), tau=pl.maximum(prior['sigma'], .001)**-2, a=prior['lower'], b=prior['upper'], value=.5 * (prior['lower'] + prior['upper']))) elif prior['dist'] == 'Normal': beta.append( mc.Normal(name_i, mu=float(prior['mu']), tau=pl.maximum(prior['sigma'], .001)**-2, value=float(prior['mu']))) elif prior['dist'] == 'Constant': beta.append(float(prior['mu'])) else: assert 0, 'ERROR: prior distribution "%s" is not implemented' % prior[ 'dist'] else: beta.append(mc.Normal(name_i, mu=0., tau=1.**-2, value=0)) # sigma for "constant" beta const_beta_sigma = [] for i, effect in enumerate(X.columns): name_i = 'beta_%s_%s' % (name, effect) if 'fixed_effects' in parameters and effect in parameters[ 'fixed_effects']: prior = parameters['fixed_effects'][effect] if prior['dist'] == 'Constant': const_beta_sigma.append(float(prior.get('sigma', 1.e-6))) else: const_beta_sigma.append(pl.nan) else: const_beta_sigma.append(pl.nan) @mc.deterministic(name='pi_%s' % name) def pi(mu=mu, U=pl.array(U, dtype=float), alpha=alpha, X=pl.array(X, dtype=float), beta=beta): return mu * pl.exp( pl.dot(U, [float(x) for x in alpha]) + pl.dot(X, [float(x) for x in beta])) return dict(pi=pi, U=U, U_shift=U_shift, sigma_alpha=sigma_alpha, alpha=alpha, alpha_potentials=alpha_potentials, X=X, X_shift=X_shift, beta=beta, hierarchy=model.hierarchy, const_alpha_sigma=const_alpha_sigma, const_beta_sigma=const_beta_sigma)
def process_summary(summary_filename): if ('fake' in summary_filename) or \ ('H3' in summary_filename) or \ ('H4' in summary_filename) or \ ('H7' in summary_filename) or \ ('H8' in summary_filename): logging.debug("Skipping %s" % summary_filename) return summary = physio.summary.Summary(summary_filename) logging.debug("Processing %s" % summary._filename) # cull trials by success trials = summary.get_trials() if len(trials) == 0: logging.error("No trails for %s" % summary._filename) return trials = trials[trials['outcome'] == 0] # and gaze gaze = clean_gaze(summary.get_gaze()) if len(gaze) > 0: logging.debug("N Trials before gaze culling: %i" % len(trials)) trials = cull_trials_by_gaze(trials, gaze) logging.debug("N Trials after gaze culling: %i" % len(trials)) for ch in xrange(1, 33): for cl in summary.get_cluster_indices(ch): outdir = '%s/%s_%i_%i' % \ (resultsdir, os.path.basename(summary._filename), ch, cl) info_dict = {} logging.debug("ch: %i, cl: %i" % (ch, cl)) # rate spike_times = summary.get_spike_times(ch, cl) # find start of isolation isolation_start = physio.spikes.times.\ find_isolation_start_by_isi(spike_times) spike_times = spike_times[spike_times >= isolation_start] nspikes = len(spike_times) info_dict['nspikes'] = nspikes if nspikes < min_spikes: logging.warning("\t%i < min_spikes[%i]" % \ (nspikes, min_spikes)) continue trange = (spike_times.min(), spike_times.max()) # trange = summary.get_epoch_range() rate = nspikes / (trange[1] - trange[0]) info_dict['rate'] = rate if rate < min_rate: logging.warning("\t%i < min_rate[%i]" % \ (rate, min_rate)) continue # filter trials dtrials = summary.filter_trials(trials, \ {'name': {'value': 'BlueSquare', 'op': '!='}}, \ timeRange=trange) if len(dtrials) == 0: logging.error("Zero trials for %i %i %s" % \ (ch, cl, summary._filename)) continue # snr TODO # location try: location = summary.get_location(ch) except Exception as E: location = (0, 0, 0) print "Attempt to get location failed: %s" % str(E) info_dict['location'] = list(location) # significant bins #bins = summary.get_significant_bins(ch, cl, attr="name", \ # blacklist="BlueSquare", spike_times=spike_times, \ # timeRange=trange) if default_bins is None: bins = summary.get_significant_bins(ch, cl, trials=dtrials, \ spike_times=spike_times) else: bins = default_bins info_dict['bins'] = bins baseline = summary.get_baseline(ch, cl, prew, trials=trials, \ spike_times=spike_times) info_dict['baseline'] = baseline # selectivity #resps, means, stds, ns = summary.get_binned_response( \ # ch, cl, 'name', bins=bins, spike_times=spike_times, \ # blacklist="BlueSquare", timeRange=trange) resps, means, stds, ns = summary.get_binned_response( \ ch, cl, 'name', bins=bins, spike_times=spike_times, \ trials=dtrials, timeRange=trange) if len(resps) == 0: logging.warning("No responses") continue sel_index = physio.spikes.selectivity.selectivity(resps.values()) #if numpy.isnan(sel_index): # raise Exception("Selectivity is nan") sorted_names = sorted(resps, key=lambda k: resps[k]) info_dict['selectivity'] = sel_index info_dict['sorted_names'] = sorted_names if not os.path.exists(outdir): os.makedirs(outdir) with open(outdir + '/info_dict.p', 'w') as f: pickle.dump(info_dict, f, 2) with open(outdir + '/sel_info.p', 'w') as f: pickle.dump({'resps': resps, 'means': means, 'stds': stds, \ 'ns': ns}, f, 2) x = pylab.arange(len(resps)) y = pylab.zeros(len(resps)) err = pylab.zeros(len(resps)) pylab.figure(1) for (i, name) in enumerate(sorted_names): y[i] = resps[name] # TODO fix this to be something reasonable #err[i] = (pylab.sum(stds[name][bins]) / float(len(bins))) / \ # pylab.sqrt(ns[name]) err[i] = 0 pylab.errorbar(x, y, err) xl = pylab.xlim() pylab.xticks(x, sorted_names) pylab.xlim(xl) pylab.ylabel('average binned response') pylab.title('Selectivity: %.2f' % sel_index) pylab.savefig(outdir + '/by_name.png') pylab.close(1) # separability # get stims without bluesquare stims = summary.get_stimuli({'name': \ {'value': 'BlueSquare', 'op': '!='}}) attr_combinations = {} sep_info = {} for (ai, attr1) in enumerate(attrs[:-1]): uniques1 = numpy.unique(stims[attr1]) for attr2 in attrs[ai + 1:]: uniques2 = numpy.unique(stims[attr2]) if attr1 == attr2: continue M = summary.get_response_matrix(ch, cl, attr1, attr2, \ bins=bins, spike_times=spike_times, stims=stims, \ uniques1=uniques1, uniques2=uniques2, \ timeRange=trange, trials=dtrials) if M.shape[0] == 1 or M.shape[1] == 1: logging.warning("M.shape %s, skipping" % \ str(M.shape)) continue sep, spi, ps = physio.spikes.separability.\ separability_permutation(M) if not pylab.any(pylab.isnan(M)): pylab.figure(1) pylab.imshow(M, interpolation='nearest') pylab.colorbar() pylab.xlabel(attr2) xl = pylab.xlim() yl = pylab.ylim() pylab.xticks(range(len(uniques2)), uniques2) pylab.ylabel(attr1) pylab.yticks(range(len(uniques1)), uniques1) pylab.xlim(xl) pylab.ylim(yl) pylab.title('Sep: %s, %.4f, (%.3f, %.3f)' % \ (str(sep), spi, ps[0], ps[1])) pylab.savefig(outdir + '/%s_%s.png' % \ (attr1, attr2)) pylab.close(1) sep_info['_'.join((attr1, attr2))] = { \ 'sep': sep, 'spi': spi, 'ps': ps} with open(outdir + '/sep_info.p', 'w') as f: pickle.dump(sep_info, f, 2) # compute separability at each name name_sep_info = {} for name in sorted_names: stims = summary.get_stimuli({'name': name}) for (ai, attr1) in enumerate(attrs[:-1]): uniques1 = numpy.unique(stims[attr1]) for attr2 in attrs[ai + 1:]: uniques2 = numpy.unique(stims[attr2]) if attr1 == attr2 or \ attr1 == 'name' or attr2 == 'name': continue M = summary.get_response_matrix(ch, cl, attr1, \ attr2, bins=bins, spike_times=spike_times,\ stims=stims, uniques1=uniques1, \ uniques2=uniques2, timeRange=trange, \ trials=dtrials) if M.shape[0] == 1 or M.shape[1] == 1: logging.debug("M.shape incompatible" \ " with separability: %s" % \ str(M.shape)) continue else: sep, spi, ps = physio.spikes.separability.\ separability_permutation(M) if not pylab.any(pylab.isnan(M)): pylab.figure(1) pylab.imshow(M, interpolation='nearest') pylab.colorbar() pylab.xlabel(attr2) xl = pylab.xlim() yl = pylab.ylim() pylab.xticks(range(len(uniques2)), uniques2) pylab.ylabel(attr1) pylab.yticks(range(len(uniques1)), uniques1) pylab.xlim(xl) pylab.ylim(yl) pylab.title('Sep: %s, %.4f, (%.3f, %.3f)' \ % (str(sep), spi, ps[0], ps[1])) pylab.savefig(outdir + '/%s_%s_%s.png' % \ (name, attr1, attr2)) pylab.close(1) name_sep_info['_'.join((name, attr1, attr2))] \ = {'sep': sep, 'spi': spi, 'ps': ps} with open(outdir + '/name_sep_info.p', 'w') as f: pickle.dump(name_sep_info, f, 2)