Example #1
0
 def __setitem__(self, wavelength, intensity):
     index, = pylab.where(self.wavelengths == wavelength)
     if pylab.any(index.shape):
         self.intensities[index] = intensity
     else:
         index, = pylab.where(self.wavelengths < wavelength)
         if pylab.any(index.shape):
             self.wavelengths = pylab.insert(self.wavelengths, index[-1] + 1,
                                             wavelength)
             self.intensities = pylab.insert(self.intensities, index[-1] + 1,
                                             intensity)
         else:
             self.wavelengths = pylab.insert(self.wavelengths, 0, wavelength)
             self.intensities = pylab.insert(self.intensities, 0, intensity)
Example #2
0
def get_cod_data_all_causes(iso3='USA', age_group='1_4', sex='F'):
    """ TODO: write doc string for this function"""
    print 'loading', iso3, age_group, sex
    import glob

    cause_list = []
    fpath = '/home/j/Project/Causes of Death/Under Five Deaths/CoD Correct Input Data/v02_prep_%s/%s+*+%s+%s.csv' % (
        iso3, iso3, age_group, sex)
    #fpath = '/home/j/Project/GBD/dalynator/data/cod_correct_input_pos/run_9_cause_*.csv'  # use Mike's validation data
    fnames = glob.glob(fpath)

    # initialize input distribution array
    N = 990  # TODO: get this from the data files
    T = 32  # TODO: get this from the data files
    J = len(fnames)
    F = pl.zeros((N, T, J))

    # fill input distribution array with data from files
    for j, fname in enumerate(sorted(fnames)):
        cause = fname.split('+')[1]  # TODO: make this less brittle and clearer
        #cause = str(j) # use Mike's validation data causes
        print 'loading cause', cause
        F_j = pl.csv2rec(fname)

        for n in range(N):
            F[n, :, j] = F_j['ensemble_d%d' % (n + 1)] / F_j['envelope']
            #F[n, :, j] = F_j['d%d'%(n+1)]/F_j['envelope'] # use Mike's validation data

        assert not pl.any(
            pl.isnan(F)), '%s should have no missing values' % fname
        cause_list.append(cause)

    print 'loading complete'
    return F, cause_list
Example #3
0
def dict_diff(dict1, dict2):
    """Return the difference between two dictionaries as a dictionary of key: [val1, val2] pairs.
    Keys unique to either dictionary are included as key: [val1, '-'] or key: ['-', val2]."""
    diff_keys = []
    common_keys = pylab.intersect1d(dict1.keys(), dict2.keys())
    for key in common_keys:
        if pylab.iterable(dict1[key]):
            if pylab.any(dict1[key] != dict2[key]):
                diff_keys.append(key)
        else:
            if dict1[key] != dict2[key]:
                diff_keys.append(key)

    dict1_unique = [key for key in dict1.keys() if key not in common_keys]
    dict2_unique = [key for key in dict2.keys() if key not in common_keys]

    diff = {}
    for key in diff_keys:
        diff[key] = [dict1[key], dict2[key]]

    for key in dict1_unique:
        diff[key] = [dict1[key], '-']

    for key in dict2_unique:
        diff[key] = ['-', dict2[key]]

    return diff
Example #4
0
def get_cod_data_all_causes(iso3='USA', age_group='1_4', sex='F'):
    """ TODO: write doc string for this function"""
    print 'loading', iso3, age_group, sex
    import glob
    
    cause_list = []
    fpath = '/home/j/Project/Causes of Death/Under Five Deaths/CoD Correct Input Data/v02_prep_%s/%s+*+%s+%s.csv' % (iso3, iso3, age_group, sex)
    #fpath = '/home/j/Project/GBD/dalynator/data/cod_correct_input_pos/run_9_cause_*.csv'  # use Mike's validation data
    fnames = glob.glob(fpath)

    # initialize input distribution array
    N = 990  # TODO: get this from the data files
    T = 32  # TODO: get this from the data files
    J = len(fnames)
    F = pl.zeros((N, T, J))

    # fill input distribution array with data from files
    for j, fname in enumerate(sorted(fnames)):
        cause = fname.split('+')[1]  # TODO: make this less brittle and clearer
        #cause = str(j) # use Mike's validation data causes
        print 'loading cause', cause
        F_j = pl.csv2rec(fname)

        for n in range(N):
            F[n, :, j] = F_j['ensemble_d%d'%(n+1)]/F_j['envelope']
            #F[n, :, j] = F_j['d%d'%(n+1)]/F_j['envelope'] # use Mike's validation data

        assert not pl.any(pl.isnan(F)), '%s should have no missing values' % fname
        cause_list.append(cause)
    
    print 'loading complete'
    return F, cause_list
Example #5
0
def log_pdf_full_mvn( x, mu, cov = None, invcov = None, logdet = None ):
  if cov is None:
      assert invcov is not None, "need cov or invcov"
  if invcov is None:
      invcov = sp.linalg.pinv2( cov )
      #invcov = np.linalg.pinv( cov )
    
  difx   = x-mu
  if len(x.shape) > 1 or len(mu.shape)>1:
      if len(x.shape) > 1:
        nVals = x.shape[0]
        dim   = x.shape[1] 
      else:    
        nVals = mu.shape[0]
        dim   = np.float( len(x) ) 
      malhab = (np.dot( difx, invcov ) * difx ).sum(1)
  else:
      nVals = 1
      dim = np.float( len(x) )
      malhab = np.dot( np.dot( difx, invcov ), difx )

    

  if logdet is None:
      try:
          neglogdet = np.log( np.linalg.det(cov ) ) # 
          logdet = -neglogdet
          #logdet = sum(numpy.log(numpy.linalg.svd(invcov)[1]))
      except:
          logdet = sum( np.log( np.diag( invcov ) ) )
  #print str(-0.5*dim*numpy.log( 2.0 * numpy.pi ) )
  #print str(0.5*logdet)
  #print str(malhab)
  logpdf = -0.5*dim*np.log( 2.0 * np.pi ) + 0.5*logdet - 0.5*malhab

  
  if pp.any( np.isnan( logpdf ) ) or pp.any( np.isinf( logpdf ) ):
      pdb.set_trace()
      print "********************************"
      print "********************************"
      print "log_pdf_full_mvn has inf"
      print logpdf
      print "********************************"
      print "********************************"
      return -np.inf
  return logpdf
Example #6
0
def log_pdf_full_mvn(x, mu, cov=None, invcov=None, logdet=None):
    if cov is None:
        assert invcov is not None, "need cov or invcov"
    if invcov is None:
        invcov = sp.linalg.pinv2(cov)
        #invcov = np.linalg.pinv( cov )

    difx = x - mu
    if len(x.shape) > 1 or len(mu.shape) > 1:
        if len(x.shape) > 1:
            nVals = x.shape[0]
            dim = x.shape[1]
        else:
            nVals = mu.shape[0]
            dim = np.float(len(x))
        malhab = (np.dot(difx, invcov) * difx).sum(1)
    else:
        nVals = 1
        dim = np.float(len(x))
        malhab = np.dot(np.dot(difx, invcov), difx)

    if logdet is None:
        try:
            neglogdet = np.log(np.linalg.det(cov))  #
            logdet = -neglogdet
            #logdet = sum(numpy.log(numpy.linalg.svd(invcov)[1]))
        except:
            logdet = sum(np.log(np.diag(invcov)))
    #print str(-0.5*dim*numpy.log( 2.0 * numpy.pi ) )
    #print str(0.5*logdet)
    #print str(malhab)
    logpdf = -0.5 * dim * np.log(2.0 * np.pi) + 0.5 * logdet - 0.5 * malhab

    if pp.any(np.isnan(logpdf)) or pp.any(np.isinf(logpdf)):
        pdb.set_trace()
        print "********************************"
        print "********************************"
        print "log_pdf_full_mvn has inf"
        print logpdf
        print "********************************"
        print "********************************"
        return -np.inf
    return logpdf
Example #7
0
    def mu_interval(weighted_sum_mu=weighted_sum_mu, cum_sum_weights=cum_sum_weights,
                    mu_age=mu_age,
                    age_start=pl.array(age_start, dtype=int),
                    age_end=pl.array(age_end, dtype=int)):
        mu = (weighted_sum_mu[age_end] - weighted_sum_mu[age_start]) / (cum_sum_weights[age_end] - cum_sum_weights[age_start])
        
        # correct cases where age_start == age_end
        i = age_start == age_end
        if pl.any(i):
            mu[i] = mu_age[age_start[i]]

        return mu
    def mu_interval(weighted_sum_mu=weighted_sum_mu,
                    cum_sum_weights=cum_sum_weights,
                    mu_age=mu_age,
                    age_start=pl.array(age_start, dtype=int),
                    age_end=pl.array(age_end, dtype=int)):
        mu = (weighted_sum_mu[age_end] - weighted_sum_mu[age_start]) / (
            cum_sum_weights[age_end] - cum_sum_weights[age_start])

        # correct cases where age_start == age_end
        i = age_start == age_end
        if pl.any(i):
            mu[i] = mu_age[age_start[i]]

        return mu
Example #9
0
def main():
    df = get_df()
    for category in CATEGORIES:
        df = remove_small_counts(df, category)
        df.drop('count', axis=1, inplace=True)

    regression_df = pd.DataFrame()
    for category in CATEGORIES:
        dummies = pd.get_dummies(df[category])
        regression_df = pd.concat([regression_df, dummies], axis=1)

    regression_df['name'] = df['name']

    regression_df = regression_df.groupby('name').agg(lambda x : int(pylab.any(x)))
    print regression_df
Example #10
0
def generate_data(N, delta_true, pi_true, heterogeneity, bias, sigma_prior):
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    model = data_simulation.simple_model(N)
    model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    model.parameters['p']['smoothness'] = dict(amount='Moderately')
    model.parameters['p']['heterogeneity'] = heterogeneity

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true*age_weights)
    sum_wt = pl.cumsum(age_weights)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    n = mc.runiform(10000, 100000, size=N)

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n * pl.exp(bias)

    emp_priors = {}
    emp_priors['p', 'mu'] = pi_age_true
    emp_priors['p', 'sigma'] = sigma_prior*pi_age_true
    model.emp_priors = emp_priors

    model.a = a
    model.pi_age_true = pi_age_true
    model.delta_true = delta_true

    return model
Example #11
0
 def positive(f=sm.f_eval):
     if pl.any(f < 0.):
         return -pl.inf
     else:
         return 0.
Example #12
0
def ellfit(x, y, wt=None):
    import pylab as pl
    # Calculate the best fit ellipse for an X and Y distribution, allowing
    # for weighting.
    # OUTPUTS:
    #   MAJOR - major axis in same units as x and y
    #   MINOR - minor axis in same units as x and y
    #   POSANG - the position angle CCW from the X=0 line of the coordinates
    #
    #   Adam: The intensity weighted major and minor values are equal to the
    #   second moment.
    #   For equal weighting by pixel (of the sort that
    #   might be done for blob analysis) the ellipse fit to the
    #   half-maximum area will have semimajor axis equal to 1./1.69536 the
    #   second moment. For the quarter maximum surface this is 1./1.19755.
    #
    #   i.e. if you run this with x,y down to zero intensity (like integrating
    #   to infinity), and wt=intensity, you get the second moments sig_major,
    #   sig_minor back
    #   if you run this with x,y down to half-intensity, and wt=None, you get
    #   sigx/1.6986 back  (not sure why my integra differs from his slightly)
    #
    #   but adam did not have the factor of 4 to turn eigenval into major axis
    #
    #   translation: if we run this with intensity weight, we get
    #   the second moment back (a sigma).  for flat weights i think he means
    #   the halfmax contour semimajor axis

    if type(wt) == type(None):
        wt = x * 0.0 + 1.0

    tot_wt = wt.sum()

    # WEIGHTED X AND Y CENTERS
    x_ctr = (wt * x).sum() / tot_wt
    y_ctr = (wt * y).sum() / tot_wt

    # BUILD THE MATRIX
    i11 = (wt * (x - x_ctr)**2).sum() / tot_wt
    i22 = (wt * (y - y_ctr)**2).sum() / tot_wt
    i12 = (wt * (x - x_ctr) * (y - y_ctr)).sum() / tot_wt
    mat = [[i11, i12], [i12, i22]]

    # CATCH THE CASE OF ZERO DETERMINANT
    if pl.det(mat) == 0:
        return pl.nan, pl.nan, pl.nan

    if pl.any(pl.isnan(mat)):
        return pl.nan, pl.nan, pl.nan

# WORK OUT THE EIGENVALUES
    evals, evec = pl.eig(mat)

    # PICK THE MAJOR AXIS
    absvals = pl.absolute(evals)
    major = absvals.max()
    maj_ind = pl.where(absvals == major)[0][0]
    major_vec = evec[maj_ind]
    min_ind = 1 - maj_ind

    # WORK OUT THE ORIENTATION OF THE MAJOR AXIS
    posang = pl.arctan2(major_vec[1], major_vec[0])

    # compared to the original idl code, this code is returning
    # pi-the desired angle, so:
    #    posang=pl.pi-posang

    #    if posang<0: posang = posang+pl.pi

    # MAJOR AND MINOR AXIS SIZES
    # turn into real half-max major/minor axis
    major = pl.sqrt(evals[maj_ind]) * 4.
    minor = pl.sqrt(evals[min_ind]) * 4.

    return major, minor, posang
Example #13
0
 def positive(f=f):
     if pl.any(f < 0.):
         return -pl.inf
     else:
         return 0.
Example #14
0
def linear_norm(x, y, msk, eps=0.003, deps=0.001, nmin=2, nwin=3):
    '''Linear normalization of a slice of a spectra,
       assuming that the slice is centered on the line to normalized.
    '''

    bla = False
    blabla = False

    x = x[msk]
    y = y[msk]

    n = int((len(y) / 2.))
    yl = y[:n]
    yr = y[n + 1:]

    # Criteria on the left of the central wavelength
    epsl, epsr = eps, eps
    while 1:
        critl = abs(max(yl) - yl) / max(yl)
        idx_yl = pl.where(critl <= epsl)[0]
        idx_yl = idx_yl.astype(int)
        if blabla:
            print " epsl:", epsl
            print " idx_yl, yl:", idx_yl, [y[i] for i in idx_yl]
        if pl.size(idx_yl) >= nmin:
            break
        else:
            epsl = epsl + deps

# Criteria on the right of the central wavelength
    while 1:
        critr = abs(max(yr) - yr) / max(yr)
        idx_yr = pl.where(critr <= epsr)[0] + n
        idx_yr = idx_yr.astype(int)
        if blabla:
            print " epsr:", epsr
            print "idx_yr, yr:", idx_yr, [y[i] for i in idx_yr]
        if pl.size(idx_yr) >= nmin:
            break
        else:
            epsr = epsr + deps

    idx_y = pl.concatenate([idx_yl, idx_yr])

    if bla:
        print " nmin, nwin =", nmin, nwin
        print " Number of selected left continuum points:  ", idx_yl.size, "/", n
        print " Number of selected right continuum points: ", idx_yr.size, "/", n
        print " Number of selected continuum points:       ", idx_y.size, "/", y.size

    xs = [x[i] for i in idx_y]
    ys = [y[i] for i in idx_y]

    xs, ys = pl.asarray(xs), pl.asarray(ys)
    n_xs = xs.size

    # Mean value around selected points
    for ind, val in enumerate(ys):
        i = idx_y[ind] - nwin
        j = idx_y[ind] + nwin
        if i < 0:
            i = 0
        if j > len(y):
            j = len(y)
        ys[ind] = y[i:j].mean()

    if blabla:
        print "xs, ys", xs, ys

    A = pl.concatenate([xs, pl.ones(n_xs)])
    A = A.reshape((2, n_xs))
    w = pl.linalg.lstsq(A.T, ys)[0]

    # Test if one of the value of w is a nan
    if pl.any(w != w):
        print "Pb with linalg.lstsq. Try to reduce eps or nmin."
        quit(1)

    a, b = w[0], w[1]

    if blabla:
        print "a =", a, "b =", b

    return a, b, xs, ys
Example #15
0
 def __getitem__(self, wavelength):
     index, = pylab.where(self.wavelengths == wavelength)
     if pylab.any(index.shape):
         return self.intensities[index]
     else:
         return None
Example #16
0
 def positive(f=sm.f_eval):
     if pl.any(f < 0.):
         return -pl.inf
     else:
         return 0.
Example #17
0
print "\nLoading data from file "+F.CYAN+" "+filename+F.RESET+"\n"
Data=pl.loadtxt(filename)
N=Data.shape[0]

from scipy.spatial.distance import pdist,squareform
# compute distances
dists=squareform(pdist(Data))
# exclude the case of self-distance
pl.fill_diagonal(dists, pl.inf)
test= (dists<cutoff)

if(mode==1):
    picked=[]
    for p in range(N):
        if pl.any(test[p,:]):
            test[:,p]=False
            test[p,:]=False
        else:
            picked.append(p)
        No_overlaps=Data[picked]

if(mode==2):
    print "- Cutting out particles with at least two overlaps at distance <", cutoff,"..."
    picked=[]
    for p in range(N):
        # removing only double overlaps
        if(pl.sum(test[p][p:])>1):
            pass
        else:
            picked.append(p)
def validate_age_integrating_model_sim(N=500,
                                       delta_true=.15,
                                       pi_true=quadratic):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    model = data_simulation.simple_model(N)
    #model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    #model.parameters['p']['smoothness'] = dict(amount='Very')

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true * age_weights)
    sum_wt = pl.cumsum(age_weights)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] -
                                                       sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    n = mc.runiform(100, 10000, size=N)

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n * p,
                                                      delta_true * n * p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total',
                                            'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=25,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(a, pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(
        dict(true=pi_age_true,
             mu_pred=model.vars['p']['mu_age'].stats()['mean'],
             sigma_pred=model.vars['p']['mu_age'].stats()
             ['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    model.results = pandas.DataFrame(model.results,
                                     columns='param bias mae mare pc'.split())

    print model.results

    return model
def validate_consistent_model_sim(N=500,
                                  delta_true=.5,
                                  true=dict(i=quadratic,
                                            f=constant,
                                            r=constant)):
    types = pl.array(['i', 'r', 'f', 'p'])

    ## generate simulated data
    model = data_simulation.simple_model(N)
    model.input_data['effective_sample_size'] = 1.
    model.input_data['value'] = 0.

    for t in types:
        model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20)

    sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {})
    for t in 'irf':
        for i, k_i in enumerate(sim[t]['knots']):
            sim[t]['gamma'][i].value = pl.log(true[t](k_i))

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) /
                                      float(len(types)),
                                      size=N)]

    a = pl.arange(101)
    age_weights = pl.ones_like(a)
    sum_wt = pl.cumsum(age_weights)

    p = pl.zeros(N)
    for t in types:
        mu_t = sim[t]['mu_age'].value
        sum_mu_wt = pl.cumsum(mu_t * age_weights)

        p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] -
                                                             sum_wt[age_start])

        # correct cases where age_start == age_end
        i = age_start == age_end
        if pl.any(i):
            p_t[i] = mu_t[age_start[i]]

        # copy part into p
        p[data_type == t] = p_t[data_type == t]
    n = mc.runiform(100, 10000, size=N)

    model.input_data['data_type'] = data_type
    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n * p,
                                                      delta_true * n * p) / n

    # coarse knot spacing for fast testing
    for t in types:
        model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20)

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars = consistent_model.consistent_model(model, 'all', 'total',
                                                   'all', {})
    model.map, model.mcmc = fit_model.fit_consistent_model(model.vars,
                                                           iter=10000,
                                                           burn=5000,
                                                           thin=25,
                                                           tune_interval=100)

    graphics.plot_convergence_diag(model.vars)

    graphics.plot_fit(model, model.vars, {}, {})
    for i, t in enumerate('i r f p rr pf'.split()):
        pl.subplot(2, 3, i + 1)
        pl.plot(a, sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2)
        pl.plot(a, sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1)

    #graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    #pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = 0.
    model.input_data['sigma_pred'] = 0.
    for t in types:
        model.input_data['mu_pred'][
            data_type == t] = model.vars[t]['p_pred'].stats()['mean']
        model.input_data['sigma_pred'][data_type == t] = model.vars['p'][
            'p_pred'].stats()['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(
        dict(true=[delta_true for t in types if t != 'rr']))
    model.delta['mu_pred'] = [
        pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr'
    ]
    model.delta['sigma_pred'] = [
        pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr'
    ]
    data_simulation.add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame()
    for t in types:
        model.mu = model.mu.append(pandas.DataFrame(
            dict(true=sim[t]['mu_age'].value,
                 mu_pred=model.vars[t]['mu_age'].stats()['mean'],
                 sigma_pred=model.vars[t]['mu_age'].stats()
                 ['standard deviation'])),
                                   ignore_index=True)
    data_simulation.add_quality_metrics(model.mu)
    print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.mu['abs_err'].mean(),
        pl.median(pl.absolute(
            model.mu['rel_err'].dropna())), model.mu['covered?'].mean())
    print

    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    data_simulation.finalize_results(model)

    print model.results

    return model
Example #20
0
def evaluate_fit_quality(time, par, noise_values, trials, debug_plot=True):
    """
    present the results obtained by fit_quality(...) in a graphical way
    """
    mean_fit = []
    std_fit = []
    success_count = []
    allvals = []
    errors = []
    for noise in noise_values:
        mean, std, success, keys, a, errs = fit_quality(
            time, par, noise, trials)
        mean_fit.append(mean)
        std_fit.append(std)
        success_count.append(success)
        allvals.append(a)
        errors.append(errs)
        print(p.any(p.isnan(errs)))

    mean_fit = p.array(mean_fit)
    std_fit = p.array(std_fit)

    if debug_plot:
        p.figure()

        num_subplots = mean_fit.shape[1] + 1
        plot = None

        p.title("fit quality evaluation")
        for i in range(num_subplots - 1):
            plot = p.subplot(num_subplots, 1, 1 + i, sharex=plot)
            p.axhline(par[keys[i]], c="r")
            p.errorbar(noise_values, mean_fit[:, i], yerr=std_fit[:, i])
            p.semilogx()
            p.xlim(min(noise_values) / 2., max(noise_values) * 2.)
            p.ylabel(keys[i])

            for n, a, e in zip(noise_values, allvals, errors):
                # p.plot([n] * len(a[i]), a[i], "rx", alpha=.4)
                p.errorbar([n] * len(a[i]),
                           a[i],
                           yerr=e[i],
                           fmt="rx",
                           alpha=.4)

        p.subplot(num_subplots, 1, num_subplots, sharex=plot)
        p.plot(noise_values, [trials - x for x in success_count])
        p.ylabel("failure count (of {0} total)".format(trials))
        p.xlabel("noise / [value]")
        p.savefig("plots/example_fit_precision.pdf")

        p.figure()
        for i in range(num_subplots - 1):
            plot = p.subplot(num_subplots - 1, 1, 1 + i, sharex=plot)
            if i == 0:
                p.title("RMS of (fit - param) / estimated_error")
            p.axhline(1)
            p.axhline(0)
            for n, a, e in zip(noise_values, allvals, errors):
                rmsvals = p.sqrt(
                    p.mean(
                        ((p.array(a[i]) - par[keys[i]]) / p.array(e[i]))**2))
                p.plot([n], rmsvals, "go")
                print("rmsvals for noise={0}, param={1}:".format(n, keys[i]),
                      rmsvals, p.array(e[i]))
            p.ylim(0, None)
            p.ylabel(keys[i])

        p.xlabel("noise / [value]")
        p.savefig("plots/example_fit_error_estimate.pdf")

    return mean_fit, std_fit, success_count
Example #21
0
def age_specific_rate(model,
                      data_type,
                      reference_area='all',
                      reference_sex='total',
                      reference_year='all',
                      mu_age=None,
                      mu_age_parent=None,
                      sigma_age_parent=None,
                      rate_type='neg_binom',
                      lower_bound=None,
                      interpolation_method='linear',
                      include_covariates=True,
                      zero_re=False):
    # TODO: expose (and document) interface for alternative rate_type as well as other options,
    # record reference values in the model
    """ Generate PyMC objects for model of epidemological age-interval data

    :Parameters:
      - `model` : data.ModelData
      - `data_type` : str, one of 'i', 'r', 'f', 'p', or 'pf'
      - `reference_area, reference_sex, reference_year` : the node of the model to fit consistently
      - `mu_age` : pymc.Node, will be used as the age pattern, set to None if not needed
      - `mu_age_parent` : pymc.Node, will be used as the age pattern of the parent of the root area, set to None if not needed
      - `sigma_age_parent` : pymc.Node, will be used as the standard deviation of the age pattern, set to None if not needed
      - `rate_type` : str, optional. One of 'beta_binom', 'binom', 'log_normal_model', 'neg_binom', 'neg_binom_lower_bound_model', 'neg_binom_model', 'normal_model', 'offest_log_normal', or 'poisson'
      - `lower_bound` : 
      - `interpolation_method` : str, optional, one of 'linear', 'nearest', 'zero', 'slinear', 'quadratic, or 'cubic'
      - `include_covariates` : boolean
      - `zero_re` : boolean, change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic

    :Results:
      - Returns dict of PyMC objects, including 'pi', the covariate adjusted predicted values for each row of data

    """
    name = data_type
    import data
    result = data.ModelVars()

    if (mu_age_parent != None and pl.any(pl.isnan(mu_age_parent))) \
           or (sigma_age_parent != None and pl.any(pl.isnan(sigma_age_parent))):
        mu_age_parent = None
        sigma_age_parent = None
        print 'WARNING: nan found in parent mu/sigma.  Ignoring'

    ages = pl.array(model.parameters['ages'])
    data = model.get_data(data_type)
    if lower_bound:
        lb_data = model.get_data(lower_bound)
    parameters = model.parameters.get(data_type, {})
    area_hierarchy = model.hierarchy

    vars = dismod3.data.ModelVars()
    vars += dict(data=data)

    if 'parameter_age_mesh' in parameters:
        knots = pl.array(parameters['parameter_age_mesh'])
    else:
        knots = pl.arange(ages[0], ages[-1] + 1, 5)

    smoothing_dict = {
        'No Prior': pl.inf,
        'Slightly': .5,
        'Moderately': .05,
        'Very': .005
    }
    if 'smoothness' in parameters:
        smoothing = smoothing_dict[parameters['smoothness']['amount']]
    else:
        smoothing = 0.

    if mu_age == None:
        vars.update(
            age_pattern.age_pattern(name,
                                    ages=ages,
                                    knots=knots,
                                    smoothing=smoothing,
                                    interpolation_method=interpolation_method))
    else:
        vars.update(dict(mu_age=mu_age, ages=ages))

    vars.update(
        expert_prior_model.level_constraints(name, parameters, vars['mu_age'],
                                             ages))
    vars.update(
        expert_prior_model.derivative_constraints(name, parameters,
                                                  vars['mu_age'], ages))

    if mu_age_parent != None:
        # setup a hierarchical prior on the simliarity between the
        # consistent estimate here and (inconsistent) estimate for its
        # parent in the areas hierarchy
        #weight_dict = {'Unusable': 10., 'Slightly': 10., 'Moderately': 1., 'Very': .1}
        #weight = weight_dict[parameters['heterogeneity']]
        vars.update(
            similarity_prior_model.similar('parent_similarity_%s' % name,
                                           vars['mu_age'], mu_age_parent,
                                           sigma_age_parent, 0.))

        # also use this as the initial value for the age pattern, if it is not already specified
        if mu_age == None:
            if isinstance(mu_age_parent, mc.Node):  # TODO: test this code
                initial_mu = mu_age_parent.value
            else:
                initial_mu = mu_age_parent

            for i, k_i in enumerate(knots):
                vars['gamma'][i].value = (pl.log(
                    initial_mu[k_i - ages[0]])).clip(-12, 6)

    age_weights = pl.ones_like(
        vars['mu_age'].value
    )  # TODO: use age pattern appropriate to the rate type
    if len(data) > 0:
        vars.update(
            age_integrating_model.age_standardize_approx(
                name, age_weights, vars['mu_age'], data['age_start'],
                data['age_end'], ages))

        # uncomment the following to effectively remove alleffects
        #if 'random_effects' in parameters:
        #    for i in range(5):
        #        effect = 'sigma_alpha_%s_%d' % (name, i)
        #        parameters['random_effects'][effect] = dict(dist='TruncatedNormal', mu=.0001, sigma=.00001, lower=.00009, upper=.00011)
        #if 'fixed_effects' in parameters:
        #    for effect in ['x_sex', 'x_LDI_id_Updated_7July2011']:
        #        parameters['fixed_effects'][effect] = dict(dist='normal', mu=.0001, sigma=.00001)

        if include_covariates:
            vars.update(
                covariate_model.mean_covariate_model(name,
                                                     vars['mu_interval'],
                                                     data,
                                                     parameters,
                                                     model,
                                                     reference_area,
                                                     reference_sex,
                                                     reference_year,
                                                     zero_re=zero_re))
        else:
            vars.update({'pi': vars['mu_interval']})

        ## ensure that all data has uncertainty quantified appropriately
        # first replace all missing se from ci
        missing_se = pl.isnan(
            data['standard_error']) | (data['standard_error'] < 0)
        data['standard_error'][missing_se] = (data['upper_ci'][missing_se] -
                                              data['lower_ci'][missing_se]) / (
                                                  2 * 1.96)

        # then replace all missing ess with se
        missing_ess = pl.isnan(data['effective_sample_size'])
        data['effective_sample_size'][missing_ess] = data['value'][
            missing_ess] * (1 - data['value'][missing_ess]
                            ) / data['standard_error'][missing_ess]**2

        if rate_type == 'neg_binom':

            # warn and drop data that doesn't have effective sample size quantified, or is is non-positive
            missing_ess = pl.isnan(data['effective_sample_size']) | (
                data['effective_sample_size'] < 0)
            if sum(missing_ess) > 0:
                print 'WARNING: %d rows of %s data has invalid quantification of uncertainty.' % (
                    sum(missing_ess), name)
                data['effective_sample_size'][missing_ess] = 0.0

            # warn and change data where ess is unreasonably huge
            large_ess = data['effective_sample_size'] >= 1.e10
            if sum(large_ess) > 0:
                print 'WARNING: %d rows of %s data have effective sample size exceeding 10 billion.' % (
                    sum(large_ess), name)
                data['effective_sample_size'][large_ess] = 1.e10

            if 'heterogeneity' in parameters:
                lower_dict = {'Slightly': 9., 'Moderately': 3., 'Very': 1.}
                lower = lower_dict[parameters['heterogeneity']]
            else:
                lower = 1.

            # special case, treat pf data as poisson
            if data_type == 'pf':
                lower = 1.e12

            vars.update(
                covariate_model.dispersion_covariate_model(
                    name, data, lower, lower * 9.))

            vars.update(
                rate_model.neg_binom_model(name, vars['pi'], vars['delta'],
                                           data['value'],
                                           data['effective_sample_size']))
        elif rate_type == 'log_normal':

            # warn and drop data that doesn't have effective sample size quantified
            missing = pl.isnan(
                data['standard_error']) | (data['standard_error'] < 0)
            if sum(missing) > 0:
                print 'WARNING: %d rows of %s data has no quantification of uncertainty.' % (
                    sum(missing), name)
                data['standard_error'][missing] = 1.e6

            # TODO: allow options for alternative priors for sigma
            vars['sigma'] = mc.Uniform('sigma_%s' % name,
                                       lower=.0001,
                                       upper=1.,
                                       value=.01)
            #vars['sigma'] = mc.Exponential('sigma_%s'%name, beta=100., value=.01)
            vars.update(
                rate_model.log_normal_model(name, vars['pi'], vars['sigma'],
                                            data['value'],
                                            data['standard_error']))
        elif rate_type == 'normal':

            # warn and drop data that doesn't have standard error quantified
            missing = pl.isnan(
                data['standard_error']) | (data['standard_error'] < 0)
            if sum(missing) > 0:
                print 'WARNING: %d rows of %s data has no quantification of uncertainty.' % (
                    sum(missing), name)
                data['standard_error'][missing] = 1.e6

            vars['sigma'] = mc.Uniform('sigma_%s' % name,
                                       lower=.0001,
                                       upper=.1,
                                       value=.01)
            vars.update(
                rate_model.normal_model(name, vars['pi'], vars['sigma'],
                                        data['value'], data['standard_error']))
        elif rate_type == 'binom':
            missing_ess = pl.isnan(data['effective_sample_size']) | (
                data['effective_sample_size'] < 0)
            if sum(missing_ess) > 0:
                print 'WARNING: %d rows of %s data has invalid quantification of uncertainty.' % (
                    sum(missing_ess), name)
                data['effective_sample_size'][missing_ess] = 0.0
            vars += rate_model.binom(name, vars['pi'], data['value'],
                                     data['effective_sample_size'])
        elif rate_type == 'beta_binom':
            vars += rate_model.beta_binom(name, vars['pi'], data['value'],
                                          data['effective_sample_size'])
        elif rate_type == 'poisson':
            missing_ess = pl.isnan(data['effective_sample_size']) | (
                data['effective_sample_size'] < 0)
            if sum(missing_ess) > 0:
                print 'WARNING: %d rows of %s data has invalid quantification of uncertainty.' % (
                    sum(missing_ess), name)
                data['effective_sample_size'][missing_ess] = 0.0

            vars += rate_model.poisson(name, vars['pi'], data['value'],
                                       data['effective_sample_size'])
        elif rate_type == 'offset_log_normal':
            vars['sigma'] = mc.Uniform('sigma_%s' % name,
                                       lower=.0001,
                                       upper=10.,
                                       value=.01)
            vars += rate_model.offset_log_normal(name, vars['pi'],
                                                 vars['sigma'], data['value'],
                                                 data['standard_error'])
        else:
            raise Exception, 'rate_model "%s" not implemented' % rate_type
    else:
        if include_covariates:
            vars.update(
                covariate_model.mean_covariate_model(name, [],
                                                     data,
                                                     parameters,
                                                     model,
                                                     reference_area,
                                                     reference_sex,
                                                     reference_year,
                                                     zero_re=zero_re))
    if include_covariates:
        vars.update(
            expert_prior_model.covariate_level_constraints(
                name, model, vars, ages))

    if lower_bound and len(lb_data) > 0:
        vars['lb'] = age_integrating_model.age_standardize_approx(
            'lb_%s' % name, age_weights, vars['mu_age'], lb_data['age_start'],
            lb_data['age_end'], ages)

        if include_covariates:

            vars['lb'].update(
                covariate_model.mean_covariate_model('lb_%s' % name,
                                                     vars['lb']['mu_interval'],
                                                     lb_data,
                                                     parameters,
                                                     model,
                                                     reference_area,
                                                     reference_sex,
                                                     reference_year,
                                                     zero_re=zero_re))
        else:
            vars['lb'].update({'pi': vars['lb']['mu_interval']})

        vars['lb'].update(
            covariate_model.dispersion_covariate_model(
                'lb_%s' % name, lb_data, 1e12, 1e13)  # treat like poisson
        )

        ## ensure that all data has uncertainty quantified appropriately
        # first replace all missing se from ci
        missing_se = pl.isnan(
            lb_data['standard_error']) | (lb_data['standard_error'] <= 0)
        lb_data['standard_error'][missing_se] = (
            lb_data['upper_ci'][missing_se] -
            lb_data['lower_ci'][missing_se]) / (2 * 1.96)

        # then replace all missing ess with se
        missing_ess = pl.isnan(lb_data['effective_sample_size'])
        lb_data['effective_sample_size'][missing_ess] = lb_data['value'][
            missing_ess] * (1 - lb_data['value'][missing_ess]
                            ) / lb_data['standard_error'][missing_ess]**2

        # warn and drop lb_data that doesn't have effective sample size quantified
        missing_ess = pl.isnan(lb_data['effective_sample_size']) | (
            lb_data['effective_sample_size'] <= 0)
        if sum(missing_ess) > 0:
            print 'WARNING: %d rows of %s lower bound data has no quantification of uncertainty.' % (
                sum(missing_ess), name)
            lb_data['effective_sample_size'][missing_ess] = 1.0

        vars['lb'].update(
            rate_model.neg_binom_lower_bound_model(
                'lb_%s' % name, vars['lb']['pi'], vars['lb']['delta'],
                lb_data['value'], lb_data['effective_sample_size']))

    result[data_type] = vars
    return result
Example #22
0
File: ism.py Project: peterhm/gbd
def age_specific_rate(
    model,
    data_type,
    reference_area="all",
    reference_sex="total",
    reference_year="all",
    mu_age=None,
    mu_age_parent=None,
    sigma_age_parent=None,
    rate_type="neg_binom",
    lower_bound=None,
    interpolation_method="linear",
    include_covariates=True,
    zero_re=False,
):
    # TODO: expose (and document) interface for alternative rate_type as well as other options,
    # record reference values in the model
    """ Generate PyMC objects for model of epidemological age-interval data

    :Parameters:
      - `model` : data.ModelData
      - `data_type` : str, one of 'i', 'r', 'f', 'p', or 'pf'
      - `reference_area, reference_sex, reference_year` : the node of the model to fit consistently
      - `mu_age` : pymc.Node, will be used as the age pattern, set to None if not needed
      - `mu_age_parent` : pymc.Node, will be used as the age pattern of the parent of the root area, set to None if not needed
      - `sigma_age_parent` : pymc.Node, will be used as the standard deviation of the age pattern, set to None if not needed
      - `rate_type` : str, optional. One of 'beta_binom', 'binom', 'log_normal_model', 'neg_binom', 'neg_binom_lower_bound_model', 'neg_binom_model', 'normal_model', 'offest_log_normal', or 'poisson'
      - `lower_bound` : 
      - `interpolation_method` : str, optional, one of 'linear', 'nearest', 'zero', 'slinear', 'quadratic, or 'cubic'
      - `include_covariates` : boolean
      - `zero_re` : boolean, change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic

    :Results:
      - Returns dict of PyMC objects, including 'pi', the covariate adjusted predicted values for each row of data

    """
    name = data_type
    import data

    result = data.ModelVars()

    if (mu_age_parent != None and pl.any(pl.isnan(mu_age_parent))) or (
        sigma_age_parent != None and pl.any(pl.isnan(sigma_age_parent))
    ):
        mu_age_parent = None
        sigma_age_parent = None
        print "WARNING: nan found in parent mu/sigma.  Ignoring"

    ages = pl.array(model.parameters["ages"])
    data = model.get_data(data_type)
    if lower_bound:
        lb_data = model.get_data(lower_bound)
    parameters = model.parameters.get(data_type, {})
    area_hierarchy = model.hierarchy

    vars = dismod3.data.ModelVars()
    vars += dict(data=data)

    if "parameter_age_mesh" in parameters:
        knots = pl.array(parameters["parameter_age_mesh"])
    else:
        knots = pl.arange(ages[0], ages[-1] + 1, 5)

    smoothing_dict = {"No Prior": pl.inf, "Slightly": 0.5, "Moderately": 0.05, "Very": 0.005}
    if "smoothness" in parameters:
        smoothing = smoothing_dict[parameters["smoothness"]["amount"]]
    else:
        smoothing = 0.0

    if mu_age == None:
        vars.update(
            age_pattern.age_pattern(
                name, ages=ages, knots=knots, smoothing=smoothing, interpolation_method=interpolation_method
            )
        )
    else:
        vars.update(dict(mu_age=mu_age, ages=ages))

    vars.update(expert_prior_model.level_constraints(name, parameters, vars["mu_age"], ages))
    vars.update(expert_prior_model.derivative_constraints(name, parameters, vars["mu_age"], ages))

    if mu_age_parent != None:
        # setup a hierarchical prior on the simliarity between the
        # consistent estimate here and (inconsistent) estimate for its
        # parent in the areas hierarchy
        # weight_dict = {'Unusable': 10., 'Slightly': 10., 'Moderately': 1., 'Very': .1}
        # weight = weight_dict[parameters['heterogeneity']]
        vars.update(
            similarity_prior_model.similar(
                "parent_similarity_%s" % name, vars["mu_age"], mu_age_parent, sigma_age_parent, 0.0
            )
        )

        # also use this as the initial value for the age pattern, if it is not already specified
        if mu_age == None:
            if isinstance(mu_age_parent, mc.Node):  # TODO: test this code
                initial_mu = mu_age_parent.value
            else:
                initial_mu = mu_age_parent

            for i, k_i in enumerate(knots):
                vars["gamma"][i].value = (pl.log(initial_mu[k_i - ages[0]])).clip(-12, 6)

    age_weights = pl.ones_like(vars["mu_age"].value)  # TODO: use age pattern appropriate to the rate type
    if len(data) > 0:
        vars.update(
            age_integrating_model.age_standardize_approx(
                name, age_weights, vars["mu_age"], data["age_start"], data["age_end"], ages
            )
        )

        # uncomment the following to effectively remove alleffects
        # if 'random_effects' in parameters:
        #    for i in range(5):
        #        effect = 'sigma_alpha_%s_%d' % (name, i)
        #        parameters['random_effects'][effect] = dict(dist='TruncatedNormal', mu=.0001, sigma=.00001, lower=.00009, upper=.00011)
        # if 'fixed_effects' in parameters:
        #    for effect in ['x_sex', 'x_LDI_id_Updated_7July2011']:
        #        parameters['fixed_effects'][effect] = dict(dist='normal', mu=.0001, sigma=.00001)

        if include_covariates:
            vars.update(
                covariate_model.mean_covariate_model(
                    name,
                    vars["mu_interval"],
                    data,
                    parameters,
                    model,
                    reference_area,
                    reference_sex,
                    reference_year,
                    zero_re=zero_re,
                )
            )
        else:
            vars.update({"pi": vars["mu_interval"]})

        ## ensure that all data has uncertainty quantified appropriately
        # first replace all missing se from ci
        missing_se = pl.isnan(data["standard_error"]) | (data["standard_error"] < 0)
        data["standard_error"][missing_se] = (data["upper_ci"][missing_se] - data["lower_ci"][missing_se]) / (2 * 1.96)

        # then replace all missing ess with se
        missing_ess = pl.isnan(data["effective_sample_size"])
        data["effective_sample_size"][missing_ess] = (
            data["value"][missing_ess] * (1 - data["value"][missing_ess]) / data["standard_error"][missing_ess] ** 2
        )

        if rate_type == "neg_binom":

            # warn and drop data that doesn't have effective sample size quantified, or is is non-positive
            missing_ess = pl.isnan(data["effective_sample_size"]) | (data["effective_sample_size"] < 0)
            if sum(missing_ess) > 0:
                print "WARNING: %d rows of %s data has invalid quantification of uncertainty." % (
                    sum(missing_ess),
                    name,
                )
                data["effective_sample_size"][missing_ess] = 0.0

            # warn and change data where ess is unreasonably huge
            large_ess = data["effective_sample_size"] >= 1.0e10
            if sum(large_ess) > 0:
                print "WARNING: %d rows of %s data have effective sample size exceeding 10 billion." % (
                    sum(large_ess),
                    name,
                )
                data["effective_sample_size"][large_ess] = 1.0e10

            if "heterogeneity" in parameters:
                lower_dict = {"Slightly": 9.0, "Moderately": 3.0, "Very": 1.0}
                lower = lower_dict[parameters["heterogeneity"]]
            else:
                lower = 1.0

            # special case, treat pf data as poisson
            if data_type == "pf":
                lower = 1.0e12

            vars.update(covariate_model.dispersion_covariate_model(name, data, lower, lower * 9.0))

            vars.update(
                rate_model.neg_binom_model(
                    name, vars["pi"], vars["delta"], data["value"], data["effective_sample_size"]
                )
            )
        elif rate_type == "log_normal":

            # warn and drop data that doesn't have effective sample size quantified
            missing = pl.isnan(data["standard_error"]) | (data["standard_error"] < 0)
            if sum(missing) > 0:
                print "WARNING: %d rows of %s data has no quantification of uncertainty." % (sum(missing), name)
                data["standard_error"][missing] = 1.0e6

            # TODO: allow options for alternative priors for sigma
            vars["sigma"] = mc.Uniform("sigma_%s" % name, lower=0.0001, upper=1.0, value=0.01)
            # vars['sigma'] = mc.Exponential('sigma_%s'%name, beta=100., value=.01)
            vars.update(
                rate_model.log_normal_model(name, vars["pi"], vars["sigma"], data["value"], data["standard_error"])
            )
        elif rate_type == "normal":

            # warn and drop data that doesn't have standard error quantified
            missing = pl.isnan(data["standard_error"]) | (data["standard_error"] < 0)
            if sum(missing) > 0:
                print "WARNING: %d rows of %s data has no quantification of uncertainty." % (sum(missing), name)
                data["standard_error"][missing] = 1.0e6

            vars["sigma"] = mc.Uniform("sigma_%s" % name, lower=0.0001, upper=0.1, value=0.01)
            vars.update(rate_model.normal_model(name, vars["pi"], vars["sigma"], data["value"], data["standard_error"]))
        elif rate_type == "binom":
            missing_ess = pl.isnan(data["effective_sample_size"]) | (data["effective_sample_size"] < 0)
            if sum(missing_ess) > 0:
                print "WARNING: %d rows of %s data has invalid quantification of uncertainty." % (
                    sum(missing_ess),
                    name,
                )
                data["effective_sample_size"][missing_ess] = 0.0
            vars += rate_model.binom(name, vars["pi"], data["value"], data["effective_sample_size"])
        elif rate_type == "beta_binom":
            vars += rate_model.beta_binom(name, vars["pi"], data["value"], data["effective_sample_size"])
        elif rate_type == "poisson":
            missing_ess = pl.isnan(data["effective_sample_size"]) | (data["effective_sample_size"] < 0)
            if sum(missing_ess) > 0:
                print "WARNING: %d rows of %s data has invalid quantification of uncertainty." % (
                    sum(missing_ess),
                    name,
                )
                data["effective_sample_size"][missing_ess] = 0.0

            vars += rate_model.poisson(name, vars["pi"], data["value"], data["effective_sample_size"])
        elif rate_type == "offset_log_normal":
            vars["sigma"] = mc.Uniform("sigma_%s" % name, lower=0.0001, upper=10.0, value=0.01)
            vars += rate_model.offset_log_normal(name, vars["pi"], vars["sigma"], data["value"], data["standard_error"])
        else:
            raise Exception, 'rate_model "%s" not implemented' % rate_type
    else:
        if include_covariates:
            vars.update(
                covariate_model.mean_covariate_model(
                    name, [], data, parameters, model, reference_area, reference_sex, reference_year, zero_re=zero_re
                )
            )
    if include_covariates:
        vars.update(expert_prior_model.covariate_level_constraints(name, model, vars, ages))

    if lower_bound and len(lb_data) > 0:
        vars["lb"] = age_integrating_model.age_standardize_approx(
            "lb_%s" % name, age_weights, vars["mu_age"], lb_data["age_start"], lb_data["age_end"], ages
        )

        if include_covariates:

            vars["lb"].update(
                covariate_model.mean_covariate_model(
                    "lb_%s" % name,
                    vars["lb"]["mu_interval"],
                    lb_data,
                    parameters,
                    model,
                    reference_area,
                    reference_sex,
                    reference_year,
                    zero_re=zero_re,
                )
            )
        else:
            vars["lb"].update({"pi": vars["lb"]["mu_interval"]})

        vars["lb"].update(
            covariate_model.dispersion_covariate_model("lb_%s" % name, lb_data, 1e12, 1e13)  # treat like poisson
        )

        ## ensure that all data has uncertainty quantified appropriately
        # first replace all missing se from ci
        missing_se = pl.isnan(lb_data["standard_error"]) | (lb_data["standard_error"] <= 0)
        lb_data["standard_error"][missing_se] = (lb_data["upper_ci"][missing_se] - lb_data["lower_ci"][missing_se]) / (
            2 * 1.96
        )

        # then replace all missing ess with se
        missing_ess = pl.isnan(lb_data["effective_sample_size"])
        lb_data["effective_sample_size"][missing_ess] = (
            lb_data["value"][missing_ess]
            * (1 - lb_data["value"][missing_ess])
            / lb_data["standard_error"][missing_ess] ** 2
        )

        # warn and drop lb_data that doesn't have effective sample size quantified
        missing_ess = pl.isnan(lb_data["effective_sample_size"]) | (lb_data["effective_sample_size"] <= 0)
        if sum(missing_ess) > 0:
            print "WARNING: %d rows of %s lower bound data has no quantification of uncertainty." % (
                sum(missing_ess),
                name,
            )
            lb_data["effective_sample_size"][missing_ess] = 1.0

        vars["lb"].update(
            rate_model.neg_binom_lower_bound_model(
                "lb_%s" % name,
                vars["lb"]["pi"],
                vars["lb"]["delta"],
                lb_data["value"],
                lb_data["effective_sample_size"],
            )
        )

    result[data_type] = vars
    return result
Example #23
0
def validate_consistent_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], 
                           true=dict(i=quadratic, f=constant, r=constant)):
    types = pl.array(['i', 'r', 'f', 'p'])

    ## generate simulated data
    model = data_simulation.simple_model(N)
    model.input_data['effective_sample_size'] = 1.
    model.input_data['value'] = 0.
    # coarse knot spacing for fast testing
    for t in types:
        model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20)

    sim = consistent_model.consistent_model(model, 'all', 'total', 'all', {})
    for t in 'irf':
        for i, k_i in enumerate(sim[t]['knots']):
            sim[t]['gamma'][i].value = pl.log(true[t](k_i))

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    data_type = types[mc.rcategorical(pl.ones(len(types), dtype=float) / float(len(types)), size=N)]


    a = pl.arange(101)
    age_weights = pl.ones_like(a)
    sum_wt = pl.cumsum(age_weights)

    p = pl.zeros(N)
    for t in types:
        mu_t = sim[t]['mu_age'].value
        sum_mu_wt = pl.cumsum(mu_t*age_weights)
    
        p_t = (sum_mu_wt[age_end] - sum_mu_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start])

        # correct cases where age_start == age_end
        i = age_start == age_end
        if pl.any(i):
            p_t[i] = mu_t[age_start[i]]

        # copy part into p
        p[data_type==t] = p_t[data_type==t]


    # add covariate shifts
    import dismod3
    import simplejson as json
    gbd_model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json()))
    model.hierarchy = gbd_model.hierarchy

    from validate_covariates import alpha_true_sim
    area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR'])
    alpha = {}
    for t in types:
        alpha[t] = alpha_true_sim(model, area_list, sigma_true)
    print json.dumps(alpha, indent=2)

    model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)]
    
    for i, a in model.input_data['area'].iteritems():
        t = data_type[i]
        p[i] = p[i] * pl.exp(pl.sum([alpha[t][n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha]))

    n = mc.runiform(100, 10000, size=N)

    model.input_data['data_type'] = data_type
    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n

    # coarse knot spacing for fast testing
    for t in types:
        model.parameters[t]['parameter_age_mesh'] = range(0, 101, 20)

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars = consistent_model.consistent_model(model, 'all', 'total', 'all', {})
    #model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=101, burn=0, thin=1, tune_interval=100)
    model.map, model.mcmc = fit_model.fit_consistent_model(model.vars, iter=10000, burn=5000, thin=25, tune_interval=100)

    graphics.plot_convergence_diag(model.vars)

    graphics.plot_fit(model, model.vars, {}, {})
    for i, t in enumerate('i r f p rr pf'.split()):
        pl.subplot(2, 3, i+1)
        pl.plot(range(101), sim[t]['mu_age'].value, 'w-', label='Truth', linewidth=2)
        pl.plot(range(101), sim[t]['mu_age'].value, 'r-', label='Truth', linewidth=1)

    pl.show()

    model.input_data['mu_pred'] = 0.
    model.input_data['sigma_pred'] = 0.
    for t in types:
        model.input_data['mu_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['mean']
        model.input_data['sigma_pred'][data_type==t] = model.vars[t]['p_pred'].stats()['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true for t in types if t != 'rr']))
    model.delta['mu_pred'] = [pl.exp(model.vars[t]['eta'].trace()).mean() for t in types if t != 'rr']
    model.delta['sigma_pred'] = [pl.exp(model.vars[t]['eta'].trace()).std() for t in types if t != 'rr']
    data_simulation.add_quality_metrics(model.delta)

    model.alpha = pandas.DataFrame()
    model.sigma = pandas.DataFrame()
    for t in types:
        alpha_t = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)])
        alpha_t['true'] = pandas.Series(dict(alpha[t]))
        alpha_t['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns)
        alpha_t['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars[t]['alpha']], index=model.vars[t]['U'].columns)
        alpha_t['type'] = t
        model.alpha = model.alpha.append(alpha_t.dropna(), ignore_index=True)

        sigma_t = pandas.DataFrame(dict(true=sigma_true))
        sigma_t['mu_pred'] = [n.stats()['mean'] for n in model.vars[t]['sigma_alpha']]
        sigma_t['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars[t]['sigma_alpha']]
        model.sigma = model.sigma.append(sigma_t.dropna(), ignore_index=True)

    data_simulation.add_quality_metrics(model.alpha)
    data_simulation.add_quality_metrics(model.sigma)


    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame()
    for t in types:
        model.mu = model.mu.append(pandas.DataFrame(dict(true=sim[t]['mu_age'].value,
                                                         mu_pred=model.vars[t]['mu_age'].stats()['mean'],
                                                         sigma_pred=model.vars[t]['mu_age'].stats()['standard deviation'])),
                                   ignore_index=True)
    data_simulation.add_quality_metrics(model.mu)
    print '\nparam prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.mu['abs_err'].mean(),
                                                                         pl.median(pl.absolute(model.mu['rel_err'].dropna())),
                                                                         model.mu['covered?'].mean())
    print


    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    data_simulation.add_to_results(model, 'alpha')
    data_simulation.add_to_results(model, 'sigma')
    data_simulation.finalize_results(model)

    print model.results

    return model
Example #24
0
 #for ncase in range(1):
 #for ncase in [6]:
 KK = np.uint64(CaseIn[ncase][0])
 CC = np.uint64(CaseIn[ncase][1])
 SS = np.uint64(CaseIn[ncase][2])
 if (CC * SS < KK):
     print("Case #{}: IMPOSSIBLE".format(ncase + 1))
 else:
     remaining = np.array([np.uint64(jj) for jj in range(KK)])
     place = np.arange(min(KK, CC) - 1, -1, -1, dtype=np.uint64)
     #place = np.arange(0, CC, dtype = np.uint64)
     Kpow = np.array([KK**cj for cj in place])
     ##With limits at 1e+18, should be no problem with int64
     sample_pos = []
     if (len(Kpow) > 1):
         if (pl.any(np.diff(np.double(Kpow)) >= 0)):
             #           if(pl.any(np.diff(np.double(Kpow)) <=0)):
             print('unexpected overflow')
             sys.exit(1)
     while (len(remaining) >= CC):
         ##Break off the next word and convert to position
         pos_baseK = remaining[:CC]
         remaining = remaining[CC:]
         ##Generate a position
         ## (Kpow * (0*pos_baseK)).sum() + np.uint64(1)
         posj = (Kpow * pos_baseK).sum() + np.uint64(1)
         sample_pos.append(posj)
     if (len(remaining) > 0):
         pos_baseK = remaining
         posj = (Kpow[:len(pos_baseK)] * pos_baseK).sum() + np.uint64(1)
         sample_pos.append(posj)
def validate_age_integrating_model_sim(N=500, delta_true=.15, pi_true=quadratic):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    model = data_simulation.simple_model(N)
    #model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    #model.parameters['p']['smoothness'] = dict(amount='Very')

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true*age_weights)
    sum_wt = pl.cumsum(age_weights)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    n = mc.runiform(100, 10000, size=N)

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(a, pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(dict(true=pi_age_true,
                                     mu_pred=model.vars['p']['mu_age'].stats()['mean'],
                                     sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split())

    print model.results

    return model
Example #26
0
def mean_covariate_model(name, mu, input_data, parameters, model, root_area, root_sex, root_year, zero_re=True):
    """ Generate PyMC objects covariate adjusted version of mu

    :Parameters:
      - `name` : str
      - `mu` : the unadjusted mean parameter for this node
      - `model` : ModelData to use for covariates
      - `root_area, root_sex, root_year` : str, str, int
      - `zero_re` : boolean, change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic

    :Results:
      - Returns dict of PyMC objects, including 'pi', the covariate adjusted predicted values for the mu and X provided

    """
    n = len(input_data.index)

    # make U and alpha
    p_U = model.hierarchy.number_of_nodes()  # random effects for area
    U = pandas.DataFrame(pl.zeros((n, p_U)), columns=model.hierarchy.nodes(), index=input_data.index)
    for i, row in input_data.T.iteritems():
        if row['area'] not in model.hierarchy:
            print 'WARNING: "%s" not in model hierarchy, skipping random effects for this observation' % row['area']
            continue
        
        for level, node in enumerate(nx.shortest_path(model.hierarchy, 'all', input_data.ix[i, 'area'])):
            model.hierarchy.node[node]['level'] = level
            U.ix[i, node] = 1.
            
    for n2 in model.hierarchy.nodes():
        for level, node in enumerate(nx.shortest_path(model.hierarchy, 'all', n2)):
                        model.hierarchy.node[node]['level'] = level
                        
    #U = U.select(lambda col: U[col].std() > 1.e-5, axis=1)  # drop constant columns
    if len(U.index) == 0:
        U = pandas.DataFrame()
    else:
        U = U.select(lambda col: (U[col].max() > 0) and (model.hierarchy.node[col].get('level') > model.hierarchy.node[root_area]['level']), axis=1)  # drop columns with only zeros and which are for higher levels in hierarchy
        #U = U.select(lambda col: model.hierarchy.node[col].get('level') <= 2, axis=1)  # drop country-level REs
        #U = U.drop(['super-region_0', 'north_america_high_income', 'USA'], 1)

        #U = U.drop(['super-region_0', 'north_america_high_income'], 1)
        #U = U.drop(U.columns, 1)


        ## drop random effects with less than 1 observation or with all observations set to 1, unless they have an informative prior
        keep = []
        if 'random_effects' in parameters:
            for re in parameters['random_effects']:
                if parameters['random_effects'][re].get('dist') == 'Constant':
                    keep.append(re)
        U = U.select(lambda col: 1 <= U[col].sum() < len(U[col]) or col in keep, axis=1)


    U_shift = pandas.Series(0., index=U.columns)
    for level, node in enumerate(nx.shortest_path(model.hierarchy, 'all', root_area)):
        if node in U_shift:
            U_shift[node] = 1.
    U = U - U_shift

    sigma_alpha = []
    for i in range(5):  # max depth of hierarchy is 5
        effect = 'sigma_alpha_%s_%d'%(name,i)
        if 'random_effects' in parameters and effect in parameters['random_effects']:
            prior = parameters['random_effects'][effect]
            print 'using stored RE hyperprior for', effect, prior 
            sigma_alpha.append(MyTruncatedNormal(effect, prior['mu'], pl.maximum(prior['sigma'], .001)**-2,
                                                  min(prior['mu'], prior['lower']),
                                                  max(prior['mu'], prior['upper']),
                                                  value=prior['mu']))
        else:
            sigma_alpha.append(MyTruncatedNormal(effect, .05, .03**-2, .05, .5, value=.1))
    
    alpha = pl.array([])
    const_alpha_sigma = pl.array([])
    alpha_potentials = []
    if len(U.columns) > 0:
        tau_alpha_index = []
        for alpha_name in U.columns:
            tau_alpha_index.append(model.hierarchy.node[alpha_name]['level'])
        tau_alpha_index=pl.array(tau_alpha_index, dtype=int)

        tau_alpha_for_alpha = [sigma_alpha[i]**-2 for i in tau_alpha_index]

        alpha = []
        for i, tau_alpha_i in enumerate(tau_alpha_for_alpha):
            effect = 'alpha_%s_%s'%(name, U.columns[i])
            if 'random_effects' in parameters and U.columns[i] in parameters['random_effects']:
                prior = parameters['random_effects'][U.columns[i]]
                print 'using stored RE for', effect, prior
                if prior['dist'] == 'Normal':
                    alpha.append(mc.Normal(effect, prior['mu'], pl.maximum(prior['sigma'], .001)**-2,
                                           value=0.))
                elif prior['dist'] == 'TruncatedNormal':
                    alpha.append(MyTruncatedNormal(effect, prior['mu'], pl.maximum(prior['sigma'], .001)**-2,
                                                   prior['lower'], prior['upper'], value=0.))
                elif prior['dist'] == 'Constant':
                    alpha.append(float(prior['mu']))
                else:
                    assert 0, 'ERROR: prior distribution "%s" is not implemented' % prior['dist']
            else:
                alpha.append(mc.Normal(effect, 0, tau=tau_alpha_i, value=0))

        # sigma for "constant" alpha
        const_alpha_sigma = []
        for i, tau_alpha_i in enumerate(tau_alpha_for_alpha):
            effect = 'alpha_%s_%s'%(name, U.columns[i])
            if 'random_effects' in parameters and U.columns[i] in parameters['random_effects']:
                prior = parameters['random_effects'][U.columns[i]]
                if prior['dist'] == 'Constant':
                    const_alpha_sigma.append(float(prior['sigma']))
                else:
                    const_alpha_sigma.append(pl.nan)
            else:
                const_alpha_sigma.append(pl.nan)
                
        if zero_re:
            column_map = dict([(n,i) for i,n in enumerate(U.columns)])
            # change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic
            for parent in model.hierarchy:
                node_names = model.hierarchy.successors(parent)
                nodes = [column_map[n] for n in node_names if n in U]
                if len(nodes) > 0:
                    i = nodes[0]
                    old_alpha_i = alpha[i]

                    # do not change if prior for this node has dist='constant'
                    if parameters.get('random_effects', {}).get(U.columns[i], {}).get('dist') == 'Constant':
                        continue

                    alpha[i] = mc.Lambda('alpha_det_%s_%d'%(name, i),
                                                lambda other_alphas_at_this_level=[alpha[n] for n in nodes[1:]]: -sum(other_alphas_at_this_level))

                    if isinstance(old_alpha_i, mc.Stochastic):
                        @mc.potential(name='alpha_pot_%s_%s'%(name, U.columns[i]))
                        def alpha_potential(alpha=alpha[i], mu=old_alpha_i.parents['mu'], tau=old_alpha_i.parents['tau']):
                            return mc.normal_like(alpha, mu, tau)
                        alpha_potentials.append(alpha_potential)

    # make X and beta
    X = input_data.select(lambda col: col.startswith('x_'), axis=1)

    # add sex as a fixed effect (TODO: decide if this should be in data.py, when loading gbd model)
    X['x_sex'] = [sex_value[row['sex']] for i, row in input_data.T.iteritems()]

    beta = pl.array([])
    const_beta_sigma = pl.array([])
    X_shift = pandas.Series(0., index=X.columns)
    if len(X.columns) > 0:
        # shift columns to have zero for root covariate
        try:
            output_template = model.output_template.groupby(['area', 'sex', 'year']).mean()  # TODO: change to .first(), but that doesn't work with old pandas
        except pandas.core.groupby.DataError:
            output_template = model.output_template.groupby(['area', 'sex', 'year']).first()
        covs = output_template.filter(list(X.columns) + ['pop'])
        if len(covs.columns) > 1:
            leaves = [n for n in nx.traversal.bfs_tree(model.hierarchy, root_area) if model.hierarchy.successors(n) == []]
            if len(leaves) == 0:
                # networkx returns an empty list when the bfs tree is a single node
                leaves = [root_area]

            if root_sex == 'total' and root_year == 'all':  # special case for all years and sexes
                covs = covs.delevel().drop(['year', 'sex'], axis=1).groupby('area').mean()  # TODO: change to .reset_index(), but that doesn't work with old pandas
                leaf_covs = covs.ix[leaves]
            elif root_sex == 'total':
                raise Exception, 'root_sex == total, root_year != all is Not Yet Implemented'
            elif root_year == 'all':
                raise Exception, 'root_year == all, root_sex != total is Not Yet Implemented'
            else:
                leaf_covs = covs.ix[[(l, root_sex, root_year) for l in leaves]]

            for cov in covs:
                if cov != 'pop':
                    X_shift[cov] = (leaf_covs[cov] * leaf_covs['pop']).sum() / leaf_covs['pop'].sum()

        if 'x_sex' in X.columns:
            X_shift['x_sex'] = sex_value[root_sex]

        X = X - X_shift

        assert not pl.any(pl.isnan(X.__array__())), 'Covariate matrix should have no missing values'

        beta = []
        for i, effect in enumerate(X.columns):
            name_i = 'beta_%s_%s'%(name, effect)
            if 'fixed_effects' in parameters and effect in parameters['fixed_effects']:
                prior = parameters['fixed_effects'][effect]
                print 'using stored FE for', name_i, effect, prior
                if prior['dist'] == 'TruncatedNormal':
                    beta.append(MyTruncatedNormal(name_i, mu=float(prior['mu']), tau=pl.maximum(prior['sigma'], .001)**-2, a=prior['lower'], b=prior['upper'], value=.5*(prior['lower']+prior['upper'])))
                elif prior['dist'] == 'Normal':
                    beta.append(mc.Normal(name_i, mu=float(prior['mu']), tau=pl.maximum(prior['sigma'], .001)**-2, value=float(prior['mu'])))
                elif prior['dist'] == 'Constant':
                    beta.append(float(prior['mu']))
                else:
                    assert 0, 'ERROR: prior distribution "%s" is not implemented' % prior['dist']
            else:
                beta.append(mc.Normal(name_i, mu=0., tau=1.**-2, value=0))

        # sigma for "constant" beta
        const_beta_sigma = []
        for i, effect in enumerate(X.columns):
            name_i = 'beta_%s_%s'%(name, effect)
            if 'fixed_effects' in parameters and effect in parameters['fixed_effects']:
                prior = parameters['fixed_effects'][effect]
                if prior['dist'] == 'Constant':
                    const_beta_sigma.append(float(prior.get('sigma', 1.e-6)))
                else:
                    const_beta_sigma.append(pl.nan)
            else:
                const_beta_sigma.append(pl.nan)
                
    @mc.deterministic(name='pi_%s'%name)
    def pi(mu=mu, U=pl.array(U, dtype=float), alpha=alpha, X=pl.array(X, dtype=float), beta=beta):
        return mu * pl.exp(pl.dot(U, [float(x) for x in alpha]) + pl.dot(X, [float(x) for x in beta]))

    return dict(pi=pi, U=U, U_shift=U_shift, sigma_alpha=sigma_alpha, alpha=alpha, alpha_potentials=alpha_potentials, X=X, X_shift=X_shift, beta=beta, hierarchy=model.hierarchy, const_alpha_sigma=const_alpha_sigma, const_beta_sigma=const_beta_sigma)
def validate_ai_re(N=500,
                   delta_true=.15,
                   sigma_true=[.1, .1, .1, .1, .1],
                   pi_true=quadratic,
                   smoothness='Moderately',
                   heterogeneity='Slightly'):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    import dismod3
    import simplejson as json
    model = data.ModelData.from_gbd_jsons(
        json.loads(dismod3.disease_json.DiseaseJson().to_json()))
    gbd_hierarchy = model.hierarchy

    model = data_simulation.simple_model(N)
    model.hierarchy = gbd_hierarchy

    model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    model.parameters['p']['smoothness'] = dict(amount=smoothness)
    model.parameters['p']['heterogeneity'] = heterogeneity

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true * age_weights)
    sum_wt = pl.cumsum(age_weights * 1.)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] -
                                                       sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N)

    from validate_covariates import alpha_true_sim
    area_list = pl.array([
        'all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT',
        'IRN', 'IRQ', 'JOR', 'SYR'
    ])
    alpha = alpha_true_sim(model, area_list, sigma_true)
    print alpha

    model.input_data['true'] = pl.nan

    model.input_data['area'] = area_list[mc.rcategorical(
        pl.ones(len(area_list)) / float(len(area_list)), N)]

    for i, a in model.input_data['area'].iteritems():
        model.input_data['true'][i] = p[i] * pl.exp(
            pl.sum([
                alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a)
                if n in alpha
            ]))
    p = model.input_data['true']

    n = model.input_data['effective_sample_size']
    model.input_data['value'] = mc.rnegative_binomial(n * p,
                                                      delta_true * n * p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p',
                                            'north_africa_middle_east',
                                            'total', 'all', None, None, None)
    #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=25,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(range(101), pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    model.alpha = pandas.DataFrame(
        index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)])
    model.alpha['true'] = pandas.Series(dict(alpha))
    model.alpha['mu_pred'] = pandas.Series(
        [n.stats()['mean'] for n in model.vars['p']['alpha']],
        index=model.vars['p']['U'].columns)
    model.alpha['sigma_pred'] = pandas.Series(
        [n.stats()['standard deviation'] for n in model.vars['p']['alpha']],
        index=model.vars['p']['U'].columns)
    model.alpha = model.alpha.dropna()
    data_simulation.add_quality_metrics(model.alpha)

    model.sigma = pandas.DataFrame(dict(true=sigma_true))
    model.sigma['mu_pred'] = [
        n.stats()['mean'] for n in model.vars['p']['sigma_alpha']
    ]
    model.sigma['sigma_pred'] = [
        n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']
    ]
    data_simulation.add_quality_metrics(model.sigma)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(
        dict(true=pi_age_true,
             mu_pred=model.vars['p']['mu_age'].stats()['mean'],
             sigma_pred=model.vars['p']['mu_age'].stats()
             ['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    data_simulation.add_to_results(model, 'alpha')
    data_simulation.add_to_results(model, 'sigma')
    data_simulation.finalize_results(model)

    print model.results

    return model
Example #28
0
def linear_norm(x, y, msk, eps=0.003, deps=0.001, nmin=2, nwin=3):
    '''Linear normalization of a slice of a spectra,
       assuming that the slice is centered on the line to normalized.
    '''

    bla = False
    blabla = False

    x = x[msk]
    y = y[msk]

    n = int((len(y)/2.))
    yl = y[:n]
    yr = y[n+1:]

    # Criteria on the left of the central wavelength
    epsl, epsr = eps, eps
    while 1:
        critl = abs(max(yl)-yl) / max(yl)
        idx_yl = pl.where(critl <= epsl)[0]
        idx_yl = idx_yl.astype(int)
        if blabla:
            print " epsl:", epsl
            print " idx_yl, yl:", idx_yl, [y[i] for i in idx_yl]
        if pl.size(idx_yl) >= nmin:
            break
        else:
            epsl = epsl + deps

   # Criteria on the right of the central wavelength
    while 1:
        critr = abs(max(yr)-yr) / max(yr)
        idx_yr = pl.where(critr <= epsr)[0] + n
        idx_yr = idx_yr.astype(int)
        if blabla:
            print " epsr:", epsr
            print "idx_yr, yr:", idx_yr, [y[i] for i in idx_yr]
        if pl.size(idx_yr) >= nmin:
            break
        else:
            epsr = epsr + deps

    idx_y = pl.concatenate([idx_yl, idx_yr])

    if bla:
        print " nmin, nwin =", nmin, nwin
        print " Number of selected left continuum points:  ", idx_yl.size, "/", n
        print " Number of selected right continuum points: ", idx_yr.size, "/", n
        print " Number of selected continuum points:       ", idx_y.size, "/", y.size

    xs = [x[i] for i in idx_y]
    ys = [y[i] for i in idx_y]

    xs, ys = pl.asarray(xs), pl.asarray(ys)
    n_xs = xs.size

    # Mean value around selected points
    for ind, val in enumerate(ys):
        i = idx_y[ind] - nwin
        j = idx_y[ind] + nwin
        if i < 0:
            i = 0
        if j > len(y):
            j = len(y)
        ys[ind] = y[i:j].mean()

    if blabla:
        print "xs, ys", xs, ys

    A = pl.concatenate([xs, pl.ones(n_xs)])
    A = A.reshape((2, n_xs))
    w = pl.linalg.lstsq(A.T, ys)[0]

    # Test if one of the value of w is a nan
    if pl.any(w != w):
        print "Pb with linalg.lstsq. Try to reduce eps or nmin."
        quit(1)

    a, b = w[0], w[1]

    if blabla:
        print "a =", a, "b =", b

    return a, b, xs, ys
Example #29
0
def fit(psp_shape,
        time,
        voltage,
        error_estimate,
        maxcall=1000,
        maximal_red_chi2=2.0,
        fail_on_negative_cov=None):
    """
    psp_shape : object
        PSPShape instance

    time : numpy.ndarray of floats
        numpy array of data acquisition times

    voltage : numpy.ndarray
        numpy array of voltage values

    error_estimate : float
        estimate for the standard deviation of an individual data point.

    maxcall : int
        maximal number of calls to the fit routine

    fail_on_negative_cov : list of int

    returns : tuple
        (fit_results
         error_estimates
         chi2_per_dof
         success)
    """
    assert len(time) == len(voltage)

    initial_values = psp_shape.initial_fit_values(time, voltage)

    result = optimize.leastsq(
        lambda param: (psp_shape(time, *param) - voltage),
        [initial_values[key] for key in psp_shape.parameter_names()],
        full_output=1,
        maxfev=maxcall)

    resultparams, cov_x, _, _, ier = result

    ndof = len(time) - len(psp_shape.parameter_names())
    fit_voltage = psp_shape(time, *result[0])
    red_chi2 = sum(((fit_voltage - voltage)) ** 2) \
               / (error_estimate ** 2 * ndof)

    fail_neg = p.any(p.diag(cov_x) < 0)
    if fail_on_negative_cov is not None:
        fail_neg = p.any(p.logical_and(
            p.diag(cov_x) < 0, fail_on_negative_cov))

    cov_x *= error_estimate**2

    success = ((not fail_neg) and (ier in [1, 2, 3, 4])
               and (red_chi2 <= maximal_red_chi2))

    processed, processed_cov = psp_shape.process_fit_results(
        resultparams, cov_x)

    return processed, processed_cov, red_chi2, success
    data_ecog_lp_ss[i,:] = signal.decimate(filters.low_pass_filter(data_ecog[i,:], Fsampling=f_sampling, Fcutoff=f_lp_cutoff), int(f_sampling/f_subsample))
    data_ecog_lp_ss.flush()
    print(i)
pl.save(os.path.join(memap_folder, 'data_ecog_lp_ss.npy'), data_ecog_lp_ss)



spike_samples = tf.spikedetect(data_probe_hp, threshold_multiplier=6.5, bad_channels=probe_bad_channels)
pl.save(os.path.join(memap_folder, 'spike_samples.npy'), spike_samples)


spike_samples_clean = spike_samples
for i in pl.arange(pl.size(spike_samples_clean)-1,-1,-1):
    data = data_probe_hp[:, spike_samples[i]-60:spike_samples[i]+60]
    stdevs = sp.std(data,1)
    if np.max(data) > 3000 or pl.any(stdevs>600):
        spike_samples_clean = pl.delete(spike_samples_clean, i)
    if i%100==0:
        print(i)
spike_samples_clean = pl.delete(spike_samples_clean, 0)
pl.save(os.path.join(memap_folder, 'spike_samples_clean.npy'), spike_samples_clean)

channels = np.empty(0)
for i in pl.arange(0, pl.size(spike_samples_clean)):
    data = np.array(data_probe_hp[:, spike_samples_clean[i]].tolist())
    channels = np.append(channels, np.argmax(data))
    if i%100==0:
        print(i)
channels_spikes_df = pd.DataFrame([(channels, spike_samples_clean)], columns=['Channels', 'Samples'])

spike_times_shaftA = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>7][channels_spikes_df.Channels[0]<16]
Example #31
0
    data_ecog_lp_ss[i,:] = signal.decimate(
        filters.low_pass_filter(data_ecog[i, :], Fsampling=f_sampling, Fcutoff=f_lp_cutoff), int(f_sampling / f_subsample))
    data_ecog_lp_ss.flush()
    print(i)
pl.save(os.path.join(memap_folder, 'data_ecog_lp_ss.npy'), data_ecog_lp_ss)


spike_samples = tf.spikedetect(data_probe_hp, threshold_multiplier=6.5, bad_channels=probe_bad_channels)
pl.save(os.path.join(memap_folder, 'spike_samples.npy'), spike_samples)


spike_samples_clean = spike_samples
for i in pl.arange(pl.size(spike_samples_clean)-1,-1,-1):
    data = data_probe_hp[:, spike_samples[i]-60:spike_samples[i]+60]
    stdevs = sp.std(data,1)
    if np.max(data) > 3000 or pl.any(stdevs>600):
        spike_samples_clean = pl.delete(spike_samples_clean, i)
    if i%100==0:
        print(i)
spike_samples_clean = pl.delete(spike_samples_clean, 0)
pl.save(os.path.join(memap_folder, 'spike_samples_clean.npy'), spike_samples_clean)

channels = np.empty(0)
for i in pl.arange(0, pl.size(spike_samples_clean)):
    data = np.array(data_probe_hp[:, spike_samples_clean[i]].tolist())
    channels = np.append(channels, np.argmax(data))
    if i%100==0:
        print(i)
channels_spikes_df = pd.DataFrame([(channels, spike_samples_clean)], columns=['Channels', 'Samples'])

spike_times_shaftA = channels_spikes_df.Samples[0][channels_spikes_df.Channels[0]>7][channels_spikes_df.Channels[0]<16]
Example #32
0
def win(board, letter):
    wins = logical_or(board == letter, board == 'T')
    return any(all(wins, 0)) or any(all(wins, 1)) or all(diag(wins)) or \
      all(diag(rot90(wins)))
Example #33
0
def setup(dm, key, data_list, rate_stoch):
    """ Generate the PyMC variables for a normal model of
    a function of age

    Parameters
    ----------
    dm : dismod3.DiseaseModel
      the object containing all the data, priors, and additional
      information (like input and output age-mesh)
      
    key : str
      the name of the key for everything about this model (priors,
      initial values, estimations)

    data_list : list of data dicts
      the observed data to use in the beta-binomial liklihood function

    rate_stoch : pymc.Stochastic
      a PyMC stochastic (or deterministic) object, with
      len(rate_stoch.value) == len(dm.get_estimation_age_mesh()).

    Results
    -------
    vars : dict
      Return a dictionary of all the relevant PyMC objects for the
      normal model.  vars['rate_stoch'] is of particular
      relevance, for details see the beta_binomial_model
    """
    vars = {}
    est_mesh = dm.get_estimate_age_mesh()
    if pl.any(pl.diff(est_mesh) != 1):
        raise ValueError, 'ERROR: Gaps in estimation age mesh must all equal 1'

    vars['rate_stoch'] = rate_stoch

    # set up priors and observed data
    prior_str = dm.get_priors(key)
    dismod3.utils.generate_prior_potentials(vars, prior_str, est_mesh)

    vars['observed_rates'] = []
    for d in data_list:
        # set up observed stochs for all relevant data
        id = d['id']

        if d['value'] == dismod3.settings.MISSING:
            print 'WARNING: data %d missing value' % id
            continue

        # ensure all rate data is valid
        d_val = dm.value_per_1(d)
        d_se = dm.se_per_1(d)

        if d['age_start'] < est_mesh[0] or d['age_end'] > est_mesh[-1]:
            raise ValueError, 'Data %d is outside of estimation range---([%d, %d] is not inside [%d, %d])' \
                % (d['id'], d['age_start'], d['age_end'], est_mesh[0], est_mesh[-1])

        age_indices = dismod3.utils.indices_for_range(est_mesh, d['age_start'],
                                                      d['age_end'])
        age_weights = d.get('age_weights',
                            pl.ones(len(age_indices)) / len(age_indices))

        # data must have standard error to use normal model
        if d_se == 0:
            raise ValueError, 'Data %d has invalid standard error' % d['id']

        print 'data %d: value = %f, se = %f' % (d['id'], d_val, d_se)

        @mc.observed
        @mc.stochastic(name='obs_%d' % id)
        def obs(f=rate_stoch,
                age_indices=age_indices,
                age_weights=age_weights,
                value=d_val,
                tau=1. / (d_se)**2):
            f_i = dismod3.utils.rate_for_range(f, age_indices, age_weights)
            return mc.normal_like(value, f_i, tau)

        vars['observed_rates'].append(obs)

    return vars
Example #34
0
def setup(dm, key, data_list=[], rate_stoch=None, emp_prior={}, lower_bound_data=[]):
    """ Generate the PyMC variables for a negative-binomial model of
    a single rate function

    Parameters
    ----------
    dm : dismod3.DiseaseModel
      the object containing all the data, priors, and additional
      information (like input and output age-mesh)
      
    key : str
      the name of the key for everything about this model (priors,
      initial values, estimations)

    data_list : list of data dicts
      the observed data to use in the negative binomial liklihood function

    rate_stoch : pymc.Stochastic, optional
      a PyMC stochastic (or deterministic) object, with
      len(rate_stoch.value) == len(dm.get_estimation_age_mesh()).
      This is used to link rate stochs into a larger model,
      for example.

    emp_prior : dict, optional
      the empirical prior dictionary, retrieved from the disease model
      if appropriate by::

          >>> t, r, y, s = dismod3.utils.type_region_year_sex_from_key(key)
          >>> emp_prior = dm.get_empirical_prior(t)

    Results
    -------
    vars : dict
      Return a dictionary of all the relevant PyMC objects for the
      rate model.  vars['rate_stoch'] is of particular
      relevance; this is what is used to link the rate model
      into more complicated models, like the generic disease model.
    """
    vars = {}
    est_mesh = dm.get_estimate_age_mesh()
    param_mesh = dm.get_param_age_mesh()

    if pl.any(pl.diff(est_mesh) != 1):
        raise ValueError, 'ERROR: Gaps in estimation age mesh must all equal 1'

    # calculate effective sample size for all data and lower bound data
    dm.calc_effective_sample_size(data_list)
    dm.calc_effective_sample_size(lower_bound_data)

    # generate regional covariates
    covariate_dict = dm.get_covariates()
    derived_covariate = dm.get_derived_covariate_values()
    X_region, X_study = regional_covariates(key, covariate_dict, derived_covariate)

    # use confidence prior from prior_str  (only for posterior estimate, this is overridden below for empirical prior estimate)
    mu_delta = 1000.
    sigma_delta = 10.
    mu_log_delta = 3.
    sigma_log_delta = .25
    from dismod3.settings import PRIOR_SEP_STR
    for line in dm.get_priors(key).split(PRIOR_SEP_STR):
        prior = line.strip().split()
        if len(prior) == 0:
            continue
        if prior[0] == 'heterogeneity':
            # originally designed for this:
            mu_delta = float(prior[1])
            sigma_delta = float(prior[2])

            # HACK: override design to set sigma_log_delta,
            # .25 = very, .025 = moderately, .0025 = slightly
            if float(prior[2]) > 0:
                sigma_log_delta = .025 / float(prior[2])


    # use the empirical prior mean if it is available
    if len(set(emp_prior.keys()) & set(['alpha', 'beta', 'gamma'])) == 3:
        mu_alpha = pl.array(emp_prior['alpha'])
        sigma_alpha = pl.array(emp_prior['sigma_alpha'])
        alpha = pl.array(emp_prior['alpha']) # TODO: make this stochastic
        vars.update(region_coeffs=alpha)

        beta = pl.array(emp_prior['beta']) # TODO: make this stochastic
        sigma_beta = pl.array(emp_prior['sigma_beta'])
        vars.update(study_coeffs=beta)

        mu_gamma = pl.array(emp_prior['gamma'])
        sigma_gamma = pl.array(emp_prior['sigma_gamma'])

        # Do not inform dispersion parameter from empirical prior stage
        # if 'delta' in emp_prior:
        #    mu_delta = emp_prior['delta']
        #    if 'sigma_delta' in emp_prior:
        #        sigma_delta = emp_prior['sigma_delta']
    else:
        import dismod3.regional_similarity_matrices as similarity_matrices
        n = len(X_region)
        mu_alpha = pl.zeros(n)
        sigma_alpha = .025  # TODO: make this a hyperparameter, with a traditional prior, like inverse gamma
        C_alpha = similarity_matrices.regions_nested_in_superregions(n, sigma_alpha)

        # use alternative region effect covariance structure if requested
        region_prior_key = 'region_effects'
        if region_prior_key in dm.params:
            if dm.params[region_prior_key] == 'uninformative':
                C_alpha = similarity_matrices.uninformative(n, sigma_alpha)

        region_prior_key = 'region_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0]
        if region_prior_key in dm.params:
            if dm.params[region_prior_key] == 'uninformative':
                C_alpha = similarity_matrices.regions_nested_in_superregions(n, dm.params[region_prior_key]['std'])

        # add informative prior for sex effect if requested
        sex_prior_key = 'sex_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0]
        if sex_prior_key in dm.params:
            print 'adjusting prior on sex effect coefficient for %s' % key
            mu_alpha[n-1] = pl.log(dm.params[sex_prior_key]['mean'])
            sigma_sex = (pl.log(dm.params[sex_prior_key]['upper_ci']) - pl.log(dm.params[sex_prior_key]['lower_ci'])) / (2*1.96)
            C_alpha[n-1, n-1]= sigma_sex**2.

        # add informative prior for time effect if requested
        time_prior_key = 'time_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0]  # HACK: sometimes key is just parameter type, sometimes it is type+region+year+sex
        if time_prior_key in dm.params:
            print 'adjusting prior on time effect coefficient for %s' % key
            mu_alpha[n-2] = pl.log(dm.params[time_prior_key]['mean'])
            sigma_time = (pl.log(dm.params[time_prior_key]['upper_ci']) - pl.log(dm.params[time_prior_key]['lower_ci'])) / (2*1.96)
            C_alpha[n-2, n-2]= sigma_time**2.
        
        #C_alpha = similarity_matrices.all_related_equally(n, sigma_alpha)
        alpha = mc.MvNormalCov('region_coeffs_%s' % key, mu=mu_alpha,
                            C=C_alpha,
                            value=mu_alpha)
        vars.update(region_coeffs=alpha, region_coeffs_step_cov=.005*C_alpha)

        mu_beta = pl.zeros(len(X_study))
        sigma_beta = .1

        # add informative prior for beta effect if requested
        prior_key = 'beta_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0]  # HACK: sometimes key is just parameter type, sometimes it is type+region+year+sex
        if prior_key in dm.params:
            print 'adjusting prior on beta effect coefficients for %s' % key
            mu_beta = pl.array(dm.params[prior_key]['mean'])
            sigma_beta = pl.array(dm.params[prior_key]['std'])

        beta = mc.Normal('study_coeffs_%s' % key, mu=mu_beta, tau=sigma_beta**-2., value=mu_beta)
        vars.update(study_coeffs=beta)

        mu_gamma = 0.*pl.ones(len(est_mesh))
        sigma_gamma = 2.*pl.ones(len(est_mesh))

        # add informative prior for gamma effect if requested
        prior_key = 'gamma_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0]  # HACK: sometimes key is just parameter type, sometimes it is type+region+year+sex
        if prior_key in dm.params:
            print 'adjusting prior on gamma effect coefficients for %s' % key
            mu_gamma = pl.array(dm.params[prior_key]['mean'])
            sigma_gamma = pl.array(dm.params[prior_key]['std'])

        # always use dispersed prior on delta for empirical prior phase
        mu_log_delta = 3.
        sigma_log_delta = .25
        # add informative prior for delta effect if requested
        prior_key = 'delta_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0]  # HACK: sometimes key is just parameter type, sometimes it is type+region+year+sex
        if prior_key in dm.params:
            print 'adjusting prior on delta effect coefficients for %s' % key
            mu_log_delta = dm.params[prior_key]['mean']
            sigma_log_delta = dm.params[prior_key]['std']

    mu_zeta = 0.
    sigma_zeta = .25
    # add informative prior for zeta effect if requested
    prior_key = 'zeta_effect_%s'%key.split(dismod3.settings.KEY_DELIM_CHAR)[0]  # HACK: sometimes key is just parameter type, sometimes it is type+region+year+sex
    if prior_key in dm.params:
        print 'adjusting prior on zeta effect coefficients for %s' % key
        mu_zeta = dm.params[prior_key]['mean']
        sigma_zeta = dm.params[prior_key]['std']
    
    if mu_delta != 0.:
        if sigma_delta != 0.:
            log_delta = mc.Normal('log_dispersion_%s' % key, mu=mu_log_delta, tau=sigma_log_delta**-2, value=3.)
            zeta = mc.Normal('zeta_%s'%key, mu=mu_zeta, tau=sigma_zeta**-2, value=mu_zeta)
            delta = mc.Lambda('dispersion_%s' % key, lambda x=log_delta: 50. + 10.**x)
            vars.update(dispersion=delta, log_dispersion=log_delta, zeta=zeta, dispersion_step_sd=.1*log_delta.parents['tau']**-.5)
        else:
            delta = mc.Lambda('dispersion_%s' % key, lambda x=mu_delta: mu_delta)
            vars.update(dispersion=delta)
        
    else:
        delta = mc.Lambda('dispersion_%s' % key, lambda mu=mu_delta: 0)
        vars.update(dispersion=delta)

    if len(sigma_gamma) == 1:
        sigma_gamma = sigma_gamma[0]*pl.ones(len(est_mesh))

    # create varible for interpolated rate;
    # also create variable for age-specific rate function, if it does not yet exist
    if rate_stoch:
        # if the rate_stoch already exists, for example prevalence in the generic model,
        # we use it to back-calculate mu and eventually gamma
        mu = rate_stoch

        @mc.deterministic(name='age_coeffs_%s' % key)
        def gamma(mu=mu, Xa=X_region, Xb=X_study, alpha=alpha, beta=beta):
            return pl.log(pl.maximum(dismod3.settings.NEARLY_ZERO, mu)) - pl.dot(alpha, Xa) - pl.dot(beta, Xb)

        @mc.potential(name='age_coeffs_potential_%s' % key)
        def gamma_potential(gamma=gamma, mu_gamma=mu_gamma, tau_gamma=1./sigma_gamma[param_mesh]**2, param_mesh=param_mesh):
            return mc.normal_like(gamma[param_mesh], mu_gamma[param_mesh], tau_gamma)

        vars.update(rate_stoch=mu, age_coeffs=gamma, age_coeffs_potential=gamma_potential)
    else:
        # if the rate_stoch does not yet exists, we make gamma a stoch, and use it to calculate mu
        # for computational efficiency, gamma is a linearly interpolated version of gamma_mesh
        initial_gamma = pl.log(dismod3.settings.NEARLY_ZERO + dm.get_initial_value(key))

        gamma_mesh = mc.Normal('age_coeffs_mesh_%s' % key, mu=mu_gamma[param_mesh], tau=sigma_gamma[param_mesh]**-2, value=initial_gamma[param_mesh])

        @mc.deterministic(name='age_coeffs_%s' % key)
        def gamma(gamma_mesh=gamma_mesh, param_mesh=param_mesh, est_mesh=est_mesh):
            return dismod3.utils.interpolate(param_mesh, gamma_mesh, est_mesh)

        @mc.deterministic(name=key)
        def mu(Xa=X_region, Xb=X_study, alpha=alpha, beta=beta, gamma=gamma):
            return predict_rate([Xa, Xb], alpha, beta, gamma, lambda f, age: f, est_mesh)

        # Create a guess at the covariance matrix for MCMC proposals to update gamma_mesh
        from pymc.gp.cov_funs import matern
        a = pl.atleast_2d(param_mesh).T
        C = matern.euclidean(a, a, diff_degree = 2, amp = 1.**2, scale = 10.)

        vars.update(age_coeffs_mesh=gamma_mesh, age_coeffs=gamma, rate_stoch=mu, age_coeffs_mesh_step_cov=.005*pl.array(C))

        # adjust value of gamma_mesh based on priors, if necessary
        # TODO: implement more adjustments, currently only adjusted based on at_least priors
        for line in dm.get_priors(key).split(PRIOR_SEP_STR):
            prior = line.strip().split()
            if len(prior) == 0:
                continue
            if prior[0] == 'at_least':
                delta_gamma = pl.log(pl.maximum(mu.value, float(prior[1]))) - pl.log(mu.value)
                gamma_mesh.value = gamma_mesh.value + delta_gamma[param_mesh]

    # create potentials for priors
    dismod3.utils.generate_prior_potentials(vars, dm.get_priors(key), est_mesh)

    # create observed stochastics for data
    vars['data'] = []

    if mu_delta != 0.:  
        value = []
        N = []
        Xa = []
        Xb = []
        ai = []
        aw = []
        Xz = []

        for d in data_list:
            try:
                age_indices, age_weights, Y_i, N_i = values_from(dm, d)
            except ValueError:
                debug('WARNING: could not calculate likelihood for data %d' % d['id'])
                continue

            value.append(Y_i*N_i)
            N.append(N_i)
            Xa.append(covariates(d, covariate_dict)[0])
            Xb.append(covariates(d, covariate_dict)[1])
            Xz.append(float(d.get('bias') or 0.))
            ai.append(age_indices)
            aw.append(age_weights)

            vars['data'].append(d)

        N = pl.array(N)
        Xa = pl.array(Xa)
        Xb = pl.array(Xb)
        Xz = pl.array(Xz)
        value = pl.array(value)
        
        vars['effective_sample_size'] = list(N)
        
    if len(vars['data']) > 0:
        # TODO: consider using only a subset of the rates at each step of the fit to speed computation; say 100 of them
        k = 50000
        if len(vars['data']) < k:
            data_sample = range(len(vars['data']))
        else:
            import random
            @mc.deterministic(name='data_sample_%s' % key)
            def data_sample(n=len(vars['data']), k=k):
                return random.sample(range(n), k)

        @mc.deterministic(name='rate_%s' % key)
        def rates(S=data_sample,
                Xa=Xa, Xb=Xb,
                alpha=alpha, beta=beta, gamma=gamma,
                bounds_func=vars['bounds_func'],
                age_indices=ai,
                age_weights=aw):

            # calculate study-specific rate function
            shifts = pl.exp(pl.dot(Xa[S], alpha) + pl.dot(Xb[S], pl.atleast_1d(beta)))
            exp_gamma = pl.exp(gamma)
            mu = pl.zeros_like(shifts)
            for i,s in enumerate(S):
                mu[i] = pl.dot(age_weights[s], bounds_func(shifts[i] * exp_gamma[age_indices[s]], age_indices[s]))
                # TODO: evaluate speed increase and accuracy decrease of the following:
                #midpoint = age_indices[s][len(age_indices[s])/2]
                #mu[i] = bounds_func(shifts[i] * exp_gamma[midpoint], midpoint)
                # TODO: evaluate speed increase and accuracy decrease of the following: (to see speed increase, need to code this up using difference of running sums
                #mu[i] = pl.dot(pl.ones_like(age_weights[s]) / float(len(age_weights[s])),
                #               bounds_func(shifts[i] * exp_gamma[age_indices[s]], age_indices[s]))
            return mu
        vars['expected_rates'] = rates
        
        @mc.observed
        @mc.stochastic(name='data_%s' % key)
        def obs(value=value,
                S=data_sample,
                N=N,
                mu_i=rates,
                Xz=Xz,
                zeta=zeta,
                delta=delta):
            #zeta_i = .001
            #residual = pl.log(value[S] + zeta_i) - pl.log(mu_i*N[S] + zeta_i)
            #return mc.normal_like(residual, 0, 100. + delta)
            logp = mc.negative_binomial_like(value[S], N[S]*mu_i, delta*pl.exp(Xz*zeta))
            return logp

        vars['observed_counts'] = obs

        @mc.deterministic(name='predicted_data_%s' % key)
        def predictions(value=value,
                        N=N,
                        S=data_sample,
                        mu=rates,
                        delta=delta):
            r_S = mc.rnegative_binomial(N[S]*mu, delta)/N[S]
            r = pl.zeros(len(vars['data']))
            r[S] = r_S
            return r

        vars['predicted_rates'] = predictions
        debug('likelihood of %s contains %d rates' % (key, len(vars['data'])))

    # now do the same thing for the lower bound data
    # TODO: refactor to remove duplicated code
    vars['lower_bound_data'] = []
    value = []
    N = []
    Xa = []
    Xb = []
    ai = []
    aw = []
    for d in lower_bound_data:
        try:
            age_indices, age_weights, Y_i, N_i = values_from(dm, d)
        except ValueError:
            debug('WARNING: could not calculate likelihood for data %d' % d['id'])
            continue

        value.append(Y_i*N_i)
        N.append(N_i)
        Xa.append(covariates(d, covariate_dict)[0])
        Xb.append(covariates(d, covariate_dict)[1])
        ai.append(age_indices)
        aw.append(age_weights)

        vars['lower_bound_data'].append(d)

    N = pl.array(N)
    value = pl.array(value)

    if len(vars['lower_bound_data']) > 0:
        @mc.observed
        @mc.stochastic(name='lower_bound_data_%s' % key)
        def obs_lb(value=value, N=N,
                   Xa=Xa, Xb=Xb,
                   alpha=alpha, beta=beta, gamma=gamma,
                   bounds_func=vars['bounds_func'],
                   delta=delta,
                   age_indices=ai,
                   age_weights=aw):

            # calculate study-specific rate function
            shifts = pl.exp(pl.dot(Xa, alpha) + pl.dot(Xb, pl.atleast_1d(beta)))
            exp_gamma = pl.exp(gamma)
            mu_i = [pl.dot(weights, bounds_func(s_i * exp_gamma[ages], ages)) for s_i, ages, weights in zip(shifts, age_indices, age_weights)]  # TODO: try vectorizing this loop to increase speed
            rate_param = mu_i*N
            violated_bounds = pl.nonzero(rate_param < value)
            logp = mc.negative_binomial_like(value[violated_bounds], rate_param[violated_bounds], delta)
            return logp

        vars['observed_lower_bounds'] = obs_lb
        debug('likelihood of %s contains %d lowerbounds' % (key, len(vars['lower_bound_data'])))

    return vars
Example #35
0
def setup(dm, key, data_list, rate_stoch):
    """ Generate the PyMC variables for a normal model of
    a function of age

    Parameters
    ----------
    dm : dismod3.DiseaseModel
      the object containing all the data, priors, and additional
      information (like input and output age-mesh)
      
    key : str
      the name of the key for everything about this model (priors,
      initial values, estimations)

    data_list : list of data dicts
      the observed data to use in the beta-binomial liklihood function

    rate_stoch : pymc.Stochastic
      a PyMC stochastic (or deterministic) object, with
      len(rate_stoch.value) == len(dm.get_estimation_age_mesh()).

    Results
    -------
    vars : dict
      Return a dictionary of all the relevant PyMC objects for the
      normal model.  vars['rate_stoch'] is of particular
      relevance, for details see the beta_binomial_model
    """
    vars = {}
    est_mesh = dm.get_estimate_age_mesh()
    if pl.any(pl.diff(est_mesh) != 1):
        raise ValueError, 'ERROR: Gaps in estimation age mesh must all equal 1'

    vars['rate_stoch'] = rate_stoch

    # set up priors and observed data
    prior_str = dm.get_priors(key)
    dismod3.utils.generate_prior_potentials(vars, prior_str, est_mesh)

    vars['observed_rates'] = []
    for d in data_list:
        # set up observed stochs for all relevant data
        id = d['id']
        
        if d['value'] == dismod3.settings.MISSING:
            print 'WARNING: data %d missing value' % id
            continue

        # ensure all rate data is valid
        d_val = dm.value_per_1(d)
        d_se = dm.se_per_1(d)

        if d['age_start'] < est_mesh[0] or d['age_end'] > est_mesh[-1]:
            raise ValueError, 'Data %d is outside of estimation range---([%d, %d] is not inside [%d, %d])' \
                % (d['id'], d['age_start'], d['age_end'], est_mesh[0], est_mesh[-1])

        age_indices = dismod3.utils.indices_for_range(est_mesh, d['age_start'], d['age_end'])
        age_weights = d.get('age_weights', pl.ones(len(age_indices)) / len(age_indices))

        # data must have standard error to use normal model
        if d_se == 0:
            raise ValueError, 'Data %d has invalid standard error' % d['id']

        print 'data %d: value = %f, se = %f' % (d['id'], d_val, d_se)

        @mc.observed
        @mc.stochastic(name='obs_%d' % id)
        def obs(f=rate_stoch,
                age_indices=age_indices,
                age_weights=age_weights,
                value=d_val,
                tau=1./(d_se)**2):
            f_i = dismod3.utils.rate_for_range(f, age_indices, age_weights)
            return mc.normal_like(value, f_i, tau)
        vars['observed_rates'].append(obs)
        
    return vars
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)


    import dismod3
    import simplejson as json
    model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json()))
    gbd_hierarchy = model.hierarchy

    model = data_simulation.simple_model(N)
    model.hierarchy = gbd_hierarchy

    model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    model.parameters['p']['smoothness'] = dict(amount=smoothness)
    model.parameters['p']['heterogeneity'] = heterogeneity

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true*age_weights)
    sum_wt = pl.cumsum(age_weights*1.)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N)


    from validate_covariates import alpha_true_sim
    area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR'])
    alpha = alpha_true_sim(model, area_list, sigma_true)
    print alpha

    model.input_data['true'] = pl.nan

    model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)]
    
    for i, a in model.input_data['area'].iteritems():
        model.input_data['true'][i] = p[i] * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha]))
    p = model.input_data['true']

    n = model.input_data['effective_sample_size']
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None)
    #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(range(101), pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)])
    model.alpha['true'] = pandas.Series(dict(alpha))
    model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns)
    model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns)
    model.alpha = model.alpha.dropna()
    data_simulation.add_quality_metrics(model.alpha)

    model.sigma = pandas.DataFrame(dict(true=sigma_true))
    model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']]
    model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']]
    data_simulation.add_quality_metrics(model.sigma)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(dict(true=pi_age_true,
                                     mu_pred=model.vars['p']['mu_age'].stats()['mean'],
                                     sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    data_simulation.add_to_results(model, 'alpha')
    data_simulation.add_to_results(model, 'sigma')
    data_simulation.finalize_results(model)

    print model.results

    return model
Positions=Positions[1:]

# =================== REMOVE OVERLAPS ===================
from scipy.spatial.distance import pdist,squareform
N=Positions.shape[0]

dists=squareform(pdist(Positions))
# exclude the case of self-distance
pl.fill_diagonal(dists, pl.inf)
# first in, first served
test= (dists<cutoff)
print "- Cutting overlaps"

picked=[]
for p in range(N):
    if pl.any(test[p,:]):
        test[:,p]=False
        test[p,:]=False
    else:
        picked.append(p)

No_overlaps=Positions[picked]

print "\n====> Detected"+F.GREEN,No_overlaps.shape[0],F.RESET+"particles.\n"

# ======================== SAVING THE RESULT ===================== 
# reorder the columns
z,y,x=No_overlaps[:,0],No_overlaps[:,1],No_overlaps[:,2]

outfile="Detected_"+filename.split('_')[0][-3:]+".txt"
pl.savetxt(outfile,zip(x,y,z), fmt="%g")
Example #38
0
def mean_covariate_model(name,
                         mu,
                         input_data,
                         parameters,
                         model,
                         root_area,
                         root_sex,
                         root_year,
                         zero_re=True):
    """ Generate PyMC objects covariate adjusted version of mu

    :Parameters:
      - `name` : str
      - `mu` : the unadjusted mean parameter for this node
      - `model` : ModelData to use for covariates
      - `root_area, root_sex, root_year` : str, str, int
      - `zero_re` : boolean, change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic

    :Results:
      - Returns dict of PyMC objects, including 'pi', the covariate adjusted predicted values for the mu and X provided

    """
    n = len(input_data.index)

    # make U and alpha
    p_U = model.hierarchy.number_of_nodes()  # random effects for area
    U = pandas.DataFrame(pl.zeros((n, p_U)),
                         columns=model.hierarchy.nodes(),
                         index=input_data.index)
    for i, row in input_data.T.iteritems():
        if row['area'] not in model.hierarchy:
            print 'WARNING: "%s" not in model hierarchy, skipping random effects for this observation' % row[
                'area']
            continue

        for level, node in enumerate(
                nx.shortest_path(model.hierarchy, 'all',
                                 input_data.ix[i, 'area'])):
            model.hierarchy.node[node]['level'] = level
            U.ix[i, node] = 1.

    for n2 in model.hierarchy.nodes():
        for level, node in enumerate(
                nx.shortest_path(model.hierarchy, 'all', n2)):
            model.hierarchy.node[node]['level'] = level

    #U = U.select(lambda col: U[col].std() > 1.e-5, axis=1)  # drop constant columns
    if len(U.index) == 0:
        U = pandas.DataFrame()
    else:
        U = U.select(
            lambda col: (U[col].max() > 0) and (model.hierarchy.node[col].get(
                'level') > model.hierarchy.node[root_area]['level']),
            axis=1
        )  # drop columns with only zeros and which are for higher levels in hierarchy
        #U = U.select(lambda col: model.hierarchy.node[col].get('level') <= 2, axis=1)  # drop country-level REs
        #U = U.drop(['super-region_0', 'north_america_high_income', 'USA'], 1)

        #U = U.drop(['super-region_0', 'north_america_high_income'], 1)
        #U = U.drop(U.columns, 1)

        ## drop random effects with less than 1 observation or with all observations set to 1, unless they have an informative prior
        keep = []
        if 'random_effects' in parameters:
            for re in parameters['random_effects']:
                if parameters['random_effects'][re].get('dist') == 'Constant':
                    keep.append(re)
        U = U.select(
            lambda col: 1 <= U[col].sum() < len(U[col]) or col in keep, axis=1)

    U_shift = pandas.Series(0., index=U.columns)
    for level, node in enumerate(
            nx.shortest_path(model.hierarchy, 'all', root_area)):
        if node in U_shift:
            U_shift[node] = 1.
    U = U - U_shift

    sigma_alpha = []
    for i in range(5):  # max depth of hierarchy is 5
        effect = 'sigma_alpha_%s_%d' % (name, i)
        if 'random_effects' in parameters and effect in parameters[
                'random_effects']:
            prior = parameters['random_effects'][effect]
            print 'using stored RE hyperprior for', effect, prior
            sigma_alpha.append(
                MyTruncatedNormal(effect,
                                  prior['mu'],
                                  pl.maximum(prior['sigma'], .001)**-2,
                                  min(prior['mu'], prior['lower']),
                                  max(prior['mu'], prior['upper']),
                                  value=prior['mu']))
        else:
            sigma_alpha.append(
                MyTruncatedNormal(effect, .05, .03**-2, .05, .5, value=.1))

    alpha = pl.array([])
    const_alpha_sigma = pl.array([])
    alpha_potentials = []
    if len(U.columns) > 0:
        tau_alpha_index = []
        for alpha_name in U.columns:
            tau_alpha_index.append(model.hierarchy.node[alpha_name]['level'])
        tau_alpha_index = pl.array(tau_alpha_index, dtype=int)

        tau_alpha_for_alpha = [sigma_alpha[i]**-2 for i in tau_alpha_index]

        alpha = []
        for i, tau_alpha_i in enumerate(tau_alpha_for_alpha):
            effect = 'alpha_%s_%s' % (name, U.columns[i])
            if 'random_effects' in parameters and U.columns[i] in parameters[
                    'random_effects']:
                prior = parameters['random_effects'][U.columns[i]]
                print 'using stored RE for', effect, prior
                if prior['dist'] == 'Normal':
                    alpha.append(
                        mc.Normal(effect,
                                  prior['mu'],
                                  pl.maximum(prior['sigma'], .001)**-2,
                                  value=0.))
                elif prior['dist'] == 'TruncatedNormal':
                    alpha.append(
                        MyTruncatedNormal(effect,
                                          prior['mu'],
                                          pl.maximum(prior['sigma'], .001)**-2,
                                          prior['lower'],
                                          prior['upper'],
                                          value=0.))
                elif prior['dist'] == 'Constant':
                    alpha.append(float(prior['mu']))
                else:
                    assert 0, 'ERROR: prior distribution "%s" is not implemented' % prior[
                        'dist']
            else:
                alpha.append(mc.Normal(effect, 0, tau=tau_alpha_i, value=0))

        # sigma for "constant" alpha
        const_alpha_sigma = []
        for i, tau_alpha_i in enumerate(tau_alpha_for_alpha):
            effect = 'alpha_%s_%s' % (name, U.columns[i])
            if 'random_effects' in parameters and U.columns[i] in parameters[
                    'random_effects']:
                prior = parameters['random_effects'][U.columns[i]]
                if prior['dist'] == 'Constant':
                    const_alpha_sigma.append(float(prior['sigma']))
                else:
                    const_alpha_sigma.append(pl.nan)
            else:
                const_alpha_sigma.append(pl.nan)

        if zero_re:
            column_map = dict([(n, i) for i, n in enumerate(U.columns)])
            # change one stoch from each set of siblings in area hierarchy to a 'sum to zero' deterministic
            for parent in model.hierarchy:
                node_names = model.hierarchy.successors(parent)
                nodes = [column_map[n] for n in node_names if n in U]
                if len(nodes) > 0:
                    i = nodes[0]
                    old_alpha_i = alpha[i]

                    # do not change if prior for this node has dist='constant'
                    if parameters.get('random_effects',
                                      {}).get(U.columns[i],
                                              {}).get('dist') == 'Constant':
                        continue

                    alpha[i] = mc.Lambda(
                        'alpha_det_%s_%d' % (name, i),
                        lambda other_alphas_at_this_level=
                        [alpha[n]
                         for n in nodes[1:]]: -sum(other_alphas_at_this_level))

                    if isinstance(old_alpha_i, mc.Stochastic):

                        @mc.potential(name='alpha_pot_%s_%s' %
                                      (name, U.columns[i]))
                        def alpha_potential(alpha=alpha[i],
                                            mu=old_alpha_i.parents['mu'],
                                            tau=old_alpha_i.parents['tau']):
                            return mc.normal_like(alpha, mu, tau)

                        alpha_potentials.append(alpha_potential)

    # make X and beta
    X = input_data.select(lambda col: col.startswith('x_'), axis=1)

    # add sex as a fixed effect (TODO: decide if this should be in data.py, when loading gbd model)
    X['x_sex'] = [sex_value[row['sex']] for i, row in input_data.T.iteritems()]

    beta = pl.array([])
    const_beta_sigma = pl.array([])
    X_shift = pandas.Series(0., index=X.columns)
    if len(X.columns) > 0:
        # shift columns to have zero for root covariate
        try:
            output_template = model.output_template.groupby([
                'area', 'sex', 'year'
            ]).mean(
            )  # TODO: change to .first(), but that doesn't work with old pandas
        except pandas.core.groupby.DataError:
            output_template = model.output_template.groupby(
                ['area', 'sex', 'year']).first()
        covs = output_template.filter(list(X.columns) + ['pop'])
        if len(covs.columns) > 1:
            leaves = [
                n for n in nx.traversal.bfs_tree(model.hierarchy, root_area)
                if model.hierarchy.successors(n) == []
            ]
            if len(leaves) == 0:
                # networkx returns an empty list when the bfs tree is a single node
                leaves = [root_area]

            if root_sex == 'total' and root_year == 'all':  # special case for all years and sexes
                covs = covs.delevel().drop([
                    'year', 'sex'
                ], axis=1).groupby('area').mean(
                )  # TODO: change to .reset_index(), but that doesn't work with old pandas
                leaf_covs = covs.ix[leaves]
            elif root_sex == 'total':
                raise Exception, 'root_sex == total, root_year != all is Not Yet Implemented'
            elif root_year == 'all':
                raise Exception, 'root_year == all, root_sex != total is Not Yet Implemented'
            else:
                leaf_covs = covs.ix[[(l, root_sex, root_year) for l in leaves]]

            for cov in covs:
                if cov != 'pop':
                    X_shift[cov] = (leaf_covs[cov] * leaf_covs['pop']
                                    ).sum() / leaf_covs['pop'].sum()

        if 'x_sex' in X.columns:
            X_shift['x_sex'] = sex_value[root_sex]

        X = X - X_shift

        assert not pl.any(pl.isnan(
            X.__array__())), 'Covariate matrix should have no missing values'

        beta = []
        for i, effect in enumerate(X.columns):
            name_i = 'beta_%s_%s' % (name, effect)
            if 'fixed_effects' in parameters and effect in parameters[
                    'fixed_effects']:
                prior = parameters['fixed_effects'][effect]
                print 'using stored FE for', name_i, effect, prior
                if prior['dist'] == 'TruncatedNormal':
                    beta.append(
                        MyTruncatedNormal(
                            name_i,
                            mu=float(prior['mu']),
                            tau=pl.maximum(prior['sigma'], .001)**-2,
                            a=prior['lower'],
                            b=prior['upper'],
                            value=.5 * (prior['lower'] + prior['upper'])))
                elif prior['dist'] == 'Normal':
                    beta.append(
                        mc.Normal(name_i,
                                  mu=float(prior['mu']),
                                  tau=pl.maximum(prior['sigma'], .001)**-2,
                                  value=float(prior['mu'])))
                elif prior['dist'] == 'Constant':
                    beta.append(float(prior['mu']))
                else:
                    assert 0, 'ERROR: prior distribution "%s" is not implemented' % prior[
                        'dist']
            else:
                beta.append(mc.Normal(name_i, mu=0., tau=1.**-2, value=0))

        # sigma for "constant" beta
        const_beta_sigma = []
        for i, effect in enumerate(X.columns):
            name_i = 'beta_%s_%s' % (name, effect)
            if 'fixed_effects' in parameters and effect in parameters[
                    'fixed_effects']:
                prior = parameters['fixed_effects'][effect]
                if prior['dist'] == 'Constant':
                    const_beta_sigma.append(float(prior.get('sigma', 1.e-6)))
                else:
                    const_beta_sigma.append(pl.nan)
            else:
                const_beta_sigma.append(pl.nan)

    @mc.deterministic(name='pi_%s' % name)
    def pi(mu=mu,
           U=pl.array(U, dtype=float),
           alpha=alpha,
           X=pl.array(X, dtype=float),
           beta=beta):
        return mu * pl.exp(
            pl.dot(U, [float(x)
                       for x in alpha]) + pl.dot(X, [float(x) for x in beta]))

    return dict(pi=pi,
                U=U,
                U_shift=U_shift,
                sigma_alpha=sigma_alpha,
                alpha=alpha,
                alpha_potentials=alpha_potentials,
                X=X,
                X_shift=X_shift,
                beta=beta,
                hierarchy=model.hierarchy,
                const_alpha_sigma=const_alpha_sigma,
                const_beta_sigma=const_beta_sigma)
def process_summary(summary_filename):
    if ('fake' in summary_filename) or \
            ('H3' in summary_filename) or \
            ('H4' in summary_filename) or \
            ('H7' in summary_filename) or \
            ('H8' in summary_filename):
        logging.debug("Skipping %s" % summary_filename)
        return
    summary = physio.summary.Summary(summary_filename)
    logging.debug("Processing %s" % summary._filename)

    # cull trials by success
    trials = summary.get_trials()
    if len(trials) == 0:
        logging.error("No trails for %s" % summary._filename)
        return
    trials = trials[trials['outcome'] == 0]
    # and gaze
    gaze = clean_gaze(summary.get_gaze())

    if len(gaze) > 0:
        logging.debug("N Trials before gaze culling: %i" % len(trials))
        trials = cull_trials_by_gaze(trials, gaze)
        logging.debug("N Trials after gaze culling: %i" % len(trials))

    for ch in xrange(1, 33):
        for cl in summary.get_cluster_indices(ch):
            outdir = '%s/%s_%i_%i' % \
                    (resultsdir, os.path.basename(summary._filename), ch, cl)

            info_dict = {}

            logging.debug("ch: %i, cl: %i" % (ch, cl))
            # rate
            spike_times = summary.get_spike_times(ch, cl)

            # find start of isolation
            isolation_start = physio.spikes.times.\
                    find_isolation_start_by_isi(spike_times)
            spike_times = spike_times[spike_times >= isolation_start]

            nspikes = len(spike_times)
            info_dict['nspikes'] = nspikes
            if nspikes < min_spikes:
                logging.warning("\t%i < min_spikes[%i]" % \
                        (nspikes, min_spikes))
                continue
            trange = (spike_times.min(), spike_times.max())
            # trange = summary.get_epoch_range()
            rate = nspikes / (trange[1] - trange[0])
            info_dict['rate'] = rate
            if rate < min_rate:
                logging.warning("\t%i < min_rate[%i]" % \
                        (rate, min_rate))
                continue

            # filter trials
            dtrials = summary.filter_trials(trials, \
                    {'name': {'value': 'BlueSquare', 'op': '!='}}, \
                    timeRange=trange)
            if len(dtrials) == 0:
                logging.error("Zero trials for %i %i %s" % \
                        (ch, cl, summary._filename))
                continue

            # snr TODO

            # location
            try:
                location = summary.get_location(ch)
            except Exception as E:
                location = (0, 0, 0)
                print "Attempt to get location failed: %s" % str(E)
            info_dict['location'] = list(location)

            # significant bins
            #bins = summary.get_significant_bins(ch, cl, attr="name", \
            #        blacklist="BlueSquare", spike_times=spike_times, \
            #        timeRange=trange)
            if default_bins is None:
                bins = summary.get_significant_bins(ch, cl, trials=dtrials, \
                        spike_times=spike_times)
            else:
                bins = default_bins
            info_dict['bins'] = bins

            baseline = summary.get_baseline(ch, cl, prew, trials=trials, \
                    spike_times=spike_times)
            info_dict['baseline'] = baseline

            # selectivity
            #resps, means, stds, ns = summary.get_binned_response( \
            #        ch, cl, 'name', bins=bins, spike_times=spike_times, \
            #        blacklist="BlueSquare", timeRange=trange)
            resps, means, stds, ns = summary.get_binned_response( \
                    ch, cl, 'name', bins=bins, spike_times=spike_times, \
                    trials=dtrials, timeRange=trange)
            if len(resps) == 0:
                logging.warning("No responses")
                continue
            sel_index = physio.spikes.selectivity.selectivity(resps.values())
            #if numpy.isnan(sel_index):
            #    raise Exception("Selectivity is nan")
            sorted_names = sorted(resps, key=lambda k: resps[k])
            info_dict['selectivity'] = sel_index
            info_dict['sorted_names'] = sorted_names

            if not os.path.exists(outdir):
                os.makedirs(outdir)
            with open(outdir + '/info_dict.p', 'w') as f:
                pickle.dump(info_dict, f, 2)

            with open(outdir + '/sel_info.p', 'w') as f:
                pickle.dump({'resps': resps, 'means': means, 'stds': stds, \
                        'ns': ns}, f, 2)

            x = pylab.arange(len(resps))
            y = pylab.zeros(len(resps))
            err = pylab.zeros(len(resps))
            pylab.figure(1)
            for (i, name) in enumerate(sorted_names):
                y[i] = resps[name]
                # TODO fix this to be something reasonable
                #err[i] = (pylab.sum(stds[name][bins]) / float(len(bins))) / \
                #        pylab.sqrt(ns[name])
                err[i] = 0
            pylab.errorbar(x, y, err)
            xl = pylab.xlim()
            pylab.xticks(x, sorted_names)
            pylab.xlim(xl)
            pylab.ylabel('average binned response')
            pylab.title('Selectivity: %.2f' % sel_index)
            pylab.savefig(outdir + '/by_name.png')
            pylab.close(1)

            # separability
            # get stims without bluesquare
            stims = summary.get_stimuli({'name': \
                    {'value': 'BlueSquare', 'op': '!='}})
            attr_combinations = {}
            sep_info = {}
            for (ai, attr1) in enumerate(attrs[:-1]):
                uniques1 = numpy.unique(stims[attr1])
                for attr2 in attrs[ai + 1:]:
                    uniques2 = numpy.unique(stims[attr2])
                    if attr1 == attr2:
                        continue
                    M = summary.get_response_matrix(ch, cl, attr1, attr2, \
                            bins=bins, spike_times=spike_times, stims=stims, \
                            uniques1=uniques1, uniques2=uniques2, \
                            timeRange=trange, trials=dtrials)
                    if M.shape[0] == 1 or M.shape[1] == 1:
                        logging.warning("M.shape %s, skipping" % \
                                str(M.shape))
                        continue
                    sep, spi, ps = physio.spikes.separability.\
                            separability_permutation(M)
                    if not pylab.any(pylab.isnan(M)):
                        pylab.figure(1)
                        pylab.imshow(M, interpolation='nearest')
                        pylab.colorbar()
                        pylab.xlabel(attr2)
                        xl = pylab.xlim()
                        yl = pylab.ylim()
                        pylab.xticks(range(len(uniques2)), uniques2)
                        pylab.ylabel(attr1)
                        pylab.yticks(range(len(uniques1)), uniques1)
                        pylab.xlim(xl)
                        pylab.ylim(yl)
                        pylab.title('Sep: %s, %.4f, (%.3f, %.3f)' % \
                                (str(sep), spi, ps[0], ps[1]))
                        pylab.savefig(outdir + '/%s_%s.png' % \
                                (attr1, attr2))
                        pylab.close(1)
                    sep_info['_'.join((attr1, attr2))] = { \
                            'sep': sep, 'spi': spi, 'ps': ps}

            with open(outdir + '/sep_info.p', 'w') as f:
                pickle.dump(sep_info, f, 2)

            # compute separability at each name
            name_sep_info = {}
            for name in sorted_names:
                stims = summary.get_stimuli({'name': name})
                for (ai, attr1) in enumerate(attrs[:-1]):
                    uniques1 = numpy.unique(stims[attr1])
                    for attr2 in attrs[ai + 1:]:
                        uniques2 = numpy.unique(stims[attr2])
                        if attr1 == attr2 or \
                                attr1 == 'name' or attr2 == 'name':
                            continue
                        M = summary.get_response_matrix(ch, cl, attr1, \
                                attr2, bins=bins, spike_times=spike_times,\
                                stims=stims, uniques1=uniques1, \
                                uniques2=uniques2, timeRange=trange, \
                                trials=dtrials)
                        if M.shape[0] == 1 or M.shape[1] == 1:
                            logging.debug("M.shape incompatible" \
                                    " with separability: %s" % \
                                    str(M.shape))
                            continue
                        else:
                            sep, spi, ps = physio.spikes.separability.\
                                    separability_permutation(M)
                            if not pylab.any(pylab.isnan(M)):
                                pylab.figure(1)
                                pylab.imshow(M, interpolation='nearest')
                                pylab.colorbar()
                                pylab.xlabel(attr2)
                                xl = pylab.xlim()
                                yl = pylab.ylim()
                                pylab.xticks(range(len(uniques2)), uniques2)
                                pylab.ylabel(attr1)
                                pylab.yticks(range(len(uniques1)), uniques1)
                                pylab.xlim(xl)
                                pylab.ylim(yl)
                                pylab.title('Sep: %s, %.4f, (%.3f, %.3f)' \
                                        % (str(sep), spi, ps[0], ps[1]))
                                pylab.savefig(outdir + '/%s_%s_%s.png' % \
                                        (name, attr1, attr2))
                                pylab.close(1)
                            name_sep_info['_'.join((name, attr1, attr2))] \
                                    = {'sep': sep, 'spi': spi, 'ps': ps}

            with open(outdir + '/name_sep_info.p', 'w') as f:
                pickle.dump(name_sep_info, f, 2)