Ejemplo n.º 1
0
 def use_cache(self, dir):
     ''' Use cached data from disk instead of querying mysql for the latest version '''
     try:
         self.prediction_matrix = pl.csv2rec(dir + 'prediction_matrix_' + self.cause + '_' + self.sex + '.csv')
         self.observation_matrix = pl.csv2rec(dir + 'observation_matrix_' + self.cause + '_' + self.sex + '.csv')
     except IOError:
         raise IOError('No cached data found.')
     self.data_rows = self.observation_matrix.shape[0]
     self.country_list = np.unique(self.prediction_matrix.country)
     self.region_list = np.unique(self.prediction_matrix.region)
     self.super_region_list = np.unique(self.prediction_matrix.super_region)
     self.age_list = np.unique(self.prediction_matrix.age)
     self.year_list = np.unique(self.prediction_matrix.year)
     self.covariate_dict = {'x0': 'constant'}
     for i in range(len(self.covariate_list)):
         self.covariate_dict['x' + str(i+1)] = self.covariate_list[i]
     if self.age_dummies == True:
         pre_ref = 1
         for i,j in enumerate(self.age_list):
             if j == self.age_ref:
                 pre_ref = 0
             elif pre_ref == 1:
                 self.covariate_dict['x' + str(len(self.covariate_list)+i+1)] = 'Age ' + str(j)
             else:
                 self.covariate_dict['x' + str(len(self.covariate_list)+i)] = 'Age ' + str(j)
     self.training_split()
Ejemplo n.º 2
0
def evaluate_model(mod, comment='', data_fname='missing_noisy_data.csv', truth_fname='data.csv'):
    """ Run specified model on existing data (data.csv / missing_noisy_data.csv) and save results in dev_log.csv
    Existing models: %s """ % data_run_models
    if mod not in data_run_models.split(' '):
        raise TypeError, 'Unrecognized model "%s"; must be one of %s' % (mod, data_run_models)

    import model
    reload(model)

    print 'loading data'
    data = pl.csv2rec(data_fname)
    truth = pl.csv2rec(truth_fname)
    
    t0 = time.time()
    print 'generating model'
    mod_mc = eval('model.%s(data)' % mod)

    print 'fitting model with mcmc'
    mod_mc.sample(10000, 5000, 50, verbose=1)
    t1 = time.time()

    print 'summarizing results'

    import graphics
    reload(graphics)
    pl.figure(figsize=(22, 17), dpi=300)
    pl.clf()
    graphics.plot_all_predictions_over_time(data, mod_mc.predicted, more_data=truth)

    data_stats = mod_mc.data_predicted.stats()
    i_out = [i for i in range(len(data)) if pl.isnan(data.y[i])]
    rmse_abs_out = pl.rms_flat(truth.y[i_out] - data_stats['mean'][i_out])
    rmse_rel_out = 100*pl.rms_flat(1. - data_stats['mean'][i_out]/truth.y[i_out])

    i_in = [i for i in range(len(data)) if not pl.isnan(data.y[i])]
    rmse_abs_in = pl.rms_flat(truth.y[i_in] - data_stats['mean'][i_in])
    rmse_rel_in = 100*pl.rms_flat(1. - data_stats['mean'][i_in]/truth.y[i_in])

    param_stats = mod_mc.param_predicted.stats()
    coverage = 100*pl.sum((truth.y[i_out] >= param_stats['95% HPD interval'][i_out, 0]) & (truth.y[i_out] <= param_stats['95% HPD interval'][i_out, 1])) / float(len(i_out))

    import md5
    data_hash = md5.md5(data).hexdigest()
    results = [mod, t1-t0, rmse_abs_out, rmse_rel_out, rmse_abs_in, rmse_rel_in, coverage,
               len(data), len(pl.unique(data.region)), len(pl.unique(data.country)), len(pl.unique(data.year)), len(pl.unique(data.age)), data_hash,
               t0, comment]
    print '%s: time: %.0fs out-of-samp rmse abs=%.1f rel=%.0f in-samp rmse abs=%.1f rel=%.0f coverage=%.0f\ndata: %d rows; %d regions, %d countries %d years %d ages [data hash: %s]\n(run conducted at %f)\n%s' % tuple(results)

    pl.savefig('/home/j/Project/Models/space-time-smoothing/images/%s.png' % t0)  # FIXME: don't hardcode path for saving images

    import csv
    f = open('dev_log.csv', 'a')
    f_csv = csv.writer(f)
    f_csv.writerow(results)
    f.close()

    return mod_mc
Ejemplo n.º 3
0
def get_cod_data_all_causes(iso3='USA', age_group='1_4', sex='F'):
    """ TODO: write doc string for this function"""
    print 'loading', iso3, age_group, sex
    import glob

    cause_list = []
    fpath = '/home/j/Project/Causes of Death/Under Five Deaths/CoD Correct Input Data/v02_prep_%s/%s+*+%s+%s.csv' % (
        iso3, iso3, age_group, sex)
    #fpath = '/home/j/Project/GBD/dalynator/data/cod_correct_input_pos/run_9_cause_*.csv'  # use Mike's validation data
    fnames = glob.glob(fpath)

    # initialize input distribution array
    N = 990  # TODO: get this from the data files
    T = 32  # TODO: get this from the data files
    J = len(fnames)
    F = pl.zeros((N, T, J))

    # fill input distribution array with data from files
    for j, fname in enumerate(sorted(fnames)):
        cause = fname.split('+')[1]  # TODO: make this less brittle and clearer
        #cause = str(j) # use Mike's validation data causes
        print 'loading cause', cause
        F_j = pl.csv2rec(fname)

        for n in range(N):
            F[n, :, j] = F_j['ensemble_d%d' % (n + 1)] / F_j['envelope']
            #F[n, :, j] = F_j['d%d'%(n+1)]/F_j['envelope'] # use Mike's validation data

        assert not pl.any(
            pl.isnan(F)), '%s should have no missing values' % fname
        cause_list.append(cause)

    print 'loading complete'
    return F, cause_list
Ejemplo n.º 4
0
def make_model(lon,lat,t,input_data,covariate_keys,n,datatype,
                genaa,genab,genbb,gen00,gena0,genb0,gena1,genb1,gen01,gen11,
                pheab,phea,pheb,
                phe0,prom0,promab,
                aphea,aphe0,
                bpheb,bphe0,
                vivax_pos,vivax_neg,
                lo_age, up_age,
                cpus=1):
    """
    This function is required by the generic MBG code.
    """
    
    ra = csv2rec(input_data)
    
    # Step method granularity    
    grainsize = 20
    
    where_vivax = np.where(datatype=='vivax')
    from dufvax import disttol, ttol
    
    # Duffy needs to be modelled everywhere Duffy or Vivax is observed.
    # Vivax only needs to be modelled where Vivax is observed.
    # Complication: Vivax can have multiple co-located observations at different times,
    # all corresponding to the same Duffy observation.
    duffy_data_mesh, duffy_logp_mesh, duffy_fi, duffy_ui, duffy_ti = uniquify_tol(disttol,ttol,lon,lat)
    duffy_data_mesh = np.hstack((duffy_data_mesh, np.atleast_2d(t).T))
    duffy_logp_mesh = np.hstack((duffy_logp_mesh, np.atleast_2d(t[duffy_ui]).T))
    vivax_data_mesh, vivax_logp_mesh, vivax_fi, vivax_ui, vivax_ti = uniquify_tol(disttol,ttol,lon[where_vivax],lat[where_vivax],t[where_vivax])
    
    # Create the mean & its evaluation at the data locations.
    init_OK = False
    
    # Probability of mutation in the promoter region, given that the other thing is a.
    p1 = pm.Uniform('p1', 0, .04, value=.01)
    
    covariate_key_dict = {'v': set(covariate_keys), 'b': ['africa'], '0':[]}
    ui_dict = {'v': vivax_ui, 'b': duffy_ui, '0': duffy_ui}
    
    
    logp_mesh_dict = {'b': duffy_logp_mesh, '0': duffy_logp_mesh, 'v': vivax_logp_mesh}
    temporal_dict = {'b': False, '0': False, 'v': True}
    
    init_OK = False
    while not init_OK:
        try:
            spatial_vars = zipmap(lambda k: covariance_submodel(k, ra, logp_mesh_dict[k], covariate_key_dict[k], ui_dict[k], temporal_dict[k]), ['b','0','v'])
            sp_sub = zipmap(lambda k: spatial_vars[k]['sp_sub'], ['b','0','v'])
            sp_sub_b, sp_sub_0, sp_sub_v = [sp_sub[k] for k in ['b','0','v']]
            V = zipmap(lambda k: spatial_vars[k]['V'], ['b','0','v'])
            V_b, V_0, V_v = [V[k] for k in ['b','0','v']]
            tau = zipmap(lambda k: 1./spatial_vars[k]['V'], ['b','0','v'])
        
            # Loop over data clusters, adding nugget and applying link function.
            f = zipmap(lambda k: spatial_vars[k]['sp_sub'].f_eval, ['b','0','v'])
            init_OK = True
        except pm.ZeroProbability, msg:
            print 'Trying again: %s'%msg
            init_OK = False
            gc.collect()        
Ejemplo n.º 5
0
def get_cod_data_all_causes(iso3='USA', age_group='1_4', sex='F'):
    """ TODO: write doc string for this function"""
    print 'loading', iso3, age_group, sex
    import glob
    
    cause_list = []
    fpath = '/home/j/Project/Causes of Death/Under Five Deaths/CoD Correct Input Data/v02_prep_%s/%s+*+%s+%s.csv' % (iso3, iso3, age_group, sex)
    #fpath = '/home/j/Project/GBD/dalynator/data/cod_correct_input_pos/run_9_cause_*.csv'  # use Mike's validation data
    fnames = glob.glob(fpath)

    # initialize input distribution array
    N = 990  # TODO: get this from the data files
    T = 32  # TODO: get this from the data files
    J = len(fnames)
    F = pl.zeros((N, T, J))

    # fill input distribution array with data from files
    for j, fname in enumerate(sorted(fnames)):
        cause = fname.split('+')[1]  # TODO: make this less brittle and clearer
        #cause = str(j) # use Mike's validation data causes
        print 'loading cause', cause
        F_j = pl.csv2rec(fname)

        for n in range(N):
            F[n, :, j] = F_j['ensemble_d%d'%(n+1)]/F_j['envelope']
            #F[n, :, j] = F_j['d%d'%(n+1)]/F_j['envelope'] # use Mike's validation data

        assert not pl.any(pl.isnan(F)), '%s should have no missing values' % fname
        cause_list.append(cause)
    
    print 'loading complete'
    return F, cause_list
Ejemplo n.º 6
0
def combine_output(J, T, model, dir, reps, save=False):
    """
    Combine output on absolute error, relative error, csmf_accuracy, and coverage from from
    multiple runs of validate_once. Either saves the output to the disk, or returns arays
    for each. 
    """

    cause = pl.zeros(J*T, dtype='f').view(pl.recarray)
    time = pl.zeros(J*T, dtype='f').view(pl.recarray)
    abs_err = pl.zeros(J*T, dtype='f').view(pl.recarray) 
    rel_err = pl.zeros(J*T, dtype='f').view(pl.recarray)
    coverage = pl.zeros(J*T, dtype='f').view(pl.recarray)
    csmf_accuracy = pl.zeros(J*T, dtype='f').view(pl.recarray)

    for i in range(reps): 
        metrics = pl.csv2rec('%s/metrics_%s_%i.csv' % (dir, model, i))
        cause = pl.vstack((cause, metrics.cause))
        time = pl.vstack((time, metrics.time))
        abs_err = pl.vstack((abs_err, metrics.abs_err))
        rel_err = pl.vstack((rel_err, metrics.rel_err))
        coverage = pl.vstack((coverage, metrics.coverage))
        csmf_accuracy = pl.vstack((csmf_accuracy, metrics.csmf_accuracy))

    cause = cause[1:,]
    time = time[1:,]    
    abs_err = abs_err[1:,]
    rel_err = rel_err[1:,]
    coverage = coverage[1:,]
    csmf_accuracy = csmf_accuracy[1:,]

    mean_abs_err = abs_err.mean(0)
    median_abs_err =  pl.median(abs_err, 0)
    mean_rel_err = rel_err.mean(0)
    median_rel_err = pl.median(rel_err, 0)
    mean_csmf_accuracy = csmf_accuracy.mean(0)
    median_csmf_accuracy = pl.median(csmf_accuracy, 0)
    mean_coverage_bycause = coverage.mean(0)
    mean_coverage = coverage.reshape(reps, T, J).mean(0).mean(1)
    percent_total_coverage = (coverage.reshape(reps, T, J).sum(2)==3).mean(0)
    mean_coverage = pl.array([[i for j in range(J)] for i in mean_coverage]).ravel()
    percent_total_coverage = pl.array([[i for j in range(J)] for i in percent_total_coverage]).ravel()

    models = pl.array([[model for j in range(J)] for i in range(T)]).ravel()
    true_cf = metrics.true_cf
    true_std = metrics.true_std
    std_bias = metrics.std_bias

    all = pl.np.core.records.fromarrays([models, cause[0], time[0], true_cf, true_std, std_bias, mean_abs_err, median_abs_err, mean_rel_err, median_rel_err, 
                                         mean_csmf_accuracy, median_csmf_accuracy, mean_coverage_bycause, mean_coverage, percent_total_coverage], 
                                        names=['model', 'cause', 'time', 'true_cf', 'true_std', 'std_bias', 'mean_abs_err', 'median_abs_err', 
                                         'mean_rel_err', 'median_rel_err', 'mean_csmf_accuracy', 'median_csmf_accuracy', 
                                         'mean_covearge_bycause', 'mean_coverage', 'percent_total_coverage'])   
    
    if save: 
        pl.rec2csv(all, '%s/%s_summary.csv' % (dir, model)) 
    else: 
        return all
Ejemplo n.º 7
0
def fit_and_plot(
        mod,
        data_fname='irq_5q0.csv',
        image_fname='/home/j/Project/Models/space-time-smoothing/irq_test/5q0.%s.png',
        comment='',
        iter=40000):
    import model
    reload(model)

    data = pl.csv2rec(data_fname)

    # FIXME: this makes a big difference, but I don't understand why it would (could be prior on gp amp)
    data.x1 = (data.x1 - 1990.) / 10.  # crude normalization of year data

    print 'generating model'
    mod_mc = eval('model.%s(data)' % mod)

    print 'fitting model with mcmc'
    mod_mc.sample(iter, iter / 2, iter / 2000, verbose=1)

    print 'summarizing results'

    import graphics
    reload(graphics)
    pl.figure()
    pl.clf()
    graphics.plot_prediction_over_time('IRQ',
                                       data,
                                       mod_mc.predicted,
                                       age=-1,
                                       cmap=pl.cm.RdYlBu,
                                       connected=False,
                                       jittered_posterior=False)
    graphics.plot_prediction_over_time('IRQ',
                                       data[:40],
                                       mod_mc.param_predicted,
                                       age=-1)

    #pl.plot(data.year, data.y, zorder=0,
    #        linestyle='', marker='x', mew=3, color='r', ms=8, alpha=.5)
    pl.title('IRQ')
    pl.xlabel('Time (Years)')
    pl.ylabel('$\log(_5q_0)$')
    pl.axis([1945, 2030, -1.8, -.5])
    pl.figtext(0, 1, '\n %s' % comment, va='top', ha='left')
    t1 = time.time()
    pl.savefig(image_fname % t1)

    try:
        for stoch in 'beta gamma sigma_f tau_f'.split(' '):
            print '%s =\n    %s\n' % (
                stoch, mean_w_ui(mod_mc.__getattribute__(stoch)))
    except AttributeError:
        pass

    return mod_mc
def brt_doublecheck(fname, brt_evaluator, brt_results):
    """
    Computes the 'fit' element of a gbm.object and compares it
    with that stored in the gbm.object.
    """
    ures = unpack_gbm_object(brt_results, 'fit')
    data = csv2rec(os.path.join('anopheles-caches', fname))
    ddict = dict([(k, data[k]) for k in data.dtype.names[1:]])
    out = brt_evaluator(ddict)

    print np.abs(out-ures).max()
Ejemplo n.º 9
0
def load_stimuli_raw():
    folder = os.path.join(wd_dir, "stimuli")
    stim_file_re = r'lexique_nletters_4_block_(\d)\.txt$'
    stim_file_fil = re_filter(stim_file_re)

    stim_file_names = filter(stim_file_fil,
                             glob.glob(os.path.join(folder, "*")))

    stim_file_names = sorted(stim_file_names)

    return np.concatenate(
        [pl.csv2rec(filename, delimiter="\t") for filename in stim_file_names])
Ejemplo n.º 10
0
def knockout_uniformly_at_random(in_fname='noisy_data.csv', out_fname='missing_noisy_data.csv', pct=20.):
    """ replace data.csv y column with uniformly random missing entries

    Parameters
    ----------
    pct : float, percent to knockout
    """
    data = pl.csv2rec(in_fname)
    for i, row in enumerate(data):
        if pl.rand() < pct/100.:
            data[i].y = pl.nan
    pl.rec2csv(data, out_fname)
Ejemplo n.º 11
0
Archivo: data.py Proyecto: flaxter/gbd
def knockout_uniformly_at_random(in_fname='noisy_data.csv', out_fname='missing_noisy_data.csv', pct=20.):
    """ replace data.csv y column with uniformly random missing entries

    Parameters
    ----------
    pct : float, percent to knockout
    """
    data = pl.csv2rec(in_fname)
    for i, row in enumerate(data):
        if pl.rand() < pct/100.:
            data[i].y = pl.nan
    pl.rec2csv(data, out_fname)
Ejemplo n.º 12
0
def load_stimuli_raw():
    folder = os.path.join(wd_dir, "stimuli")
    stim_file_re = r'lexique_nletters_4_block_(\d)\.txt$'
    stim_file_fil = re_filter(stim_file_re)

    stim_file_names = filter(stim_file_fil, glob.glob(
        os.path.join(folder, "*")))

    stim_file_names = sorted(stim_file_names)

    return np.concatenate([pl.csv2rec(filename, delimiter="\t")
                           for filename in stim_file_names])
Ejemplo n.º 13
0
def get_data(concurrent):

    observations = csv2rec("kq1-2.csv", missing='NA')

    # Filter for historical or concurrent
    observations = observations[observations['concurrent']==concurrent]

    # Unique paper ID values
    unique_papers = set(observations['paper_id'])
    # Re-number papers
    paper_id = zeros(len(observations), int)
    for i,p in enumerate(unique_papers):
        paper_id[observations['paper_id']==p] = i
    
    # Unique grouped ID values
    unique_groups = set(observations['group_id'])
    # Re-number groups
    group_id = zeros(len(observations), int)
    for i,g in enumerate(unique_groups):
        group_id[observations['group_id']==g] = i
    
    # unique_units = set(observations['unit_id'])
    # unit_id = observations['unit_id']-1

    # Index to individual-level data
    indiv = observations['n']==1
    # Individual-level data
    obs_indiv = observations[indiv]
    # Summarized data
    obs_summ = observations[indiv-True]

    # Group IDs for individual data
    group_id_indiv = group_id[indiv]
    # Paper IDs for individual data
    paper_id_indiv = paper_id[indiv]
    # Unit IDs for individual data
    # unit_id_indiv = unit_id[indiv]
    # Unique paper ID's for individual studies
    unique_papers_indiv = set(paper_id_indiv)

    # Group IDs for summarized data
    group_id_summ = group_id[indiv-True]
    # Paper IDs for summarized data
    paper_id_summ = paper_id[indiv-True]
    # Unit IDs for summarized data
    # unit_id_summ = unit_id[indiv-True]
    # Unique paper IDs for group data
    unique_papers_summ = set(paper_id_summ)
    
    return locals()
Ejemplo n.º 14
0
def load_spikes_csv(spikefname, ch, unit):
  """
  Load the spike timestamps from a csv file. The file is expected to have three columns: channel, unit and timestamp
  (This is the default format that data is exported in from Plexon's offline sorter)

  Inputs:
    spikefname     - name of csv file
    ch             - channel number
    unit           - unit number
  Outputs:
    array          - pylab array of timestamps
  """
  sdata = pylab.csv2rec(spikefname, names=['channel','unit','timestamp'])
  idx = pylab.find((sdata['channel'] == ch) & (sdata['unit'] == unit))
  return sdata['timestamp'][idx]
Ejemplo n.º 15
0
def load_spikes_csv(spikefname, ch, unit):
    """
  Load the spike timestamps from a csv file. The file is expected to have three columns: channel, unit and timestamp
  (This is the default format that data is exported in from Plexon's offline sorter)

  Inputs:
    spikefname     - name of csv file
    ch             - channel number
    unit           - unit number
  Outputs:
    array          - pylab array of timestamps
  """
    sdata = pylab.csv2rec(spikefname, names=['channel', 'unit', 'timestamp'])
    idx = pylab.find((sdata['channel'] == ch) & (sdata['unit'] == unit))
    return sdata['timestamp'][idx]
Ejemplo n.º 16
0
Archivo: data.py Proyecto: flaxter/gbd
def add_sampling_error(in_fname='data.csv', out_fname='noisy_data.csv', std=1.):
    """ add normally distributed noise to data.csv y column

    Parameters
    ----------
    std : float, or array of floats
      standard deviation of noise
    """
    data = pl.csv2rec(in_fname)
    if type(std) == float:
        std = std * pl.ones(len(data))
    for i, row in enumerate(data):
        data[i].y += std[i] * pl.randn(1)
        data[i].se += std[i]
    pl.rec2csv(data, out_fname)
Ejemplo n.º 17
0
def add_sampling_error(in_fname='data.csv', out_fname='noisy_data.csv', std=1.):
    """ add normally distributed noise to data.csv y column

    Parameters
    ----------
    std : float, or array of floats
      standard deviation of noise
    """
    data = pl.csv2rec(in_fname)
    if type(std) == float:
        std = std * pl.ones(len(data))
    for i, row in enumerate(data):
        data[i].y += std[i] * pl.randn(1)
        data[i].se += std[i]
    pl.rec2csv(data, out_fname)
Ejemplo n.º 18
0
 def use_cache(self, dir):
     ''' Use cached data from disk instead of querying mysql for the latest version '''
     try:
         self.prediction_matrix = pl.csv2rec(dir + 'prediction_matrix_' + self.cause + '_' + self.sex + '.csv')
         self.observation_matrix = pl.csv2rec(dir + 'observation_matrix_' + self.cause + '_' + self.sex + '.csv')
     except IOError:
         raise IOError('No cached data found.')
     if self.just_testing == True:
         self.country_list = np.array(['USA','RUS','CAN','UKR','IND','BGD','THA','GBR'])
         obs_keeper = np.zeros(self.observation_matrix.shape[0])
         pred_keeper = np.zeros(self.prediction_matrix.shape[0])
         for i in self.country_list:
             obs_keeper[np.where(self.observation_matrix.country==i)[0]] = 1
             pred_keeper[np.where(self.prediction_matrix.country==i)[0]] = 1
         self.observation_matrix = np.delete(self.observation_matrix, np.where(obs_keeper==0)[0], axis=0)
         self.prediction_matrix = np.delete(self.prediction_matrix, np.where(pred_keeper==0)[0], axis=0)
     else:
         self.country_list = np.unique(self.prediction_matrix.country)
     self.data_rows = self.observation_matrix.shape[0]
     self.region_list = np.unique(self.prediction_matrix.region)
     self.super_region_list = np.unique(self.prediction_matrix.super_region)
     self.age_list = np.unique(self.prediction_matrix.age)
     self.year_list = np.unique(self.prediction_matrix.year)
     self.covariate_dict = {'x0': 'constant'}
     for i in range(len(self.covariate_list)):
         self.covariate_dict['x' + str(i+1)] = self.covariate_list[i]
     if self.age_dummies == True:
         pre_ref = 1
         for i,j in enumerate(self.age_list):
             if j == self.age_ref:
                 pre_ref = 0
             elif pre_ref == 1:
                 self.covariate_dict['x' + str(len(self.covariate_list)+i+1)] = 'Age ' + str(j)
             else:
                 self.covariate_dict['x' + str(len(self.covariate_list)+i)] = 'Age ' + str(j)
     self.training_split()
Ejemplo n.º 19
0
def make_subplot(file, title_):
    data = pylab.csv2rec(file, delimiter=' ', names=('re', 'sim', 'th'))
    figW = 5.5
    figH = 4.5
    fig = plt.figure(subplotpars=mpl.figure.SubplotParams(left=0.125, bottom=0.130))
    ax = fig.add_subplot(111)

    pl_sim = ax.loglog(data['re'], data['sim'], 'bo-')
    pl_th = ax.loglog(data['re'], data['th'], 'ro-')
    ax.legend((pl_sim, pl_th), ('simulation results', 'theoretical approximation'), 'best')
    ax.grid('on')
    ax.grid(which='minor')
    ax.set_xlim(data[0][0], data[-1][0])
    ax.set_ylabel('drag coefficient')
    ax.set_xlabel('Reynolds number')
    ax.set_title(title_)
Ejemplo n.º 20
0
    def load_data(self, csv):
        '''
        load the data from csv
        ### Note: for now, only takes in data as a csv
            TODO: enable MySQL download
        '''
        
        # load the csv file
        self.data =     pl.csv2rec(csv)
        
        # keep just the specified age and sex
        self.data =     self.data[(self.data.sex == self.sex) & (self.data.age_group == self.age)]

        # remove any instances of population zero, which might blow things up due to having offsets of negative infinity
        self.data =     self.data[self.data.pop > 0.]
        
        # report how many rows were loaded
        print '%g rows of data loaded.' % len(self.data)
Ejemplo n.º 21
0
def fit_and_plot(mod, data_fname='irq_5q0.csv', image_fname='/home/j/Project/Models/space-time-smoothing/irq_test/5q0.%s.png',
                 comment='', iter=40000):
    import model
    reload(model)
    
    data = pl.csv2rec(data_fname)

    # FIXME: this makes a big difference, but I don't understand why it would (could be prior on gp amp)
    data.x1 = (data.x1-1990.)/10.  # crude normalization of year data

    print 'generating model'
    mod_mc = eval('model.%s(data)' % mod)

    print 'fitting model with mcmc'
    mod_mc.sample(iter, iter/2, iter/2000, verbose=1)
            
    print 'summarizing results'

    import graphics
    reload(graphics)
    pl.figure()
    pl.clf()
    graphics.plot_prediction_over_time('IRQ', data, mod_mc.predicted, age=-1, cmap=pl.cm.RdYlBu, connected=False, jittered_posterior=False)
    graphics.plot_prediction_over_time('IRQ', data[:40], mod_mc.param_predicted, age=-1)

    #pl.plot(data.year, data.y, zorder=0,
    #        linestyle='', marker='x', mew=3, color='r', ms=8, alpha=.5)
    pl.title('IRQ')
    pl.xlabel('Time (Years)')
    pl.ylabel('$\log(_5q_0)$')
    pl.axis([1945, 2030, -1.8, -.5])
    pl.figtext(0, 1, '\n %s' % comment, va='top', ha='left')
    t1 = time.time()
    pl.savefig(image_fname%t1)

    try:
        for stoch in 'beta gamma sigma_f tau_f'.split(' '):
            print '%s =\n    %s\n' % (stoch, mean_w_ui(mod_mc.__getattribute__(stoch)))
    except AttributeError:
        pass
    
    return mod_mc
Ejemplo n.º 22
0
def draw_eels(subplot,file_basename,smooth):
    """Generate a plot from EELS data saved by CSI.
    Currently highlights just Co, Mn and O... generic code to be added.
    
    subplot:        Subplot object to draw in.
    file_basename:  Filename (no suffix) to read from (adds .txt)
    smooth:         Window size for Hanning type smoothing 
    """
    data = pl.csv2rec(file_basename+".txt", delimiter="\t", names=["eV","raw","ev2","ref","ev3","bg","ev4","sign"])

    
    #eels_plot.set_title("EELS spectrum of %s" % (sys.argv[2]))
    subplot.set_ylabel("Counts / $A.U.$")
    subplot.set_xlabel("Electron energy loss / $\Delta eV$ ")
    subplot.set_ylim(data["raw"].min()*0.3,data["raw"].max()*1.15)
    subplot.plot(data["eV"],data["raw"],label="raw data",color="grey")

    smoothed = smooth(data["raw"],window_len=smooth,window="hanning")

    subplot.plot(data["eV"],smoothed,color="k",label="smoothed data")

    # find peaks
    peak_index = argrelextrema(smoothed,np.greater,order=30)[0].tolist()[1:]

    ranges = [(535,580), (630,670), (770,810)]  

    # annotate each peak

    for peak in peak_index:
        if any(lower <= data["eV"][peak] <= upper for (lower, upper) in ranges):
            subplot.annotate('%.1f' % (data["eV"][peak]),style='italic',size=11, xy=(data["eV"][peak], data["raw"][peak]), xycoords="data", textcoords="offset points", xytext=(0, 25), horizontalalignment='left', verticalalignment='top', arrowprops=dict(facecolor='black',arrowstyle="->",shrinkB=7,connectionstyle="arc3"))

    # mark regions
    subplot.axvspan(535,580,color="b",alpha=0.2) # o
    subplot.annotate("O\n$K$-edge", xy=(554,eels_plot.get_ylim()[0]), xycoords="data", va="bottom", ha="center", size=8)
    subplot.axvspan(630,670,color="r",alpha=0.2) # mn
    subplot.annotate("Mn\n $L_{2,3}$-edge", xy=(647,eels_plot.get_ylim()[0]), xycoords="data", va="bottom", ha="center", size=8)
    subplot.axvspan(770,810,color="g",alpha=0.2) # Co
    subplot.annotate("Co\n$L_{2,3}$-edge", xy=(790,eels_plot.get_ylim()[0]), xycoords="data", va="bottom", ha="center",size=8)
    subplot.set_title("Integral EELS spectrum")
Ejemplo n.º 23
0
def trees_to_diagnostics(brt_evaluator, fname, species_name, n_pseudopresences, n_pseudoabsences, config_filename):
    """
    Takes the BRT evaluator and sees how well it does at predicting the training dataset.
    """

    from diagnostics import simple_assessments, roc, plot_roc_

    din = csv2rec(os.path.join('anopheles-caches',fname))
    found = din.found
    din = dict([(k,din[k]) for k in brt_evaluator.nice_tree_dict.iterkeys()])
    probs = pm.flib.invlogit(brt_evaluator(din))

    print 'Species %s: fraction %f correctly classified.'%(species_name, ((probs>.5)*found+(probs<.5)*(True-found)).sum()/float(len(probs)))

    result_dirname = get_result_dir(config_filename)
    
    resdict = {}
    for f in simple_assessments:
        resdict[f.__name__] = f(probs>.5, found)

    pstack = np.array([pm.rbernoulli(probs) for i in xrange(10000)])
    fp, tp, AUC = roc(pstack, found)
    resdict['AUC'] = AUC
    
    fout=file(os.path.join(result_dirname,'simple-diagnostics.txt'),'w')
    fout.write('presences: %i\n'%(found.sum()-n_pseudopresences))
    fout.write('pseudopresences: %i\n'%n_pseudopresences)
    fout.write('pseudoabsences: %i\n'%n_pseudoabsences)
    for k in resdict.iteritems():
        fout.write('%s: %s\n'%k)
    
    import pylab as pl
    pl.clf()
    plot_roc_(fp,tp,AUC)
    pl.savefig(os.path.join(result_dirname,'roc.pdf'))
    
    r = np.rec.fromarrays([fp,tp],names='false,true')
    rec2csv(r,os.path.join(result_dirname,'roc.csv'))
Ejemplo n.º 24
0
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# FIXME: Need to extract urban, rural from asciis, Otherwise they'll be
# FIXME: NaN at some points.

import sys
from pylab import csv2rec, rec2csv
import numpy as np
import warnings

duffy_datafile, vivax_datafile = sys.argv[1:]
combined_datafile = duffy_datafile.split(
    '.')[0] + '_+_' + vivax_datafile.split('.')[0] + '.csv'

duffy_data = csv2rec(duffy_datafile)
vivax_data = csv2rec(vivax_datafile)
n_duffy = len(duffy_data)
n_vivax = len(vivax_data)

duffy_nan = np.repeat(np.nan, n_duffy)
vivax_nan = np.repeat(np.nan, n_vivax)

tstart = vivax_data.yestart + (vivax_data.mostart - 1) / 12.
tend = vivax_data.yeend + (vivax_data.moend - 1) / 12.

weirdcols = ['lon', 'lat', 't', 'vivax_pos', 'vivax_neg', 'n', 'datatype']
vivaxcols = [
    'lo_age',
    'up_age',
    'urban',
Ejemplo n.º 25
0
### setup Python
# import necessary libraries
import  pymc    as mc
import  numpy   as np
import  pylab   as pl
import  os

# setup directory info
project =   'USCOD'
proj_dir =  'D:/Projects/' + project +'/' if (os.environ['OS'] == 'Windows_NT') else '/shared/projects/' + project + '/'



### setup the data
# load in the csv
data =      pl.csv2rec(proj_dir + 'data/model inputs/state_random_effects_input.csv')

# keep just males aged 60-74 for now
data =      data[(data.sex == 1) & (data.age_group == '60to74')]

# remove any instances of population zero, which might blow things up due to having offsets of negative infinity
data =      data[data.pop > 0.]



### setup temporal indexing
# set year to start at 0
data =          pl.rec_append_fields(
                    rec =   data, 
                    names = 'year0', 
                    arrs =  np.array(data.year - np.min(data.year))
Ejemplo n.º 26
0
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test):
    # load in the data
    all_data = csv2rec(infile, use_mrecords=False)
    for m in range(number_submodels):
        if all_data['spacetime_' + str(m+1)].dtype == 'float64':
            all_data = np.delete(all_data, np.where(np.isnan(all_data['spacetime_' + str(m+1)]))[0], axis=0)

    # find the list of years for which we need to predict
    year_list = np.unique(all_data.year)

    # find the list of country/age groups
    country_age = np.array([str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data))])
    country_age_list = np.repeat(np.unique(country_age), len(year_list))

    # make empty arrays in which to store the results
    draws = [np.empty(len(country_age_list), 'float') for i in range(number_submodels)]
    iso3 = np.empty(len(country_age_list), '|S3')
    age_group = np.empty(len(country_age_list), 'int')
    year = np.empty(len(country_age_list), 'int')

    # loop through country/age groups
    for ca in np.unique(country_age_list):
        print('GPRing ' + ca)

        # subset the data for this particular country/age
        ca_data = all_data[country_age==ca]

        # subset just the observed data
        if ca_data['lt_cf'].dtype != '|O8':
            ca_observed = ca_data[(np.isnan(ca_data['lt_cf'])==0) & (ca_data['test_' + test]==0)]
            if len(ca_observed) > 1:
                has_data = True
            else:
                has_data = False
        else:
            has_data = False

        # loop through each submodel
        for m in range(number_submodels):

            # skip models with no spacetime results
            if all_data['spacetime_' + str(m+1)].dtype != 'float64':
                draws[m][country_age_list==ca] = np.NaN
                continue

            # identify the dependent variable for this model
            dv = dv_list[m]

            # make a list of the spacetime predictions
            ca_prior = np.array([np.mean(ca_data['spacetime_' + str(m+1)][ca_data.year==y]) for y in year_list])

            # find the amplitude for this country/age
            amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m+1)])

            # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR
            def mean_function(x) :
                return np.interp(x, year_list, ca_prior)

            # setup the covariance function
            M = gp.Mean(mean_function)
            C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale)

            # observe the data if there is any
            if has_data:
                gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m+1)], obs_vals=ca_observed[dv])

            # save the data for this country/age into the results array
            iso3[country_age_list==ca] = ca[0:3]
            age_group[country_age_list==ca] = ca[4:]
            year[country_age_list==ca] = year_list.T
            draws[m][country_age_list==ca] = M(year_list)

    # save the results
    print('Saving GPR results')
    names = ['iso3','age_group','year']
    results = np.core.records.fromarrays([iso3,age_group,year], names=names)
    for m in range(number_submodels):
        results = recfunctions.append_fields(results, 'gpr_' + str(m+1) + '_spacetime_mean', draws[m])
    rec2csv(results, outfile)
Ejemplo n.º 27
0
def make_model(lon,lat,t,input_data,covariate_keys,n,datatype,
                genaa,genab,genbb,gen00,gena0,genb0,gena1,genb1,gen01,gen11,
                pheab,phea,pheb,
                phe0,prom0,promab,
                aphea,aphe0,
                bpheb,bphe0,
                vivax_pos,vivax_neg,
                lo_age, up_age,
                cpus=1):
    """
    This function is required by the generic MBG code.
    """
    
    ra = csv2rec(input_data)
    
    # Step method granularity    
    grainsize = 20
    
    where_vivax = np.where(datatype=='vivax')
    from dufvax import disttol, ttol
    
    # Duffy needs to be modelled everywhere Duffy or Vivax is observed.
    # Vivax only needs to be modelled where Vivax is observed.
    # Complication: Vivax can have multiple co-located observations at different times,
    # all corresponding to the same Duffy observation.
    print 'Uniquifying.'
    duffy_data_mesh, duffy_logp_mesh, duffy_fi, duffy_ui, duffy_ti = uniquify_tol(disttol,ttol,lon,lat)
    duffy_data_mesh = np.hstack((duffy_data_mesh, np.atleast_2d(t).T))
    duffy_logp_mesh = np.hstack((duffy_logp_mesh, np.atleast_2d(t[duffy_ui]).T))
    vivax_data_mesh, vivax_logp_mesh, vivax_fi, vivax_ui, vivax_ti = uniquify_tol(disttol,ttol,lon[where_vivax],lat[where_vivax],t[where_vivax])
    
    print 'Done uniquifying.'
    
    duffy_data_locs = map(tuple,duffy_data_mesh[:,:2])
    vivax_data_locs = map(tuple,vivax_data_mesh[:,:2])
    
    full_vivax_ui = np.arange(len(lon))[where_vivax][vivax_ui]

    # Create the mean & its evaluation at the data locations.
    init_OK = False
    
    # Probability of mutation in the promoter region, given that the other thing is a.
    p1 = pm.Uniform('p1', 0, .04, value=.01)
    
    covariate_key_dict = {'v': set(covariate_keys), 'b': ['africa'], '0':[]}
    ui_dict = {'v': full_vivax_ui, 'b': duffy_ui, '0': duffy_ui}
        
    logp_mesh_dict = {'b': duffy_logp_mesh, '0': duffy_logp_mesh, 'v': vivax_logp_mesh}
    temporal_dict = {'b': False, '0': False, 'v': True}
    
    init_OK = False
    while not init_OK:
        try:
            spatial_vars = zipmap(lambda k: covariance_submodel(k, ra, logp_mesh_dict[k], covariate_key_dict[k], ui_dict[k], input_data, temporal_dict[k]), ['b','0','v'])
            tau = zipmap(lambda k: 1./spatial_vars[k]['V'], ['b','0','v'])
        
            # Loop over data clusters, adding nugget and applying link function.
            init_OK = True
        except pm.ZeroProbability, msg:
            print 'Trying again: %s'%msg
            init_OK = False
            gc.collect()        
Ejemplo n.º 28
0
import pylab as pl
from mpl_toolkits import basemap
import numpy as np
import colors
import matplotlib
reload(colors)

data = pl.csv2rec('Fy_input_091126_+_qryPvPR_MBG_with_covariates.csv')

vivax_data = data[np.where(data.datatype == 'vivax')]
duffy_data = data[np.where(data.datatype != 'vivax')]

# b = basemap.Basemap(-19,5,52,20, resolution='i')
b = basemap.Basemap(-19, 5, 52, 40, resolution='i')

pl.close('all')
pl.figure(figsize=(12, 3))

colors.map_axis_format(pl.subplot(1, 2, 1))
b.drawcoastlines(color=colors.map_outline)
b.drawcountries(color=colors.map_outline)
b.plot(vivax_data.lon,
       vivax_data.lat,
       linestyle='None',
       marker='.',
       color=colors.vivax_point,
       markersize=4,
       alpha=.2,
       label='vivax')
colors.map_legend_format(pl.legend(loc=0))
Ejemplo n.º 29
0
        print 'Warning: could not create data csv.  Maybe it exists already?\n%s' % e


    ## fit the model
    dir = dismod3.settings.JOB_WORKING_DIR % id
    # rm %s/sfun_in.csv # if you want to regenerate default mesh parameters
    call_str = '/tmp/dismod4.abie/build/src/dismod4_csv /tmp/dismod4.abie/test/parameter.csv %s/measure_in.csv %s/sfun_in.csv' % (dir, dir)
    subprocess.call(call_str, shell=True)

    call_str = '/tmp/dismod4.abie/build/src/dismod4_csv /tmp/dismod4.abie/test/parameter.csv %s/measure_in.csv %s/sfun_in.csv %s/sfun_out.csv %s/measure_out.csv' % (dir, dir, dir, dir)
    subprocess.call(call_str, shell=True)
    

    # generate plots of results
    print 'summarizing results'
    measure_out = pl.csv2rec('%s/measure_out.csv' % dir)

    r = 'asia_southeast'
    for year in [1990]:
        for sex in ['male']:
            for dm3_type, dm4_type in [['remission', 'remission'],
                                       ['excess-mortality', 'excess'],
                                       ['incidence', 'incidence'],
                                       ['mrr', 'risk'],
                                       ['prevalence', 'prevalence'],
                                       ]:
                x = [0]
                y = [0]
                for age in age_mesh:
                    x.append(age)
                    y.append(measure_out.model[index_dict[(dm4_type, year, age)]])
Ejemplo n.º 30
0
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test,
            spacetime_iters, top_submodel):
    # load in the data
    all_data = csv2rec(infile, use_mrecords=False)
    for m in range(number_submodels):
        if all_data['spacetime_' + str(m + 1)].dtype == 'float64':
            all_data = np.delete(
                all_data,
                np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0],
                axis=0)

    # find the list of years for which we need to predict
    year_list = np.unique(all_data.year)

    # find the list of country/age groups
    country_age = np.array([
        str(all_data.iso3[i]) + '_' + str(all_data.age_group[i])
        for i in range(len(all_data))
    ])
    country_age_list = np.repeat(np.unique(country_age), len(year_list))

    # make empty arrays in which to store the results
    total_iters = np.sum(spacetime_iters)
    draws = [
        np.empty(len(country_age_list), 'float') for i in range(total_iters)
    ]
    if (top_submodel > 0):
        top_submodel_draws = [
            np.empty(len(country_age_list), 'float') for i in range(100)
        ]
    iso3 = np.empty(len(country_age_list), '|S3')
    age_group = np.empty(len(country_age_list), 'int')
    year = np.empty(len(country_age_list), 'int')

    # loop through country/age groups
    for ca in np.unique(country_age_list):
        print('GPRing ' + ca)

        # subset the data for this particular country/age
        ca_data = all_data[country_age == ca]

        # subset just the observed data
        if ca_data['lt_cf'].dtype != '|O8':
            ca_observed = ca_data[(np.isnan(ca_data['lt_cf']) == 0)
                                  & (ca_data['test_' + test] == 0)]
            if len(ca_observed) > 1:
                has_data = True
            else:
                has_data = False
        else:
            has_data = False

        # keep track of how many iterations have been added for this model
        iter_counter = 0

        # loop through each submodel
        for m in range(number_submodels):

            # identify the dependent variable for this model
            dv = dv_list[m]

            # continue making predictions if we actually need draws for this model
            if (spacetime_iters[m] > 0) or (m + 1 == top_submodel):

                # skip models with no spacetime results
                if all_data['spacetime_' + str(m + 1)].dtype != 'float64':
                    for i in range(spacetime_iters[m]):
                        draws[iter_counter][country_age_list == ca] = np.NaN
                        iter_counter += 1
                    if (m + 1 == top_submodel):
                        for i in range(100):
                            top_submodel_draws[i][country_age_list ==
                                                  ca] = np.NaN
                    continue

                # make a list of the spacetime predictions
                ca_prior = np.array([
                    np.mean(ca_data['spacetime_' +
                                    str(m + 1)][ca_data.year == y])
                    for y in year_list
                ])

                # find the amplitude for this country/age
                amplitude = np.mean(ca_data['spacetime_amplitude_' +
                                            str(m + 1)])

                # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR
                def mean_function(x):
                    return np.interp(x, year_list, ca_prior)

                # setup the covariance function
                M = gp.Mean(mean_function)
                C = gp.Covariance(eval_fun=gp.matern.euclidean,
                                  diff_degree=2,
                                  amp=amplitude,
                                  scale=scale)

                # observe the data if there is any
                if has_data:
                    gp.observe(M=M,
                               C=C,
                               obs_mesh=ca_observed.year,
                               obs_V=ca_observed['spacetime_data_variance_' +
                                                 str(m + 1)],
                               obs_vals=ca_observed[dv])

                # draw realizations from the data
                realizations = [
                    gp.Realization(M, C) for i in range(spacetime_iters[m])
                ]

                # save the data for this country/age into the results array
                iso3[country_age_list == ca] = ca[0:3]
                age_group[country_age_list == ca] = ca[4:]
                year[country_age_list == ca] = year_list.T
                for i in range(spacetime_iters[m]):
                    try:
                        draws[iter_counter][country_age_list ==
                                            ca] = realizations[i](year_list)
                    except:
                        print('Failure in ' + ca)
                    iter_counter += 1

                # if it's the top submodel, do 100 additional draws
                if (m + 1 == top_submodel):
                    realizations = [gp.Realization(M, C) for i in range(100)]
                    for i in range(100):
                        try:
                            top_submodel_draws[i][country_age_list ==
                                                  ca] = realizations[i](
                                                      year_list)
                        except:
                            print('Failure in ' + ca)

    # save the results
    print('Saving GPR results')
    names = ['iso3', 'age_group', 'year']
    results = np.core.records.fromarrays([iso3, age_group, year], names=names)
    for i in range(total_iters):
        results = recfunctions.append_fields(results,
                                             'ensemble_d' + str(i + 1),
                                             draws[i])
    if (top_submodel > 0):
        for i in range(100):
            results = recfunctions.append_fields(results,
                                                 'top_submodel_d' + str(i + 1),
                                                 top_submodel_draws[i])
    rec2csv(results, outfile)
Ejemplo n.º 31
0
                    help='The submodel_id for GPR')
args = parser.parse_args()

# region_name = args.region_name
location_id = args.location_id
ihme_loc_id = args.ihme_loc_id
version_id = args.version_id
submodel_id = args.submodel_id

version_dir = "FILEPATH"
input_dir = "FILEPATH"
output_dir = "FILEPATH"

# Get data
input_file = "FILEPATH"
data = pl.csv2rec(input_file, missing='NA')

# Create vectors of data
all_location_index = (data['ihme_loc_id'] == ihme_loc_id)
index = (data['ihme_loc_id'] == ihme_loc_id) & (data['data'] == 1)
region_name = pl.array(data['region_name'][all_location_index])[0]
print region_name
data_year = pl.array(data['year'][index])
data_mort = pl.array(data['logit_mort'][index])
data_var = pl.array(data['logit_var'][index])
data_category = pl.array(data['category'][index])

# Prior
index = (data['ihme_loc_id'] == ihme_loc_id)
prior_year = pl.array(data['year'][index])
prior_mort = gpr.logit(
Ejemplo n.º 32
0
# zeta = .7

# transformation for the GPR step, choose from: ('log10','ln','logit','logit10')
transform = 'logit' 

## Set seed
np.random.RandomState(intSeed)
 

'''
Import data
'''

os.chdir('strPath')

data = pl.csv2rec('prediction_model_results_all_stages_%s_%i_%s_%s.txt' % (rr, ho, lam, zeta), missing='NA')

for ss in sexes:
	# training data
	index = (data['ihme_loc_id'] == cc) & (data['sex'] == ss) & (data['data'] == 1) & (data['include'] == 'TRUE')
	train_year = pl.array(data['year'][index])
	train_mort = pl.array(data['log_mort'][index])
	train_stderr = pl.array(data['log_stderr'][index])
	train_category = pl.array(data['category'][index])

	# testing data
	index = (data['ihme_loc_id'] == cc) & (data['sex'] == ss) & (data['data'] == 1) & (data['include'] == 'FALSE')
	test_year = pl.array(data['year'][index])
	test_mort = pl.array(data['log_mort'][index])
	test_stderr = pl.array(data['log_stderr'][index])
Ejemplo n.º 33
0
    # save the csv file
    import csv
    fname = dismod3.settings.JOB_WORKING_DIR % id + '/data.csv'

    try:
        f = open(fname, 'w')
        csv.writer(f).writerow(column_names)
        csv.DictWriter(f, column_names).writerows(data_list)
        f.close()
    except IOError, e:
        print 'Warning: could not create data csv.  Maybe it exists already?\n%s' % e


    ## fit the model
    data = pl.csv2rec(fname)
    
    print 'generating model'
    from space_time_model import model
    reload(model)  # for development, automatically reload in case model.py has changed
    mod_mc = model.gp_re_a(data)

    print 'fitting model with mcmc'
    iter = 10000
    #iter = 100 # for testing
    mod_mc.sample(iter, iter/2, 1+iter/2000, verbose=1)
    

    # generate plots of results
    print 'summarizing results'
    param_predicted_stats = mod_mc.param_predicted.stats()
Ejemplo n.º 34
0
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test, spacetime_iters, top_submodel):
    # load in the data
    all_data = csv2rec(infile, use_mrecords=False)
    for m in range(number_submodels):
        if all_data['spacetime_' + str(m+1)].dtype == 'float64':
            all_data = np.delete(all_data, np.where(np.isnan(all_data['spacetime_' + str(m+1)]))[0], axis=0)

    # find the list of years for which we need to predict
    year_list = np.unique(all_data.year)

    # find the list of country/age groups
    country_age = np.array([str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data))])
    country_age_list = np.repeat(np.unique(country_age), len(year_list))

    # make empty arrays in which to store the results
    total_iters = np.sum(spacetime_iters)
    draws = [np.empty(len(country_age_list), 'float') for i in range(total_iters)]
    if (top_submodel > 0):
        top_submodel_draws = [np.empty(len(country_age_list), 'float') for i in range(100)]
    iso3 = np.empty(len(country_age_list), '|S3')
    age_group = np.empty(len(country_age_list), 'int')
    year = np.empty(len(country_age_list), 'int')

    # loop through country/age groups
    for ca in np.unique(country_age_list):
        print('GPRing ' + ca)

        # subset the data for this particular country/age
        ca_data = all_data[country_age==ca]

        # subset just the observed data
        if ca_data['lt_cf'].dtype != '|O8':
            ca_observed = ca_data[(np.isnan(ca_data['lt_cf'])==0) & (ca_data['test_' + test]==0)]
            if len(ca_observed) > 1:
                has_data = True
            else:
                has_data = False
        else:
            has_data = False

        # keep track of how many iterations have been added for this model
        iter_counter = 0

        # loop through each submodel
        for m in range(number_submodels):

            # identify the dependent variable for this model
            dv = dv_list[m]

            # continue making predictions if we actually need draws for this model
            if (spacetime_iters[m] > 0) or (m+1 == top_submodel):

                # skip models with no spacetime results
                if all_data['spacetime_' + str(m+1)].dtype != 'float64':
                    for i in range(spacetime_iters[m]):
                        draws[iter_counter][country_age_list==ca] = np.NaN
                        iter_counter += 1
                    if (m+1 == top_submodel):
                        for i in range(100):
                            top_submodel_draws[i][country_age_list==ca] = np.NaN
                    continue

                # make a list of the spacetime predictions
                ca_prior = np.array([np.mean(ca_data['spacetime_' + str(m+1)][ca_data.year==y]) for y in year_list])

                # find the amplitude for this country/age
                amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m+1)])

                # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR
                def mean_function(x) :
                    return np.interp(x, year_list, ca_prior)

                # setup the covariance function
                M = gp.Mean(mean_function)
                C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale)

                # observe the data if there is any
                if has_data:
                    gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m+1)], obs_vals=ca_observed[dv])

                # draw realizations from the data
                realizations = [gp.Realization(M, C) for i in range(spacetime_iters[m])]

                # save the data for this country/age into the results array
                iso3[country_age_list==ca] = ca[0:3]
                age_group[country_age_list==ca] = ca[4:]
                year[country_age_list==ca] = year_list.T
                for i in range(spacetime_iters[m]):
                    try:
                        draws[iter_counter][country_age_list==ca] = realizations[i](year_list)
                    except:
                        print('Failure in ' + ca)
                    iter_counter += 1

                # if it's the top submodel, do 100 additional draws
                if (m+1 == top_submodel):
                    realizations = [gp.Realization(M, C) for i in range(100)]
                    for i in range(100):
                        try:
                            top_submodel_draws[i][country_age_list==ca] = realizations[i](year_list)
                        except:
                            print('Failure in ' + ca)

    # save the results
    print('Saving GPR results')
    names = ['iso3','age_group','year']
    results = np.core.records.fromarrays([iso3,age_group,year], names=names)
    for i in range(total_iters):
        results = recfunctions.append_fields(results, 'ensemble_d' + str(i+1), draws[i])
    if (top_submodel > 0):
        for i in range(100):
            results = recfunctions.append_fields(results, 'top_submodel_d' + str(i+1), top_submodel_draws[i])
    rec2csv(results, outfile)
Ejemplo n.º 35
0
# import necessary libraries
import pymc as mc
import numpy as np
import pylab as pl
import os
from scipy import sparse
from scipy.interpolate import splrep, splev

# setup directory info
project = "USCOD"
proj_dir = "D:/Projects/" + project + "/" if (os.environ["OS"] == "Windows_NT") else "/shared/projects/" + project + "/"


### setup the data
# load in the csv
data = pl.csv2rec(proj_dir + "data/model inputs/downsampled.csv")
print "Data loaded"

# keep just the specified age and sex
data = data[(data.sex == sex) & (data.age_group == age)]

# remove any instances of population zero, which might blow things up due to having offsets of negative infinity
data = data[data.pop > 0.0]


### setup temporal indexing
# set year to start at 0
data = pl.rec_append_fields(rec=data, names="year0", arrs=np.array(data.year - np.min(data.year)))

# set years to go from 0 to (num years - 1)
for i, y in enumerate(np.sort(np.unique(data.year0))):
Ejemplo n.º 36
0
def make_model(
    lon,
    lat,
    t,
    input_data,
    covariate_keys,
    pos,
    neg,
    lo_age=None,
    up_age=None,
    cpus=1,
    with_stukel=with_stukel,
    chunk=chunk,
    disttol=disttol,
    ttol=ttol,
):

    ra = csv2rec(input_data)

    if np.any(pos + neg == 0):
        where_zero = np.where(pos + neg == 0)[0]
        raise ValueError, "Pos+neg = 0 in the rows (starting from zero):\n %s" % where_zero

    C_time = [0.0]
    f_time = [0.0]
    M_time = [0.0]

    # =============================
    # = Preprocess data, uniquify =
    # =============================

    data_mesh = combine_st_inputs(lon, lat, t)
    if lo_age is None:
        lo_age = 2.0 * np.ones(data_mesh.shape[0])
    if up_age is None:
        up_age = 10.0 * np.ones(data_mesh.shape[0])

    # Find near spatiotemporal duplicates.
    ui = []
    fi = []
    ti = []
    dx = np.empty(1)
    for i in xrange(data_mesh.shape[0]):
        match = False
        for j in xrange(len(ui)):
            pm.gp.geo_rad(dx, data_mesh[i, :2].reshape((1, 2)), data_mesh[ui[j], :2].reshape((1, 2)))
            dt = abs(t[ui[j]] - t[i])

            if dx[0] < disttol and dt < ttol:
                match = True
                fi.append(j)
                ti[j].append(i)
                break

        if not match:
            fi.append(len(ui))
            ui.append(i)
            ti.append([i])
    ui = np.array(ui)
    ti = [np.array(tii) for tii in ti]
    fi = np.array(fi)
    logp_mesh = data_mesh[ui, :]

    # covariate_values_on_logp = dict([(k,covariate_values[k][ui]) for k in covariate_values.keys()])

    # =====================
    # = Create PyMC model =
    # =====================

    init_OK = False
    while not init_OK:

        @pm.deterministic()
        def M():
            return pm.gp.Mean(pm.gp.zero_fn)

        # Inverse-gamma prior on nugget variance V.
        tau = pm.Gamma("tau", alpha=3, beta=3 / 0.25, value=5)
        V = pm.Lambda("V", lambda tau=tau: 1.0 / tau)
        # V = pm.Exponential('V', .1, value=1.)

        vars_to_writeout = ["V", "m_const", "t_coef"]

        # Lock down parameters of Stukel's link function to obtain standard logit.
        # These can be freed by removing 'observed' flags, but mixing gets much worse.
        if with_stukel:
            a1 = pm.Uninformative("a1", 0.5)
            a2 = pm.Uninformative("a2", 0.8)
        else:
            a1 = pm.Uninformative("a1", 0, observed=True)
            a2 = pm.Uninformative("a2", 0, observed=True)

        inc = pm.CircVonMises("inc", 0, 0)

        # Use a uniform prior on sqrt ecc (sqrt ???). Using a uniform prior on ecc itself put too little
        # probability mass on appreciable levels of anisotropy.
        sqrt_ecc = pm.Uniform("sqrt_ecc", value=0.4, lower=0.0, upper=1.0)
        ecc = pm.Lambda("ecc", lambda s=sqrt_ecc: s ** 2)

        # Subjective skew-normal prior on amp (the partial sill, tau) in log-space.
        # Parameters are passed in in manual_MCMC_supervisor.
        log_amp = pm.SkewNormal("log_amp", value=amp_params["mu"], **amp_params)
        amp = pm.Lambda("amp", lambda log_amp=log_amp: np.exp(log_amp))

        # Subjective skew-normal prior on scale (the range, phi_x) in log-space.
        log_scale = pm.SkewNormal("log_scale", value=-1, **scale_params)
        scale = pm.Lambda("scale", lambda log_scale=log_scale: np.exp(log_scale))

        # Exponential prior on the temporal scale/range, phi_t. Standard one-over-x
        # doesn't work bc data aren't strong enough to prevent collapse to zero.
        scale_t = pm.Exponential("scale_t", 0.1, value=1.5)

        # Uniform prior on limiting correlation far in the future or past.
        t_lim_corr = pm.Uniform("t_lim_corr", 0, 1, value=0.5)

        # # Uniform prior on sinusoidal fraction in temporal variogram
        sin_frac = pm.Uniform("sin_frac", 0, 1, value=0.3)

        vars_to_writeout.extend(["inc", "ecc", "amp", "scale", "scale_t", "t_lim_corr", "sin_frac"])

        # Create covariance and MV-normal F if model is spatial.
        try:
            # A constraint on the space-time covariance parameters that ensures temporal correlations are
            # always between -1 and 1.
            @pm.potential
            def st_constraint(sd=0.5, sf=sin_frac, tlc=t_lim_corr):
                if -sd >= 1.0 / (-sf * (1 - tlc) + tlc):
                    return -np.Inf
                else:
                    return 0.0

            # A Deterministic valued as a Covariance object. Uses covariance my_st, defined above.
            @pm.deterministic
            def C(
                amp=amp, scale=scale, inc=inc, ecc=ecc, scale_t=scale_t, t_lim_corr=t_lim_corr, sin_frac=sin_frac, ra=ra
            ):
                eval_fun = CovarianceWithCovariates(my_st, input_data, covariate_keys, ui, fac=1.0e4, ra=ra)
                return pm.gp.FullRankCovariance(
                    eval_fun, amp=amp, scale=scale, inc=inc, ecc=ecc, st=scale_t, sd=0.5, tlc=t_lim_corr, sf=sin_frac
                )

            sp_sub = pm.gp.GPSubmodel("sp_sub", M, C, logp_mesh)

            init_OK = True
        except pm.ZeroProbability, msg:
            print "Trying again: %s" % msg
            init_OK = False
            gc.collect()
Ejemplo n.º 37
0
#cc = 'IND_43872'
#ss = 'male'

# transformation for the GPR step, choose from: ('log10','ln','logit','logit10')
transform = 'logit'
'''
Import data
'''
hitercount = 0
for iter in iters:
    for ss in sexes:
        if (huncert == int(1)):

            os.chdir('strPath')
            data = pl.csv2rec('prediction_model_results_all_stages%s.txt' %
                              (iter),
                              missing='NA')
        else:
            os.chdir('%s/strPath' %
                     ('/home/j' if os.name == 'posix' else 'J:'))
            data = pl.csv2rec('prediction_model_results_all_stages.txt',
                              missing='NA')

        # data
        index = (data['ihme_loc_id'] == cc) & (data['sex']
                                               == ss) & (data['data'] == 1)
        data_year = pl.array(data['year'][index])
        data_mort = pl.array(data['log_mort'][index])
        data_stderr = pl.array(data['log_stderr'][index])
        data_category = pl.array(data['category'][index])
Ejemplo n.º 38
0
import numpy as np
from csv import reader
import pylab as pl

# =================
# = Emelda's data =
# =================
R = pl.csv2rec('datafiles/all_data.csv')
# R = pl.csv2rec('all_data.csv')
missing_fields = np.zeros(len(R))
for n in ['surv_int', 'pyor', 'cases', 'region', 'lat', 'lon']:
    missing_fields += R[n].mask
missing_fields = np.array(missing_fields, dtype=bool)
R_mis = R[np.where(missing_fields)]

R = R[np.where(1 - missing_fields)].data

R.pr /= 100.
R.mbg_pr /= 100.
R.mix_pr /= 100.
R.pfpr /= 100

R_af = R[np.where(R.region == 'Africa+')]
R_am = R[np.where(R.region == 'America')]
R_as = R[np.where(R.region == 'CSE Asia')]
R_am_as = R[np.where((R.region == 'America') + (R.region == 'CSE Asia'))]


def time_scaling(pcd, surv_int):
    out = np.ones(len(pcd))
    where_rescale = np.where((pcd != 'Y') * (surv_int > 7) + (surv_int < 7))
Ejemplo n.º 39
0
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, iters):
    # load in the data
    all_data = csv2rec(infile, use_mrecords=False)
    for m in range(number_submodels):
        all_data = np.delete(
            all_data,
            np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0],
            axis=0)

    # Investigate error thrown for HKG, MAC, and SGP... they don't have data, but don't know why this is breaking line 62
    all_data = all_data[all_data['iso3'] != "HKG"]
    all_data = all_data[all_data['iso3'] != "MAC"]
    all_data = all_data[all_data['iso3'] != "SGP"]

    # find the list of years for which we need to predict
    year_list = np.unique(all_data.year)

    # find the list of country/age groups
    country_age = np.array([all_data.iso3[i] for i in range(len(all_data))])
    country_age_list = np.repeat(np.unique(country_age), len(year_list))

    # make empty arrays in which to store the results
    draws = [
        np.empty(len(country_age_list), 'float')
        for i in range(iters * number_submodels * 2)
    ]
    iso3 = np.empty(len(country_age_list), '|S3')
    # age_group = np.empty(len(country_age_list), 'int')
    year = np.empty(len(country_age_list), 'int')

    # loop through country/age groups
    for ca in np.unique(country_age_list):

        print('GPRing ' + ca)

        # subset the data for this particular country/age
        ca_data = all_data[country_age == ca]

        # subset just the observed data
        if ca_data['lt_prev'].dtype != '|O8':
            ca_observed = ca_data[(np.isnan(ca_data['lt_prev']) == 0)]
            if len(ca_observed) > 1:
                has_data = True
            else:
                has_data = False
        else:
            has_data = False

        # loop through each submodel
        for m in range(number_submodels):

            # identify the dependent variable for this model
            dv = dv_list[m]

            # loop through spacetime/linear
            for x, t in enumerate(['spacetime']):

                # make a list of the spacetime predictions
                ca_prior = np.array([
                    np.mean(ca_data[t + '_' + str(m + 1)][ca_data.year == y])
                    for y in year_list
                ])

                # find the amplitude for this country/age
                amplitude = np.mean(ca_data[t + '_amplitude_' + str(m + 1)])

                # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR
                def mean_function(x):
                    return np.interp(x, year_list, ca_prior)

                # setup the covariance function
                M = gp.Mean(mean_function)
                C = gp.Covariance(eval_fun=gp.matern.euclidean,
                                  diff_degree=2,
                                  amp=amplitude,
                                  scale=scale)

                # observe the data if there is any
                if has_data:
                    gp.observe(M=M,
                               C=C,
                               obs_mesh=ca_observed.year,
                               obs_V=ca_observed[t + '_data_variance_' +
                                                 str(m + 1)],
                               obs_vals=ca_observed['lt_prev'])

                # draw realizations from the data
                realizations = [gp.Realization(M, C) for i in range(iters)]

                # save the data for this country/age into the results array
                iso3[country_age_list == ca] = ca[0:3]
                # age_group[country_age_list==ca] = ca[4:]
                year[country_age_list == ca] = year_list.T
                for i in range(iters):
                    draws[((2 * m + x) * iters) + i][
                        country_age_list == ca] = realizations[i](year_list)

    # save the results
    print('Saving GPR results')
    names = ['iso3', 'age_group', 'year']
    results = np.core.records.fromarrays([iso3, year], names=names)
    for m in range(number_submodels):
        for x, t in enumerate(['spacetime']):
            for i in range(iters):
                results = recfunctions.append_fields(
                    results, 'gpr_' + str(m + 1) + '_' + t + '_d' + str(i + 1),
                    draws[((2 * m + x) * iters) + i])
            results = recfunctions.append_fields(
                results, 'gpr_' + str(m + 1) + '_' + t + '_mean',
                np.mean(draws[((2 * m + x) * iters):((2 * m + x + 1) * iters)],
                        axis=0))
        rec2csv(results, outfile)
Ejemplo n.º 40
0
import tables as tb
import numpy as np
import map_utils
from pylab import csv2rec, rec2csv
import os
import sys
from dufvax import covariate_names

# TODO: draw these straight from /Volumes/data

data_in = csv2rec(sys.argv[1])
covariate_path = sys.argv[2]
data_box = data_in

cols = dict([(key,data_box[key]) for key in data_box.dtype.names])
for k in ['urban','rural','africa']:
    cols.pop(k)

def mode(a):
    vals = list(set(a))
    counts = [(a==v).sum() for v in vals]
    return np.argmin(counts)

def nan_callback(lon_old, lat_old, data, lon_new, lat_new, order):
    lon_ind = np.argmin(np.abs(np.subtract.outer(lon_old, lon_new)), axis=0)
    lat_ind = np.argmin(np.abs(np.subtract.outer(lat_old, lat_new)), axis=0)
    out = lat_new*0
    for i in xrange(len(lon_new)):
        lai, loi = lat_ind[i], lon_ind[i]
        if data.mask[lai, loi]:
            for d in xrange(10):
Ejemplo n.º 41
0
    iters = range((hsim - 1) * 25 + 1, (hsim - 1) * 25 + 1 + 25)
else:
    dr = int(1000)
    iters = range(1, 2)

# transformation for the GPR step, choose from: ('log10','ln','logit','logit10')
transform = 'logit'
'''
Import data
'''
hitercount = 0
for iter in iters:
    for ss in sexes:
        if (huncert == int(1)):
            os.chdir('filepath')
            data = pl.csv2rec('filepath' % (iter), missing='NA')

        else:
            os.chdir('filepath' %
                     ('filepath' if os.name == 'posix' else 'filepath'))
            data = pl.csv2rec('filepath', missing='NA')

        # data
        index = (data['ihme_loc_id'] == cc) & (data['sex']
                                               == ss) & (data['data'] == 1)
        data_year = pl.array(data['year'][index])
        data_mort = pl.array(data['log_mort'][index])
        data_stderr = pl.array(data['log_stderr'][index])
        data_category = pl.array(data['category'][index])

        # prior
Ejemplo n.º 42
0

os.chdir('FILEPATH')
data = pl.csv2rec('FILEPATH)

# data
index = (data['ihme_loc_id'] == cc)

data_year = pl.array(data['year'][index])

data_fert = pl.array(data['logit_bound_tfr'][index])
data_var = pl.array(data['logit_bound_var'][index])
data_category = pl.array(data['category'][index])

# prior
data = pl.csv2rec("FILEPATH")


index = (data['ihme_loc_id'] == cc)
prior_year = pl.array(data['year'][index])
prior_fert = pl.array(data['logit_bound_tfr_pred_smooth'][index])


# prediction years 
predictionyears = pl.array(range(int(min(data['year'])),int(max(data['year']))+1)) + 0.5
mse = pl.array(data['mse'][index])
print(mse)
mse = float(mse[0]) 

'''
Fit model with best parameters
Ejemplo n.º 43
0
 def import_as_recarray(self, csv):
     return pylab.csv2rec(csv)
Ejemplo n.º 44
0
    dr = int(1000)
    iters = range(1, 2)

# transformation for the GPR step, choose from: ('log10','ln','logit','logit10')
transform = 'logit'
'''
Import data
'''
hitercount = 0
for iter in iters:
    for ss in sexes:
        if (hiv_uncert == int(1)):
            input_file = "FILEPATH"
        else:
            input_file = "FILEPATH"
        data = pl.csv2rec(input_file, missing='NA')

        # data
        index = (data['ihme_loc_id'] == cc) & (data['sex']
                                               == ss) & (data['data'] == 1)
        data_year = pl.array(data['year'][index])
        data_mort = pl.array(data['log_mort'][index])
        data_stderr = pl.array(data['log_stderr'][index])
        data_category = pl.array(data['category'][index])

        # prior
        index = (data['ihme_loc_id'] == cc) & (data['sex'] == ss)
        prior_year = pl.array(data['year'][index])
        if (transform == 'log10'):
            prior_mort = pl.log(pl.array(data['pred2final'][index])) / pl.log(
                10)  # this is to convert the prior to log base-10 space
Ejemplo n.º 45
0
#!/usr/bin/python
"""
Read csv station data and create an xml file from it
"""
from pylab import csv2rec
import xml.etree.ElementTree as ET

lines = ["Bakerloo", "Central", "Circle", "District", "HamAndCity", "Jubilee",
         "Metropolitan", "Northern", "Piccadilly", "Victoria",
         "WaterlooAndCity"]

for line in lines:
    stationdata = csv2rec(line + ".csv", delimiter=',', converterd={5:str})
    tree = ET.parse('Tube.xml')
    root = tree.getroot()
    for i in range(0, stationdata.size):
        newstation = ET.Element('station', {'name': stationdata[i][0]})
        root.append(newstation)
        tree.write(line + ".xml")
Ejemplo n.º 46
0
import pylab as pl

orig = pl.csv2rec('nature08230-s2.csv')

country = []
year = []
hdi = []
tfr = []
for row in orig:
    for y in range(1975, 2006):
        if pl.isnan(row['hdi%d'%y]) \
                or pl.isnan(row['tfr%d'%y]):
            continue
        country.append(row['country'])
        year.append(y)
        hdi.append(row['hdi%d' % y])
        tfr.append(row['tfr%d' % y])

all = pl.np.core.rec.fromarrays([country, year, hdi, tfr],
                                names='country year hdi tfr'.split())

hdi = all.hdi[(all.year == 1975) | (all.year == 2005)]
tfr = all.tfr[(all.year == 1975) | (all.year == 2005)]
hdi2005 = all.hdi[all.year == 2005]
tfr2005 = all.tfr[all.year == 2005]
Ejemplo n.º 47
0
os.chdir('FILEPATH')
'''
Get GPR settings
'''

rr = sys.argv[1]
cc = sys.argv[2]
rnum = sys.argv[3]
hivsims = int(sys.argv[4])
'''
Import data
'''

if hivsims == 1:
    os.chdir('FILEPATH')
    data = pl.csv2rec('gpr_5q0_input_' + rnum + '.txt', missing='NA')
else:
    os.chdir('FILEPATH')
    data = pl.csv2rec('gpr_5q0_input_GBD2015.txt', missing='NA')

# data
index = (data['ihme_loc_id'] == cc) & (data['data'] == 1)
data_year = pl.array(data['year'][index])
data_mort = pl.array(data['logit_mort'][index])
data_var = pl.array(data['logit_var'][index])
data_category = pl.array(data['category'][index])

# prior
index = (data['ihme_loc_id'] == cc)
prior_year = pl.array(data['year'][index])
prior_mort = gpr.logit(
Ejemplo n.º 48
0
if __name__ == '__main__':
    import pylab as pl
    import data

    data.age_range = pl.arange(0, 81, 20)
    data.time_range = pl.arange(1980, 2005, 5)
    data.regions = pl.randint(5, 15)

    time.sleep(pl.rand() * 5.)
    t0 = time.time()
    data.generate_fe('test_data/%s.csv' %
                     t0)  # included just to get good test coverage
    data.generate_smooth_gp_re_a('test_data/%s.csv' % t0,
                                 country_variation=True)

    std = 5. * pl.rand(len(pl.csv2rec('test_data/%s.csv' % t0)))
    pct = 90.

    print data.age_range, data.time_range, data.regions, pl.mean(std), pct

    data.add_sampling_error('test_data/%s.csv' % t0,
                            'test_data/noisy_%s.csv' % t0,
                            std=std)
    data.knockout_uniformly_at_random('test_data/noisy_%s.csv' % t0,
                                      'test_data/missing_noisy_%s.csv' % t0,
                                      pct=pct)

    mod_mc = evaluate_model(
        'gp_re_a',
        'knockout pct=%d, model matches data, has laplace priors, sigma_e = Exp(1)'
        % pct, 'test_data/missing_noisy_%s.csv' % t0, 'test_data/%s.csv' % t0)
Ejemplo n.º 49
0

'''
Get GPR settings
'''

rr = sys.argv[1]
cc = sys.argv[2]
rnum = int(1)
hivsims = int(0)

'''
Import data
'''
os.chdir('PATH')
data = pl.csv2rec('gpr_input_file.csv', missing='NA')

# data
index = (data['ihme_loc_id'] == cc) & (data['data'] == 1)
data_year = pl.array(data['year_id'][index])
data_mort = pl.array(data['logit_q5_sexratio'][index]) 
data_var = pl.array(data['data_var'][index])
data_category = pl.array(data['category'][index])

# prior
index = (data['ihme_loc_id'] == cc)
prior_year = pl.array(data['year_id'][index])
prior_mort = data['pred_logitratio_s2'][index]

# prediction years 
predictionyears = pl.array(range(int(min(data['year_id'])),int(max(data['year_id']))+1)) + 0.5
output_dir = "FILEPATH"

try:
    os.makedirs(output_dir)
except:
    pass
'''
Get GPR settings
'''
rnum = int(1)
hivsims = int(0)
'''
Import data
'''
input_file = "FILEPATH"
data = pl.csv2rec(input_file, missing='NA')

# data
index = (data['ihme_loc_id'] == cc) & (data['data'] == 1)
data_year = pl.array(data['year_id'][index])
data_mort = pl.array(data['logit_q5_sexratio'][index])
data_var = pl.array(data['data_var'][index])
data_category = pl.array(data['category'][index])

# prior
index = (data['ihme_loc_id'] == cc)
prior_year = pl.array(data['year_id'][index])
prior_mort = data['pred_logitratio_s2'][index]

# prediction years
predictionyears = pl.array(
Ejemplo n.º 51
0
def GFFthreshold(infn, outbed):
    """
    Thresholds the values in the GFF file *infn* and exports
    the results to the BED file *outbed*.
    """
    converterd = {'probe': nodate, 'a': nodate, 'b': nodate}
    logging.debug('reading GFF into record array')
    a = csv2rec(infn,
                delimiter='\t',
                names=('chr', 'prog', 'id', 'start', 'stop', 'ratio', 'a', 'b',
                       'probe'),
                converterd=converterd)
    logging.debug('sorting record array')
    a.sort(order=('chr', 'start'))
    fout = open(outbed, 'w')
    m = a.ratio.mean()
    std = a.ratio.std()
    thresh = m + 2.5 * std
    allregions = []
    region = []
    lastchr = a.chr[0]
    lastpos = None
    count = 0

    for data in a:
        if data.ratio < thresh:
            continue

        if lastpos is None:
            dist = 0
        else:
            dist = data.start - lastpos

        logging.debug('region is currently')
        for i in region:
            logging.debug('\t%s' % i)
        logging.debug('this data: %s' % data)
        logging.debug('dist from last: %s' % dist)

        if dist > 500 or data.chr != lastchr:

            logging.debug('\ndist > 500; checking region len')
            logging.debug('regionlen: %s' % len(region))
            for i in region:
                logging.debug('\t%s' % i)
            if len(region) < 4:
                logging.debug('region not long enough, erasing')
            else:
                logging.debug('region is long enough!!!!')
                logging.debug('region to be exported is')
                for i in region:
                    logging.debug('\t%s' % i)
                chr = region[0].chr
                start = region[0].start
                stop = region[-1].stop
                fout.write('%s\t%s\t%s\n' % (chr, start, stop))
                count += 1
            region = []

        lastpos = data.stop
        lastchr = data.chr
        logging.debug('adding %s to region' % data)
        region.append(data)

    if len(region) >= 4:
        logging.debug('last region will be exported')
        logging.debug('region to be exported is')
        for i in region:
            logging.debug('\t%s' % i)

        chr = region[0].chr
        start = region[0].start
        stop = region[-1].stop
        fout.write('%s\t%s\t%s\n' % (chr, start, stop))
        count += 1

    else:
        logging.debug('last region not long enough')

    fout.close()
    logging.debug('Number of enriched regions: %s' % count)
    logging.debug('using threshold: %s' % thresh)
Ejemplo n.º 52
0
import  pymc    as mc
import  numpy   as np
import  pylab   as pl
import  os
from    scipy               import sparse
from    scipy.interpolate   import splrep, splev

# setup directory info
project =   'USCOD'
proj_dir =  'D:/Projects/' + project +'/' if (os.environ['OS'] == 'Windows_NT') else '/shared/projects/' + project + '/'



### setup the data
# load in the csv
data =      pl.csv2rec(proj_dir + 'data/model inputs/state_random_effects_input.csv')
print 'Data loaded'

# keep just the specified age and sex
data =      data[(data.sex == sex) & (data.age_group == age)]

# remove any instances of population zero, which might blow things up due to having offsets of negative infinity
data =      data[data.pop > 0.]



### setup temporal indexing
# set year to start at 0
data =          pl.rec_append_fields(
                    rec =   data, 
                    names = 'year0', 
Ejemplo n.º 53
0
def evaluate_model(mod,
                   comment='',
                   data_fname='missing_noisy_data.csv',
                   truth_fname='data.csv'):
    """ Run specified model on existing data (data.csv / missing_noisy_data.csv) and save results in dev_log.csv
    Existing models: %s """ % data_run_models
    if mod not in data_run_models.split(' '):
        raise TypeError, 'Unrecognized model "%s"; must be one of %s' % (
            mod, data_run_models)

    import model
    reload(model)

    print 'loading data'
    data = pl.csv2rec(data_fname)
    truth = pl.csv2rec(truth_fname)

    t0 = time.time()
    print 'generating model'
    mod_mc = eval('model.%s(data)' % mod)

    print 'fitting model with mcmc'
    mod_mc.sample(10000, 5000, 50, verbose=1)
    t1 = time.time()

    print 'summarizing results'

    import graphics
    reload(graphics)
    pl.figure(figsize=(22, 17), dpi=300)
    pl.clf()
    graphics.plot_all_predictions_over_time(data,
                                            mod_mc.predicted,
                                            more_data=truth)

    data_stats = mod_mc.data_predicted.stats()
    i_out = [i for i in range(len(data)) if pl.isnan(data.y[i])]
    rmse_abs_out = pl.rms_flat(truth.y[i_out] - data_stats['mean'][i_out])
    rmse_rel_out = 100 * pl.rms_flat(1. - data_stats['mean'][i_out] /
                                     truth.y[i_out])

    i_in = [i for i in range(len(data)) if not pl.isnan(data.y[i])]
    rmse_abs_in = pl.rms_flat(truth.y[i_in] - data_stats['mean'][i_in])
    rmse_rel_in = 100 * pl.rms_flat(1. -
                                    data_stats['mean'][i_in] / truth.y[i_in])

    param_stats = mod_mc.param_predicted.stats()
    coverage = 100 * pl.sum(
        (truth.y[i_out] >= param_stats['95% HPD interval'][i_out, 0]) &
        (truth.y[i_out] <= param_stats['95% HPD interval'][i_out, 1])) / float(
            len(i_out))

    import md5
    data_hash = md5.md5(data).hexdigest()
    results = [
        mod, t1 - t0, rmse_abs_out, rmse_rel_out, rmse_abs_in, rmse_rel_in,
        coverage,
        len(data),
        len(pl.unique(data.region)),
        len(pl.unique(data.country)),
        len(pl.unique(data.year)),
        len(pl.unique(data.age)), data_hash, t0, comment
    ]
    print '%s: time: %.0fs out-of-samp rmse abs=%.1f rel=%.0f in-samp rmse abs=%.1f rel=%.0f coverage=%.0f\ndata: %d rows; %d regions, %d countries %d years %d ages [data hash: %s]\n(run conducted at %f)\n%s' % tuple(
        results)

    pl.savefig('/home/j/Project/Models/space-time-smoothing/images/%s.png' %
               t0)  # FIXME: don't hardcode path for saving images

    import csv
    f = open('dev_log.csv', 'a')
    f_csv = csv.writer(f)
    f_csv.writerow(results)
    f.close()

    return mod_mc
Ejemplo n.º 54
0
        return (x1, delta)
    
    def calc_spines_pos(self, cursor_list, x1_list):
        """Calculate the spines position, returning the mid point 
        of the interval from the two list."""
        
        mid_points = []
        for i, el in enumerate(cursor_list):
            mid_point = cursor_list[i] + (x1_list[i] - cursor_list[i])/2
            mid_points.append(mid_point)
        return mid_points


if __name__ == "__main__":
    from scipy.optimize import leastsq
    data = pylab.csv2rec('spines_distribution_Wilson_1992.csv')
    pfh = FitHandler()
    pfh.plot_data(data)
    
    order = 17
    pfit = pfh.fit_and_plot(data, order)
    plt.title("Fitting the data")
    plt.legend()
    plt.savefig("Fitted_data.png")
    
    
    # Integrating
    pInteg = pfit.integ()
    
    plt.figure()
    pfh.plot_poly(pInteg)
Ejemplo n.º 55
0
def _Pressure(field, data):
    return (data.pf['Gamma'] - 1.0) * data['Density'] * data['InternalEnergy']

add_field('Pressure', function=_Pressure, units=r'\rm{dyne}/\rm{cm}^{2}')



### extract an ortho_ray (1D solution vector)
ray = pf.h.ortho_ray(0, [0.5, 0.5])


### define fields vector
fields = ('Density', 'x-velocity', 'InternalEnergy', 'Pressure' )

### read exact solution
exact = pylab.csv2rec( exact_solution_filename, delimiter=' ', names=('x', 'Density', 'x-velocity', 'Pressure', 'InternalEnergy') )


### calculate difference norm

# first interpolate the exact solution onto the ray
ray_exact = {'x': ray['x'], 
             'Density': pylab.stineman_interp(ray['x'],exact['x'],exact['Density']),
             'x-velocity': pylab.stineman_interp(ray['x'],exact['x'],exact['x-velocity']),
             'Pressure': pylab.stineman_interp(ray['x'],exact['x'],exact['Pressure']),
             'InternalEnergy': pylab.stineman_interp(ray['x'],exact['x'],exact['InternalEnergy'])}


# now calculate the norm (first order, since we're dealing with
# conservation laws)
Ejemplo n.º 56
0
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# FIXME: Need to extract urban, rural from asciis, Otherwise they'll be
# FIXME: NaN at some points.

import sys
from pylab import csv2rec, rec2csv
import numpy as np
import warnings

duffy_datafile, vivax_datafile = sys.argv[1:]
combined_datafile = duffy_datafile.split(".")[0] + "_+_" + vivax_datafile.split(".")[0] + ".csv"

duffy_data = csv2rec(duffy_datafile)
vivax_data = csv2rec(vivax_datafile)
n_duffy = len(duffy_data)
n_vivax = len(vivax_data)

duffy_nan = np.repeat(np.nan, n_duffy)
vivax_nan = np.repeat(np.nan, n_vivax)

tstart = vivax_data.yestart + (vivax_data.mostart - 1) / 12.0
tend = vivax_data.yeend + (vivax_data.moend - 1) / 12.0

weirdcols = ["lon", "lat", "t", "vivax_pos", "vivax_neg", "n", "datatype"]
vivaxcols = ["lo_age", "up_age", "urban", "rural"]
duffycols = [
    "genaa",
    "genab",