Beispiel #1
0
def new_gpr_draws(df2, response, amplitude, prior, scale, has_data, draws):
    '''
    (data frame, str, float, data frame, int, int) -> array

    Using the input parameters given above runs an instance of gaussian process
    smoothing in order to account for years where we do not have data. The data
    frame (df2) is specific to a location-age.
    '''
    all_indices = df2.index
    data_indices = df2[(df2.ix[:, -4])].index
    years = df2.loc[all_indices]["year"].values
    data = pd.DataFrame({"year": years, "prior": prior})
    data.sort_values("year", inplace=True)
    data.drop_duplicates(inplace=True)

    def mean_function(x):
        return np.interp(x, data.year.values, data.prior.values)
    M = gp.Mean(mean_function)
    C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2,
                      amp=amplitude, scale=scale)
    df4 = df2.loc[data_indices]
    if has_data:
        gp.observe(M=M, C=C, obs_mesh=df4.year.values,
                   obs_vals=df4[response].values,
                   obs_V=(df4[response + "_sd"].values)**2 +
                   df4[response + "_nsv"].values)
    ca_draws = np.array([gp.Realization(M, C)(years) for d in range(draws)]).T
    return ca_draws
Beispiel #2
0
def ca_realizations(gpr_dict, years, draws):
    '''
    (dict, array, int) -> matrix

    Make draws for a specific country age group.
    '''
    realizations = np.array([gp.Realization(gpr_dict["M"], gpr_dict["C"])(years)
                             for d in range(draws)]).T
    return realizations
        delta_true = float(options.delta)
        sigma_true = float(options.sigma) * pl.ones(5)
        replicate = int(options.replicate)

        print 'Running random effects validation for:'
        print 'N', N
        print 'delta_true', delta_true
        print 'sigma_true', sigma_true
        print 'replicate', replicate

        mc.np.random.seed(1234567 + replicate)

        M = gp.Mean(validate_consistent_re_model.quadratic)
        C = gp.Covariance(gp.matern.euclidean, amp=1., diff_degree=2, scale=50)
        gp.observe(M, C, [0, 25, 100], [-5, -3, -5])

        true = {}
        li = gp.Realization(M, C)
        true['i'] = lambda x: pl.exp(li(x))
        lr = gp.Realization(M, C)
        true['r'] = lambda x: pl.exp(lr(x))
        lf = gp.Realization(M, C)
        true['f'] = lambda x: pl.exp(lf(x))

        model = validate_consistent_re_model.validate_consistent_re(
            N, delta_true, sigma_true, true)
        model.results.to_csv(
            '%s/%s/%s-%s-%s-%s.csv' %
            (output_dir, validation_name, options.numberofrows, options.delta,
             options.sigma, options.replicate))
Beispiel #4
0
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, iters):
    # load in the data
    all_data = csv2rec(infile, use_mrecords=False)
    for m in range(number_submodels):
        all_data = np.delete(
            all_data,
            np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0],
            axis=0)

    # Investigate error thrown for HKG, MAC, and SGP... they don't have data, but don't know why this is breaking line 62
    all_data = all_data[all_data['iso3'] != "HKG"]
    all_data = all_data[all_data['iso3'] != "MAC"]
    all_data = all_data[all_data['iso3'] != "SGP"]

    # find the list of years for which we need to predict
    year_list = np.unique(all_data.year)

    # find the list of country/age groups
    country_age = np.array([all_data.iso3[i] for i in range(len(all_data))])
    country_age_list = np.repeat(np.unique(country_age), len(year_list))

    # make empty arrays in which to store the results
    draws = [
        np.empty(len(country_age_list), 'float')
        for i in range(iters * number_submodels * 2)
    ]
    iso3 = np.empty(len(country_age_list), '|S3')
    # age_group = np.empty(len(country_age_list), 'int')
    year = np.empty(len(country_age_list), 'int')

    # loop through country/age groups
    for ca in np.unique(country_age_list):

        print('GPRing ' + ca)

        # subset the data for this particular country/age
        ca_data = all_data[country_age == ca]

        # subset just the observed data
        if ca_data['lt_prev'].dtype != '|O8':
            ca_observed = ca_data[(np.isnan(ca_data['lt_prev']) == 0)]
            if len(ca_observed) > 1:
                has_data = True
            else:
                has_data = False
        else:
            has_data = False

        # loop through each submodel
        for m in range(number_submodels):

            # identify the dependent variable for this model
            dv = dv_list[m]

            # loop through spacetime/linear
            for x, t in enumerate(['spacetime']):

                # make a list of the spacetime predictions
                ca_prior = np.array([
                    np.mean(ca_data[t + '_' + str(m + 1)][ca_data.year == y])
                    for y in year_list
                ])

                # find the amplitude for this country/age
                amplitude = np.mean(ca_data[t + '_amplitude_' + str(m + 1)])

                # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR
                def mean_function(x):
                    return np.interp(x, year_list, ca_prior)

                # setup the covariance function
                M = gp.Mean(mean_function)
                C = gp.Covariance(eval_fun=gp.matern.euclidean,
                                  diff_degree=2,
                                  amp=amplitude,
                                  scale=scale)

                # observe the data if there is any
                if has_data:
                    gp.observe(M=M,
                               C=C,
                               obs_mesh=ca_observed.year,
                               obs_V=ca_observed[t + '_data_variance_' +
                                                 str(m + 1)],
                               obs_vals=ca_observed['lt_prev'])

                # draw realizations from the data
                realizations = [gp.Realization(M, C) for i in range(iters)]

                # save the data for this country/age into the results array
                iso3[country_age_list == ca] = ca[0:3]
                # age_group[country_age_list==ca] = ca[4:]
                year[country_age_list == ca] = year_list.T
                for i in range(iters):
                    draws[((2 * m + x) * iters) + i][
                        country_age_list == ca] = realizations[i](year_list)

    # save the results
    print('Saving GPR results')
    names = ['iso3', 'age_group', 'year']
    results = np.core.records.fromarrays([iso3, year], names=names)
    for m in range(number_submodels):
        for x, t in enumerate(['spacetime']):
            for i in range(iters):
                results = recfunctions.append_fields(
                    results, 'gpr_' + str(m + 1) + '_' + t + '_d' + str(i + 1),
                    draws[((2 * m + x) * iters) + i])
            results = recfunctions.append_fields(
                results, 'gpr_' + str(m + 1) + '_' + t + '_mean',
                np.mean(draws[((2 * m + x) * iters):((2 * m + x + 1) * iters)],
                        axis=0))
        rec2csv(results, outfile)
 def draw(self):
     '''Generate a random realization of the spline, based on the data.'''
     if not self.setup:
         self._setup()
     self.realization = GP.Realization(self.M, self.C)
Beispiel #6
0
def fit_gpr(df,
            amp,
            obs_variable='observed_data',
            obs_var_variable='obs_data_variance',
            mean_variable='st_prediction',
            year_variable='year',
            scale=40,
            diff_degree=2,
            draws=0):

    initial_columns = list(df.columns)

    data = df.ix[(pd.notnull(df[obs_variable]))
                 & (pd.notnull(df[obs_var_variable]))]
    mean_prior = df[[year_variable, mean_variable]].drop_duplicates()

    def mean_function(x):
        return np.interp(x, mean_prior[year_variable],
                         mean_prior[mean_variable])

    M = gp.Mean(mean_function)
    C = gp.Covariance(eval_fun=gp.matern.euclidean,
                      diff_degree=diff_degree,
                      amp=amp,
                      scale=scale)

    if len(data) > 0:
        gp.observe(M=M,
                   C=C,
                   obs_mesh=data[year_variable],
                   obs_V=data[obs_var_variable],
                   obs_vals=data[obs_variable])

    model_mean = M(mean_prior[year_variable]).T
    #model_variance = np.diagonal(C(p_years,p_years)).T
    model_variance = C(mean_prior[year_variable])
    model_lower = model_mean - np.sqrt(model_variance) * 1.96
    model_upper = model_mean + np.sqrt(model_variance) * 1.96

    if draws > 0:
        realizations = [
            gp.Realization(M, C)(range(min(mean_prior['year']),
                                       max(mean_prior['year']) + 1))
            for i in range(draws)
        ]

        real_draws = pd.DataFrame({
            year_variable: mean_prior[year_variable],
            'gpr_mean': model_mean,
            'gpr_var': model_variance,
            'gpr_lower': model_lower,
            'gpr_upper': model_upper
        })

        for i, r in enumerate(realizations):
            real_draws["draw" + str(i)] = r

        real_draws = pd.merge(df, real_draws, on=year_variable, how='left')
        gpr_columns = list(set(real_draws.columns) - set(initial_columns))
        initial_columns.extend(gpr_columns)

        return real_draws[initial_columns]

    else:
        results = pd.DataFrame({
            year_variable: mean_prior[year_variable],
            'gpr_mean': model_mean,
            'gpr_var': model_variance,
            'gpr_lower': model_lower,
            'gpr_upper': model_upper
        })
        gpr_columns = list(set(results.columns) - set(initial_columns))
        initial_columns.extend(gpr_columns)
        results = pd.merge(df, results, on=year_variable, how='left')

        return results
Beispiel #7
0
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test,
            spacetime_iters, top_submodel):
    # load in the data
    all_data = csv2rec(infile, use_mrecords=False)
    for m in range(number_submodels):
        if all_data['spacetime_' + str(m + 1)].dtype == 'float64':
            all_data = np.delete(
                all_data,
                np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0],
                axis=0)

    # find the list of years for which we need to predict
    year_list = np.unique(all_data.year)

    # find the list of country/age groups
    country_age = np.array([
        str(all_data.iso3[i]) + '_' + str(all_data.age_group[i])
        for i in range(len(all_data))
    ])
    country_age_list = np.repeat(np.unique(country_age), len(year_list))

    # make empty arrays in which to store the results
    total_iters = np.sum(spacetime_iters)
    draws = [
        np.empty(len(country_age_list), 'float') for i in range(total_iters)
    ]
    if (top_submodel > 0):
        top_submodel_draws = [
            np.empty(len(country_age_list), 'float') for i in range(100)
        ]
    iso3 = np.empty(len(country_age_list), '|S3')
    age_group = np.empty(len(country_age_list), 'int')
    year = np.empty(len(country_age_list), 'int')

    # loop through country/age groups
    for ca in np.unique(country_age_list):
        print('GPRing ' + ca)

        # subset the data for this particular country/age
        ca_data = all_data[country_age == ca]

        # subset just the observed data
        if ca_data['lt_cf'].dtype != '|O8':
            ca_observed = ca_data[(np.isnan(ca_data['lt_cf']) == 0)
                                  & (ca_data['test_' + test] == 0)]
            if len(ca_observed) > 1:
                has_data = True
            else:
                has_data = False
        else:
            has_data = False

        # keep track of how many iterations have been added for this model
        iter_counter = 0

        # loop through each submodel
        for m in range(number_submodels):

            # identify the dependent variable for this model
            dv = dv_list[m]

            # continue making predictions if we actually need draws for this model
            if (spacetime_iters[m] > 0) or (m + 1 == top_submodel):

                # skip models with no spacetime results
                if all_data['spacetime_' + str(m + 1)].dtype != 'float64':
                    for i in range(spacetime_iters[m]):
                        draws[iter_counter][country_age_list == ca] = np.NaN
                        iter_counter += 1
                    if (m + 1 == top_submodel):
                        for i in range(100):
                            top_submodel_draws[i][country_age_list ==
                                                  ca] = np.NaN
                    continue

                # make a list of the spacetime predictions
                ca_prior = np.array([
                    np.mean(ca_data['spacetime_' +
                                    str(m + 1)][ca_data.year == y])
                    for y in year_list
                ])

                # find the amplitude for this country/age
                amplitude = np.mean(ca_data['spacetime_amplitude_' +
                                            str(m + 1)])

                # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR
                def mean_function(x):
                    return np.interp(x, year_list, ca_prior)

                # setup the covariance function
                M = gp.Mean(mean_function)
                C = gp.Covariance(eval_fun=gp.matern.euclidean,
                                  diff_degree=2,
                                  amp=amplitude,
                                  scale=scale)

                # observe the data if there is any
                if has_data:
                    gp.observe(M=M,
                               C=C,
                               obs_mesh=ca_observed.year,
                               obs_V=ca_observed['spacetime_data_variance_' +
                                                 str(m + 1)],
                               obs_vals=ca_observed[dv])

                # draw realizations from the data
                realizations = [
                    gp.Realization(M, C) for i in range(spacetime_iters[m])
                ]

                # save the data for this country/age into the results array
                iso3[country_age_list == ca] = ca[0:3]
                age_group[country_age_list == ca] = ca[4:]
                year[country_age_list == ca] = year_list.T
                for i in range(spacetime_iters[m]):
                    try:
                        draws[iter_counter][country_age_list ==
                                            ca] = realizations[i](year_list)
                    except:
                        print('Failure in ' + ca)
                    iter_counter += 1

                # if it's the top submodel, do 100 additional draws
                if (m + 1 == top_submodel):
                    realizations = [gp.Realization(M, C) for i in range(100)]
                    for i in range(100):
                        try:
                            top_submodel_draws[i][country_age_list ==
                                                  ca] = realizations[i](
                                                      year_list)
                        except:
                            print('Failure in ' + ca)

    # save the results
    print('Saving GPR results')
    names = ['iso3', 'age_group', 'year']
    results = np.core.records.fromarrays([iso3, age_group, year], names=names)
    for i in range(total_iters):
        results = recfunctions.append_fields(results,
                                             'ensemble_d' + str(i + 1),
                                             draws[i])
    if (top_submodel > 0):
        for i in range(100):
            results = recfunctions.append_fields(results,
                                                 'top_submodel_d' + str(i + 1),
                                                 top_submodel_draws[i])
    rec2csv(results, outfile)
Beispiel #8
0
        print "\ninspect with:\nresults.unstack()['mare', '50%'].unstack() # for example"
        print "or: results.unstack()['mare', '50%'].unstack(2).reindex(columns='Very Moderately Slightly'.split())"

    else:
        N = int(options.numberofrows)
        delta_true = float(options.delta)
        sigma_true = float(options.sigma) * pl.ones(5)
        replicate = int(options.replicate)
        smoothness = options.smoothing

        print 'Running random effects validation for:'
        print 'N', N
        print 'delta_true', delta_true
        print 'sigma_true', sigma_true
        print 'replicate', replicate
        print 'smoothness', smoothness

        M = gp.Mean(validate_age_integrating_re.quadratic)
        C = gp.Covariance(gp.matern.euclidean, amp=1., diff_degree=2, scale=50)
        gp.observe(M, C, [0, 25, 100], [-5, -3, -5])

        log_p = gp.Realization(M, C)
        true_p = lambda x: pl.exp(log_p(x))

        model = validate_age_integrating_re.validate_ai_re(
            N, delta_true, sigma_true, true_p, smoothness)
        model.results.to_csv(
            '%s/%s/%s-%s-%s-%s.csv' %
            (output_dir, validation_name, options.numberofrows, options.delta,
             options.sigma, options.replicate))
        delta_true = float(options.delta)
        replicate = int(options.replicate)
        bias = float(options.bias)
        sigma_prior = float(options.sigma)

        print 'Running random effects validation for:'
        print 'N', N
        print 'delta_true', delta_true
        print 'bias', bias
        print 'sigma_prior', sigma_prior
        print 'replicate', replicate

        M = gp.Mean(validate_similarity.quadratic)
        C = gp.Covariance(gp.matern.euclidean, amp=1., diff_degree=2, scale=50)
        gp.observe(M, C, [0, 30, 100], [-5, -3, -5])

        true = {}
        lp = gp.Realization(M, C)
        true_p = lambda x: pl.exp(lp(x))

        model = validate_similarity.generate_data(N, delta_true, true_p,
                                                  'Unusable', bias,
                                                  sigma_prior)
        for het in 'Very Moderately Slightly'.split():
            model.parameters['p']['heterogeneity'] = het
            validate_similarity.fit(model)
            model.results.to_csv(
                '%s/%s/%s-%s-%s-%s-%s-%s.csv' %
                (output_dir, validation_name, options.numberofrows,
                 options.delta, het, bias, sigma_prior, options.replicate))
Beispiel #10
0
 def draw(self):
     '''Generate a random realization of the spline, based on the data.'''
     self.realization = GP.Realization(self.M, self.C)