Beispiel #1
0
    def pull_data(self):

        self.amplitude = hlp.model_load(self.run_id,
                                        'amp_nsv',
                                        self.holdout_num,
                                        self.param_set,
                                        output_path=self.output_path)

        # pull data and drop specified ko data
        adj_data = hlp.model_load(self.run_id,
                                  'adj_data',
                                  self.holdout_num,
                                  self.param_set,
                                  output_path=self.output_path)
        adj_data = hlp.drop_ko_values(adj_data,
                                      self.holdout_num,
                                      datavar=self.datavar)
        self.adj_data = adj_data

        # pull spacetime
        self.spacetime = hlp.model_load(self.run_id,
                                        'st',
                                        self.holdout_num,
                                        self.param_set,
                                        output_path=self.output_path)

        # pull locs
        locs = hlp.model_load(self.run_id,
                              'location_hierarchy',
                              output_path=self.output_path)
        lvlcols = [
            'level_{}'.format(i)
            for i in range(0, max(locs.level.unique() + 1))
        ]
        self.locs = locs[[self.spacevar, 'level'] + lvlcols]
Beispiel #2
0
def save_rake_summaries(run_id, output_path, holdout_num=0, param_set=0):

    # will need to merge on whichever level col designates national locations
    var = 'level_{}'.format(NATIONAL_LEVEL)

    # separate country locations and subnational locations
    locs = hlp.model_load(run_id,
                          'location_hierarchy',
                          output_path=output_path)
    subnat_locations = locs.loc[locs.level > NATIONAL_LEVEL, var].unique()

    # get gpr means and observations from locations needing raking
    df = hlp.model_load(run_id,
                        'gpr',
                        param_set=param_set,
                        holdout=holdout_num,
                        output_path=output_path)
    df = df.merge(locs[[SPACEVAR, var]], on=SPACEVAR, how='left')
    df = df.loc[~df[var].isin(subnat_locations)]

    inpath = '{}/rake_means_temp_{}/'.format(output_path, holdout_num)
    raked = pd.concat(
        [pd.read_csv('{}/{}'.format(inpath, f)) for f in os.listdir(inpath)],
        sort=True)

    df = pd.concat([df, raked], sort=True)
    df.drop(columns='level_3', inplace=True)
    df = df[IDS + ['gpr_mean', 'gpr_lower', 'gpr_upper']]

    hlp.model_save(df,
                   run_id,
                   'raked',
                   holdout=holdout_num,
                   param_set=param_set,
                   output_path=output_path)
Beispiel #3
0
    def prep_indata(self):
        """Bring in prepped data, location hierarchy and stage1 linear model estimates
        and merge together for input into spacetime stage"""

        # Retrieve data
        self.data = hlp.model_load(
            self.run_id, 'prepped', holdout=self.holdout_num, output_path=self.output_path)
        self.stage1 = hlp.model_load(
            self.run_id, 'stage1', holdout=self.holdout_num, output_path=self.output_path)
        all_locations = hlp.model_load(self.run_id,
                                       'location_hierarchy', output_path=self.output_path).query('level >= {}'.format(NATIONAL_LEVEL))
        self.all_locations = all_locations['location_id'].tolist()

        # Merge
        df = pd.merge(self.stage1, self.data, how='outer', on=self.ids)

        # force stupid level columns to float
        lvlcols = [s for s in df.columns if "level_" in s]
        df[lvlcols] = df[lvlcols].astype(float)

        # Drop any data specified as 1 by ko_{holdout_num}
        df = hlp.drop_ko_values(df, self.holdout_num, datavar=self.datavar)

        # Create temporary age index
        df[self.orig_agevar] = df[self.agevar]
        df[self.agevar] = df.groupby(self.agevar).grouper.group_info[0]

        # Sort
        self.df = df.sort_values(by=self.ids)
Beispiel #4
0
def calculate_fit_stats(run_id, output_path, run_type, holdout_num, holdouts,
                        param_groups, csv=True, inv_variance_weight=False):

    rmse_list = []

    # pull data
    for param in param_groups:

        # pull data
        df = get_data(run_id, output_path, holdout_num, param)

        for var in ['stage1', 'st', 'gpr_mean']:

            # calculate in-sample and (where relevant) out-of-sample RMSE
            rmse_table = calculate_rmse(
                df, holdout_num, var, inv_variance_weight)
            rmse_table['parameter_set'] = param

            # selection runs have multiple hyperparameter sets
            # out-of-sample evaluation runs use just need oos-rmse for one set of parameters
            if run_type in ['in_sample_selection', 'oos_selection']:

                hyperparams = hlp.model_load(
                    run_id, 'parameters', holdout=holdout_num, param_set=param, output_path=output_path)
                rmse_table['zeta'] = hyperparams[parameters.ST_ZETA].iat[0]
                rmse_table['lambdaa'] = hyperparams[parameters.ST_LAMBDA].iat[0]
                rmse_table['omega'] = hyperparams[parameters.ST_OMEGA].iat[0]
                rmse_table['scale'] = hyperparams[parameters.GPR_SCALE].iat[0]

            else:
                hyperparams = hlp.model_load(
                    run_id, 'parameters', param_set=None, output_path=output_path)
                rmse_table['zeta'] = hyperparams[parameters.ST_ZETA].iat[0]
                rmse_table['lambdaa'] = hyperparams[parameters.ST_LAMBDA].iat[0]
                rmse_table['omega'] = hyperparams[parameters.ST_OMEGA].iat[0]
                rmse_table['scale'] = hyperparams[parameters.GPR_SCALE].iat[0]
                rmse_table['density_cutoffs'] = hyperparams.density_cutoffs.iat[0]

            rmse_list.append(rmse_table)

    rmses = pd.concat(rmse_list)
    rmses = rmses[['var', 'ko', 'parameter_set', 'zeta', 'lambdaa', 'omega',
                   'scale', 'in_sample_rmse', 'oos_rmse']]

    # set in order of 'best' ie lowest oos rmse if ko run
    if run_type == 'oos_selection':
        rmses.sort_values(by=['oos_rmse'], ascending=True)

    if ((run_type in ['in_sample_selection', 'oos_selection']) | (holdouts > 0)):
        outpath = '{}/fit_stats_{}_{}.csv'.format(
            output_path, holdout_num, param)
    else:
        outpath = '{}/fit_stats.csv'.format(output_path)

    if(csv):
        rmses.to_csv(outpath, index=False)
        print('Saved fit statistics for holdout {} to {}'.format(
            holdout_num, outpath))

    return(rmses)
Beispiel #5
0
    def get_hyperparameters(self, param_set=None):
        """Set up hyperparameters for ko-runs (ko), different-hyperparams-by-data-density-runs (dd) """

        hlp.model_load(run_id, 'parameters',
                       holdout=holdout_num, param_set=param_set, output_path=self.output_path)
        store = pd.HDFStore(hlp.model_path(self.run_id, 'parameters'), 'r')
        self.hyperparams = store.get('parameters_{}'.format(param_set))
        store.close()
Beispiel #6
0
    def pull_data(self):

        self.locs = hlp.model_load(self.run_id,
                                   'location_hierarchy',
                                   output_path=self.output_path)
        self.data = hlp.model_load(self.run_id,
                                   'prepped',
                                   holdout=self.holdout_num,
                                   output_path=self.output_path)
        self.stage1_df = hlp.model_load(self.run_id,
                                        'stage1',
                                        holdout=self.holdout_num,
                                        output_path=self.output_path)
Beispiel #7
0
def get_data(run_id, output_path, holdout_num=0, param_set=0):
    """Returns some useful data columns, stage1, st, and gpr.
    The modeling columns are in NORMAL space - ie backtransformed out of
    any log/logit space if the model ran in that space. Data and variance
    are provided in both transformed and non-transformed space."""

    # pull and subset prepped data to necessities
    data = hlp.model_load(run_id, 'prepped', output_path=output_path)
    data_transform = hlp.model_load(run_id, 'parameters', param_set=None, output_path=output_path)[
        'data_transform'].iat[0]

    ko_col = ['ko_{}'.format(holdout_num)]
    data_cols = ['data', 'variance', 'original_data', 'original_variance']
    data = data[IDS + data_cols + ko_col]

    # pull linear estimates outputs
    stage1 = hlp.model_load(
        run_id, 'stage1', holdout=holdout_num, output_path=output_path)
    if 'location_id_count' in stage1.columns:
        stage1.drop(columns='location_id_count', inplace=True)

    # pull st and gpr outputs
    st = hlp.model_load(run_id, 'st', holdout=holdout_num,
                        param_set=param_set, output_path=output_path)
    st.drop(columns='scale', inplace=True)
    gpr = hlp.model_load(
        run_id, 'gpr', holdout=holdout_num, param_set=param_set, output_path=output_path)

    # merge
    df = stage1.merge(data, on=IDS, how='outer')
    df = df.merge(st, on=IDS, how='left')
    df = df.merge(gpr, on=IDS, how='left')

    # transform modeling columns
    df['stage1'] = transform_helpers.transform_data(
        df['stage1'], data_transform, reverse=True
    )
    df['st'] = transform_helpers.transform_data(
        df['st'], data_transform, reverse=True
    )

    return df
Beispiel #8
0
    def enumerate_parameters(self):

        parameters = hlp.model_load(self.run_id,
                                    'parameters',
                                    param_set=None,
                                    output_path=self.output_path)
        self.data_transform = parameters.data_transform.iat[0]
        self.location_set_id = int(parameters.location_set_id.iat[0])
        self.gbd_round_id = int(parameters.gbd_round_id.iat[0])
        self.decomp_step = parameters.decomp_step.iat[0]

        # determine parallel_group
        self.prediction_location_ids = \
            hlp.get_parallelization_location_group(
                self.location_set_id,
                self.nparallel,
                self.loc_group,
                self.gbd_round_id,
                self.decomp_step)
Beispiel #9
0
    def enumerate_parameters(self):
        """Initialize all model parameters"""

        # pull parameters
        parameters = hlp.model_load(self.run_id,
                                    'parameters',
                                    param_set=None,
                                    output_path=self.output_path)

        # st necessities
        self.lsid = int(parameters['location_set_id'].iat[0])
        self.data_transform = parameters['data_transform'].iat[0]

        # amplitude necessities
        self.nsv_on = int(parameters['add_nsv'].iat[0])
        self.amp_factor = float(parameters['gpr_amp_factor'].iat[0])
        self.amp_method = parameters['gpr_amp_method'].iat[0]

        if pd.isna(parameters['gpr_amp_cutoff'].iat[0]):
            lvl = f'level_{loc_constants.NATIONAL_LEVEL}'
            cy = self.country_years[[lvl, 'max_country_years'
                                     ]].drop_duplicates([lvl])
            cy = cy['max_country_years'].tolist()
            self.amp_cutoff = float(
                np.percentile(cy, amp_constants.DEFAULT_CUTOFF_PERCENTILE))
        else:
            self.amp_cutoff = float(parameters['gpr_amp_cutoff'].iat[0])

        years = parameters['prediction_year_ids'].iat[0]
        ages = parameters['prediction_age_group_ids'].iat[0]
        sexes = parameters['prediction_sex_ids'].iat[0]
        self.prediction_year_ids = hlp.separate_string_to_list(str(years), int)
        self.prediction_age_group_ids = hlp.separate_string_to_list(
            str(ages), int)
        self.prediction_sex_ids = hlp.separate_string_to_list(str(sexes), int)

        self.n_square = (
            len(self.prediction_sex_ids) * len(self.prediction_age_group_ids) *
            len(self.prediction_year_ids) *
            len(self.locs.loc[self.locs.level >= 3, columns.LOCATION_ID]))
Beispiel #10
0
    def assign_hyperparameters(self, param_set):

        hype = hlp.model_load(run_id, 'parameters',
                              holdout=self.holdout_num, param_set=param_set, output_path=self.output_path)
        in_df = self.df.copy()

        if self.run_type in ['in_sample_selection', 'oos_selection']:
            hype = pd.DataFrame(dict(zip(hype.keys(), hype.values)), index=[0])

        if self.run_type != 'dd':
            hype['density_cat'] = 0
            in_df['density_cat'] = 0
        else:
            hype.sort_values(by='density_cutoffs', inplace=True)
            cutoffs = hype.density_cutoffs.tolist()
            hype['density_cat'] = list(range(0, len(cutoffs)))

            in_df = hlp.assign_density_cats(df=in_df,
                                            country_year_count_var='location_id_count',
                                            cutoffs=cutoffs)

        self.in_df = in_df.merge(hype, on='density_cat', how='left')
Beispiel #11
0
    def enumerate_parameters(self):
        """Initialize all model parameters"""

        # pull parameters
        parameters = hlp.model_load(
            self.run_id, 'parameters', param_set=None, output_path=self.output_path)
        parameters = parameters.where((pd.notnull(parameters)), None)

        # st necessities
        self.st_version = parameters['st_version'].iat[0]
        self.location_set_id = int(parameters['location_set_id'].iat[0])
        self.gbd_round_id = int(parameters['gbd_round_id'].iat[0])
        self.decomp_step = parameters['decomp_step'].iat[0]

        # prediction vars
        self.prediction_location_ids = hlp.get_parallelization_location_group(self.location_set_id,
                                                                              self.nparallel,
                                                                              self.loc_group,
                                                                              self.gbd_round_id,
                                                                              self.decomp_step)
        self.prediction_year_ids = parameters['prediction_year_ids'].iat[0]
        self.prediction_age_group_ids = self.df[self.orig_agevar].unique(
        ).tolist()
        self.prediction_sex_ids = self.df[self.sexvar].unique().tolist()

        # age stuff
        if parameters['st_custom_age_vector'].iat[0]:
            self.custom_age_vector = [float(x) for x in
                                      parameters['st_custom_age_vector'].iat[0]
                                      .split(',')]
        else:
            self.custom_age_vector = None

        self.n_levels = int(max(self.df.level))
        self.level_cols = ['level_{i}'.format(
            i=i) for i in range(self.n_levels + 1)]
Beispiel #12
0
    rake_logit = int(sys.argv[6])
    loc = int(sys.argv[7])

    for i in [
            'run_id', 'output_path', 'holdout_num', 'draws', 'run_type',
            'rake_logit', 'loc'
    ]:
        print('{} : {}'.format(i, eval(i)))

    # set ids
    ids = ['location_id', 'year_id', 'age_group_id', 'sex_id']

    # pull in params
    print('Getting model parameters.')
    params = hlp.model_load(run_id,
                            'parameters',
                            param_set=None,
                            output_path=output_path)

    # find best param_set if run with holdouts - else it's automatically zero
    if run_type in ['in_sample_selection', 'oos_selection']:
        fit = pd.read_csv('{}/fit_stats.csv'.format(output_path))
        param_set = fit.loc[fit.best == 1, 'parameter_set'].unique().iat[0]
    else:
        param_set = 0

    # pull in locs and prep demographics table
    print('Getting locations and populations.')
    locs = hlp.model_load(run_id,
                          'location_hierarchy',
                          output_path=output_path)
    pops = hlp.model_load(run_id, 'populations', output_path=output_path)