Esempio n. 1
0
    def copy_and_backfill(self):
        prof_id_cret_old = self.me_map["cretinism"]["srcs"]
        old = self.me_dict[prof_id_cret_old].reset_index()

        # Handle year differences between gbd2016 and gbd2017
        old.loc[old.year_id == 2016, 'year_id'] = 2017
        # Handle Saudia Arabia
        loc_meta = get_location_metadata(location_set_id=35, gbd_round_id=4)
        saudia_id = 152
        saudia_sub_nats = loc_meta.loc[loc_meta.parent_id == saudia_id,
                                       'location_id'].tolist()
        saudi_arabia = old.loc[old.location_id.isin(saudia_sub_nats), :]
        saudi_arabia.loc[:, 'location_id'] = saudia_id
        saudi_arabia = saudi_arabia.drop_duplicates(keep='first')
        old = pd.concat([old, saudi_arabia], axis=0)

        # Handle other location differences between gbd2016 and gbd2017
        data_cols = self.draw_cols
        data_dct = {'data_cols': data_cols}
        index_cols = list(set(old.columns) - set(data_cols))
        index_cols.remove('location_id')
        demo = get_demographics(gbd_team='epi', gbd_round_id=5)
        index_dct = {
            tuple(index_cols):
            list(set(tuple(x) for x in old[index_cols].values)),
            'location_id': demo['location_id']
        }
        gbdizer = gbdize.GBDizeDataFrame(
            dimensionality.DataFrameDimensions(index_dct, data_dct))
        new = gbdizer.fill_location_from_nearest_parent(old,
                                                        location_set_id=35,
                                                        gbd_round_id=5)
        prof_id_cret_new = self.me_map["cretinism"]["trgs"]
        self.me_dict[prof_id_cret_new] = new
Esempio n. 2
0
def fill_square(df, col, gbd_round_id):
    '''make data square across a column for a set of index columns'''
    demo = get_demographics(gbd_team='epi', gbd_round_id=gbd_round_id)
    draw_cols = list(df.filter(like='draw_').columns)
    index_cols = list(set(df.columns) - set(draw_cols))
    index_cols.remove(col)
    index_dct = {
        tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)),
        col: demo[col]
    }
    data_dct = {'draw_cols': draw_cols}
    gbdizer = gbdize.GBDizeDataFrame(
        dimensionality.DataFrameDimensions(index_dct, data_dct))
    return gbdizer.fill_empty_indices(df, 0)
Esempio n. 3
0
def backfill(df, norway_id, code_dir, loc_meta):
    #backfill
    data_cols = ['cases', 'effective_sample_size', 'sample_size']
    data_dct = {'data_cols': data_cols}
    index_cols = list(set(df.columns) - set(data_cols))
    index_cols.remove('location_id')
    norway_subs = loc_meta.loc[loc_meta.parent_id == norway_id,
                               'location_id'].tolist()
    index_dct = {
        tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)),
        'location_id': norway_subs
    }
    gbdizer = gbdize.GBDizeDataFrame(
        dimensionality.DataFrameDimensions(index_dct, data_dct))
    backfilled = gbdizer.fill_location_from_nearest_parent(df,
                                                           location_set_id=35,
                                                           gbd_round_id=5)
    return backfilled
Esempio n. 4
0
def fill_square(df, index_cols, square_col, square_col_vals, fill_val=0):
    """make data square across a column for a set of index columns"""
    # get index dimensions
    index_cols = [col for col in index_cols if col != square_col]
    index_dct = {
        tuple(index_cols): list(set(tuple(x) for x in df[index_cols].values)),
        square_col: square_col_vals
    }

    # get data dimensions
    data_dct = {
        "non_draw_cols":
        [col for col in df.columns if col not in index_cols + [square_col]]
    }

    # make it square
    gbdizer = gbdize.GBDizeDataFrame(
        dimensionality.DataFrameDimensions(index_dct, data_dct))
    df = gbdizer.fill_empty_indices(df, fill_val)
    return df
Esempio n. 5
0
    def __init__(self,
                 split_version_id,
                 output_dir,
                 decomp_step,
                 location_id=None,
                 year_id=None,
                 age_group_id=None,
                 sex_id=None,
                 measure_id=None,
                 location_set_id=35,
                 gbd_round_id=gbd.GBD_ROUND_ID,
                 n_draws=1000):

        # static ids
        self.split_version_id = split_version_id
        self.decomp_step = decomp_step
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id

        # read func is derived from static values. we call it to initialize the
        # internal caching
        self._read_func = split_prop_read_func()
        cached_props = self._read_func(params={}, filters=self.ss_filters)

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in
                dbtrees.loctree(location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id).leaves()
            ]
        if not year_id:
            year_id = estimation_years_from_gbd_round_id(gbd_round_id)
        if not age_group_id:
            # this has the advantage of instantiating the lru cache in the main
            # process before multiprocessing
            age_group_id = get_age_group_set(12)["age_group_id"].tolist()
            if -1 in cached_props["age_start"].unique().tolist():
                age_group_id.append(164)
        if not sex_id:
            sex_id = [gbd.sex.MALE, gbd.sex.FEMALE]
        if not measure_id:
            measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(
            index_dict, data_dict)

        sp_formula = SevPropFormula()
        sp_formula.build_custom_draw_source(params={},
                                            read_func=self._read_func)
        sp_formula.add_transforms()
        self._ss_draw_source = sp_formula.draw_source

        # epi draws source
        mvid, dstep = get_best_model_version_and_decomp_step(
            output_dir, int(self.parent_meid))
        src = Epi.create_modelable_entity_draw_source(
            n_workers=1,
            modelable_entity_id=int(self.parent_meid),
            model_version_id=mvid,
            gbd_round_id=gbd_round_id,
            decomp_step=dstep)
        src.remove_transform(automagic_age_sex_agg)

        if n_draws < 1000:
            src.add_transform(group_and_downsample, n_draws)

        self._epi_draw_source = src

        self.pusher = SuperPusher(spec={
            'file_pattern': "{modelable_entity_id}/{location_id}.h5",
            'h5_tablename': 'draws'
        },
                                  directory=output_dir)
Esempio n. 6
0
    def __init__(
            self, split_version_id, output_dir, location_id=[], year_id=[],
            age_group_id=[], sex_id=[], measure_id=[], location_set_id=35,
            gbd_round_id=5, n_draws=1000):

        # static ids
        self.split_version_id = split_version_id
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in dbtrees.loctree(
                    location_set_id=location_set_id,
                    gbd_round_id=gbd_round_id).leaves()]
        if not year_id:
            year_id = [1990, 1995, 2000, 2005, 2010, 2017]
        if not age_group_id:
            # this has the advantage of intantiating the lru cache in the main
            # process before multiprocessing
            age_group_id = get_age_group_set(12)["age_group_id"].tolist()
        if not sex_id:
            sex_id = [1, 2]
        if not measure_id:
            measure_id = [5, 6]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(index_dict,
                                                             data_dict)

        # read func is derived from static values. we call it to initialize the
        # internal caching
        self._read_func = split_prop_read_func()
        self._read_func(params={}, filters=self.ss_filters)

        # ss draw source is derived from static values
        sp_formula = SevPropFormula(
            location_set_id=location_set_id, n_draws=n_draws)
        sp_formula.build_custom_draw_source(
            params={}, read_func=self._read_func)
        sp_formula.add_transforms()
        self._ss_draw_source = sp_formula.draw_source

        # epi draws source
        self._epi_draw_source = Epi.create_modelable_entity_draw_source(
            n_workers=1,
            modelable_entity_id=self.parent_meid,
            gbd_round_id=gbd_round_id)

        self.pusher = SuperPusher(
            spec={'file_pattern': "{modelable_entity_id}/{location_id}.h5",
                  'h5_tablename': 'draws'},
            directory=output_dir)
Esempio n. 7
0
    def __init__(self,
                 me_map,
                 output_dir,
                 location_id=[],
                 year_id=[],
                 age_group_id=[],
                 sex_id=[],
                 measure_id=[],
                 location_set_id=35,
                 gbd_round_id=5,
                 n_draws=1000,
                 copy_env_inc=False):
        # set static values
        self.me_map = me_map
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id
        self.copy_env_inc = copy_env_inc

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in
                dbtrees.loctree(location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id).leaves()
            ]
        if not year_id:
            year_id = [1990, 1995, 2000, 2005, 2010, 2017]
        if not age_group_id:
            age_group_id = range(2, 21) + [30, 31, 32, 235]
        if not sex_id:
            sex_id = [1, 2]
        if not measure_id:
            measure_id = [5, 6]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(
            index_dict, data_dict)

        # draws that are imported or computed are stored here
        self.draws = {}

        # objects for reading data
        self._importers = {}
        for me_id in me_map["sub"].keys() + [me_map["env"]]:
            me_source = Epi.create_modelable_entity_draw_source(
                n_workers=1,
                modelable_entity_id=me_id,
                gbd_round_id=gbd_round_id)
            me_source.remove_transform(automagic_age_sex_agg)
            self._importers[me_id] = me_source

        # object for pushing results to disk
        self._pusher = SuperPusher(spec={
            'file_pattern': "{modelable_entity_id}/{location_id}.h5",
            'h5_tablename': 'draws'
        },
                                   directory=output_dir)
Esempio n. 8
0
    def __init__(
            self, process_name, output_dir, decomp_step, location_id=None,
            year_id=None, age_group_id=None, sex_id=None, measure_id=None,
            location_set_id=35, gbd_round_id=gbd.GBD_ROUND_ID, n_draws=1000):

        # validate decomp_step
        validate_decomp_step("ExAdjust", decomp_step, gbd_round_id)

        this_file = os.path.realpath(__file__)
        this_dir = os.path.dirname(this_file)
        filepath = os.path.join(this_dir,"..","maps","final.json")
        with open(filepath, 'r') as f:
            emap = json.load(f)
        me_map = emap[process_name]["kwargs"]["me_map"]

        # set static values
        self.me_map = json_parser(json.dumps(me_map))
        self.decomp_step = decomp_step
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id
        try:
            self.copy_env_inc = emap[
                process_name]["kwargs"].pop("copy_env_inc")
        except KeyError:
            self.copy_env_inc = False

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in dbtrees.loctree(
                    location_set_id=location_set_id,
                    gbd_round_id=gbd_round_id).leaves()]
        if not year_id:
            year_id = estimation_years_from_gbd_round_id(gbd_round_id)
        if not age_group_id:
            # this has the advantage of instantiating the lru cache in the main
            # process before multiprocessing
            age_group_id = get_age_group_set(12)["age_group_id"].tolist()
        if not sex_id:
            sex_id = [gbd.sex.MALE, gbd.sex.FEMALE]
        if not measure_id:
            measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(index_dict,
                                                             data_dict)

        # draws that are imported or computed are stored here
        self.draws = {}

        # objects for reading data
        self._importers = {}
        for me_id in list(self.me_map["sub"].keys()) + [self.me_map["env"]]:
            mvid, dstep = (
                get_best_model_version_and_decomp_step(output_dir, me_id)
            )
            me_source = Epi.create_modelable_entity_draw_source(
                n_workers=1,
                modelable_entity_id=me_id,
                model_version_id=mvid,
                gbd_round_id=gbd_round_id,
                decomp_step=dstep
            )
            me_source.remove_transform(automagic_age_sex_agg)
            if n_draws < 1000:
                me_source.add_transform(group_and_downsample, n_draws)
            self._importers[me_id] = me_source

        # object for pushing results to disk
        self._pusher = SuperPusher(
            spec={'file_pattern': "{modelable_entity_id}/{location_id}.h5",
                  'h5_tablename': 'draws'},
            directory=output_dir)