Example #1
0
def write_squeezed(sqzd, location_id, year_id, sex_id, map_file):

    tmap = pd.read_csv(map_file)
    for me_id, df in sqzd.groupby(['me_id']):

        t_meid = tmap.query('modelable_entity_id_source == %s' % me_id)
        t_meid = t_meid['modelable_entity_id_target'].squeeze()
        try:
            t_meid = int(t_meid)
        except Exception:
            pass
        if not isinstance(t_meid, int):
            continue
        print('Writing squeezed %s to file' % t_meid)
        df['location_id'] = int(float(location_id))
        df['year_id'] = int(float(year_id))
        df['sex_id'] = int(float(sex_id))
        df['measure_id'] = 5
        df['age_group_id'] = df.age_group_id.astype(float).astype(int)
        df["modelable_entity_id"] = t_meid

        pusher = SuperPusher(spec={
            'file_pattern': ("{modelable_entity_id}/{location_id}/"
                             "{measure_id}_{year_id}_{sex_id}.h5"),
            'h5_tablename':
            'draws'
        },
                             directory=output_dir)
        pusher.push(df, append=False)
Example #2
0
 def export_summary(self, component, year_type, df):
     if year_type == "single_year":
         pattern = "{measure_id}/single_year/{location_id}/{year_id}.csv"
         index_cols = {
             "cause": [
                 "measure_id", "year_id", "location_id", "sex_id",
                 "age_group_id", "cause_id", "metric_id"
             ],
             "impairment": [
                 "measure_id", "year_id", "location_id", "sex_id",
                 "age_group_id", "cause_id", "rei_id", "metric_id"
             ],
             "injuries": [
                 "measure_id", "year_id", "location_id", "sex_id",
                 "age_group_id", "cause_id", "rei_id", "metric_id"
             ],
             "sequela": [
                 "measure_id", "year_id", "location_id", "sex_id",
                 "age_group_id", "sequela_id", "metric_id"
             ]
         }
     if year_type == "multi_year":
         pattern = "{measure_id}/multi_year/{location_id}.csv"
         index_cols = {
             "cause": [
                 "measure_id", "year_start_id", "year_end_id",
                 "location_id", "sex_id", "age_group_id", "cause_id",
                 "metric_id"
             ],
             "impairment": [
                 "measure_id", "year_start_id", "year_end_id",
                 "location_id", "sex_id", "age_group_id", "cause_id",
                 "rei_id", "metric_id"
             ],
             "injuries": [
                 "measure_id", "year_start_id", "year_end_id",
                 "location_id", "sex_id", "age_group_id", "cause_id",
                 "rei_id", "metric_id"
             ],
             "sequela": [
                 "measure_id", "year_start_id", "year_end_id",
                 "location_id", "sex_id", "age_group_id", "sequela_id",
                 "metric_id"
             ]
         }
     df = sort_index_columns(df, index_cols[component])
     df = df[index_cols[component] + ["val", "upper", "lower"]]
     # path to use in summaries
     directory = os.path.join(self.como_version.como_dir, "summaries",
                              component)
     pusher = SuperPusher(directory=directory,
                          spec={"file_pattern": pattern})
     pusher.push(df, index=False)
Example #3
0
    def export_summary(self, component, year_type, df):
        if year_type == "single_year":
            pattern = "{measure_id}/single_year/{location_id}/{year_id}.csv"
        if year_type == "multi_year":
            pattern = "{measure_id}/multi_year/{location_id}.csv"

        # path to use in summaries
        directory = os.path.join(self.como_version.como_dir, "summaries",
                                 component)
        pusher = SuperPusher(directory=directory,
                             spec={"file_pattern": pattern})
        pusher.push(df)
Example #4
0
    def __init__(self,
                 split_version_id,
                 output_dir,
                 decomp_step,
                 location_id=None,
                 year_id=None,
                 age_group_id=None,
                 sex_id=None,
                 measure_id=None,
                 location_set_id=35,
                 gbd_round_id=gbd.GBD_ROUND_ID,
                 n_draws=1000):

        # static ids
        self.split_version_id = split_version_id
        self.decomp_step = decomp_step
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id

        # read func is derived from static values. we call it to initialize the
        # internal caching
        self._read_func = split_prop_read_func()
        cached_props = self._read_func(params={}, filters=self.ss_filters)

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in
                dbtrees.loctree(location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id).leaves()
            ]
        if not year_id:
            year_id = estimation_years_from_gbd_round_id(gbd_round_id)
        if not age_group_id:
            # this has the advantage of instantiating the lru cache in the main
            # process before multiprocessing
            age_group_id = get_age_group_set(12)["age_group_id"].tolist()
            if -1 in cached_props["age_start"].unique().tolist():
                age_group_id.append(164)
        if not sex_id:
            sex_id = [gbd.sex.MALE, gbd.sex.FEMALE]
        if not measure_id:
            measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(
            index_dict, data_dict)

        sp_formula = SevPropFormula()
        sp_formula.build_custom_draw_source(params={},
                                            read_func=self._read_func)
        sp_formula.add_transforms()
        self._ss_draw_source = sp_formula.draw_source

        # epi draws source
        mvid, dstep = get_best_model_version_and_decomp_step(
            output_dir, int(self.parent_meid))
        src = Epi.create_modelable_entity_draw_source(
            n_workers=1,
            modelable_entity_id=int(self.parent_meid),
            model_version_id=mvid,
            gbd_round_id=gbd_round_id,
            decomp_step=dstep)
        src.remove_transform(automagic_age_sex_agg)

        if n_draws < 1000:
            src.add_transform(group_and_downsample, n_draws)

        self._epi_draw_source = src

        self.pusher = SuperPusher(spec={
            'file_pattern': "{modelable_entity_id}/{location_id}.h5",
            'h5_tablename': 'draws'
        },
                                  directory=output_dir)
Example #5
0
class SevSplitter(object):
    def __init__(self,
                 split_version_id,
                 output_dir,
                 decomp_step,
                 location_id=None,
                 year_id=None,
                 age_group_id=None,
                 sex_id=None,
                 measure_id=None,
                 location_set_id=35,
                 gbd_round_id=gbd.GBD_ROUND_ID,
                 n_draws=1000):

        # static ids
        self.split_version_id = split_version_id
        self.decomp_step = decomp_step
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id

        # read func is derived from static values. we call it to initialize the
        # internal caching
        self._read_func = split_prop_read_func()
        cached_props = self._read_func(params={}, filters=self.ss_filters)

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in
                dbtrees.loctree(location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id).leaves()
            ]
        if not year_id:
            year_id = estimation_years_from_gbd_round_id(gbd_round_id)
        if not age_group_id:
            # this has the advantage of instantiating the lru cache in the main
            # process before multiprocessing
            age_group_id = get_age_group_set(12)["age_group_id"].tolist()
            if -1 in cached_props["age_start"].unique().tolist():
                age_group_id.append(164)
        if not sex_id:
            sex_id = [gbd.sex.MALE, gbd.sex.FEMALE]
        if not measure_id:
            measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(
            index_dict, data_dict)

        sp_formula = SevPropFormula()
        sp_formula.build_custom_draw_source(params={},
                                            read_func=self._read_func)
        sp_formula.add_transforms()
        self._ss_draw_source = sp_formula.draw_source

        # epi draws source
        mvid, dstep = get_best_model_version_and_decomp_step(
            output_dir, int(self.parent_meid))
        src = Epi.create_modelable_entity_draw_source(
            n_workers=1,
            modelable_entity_id=int(self.parent_meid),
            model_version_id=mvid,
            gbd_round_id=gbd_round_id,
            decomp_step=dstep)
        src.remove_transform(automagic_age_sex_agg)

        if n_draws < 1000:
            src.add_transform(group_and_downsample, n_draws)

        self._epi_draw_source = src

        self.pusher = SuperPusher(spec={
            'file_pattern': "{modelable_entity_id}/{location_id}.h5",
            'h5_tablename': 'draws'
        },
                                  directory=output_dir)

    @property
    def ss_filters(self):
        return {"split_version_id": self.split_version_id}

    @property
    def demo_filters(self):
        return {
            "location_id": self.dimensions.index_dim.get_level("location_id"),
            "age_group_id":
            self.dimensions.index_dim.get_level("age_group_id"),
            "year_id": self.dimensions.index_dim.get_level("year_id"),
            "sex_id": self.dimensions.index_dim.get_level("sex_id"),
            "measure_id": self.dimensions.index_dim.get_level("measure_id")
        }

    @property
    def parent_meid(self):
        df = self._read_func(params={}, filters=self.ss_filters)
        return df.parent_meid.unique()[0]

    @property
    def child_meid(self):
        df = self._read_func(params={}, filters=self.ss_filters)
        return df.child_meid.unique().tolist()

    def split(self):
        # get input draws
        draws = self._epi_draw_source.content(filters=self.demo_filters.copy())
        # get split props
        filters = self.ss_filters
        filters.update(self.demo_filters)
        gprops = self._ss_draw_source.content(filters=filters)
        splits = merge_split(draws,
                             gprops,
                             group_cols=self.dimensions.index_names,
                             value_cols=self.dimensions.data_list())
        splits = splits.assign(modelable_entity_id=splits['child_meid'])
        splits = splits[self.dimensions.index_names + ["modelable_entity_id"] +
                        self.dimensions.data_list()]
        splits = splits.fillna(0)
        self.pusher.push(splits, append=False)

    def _q_split(self, inq, outq):
        for location_id in iter(inq.get, sentinel):
            print(location_id)
            try:
                self.dimensions.index_dim.replace_level(
                    "location_id", location_id)
                self.split()
                outq.put((False, location_id))
            except Exception as e:
                outq.put((ExceptionWrapper(e), location_id))

    def run_all_splits_mp(self, n_processes=23):
        inq = Queue()
        outq = Queue()

        # Create and feed sim procs
        split_procs = []
        min_procs = min([
            n_processes,
            self.dimensions.index_dim.cardinality("location_id")
        ])
        for i in range(min_procs):
            p = Process(target=self._q_split, args=(inq, outq))
            split_procs.append(p)
            p.start()

        # run the simulations
        for location_id in self.dimensions.index_dim.get_level("location_id"):
            inq.put(location_id)

        # make the workers die after
        for _ in split_procs:
            inq.put(sentinel)

        # get results
        results = []
        for location_id in self.dimensions.index_dim.get_level("location_id"):
            proc_result = outq.get()
            results.append(proc_result)

        # close up the queue
        for p in split_procs:
            p.join()

        for exc, location_id in results:
            if exc:
                exc.re_raise()
Example #6
0
    def __init__(
            self, split_version_id, output_dir, location_id=[], year_id=[],
            age_group_id=[], sex_id=[], measure_id=[], location_set_id=35,
            gbd_round_id=5, n_draws=1000):

        # static ids
        self.split_version_id = split_version_id
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in dbtrees.loctree(
                    location_set_id=location_set_id,
                    gbd_round_id=gbd_round_id).leaves()]
        if not year_id:
            year_id = [1990, 1995, 2000, 2005, 2010, 2017]
        if not age_group_id:
            # this has the advantage of intantiating the lru cache in the main
            # process before multiprocessing
            age_group_id = get_age_group_set(12)["age_group_id"].tolist()
        if not sex_id:
            sex_id = [1, 2]
        if not measure_id:
            measure_id = [5, 6]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(index_dict,
                                                             data_dict)

        # read func is derived from static values. we call it to initialize the
        # internal caching
        self._read_func = split_prop_read_func()
        self._read_func(params={}, filters=self.ss_filters)

        # ss draw source is derived from static values
        sp_formula = SevPropFormula(
            location_set_id=location_set_id, n_draws=n_draws)
        sp_formula.build_custom_draw_source(
            params={}, read_func=self._read_func)
        sp_formula.add_transforms()
        self._ss_draw_source = sp_formula.draw_source

        # epi draws source
        self._epi_draw_source = Epi.create_modelable_entity_draw_source(
            n_workers=1,
            modelable_entity_id=self.parent_meid,
            gbd_round_id=gbd_round_id)

        self.pusher = SuperPusher(
            spec={'file_pattern': "{modelable_entity_id}/{location_id}.h5",
                  'h5_tablename': 'draws'},
            directory=output_dir)
Example #7
0
    def __init__(self,
                 me_map,
                 output_dir,
                 location_id=[],
                 year_id=[],
                 age_group_id=[],
                 sex_id=[],
                 measure_id=[],
                 location_set_id=35,
                 gbd_round_id=5,
                 n_draws=1000,
                 copy_env_inc=False):
        # set static values
        self.me_map = me_map
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id
        self.copy_env_inc = copy_env_inc

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in
                dbtrees.loctree(location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id).leaves()
            ]
        if not year_id:
            year_id = [1990, 1995, 2000, 2005, 2010, 2017]
        if not age_group_id:
            age_group_id = range(2, 21) + [30, 31, 32, 235]
        if not sex_id:
            sex_id = [1, 2]
        if not measure_id:
            measure_id = [5, 6]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(
            index_dict, data_dict)

        # draws that are imported or computed are stored here
        self.draws = {}

        # objects for reading data
        self._importers = {}
        for me_id in me_map["sub"].keys() + [me_map["env"]]:
            me_source = Epi.create_modelable_entity_draw_source(
                n_workers=1,
                modelable_entity_id=me_id,
                gbd_round_id=gbd_round_id)
            me_source.remove_transform(automagic_age_sex_agg)
            self._importers[me_id] = me_source

        # object for pushing results to disk
        self._pusher = SuperPusher(spec={
            'file_pattern': "{modelable_entity_id}/{location_id}.h5",
            'h5_tablename': 'draws'
        },
                                   directory=output_dir)
Example #8
0
class ExAdjust(object):
    def __init__(self,
                 me_map,
                 output_dir,
                 location_id=[],
                 year_id=[],
                 age_group_id=[],
                 sex_id=[],
                 measure_id=[],
                 location_set_id=35,
                 gbd_round_id=5,
                 n_draws=1000,
                 copy_env_inc=False):
        # set static values
        self.me_map = me_map
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id
        self.copy_env_inc = copy_env_inc

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in
                dbtrees.loctree(location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id).leaves()
            ]
        if not year_id:
            year_id = [1990, 1995, 2000, 2005, 2010, 2017]
        if not age_group_id:
            age_group_id = range(2, 21) + [30, 31, 32, 235]
        if not sex_id:
            sex_id = [1, 2]
        if not measure_id:
            measure_id = [5, 6]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(
            index_dict, data_dict)

        # draws that are imported or computed are stored here
        self.draws = {}

        # objects for reading data
        self._importers = {}
        for me_id in me_map["sub"].keys() + [me_map["env"]]:
            me_source = Epi.create_modelable_entity_draw_source(
                n_workers=1,
                modelable_entity_id=me_id,
                gbd_round_id=gbd_round_id)
            me_source.remove_transform(automagic_age_sex_agg)
            self._importers[me_id] = me_source

        # object for pushing results to disk
        self._pusher = SuperPusher(spec={
            'file_pattern': "{modelable_entity_id}/{location_id}.h5",
            'h5_tablename': 'draws'
        },
                                   directory=output_dir)

    @property
    def filters(self):
        return {
            "location_id": self.dimensions.index_dim.get_level("location_id"),
            "age_group_id":
            self.dimensions.index_dim.get_level("age_group_id"),
            "year_id": self.dimensions.index_dim.get_level("year_id"),
            "sex_id": self.dimensions.index_dim.get_level("sex_id"),
            "measure_id": self.dimensions.index_dim.get_level("measure_id")
        }

    def _import_draws(self):
        gbdizer = gbdize.GBDizeDataFrame(self.dimensions)

        # import draws
        for me_id in self._importers.keys():
            draw_source = self._importers[me_id]
            draws = draw_source.content(filters=self.filters)
            draws = gbdizer.fill_empty_indices(draws, 0)
            self.draws[me_id] = draws.set_index(self.dimensions.index_names)

    def _calc_sigma_sub(self):
        """calculate the sum of the sub sequela"""
        # concatenate all required frames
        sub_dfs = []
        for me_id in self.me_map["sub"].keys():
            sub_dfs.append(self.draws[me_id])
        sub_df = pd.concat(sub_dfs)

        # return the sum
        sub_df.reset_index(inplace=True)
        if self.copy_env_inc:
            draw_cols = self.dimensions.data_dim.get_level("data")
            sub_df.loc[sub_df['measure_id'] == 6, draw_cols] = 0
        return sub_df.groupby(self.dimensions.index_names).sum()

    def _resid(self):
        """calculate the residual numbers"""
        # get the needed data
        sigma_sub_df = self.draws["sigma_sub"]
        env_df = self.draws[self.me_map["env"]]

        # if it is a squeeze type then we use the absolute value of the diff
        resid_df = (env_df - sigma_sub_df)[(sigma_sub_df <= env_df)].fillna(0)
        return resid_df

    def _excess(self, sub_me):
        """calculate the excess proportions"""
        # get the needed data
        sub_me_df = self.draws[sub_me]
        env_df = self.draws[self.me_map["env"]]
        sigma_sub_df = self.draws["sigma_sub"]

        # create a boolean dataframe for our 2 cases
        more = (sigma_sub_df > env_df)

        # now calculate the excess values
        excess_df = ((sigma_sub_df[more] - env_df[more]) * sub_me_df[more] /
                     sigma_sub_df[more]).fillna(value=0)
        return excess_df

    def _squeeze(self, sub_me):
        """calculate the squeezed proportions"""
        # get the needed data
        sub_me_df = self.draws[sub_me]
        env_df = self.draws[self.me_map["env"]]
        sigma_sub_df = self.draws["sigma_sub"]

        # create a boolean dataframe for our 2 cases
        more = (sigma_sub_df > env_df)

        # get the squeezed values when
        squeeze_more = env_df[more] * sub_me_df[more] / sigma_sub_df[more]
        squeeze_less = sub_me_df[~more]
        squeeze_df = squeeze_more.fillna(squeeze_less)
        return squeeze_df

    def _export(self):
        """export all data"""
        # export residual
        me_id = self.me_map["resid"]
        resid_df = self.draws[me_id].reset_index()
        resid_df["modelable_entity_id"] = me_id
        self._pusher.push(resid_df, append=False)

        # export any subcause adjustments
        for sub_me in self.me_map["sub"].keys():
            if "squeeze" in self.me_map["sub"][sub_me].keys():
                me_id = self.me_map["sub"][sub_me]["squeeze"]
                squeeze_df = self.draws[me_id].reset_index()
                squeeze_df["modelable_entity_id"] = me_id
                self._pusher.push(squeeze_df, append=False)

            if "excess" in self.me_map["sub"][sub_me].keys():
                me_id = self.me_map["sub"][sub_me]["excess"]
                excess_df = self.draws[me_id].reset_index()
                excess_df["modelable_entity_id"] = me_id
                self._pusher.push(excess_df, append=False)

    def adjust(self):
        """run exclusivity adjustment on all MEs"""
        self._import_draws()
        self.draws["sigma_sub"] = self._calc_sigma_sub()
        self.draws[self.me_map["resid"]] = self._resid()
        for sub_me in self.me_map["sub"].keys():
            if "squeeze" in self.me_map["sub"][sub_me].keys():
                self.draws[self.me_map["sub"][sub_me]["squeeze"]] = (
                    self._squeeze(sub_me))
            if "excess" in self.me_map["sub"][sub_me].keys():
                self.draws[self.me_map["sub"][sub_me]["excess"]] = (
                    self._excess(sub_me))
        self._export()

    def _q_adjust(self, inq, outq):
        for location_id in iter(inq.get, sentinel):
            try:
                self.dimensions.index_dim.replace_level(
                    "location_id", location_id)
                self.adjust()
                outq.put((False, location_id))
            except Exception as e:
                outq.put((ExceptionWrapper(e), location_id))

    def run_all_adjustments_mp(self, n_processes=23):
        inq = Queue()
        outq = Queue()

        # Create and feed sim procs
        adjust_procs = []
        min_procs = min([
            n_processes,
            self.dimensions.index_dim.cardinality("location_id")
        ])
        for i in range(min_procs):
            p = Process(target=self._q_adjust, args=(inq, outq))
            adjust_procs.append(p)
            p.start()

        # run the silulations
        for location_id in self.dimensions.index_dim.get_level("location_id"):
            inq.put(location_id)

        # make the workers die after
        for _ in adjust_procs:
            inq.put(sentinel)

        # get results
        results = []
        for location_id in self.dimensions.index_dim.get_level("location_id"):
            proc_result = outq.get()
            results.append(proc_result)

        # close up the queue
        for p in adjust_procs:
            p.join()

        for exc, location_id in results:
            if exc:
                exc.re_raise()
Example #9
0
    def __init__(
            self, process_name, output_dir, decomp_step, location_id=None,
            year_id=None, age_group_id=None, sex_id=None, measure_id=None,
            location_set_id=35, gbd_round_id=gbd.GBD_ROUND_ID, n_draws=1000):

        # validate decomp_step
        validate_decomp_step("ExAdjust", decomp_step, gbd_round_id)

        this_file = os.path.realpath(__file__)
        this_dir = os.path.dirname(this_file)
        filepath = os.path.join(this_dir,"..","maps","final.json")
        with open(filepath, 'r') as f:
            emap = json.load(f)
        me_map = emap[process_name]["kwargs"]["me_map"]

        # set static values
        self.me_map = json_parser(json.dumps(me_map))
        self.decomp_step = decomp_step
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id
        try:
            self.copy_env_inc = emap[
                process_name]["kwargs"].pop("copy_env_inc")
        except KeyError:
            self.copy_env_inc = False

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in dbtrees.loctree(
                    location_set_id=location_set_id,
                    gbd_round_id=gbd_round_id).leaves()]
        if not year_id:
            year_id = estimation_years_from_gbd_round_id(gbd_round_id)
        if not age_group_id:
            # this has the advantage of instantiating the lru cache in the main
            # process before multiprocessing
            age_group_id = get_age_group_set(12)["age_group_id"].tolist()
        if not sex_id:
            sex_id = [gbd.sex.MALE, gbd.sex.FEMALE]
        if not measure_id:
            measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(index_dict,
                                                             data_dict)

        # draws that are imported or computed are stored here
        self.draws = {}

        # objects for reading data
        self._importers = {}
        for me_id in list(self.me_map["sub"].keys()) + [self.me_map["env"]]:
            mvid, dstep = (
                get_best_model_version_and_decomp_step(output_dir, me_id)
            )
            me_source = Epi.create_modelable_entity_draw_source(
                n_workers=1,
                modelable_entity_id=me_id,
                model_version_id=mvid,
                gbd_round_id=gbd_round_id,
                decomp_step=dstep
            )
            me_source.remove_transform(automagic_age_sex_agg)
            if n_draws < 1000:
                me_source.add_transform(group_and_downsample, n_draws)
            self._importers[me_id] = me_source

        # object for pushing results to disk
        self._pusher = SuperPusher(
            spec={'file_pattern': "{modelable_entity_id}/{location_id}.h5",
                  'h5_tablename': 'draws'},
            directory=output_dir)
Example #10
0
class ExAdjust(object):

    def __init__(
            self, process_name, output_dir, decomp_step, location_id=None,
            year_id=None, age_group_id=None, sex_id=None, measure_id=None,
            location_set_id=35, gbd_round_id=gbd.GBD_ROUND_ID, n_draws=1000):

        # validate decomp_step
        validate_decomp_step("ExAdjust", decomp_step, gbd_round_id)

        this_file = os.path.realpath(__file__)
        this_dir = os.path.dirname(this_file)
        filepath = os.path.join(this_dir,"..","maps","final.json")
        with open(filepath, 'r') as f:
            emap = json.load(f)
        me_map = emap[process_name]["kwargs"]["me_map"]

        # set static values
        self.me_map = json_parser(json.dumps(me_map))
        self.decomp_step = decomp_step
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id
        try:
            self.copy_env_inc = emap[
                process_name]["kwargs"].pop("copy_env_inc")
        except KeyError:
            self.copy_env_inc = False

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in dbtrees.loctree(
                    location_set_id=location_set_id,
                    gbd_round_id=gbd_round_id).leaves()]
        if not year_id:
            year_id = estimation_years_from_gbd_round_id(gbd_round_id)
        if not age_group_id:
            # this has the advantage of instantiating the lru cache in the main
            # process before multiprocessing
            age_group_id = get_age_group_set(12)["age_group_id"].tolist()
        if not sex_id:
            sex_id = [gbd.sex.MALE, gbd.sex.FEMALE]
        if not measure_id:
            measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(index_dict,
                                                             data_dict)

        # draws that are imported or computed are stored here
        self.draws = {}

        # objects for reading data
        self._importers = {}
        for me_id in list(self.me_map["sub"].keys()) + [self.me_map["env"]]:
            mvid, dstep = (
                get_best_model_version_and_decomp_step(output_dir, me_id)
            )
            me_source = Epi.create_modelable_entity_draw_source(
                n_workers=1,
                modelable_entity_id=me_id,
                model_version_id=mvid,
                gbd_round_id=gbd_round_id,
                decomp_step=dstep
            )
            me_source.remove_transform(automagic_age_sex_agg)
            if n_draws < 1000:
                me_source.add_transform(group_and_downsample, n_draws)
            self._importers[me_id] = me_source

        # object for pushing results to disk
        self._pusher = SuperPusher(
            spec={'file_pattern': "{modelable_entity_id}/{location_id}.h5",
                  'h5_tablename': 'draws'},
            directory=output_dir)

    @property
    def filters(self):
        return {
            "location_id": self.dimensions.index_dim.get_level("location_id"),
            "age_group_id": self.dimensions.index_dim.get_level("age_group_id"
                                                                ),
            "year_id": self.dimensions.index_dim.get_level("year_id"),
            "sex_id": self.dimensions.index_dim.get_level("sex_id"),
            "measure_id": self.dimensions.index_dim.get_level("measure_id")
        }

    def _import_draws(self):
        gbdizer = gbdize.GBDizeDataFrame(self.dimensions)

        # import draws
        for me_id in self._importers.keys():
            draw_source = self._importers[me_id]
            draws = draw_source.content(filters=self.filters)
            draws = gbdizer.fill_empty_indices(draws, 0)
            self.draws[me_id] = draws.set_index(self.dimensions.index_names)

    def _calc_sigma_sub(self):
        """calculate the sum of the sub sequela"""
        # concatenate all required frames
        sub_dfs = []
        for me_id in self.me_map["sub"].keys():
            sub_dfs.append(self.draws[me_id])
        sub_df = pd.concat(sub_dfs)

        # return the sum
        sub_df.reset_index(inplace=True)
        if self.copy_env_inc:
            draw_cols = self.dimensions.data_dim.get_level("data")
            sub_df.loc[sub_df['measure_id'] == 6, draw_cols] = 0
        return sub_df.groupby(self.dimensions.index_names).sum()

    def _resid(self):
        """calculate the residual numbers"""
        # get the needed data
        sigma_sub_df = self.draws["sigma_sub"]
        env_df = self.draws[self.me_map["env"]]

        # if it is a squeeze type then we use the absolute value of the diff
        resid_df = (env_df - sigma_sub_df)[(sigma_sub_df <= env_df)].fillna(0)
        return resid_df

    def _excess(self, sub_me):
        """calculate the excess proportions"""
        # get the needed data
        sub_me_df = self.draws[sub_me]
        env_df = self.draws[self.me_map["env"]]
        sigma_sub_df = self.draws["sigma_sub"]

        # create a boolean dataframe for our 2 cases
        more = (sigma_sub_df > env_df)

        # now calculate the excess values
        excess_df = (
            (sigma_sub_df[more] - env_df[more]) * sub_me_df[more] /
            sigma_sub_df[more]
        ).fillna(value=0)
        return excess_df

    def _squeeze(self, sub_me):
        """calculate the squeezed proportions"""
        # get the needed data
        sub_me_df = self.draws[sub_me]
        env_df = self.draws[self.me_map["env"]]
        sigma_sub_df = self.draws["sigma_sub"]

        # create a boolean dataframe for our 2 cases
        more = (sigma_sub_df > env_df)

        # get the squeezed values when
        squeeze_more = env_df[more] * sub_me_df[more] / sigma_sub_df[more]
        squeeze_less = sub_me_df[~more]
        squeeze_df = squeeze_more.fillna(squeeze_less)
        return squeeze_df

    def _export(self):
        """export all data"""
        # export residual
        me_id = self.me_map["resid"]
        resid_df = self.draws[me_id].reset_index()
        resid_df["modelable_entity_id"] = me_id
        self._pusher.push(resid_df, append=False)

        # export any subcause adjustments
        for sub_me in self.me_map["sub"].keys():
            if "squeeze" in list(self.me_map["sub"][sub_me].keys()):
                me_id = self.me_map["sub"][sub_me]["squeeze"]
                squeeze_df = self.draws[me_id].reset_index()
                squeeze_df["modelable_entity_id"] = me_id
                self._pusher.push(squeeze_df, append=False)

            if "excess" in list(self.me_map["sub"][sub_me].keys()):
                me_id = self.me_map["sub"][sub_me]["excess"]
                excess_df = self.draws[me_id].reset_index()
                excess_df["modelable_entity_id"] = me_id
                self._pusher.push(excess_df, append=False)

    def adjust(self):
        """run exclusivity adjustment on all MEs"""
        self._import_draws()
        self.draws["sigma_sub"] = self._calc_sigma_sub()
        self.draws[self.me_map["resid"]] = self._resid()
        for sub_me in self.me_map["sub"].keys():
            if "squeeze" in list(self.me_map["sub"][sub_me].keys()):
                self.draws[self.me_map["sub"][sub_me]["squeeze"]] = (
                    self._squeeze(sub_me))
            if "excess" in list(self.me_map["sub"][sub_me].keys()):
                self.draws[self.me_map["sub"][sub_me]["excess"]] = (
                    self._excess(sub_me))
        self._export()

    def _q_adjust(self, inq, outq):
        for location_id in iter(inq.get, sentinel):
            try:
                self.dimensions.index_dim.replace_level("location_id",
                                                        location_id)
                self.adjust()
                outq.put((False, location_id))
            except Exception as e:
                outq.put((ExceptionWrapper(e), location_id))

    def run_all_adjustments_mp(self, n_processes=23):
        inq = Queue()
        outq = Queue()

        # Create and feed sim process
        adjust_procs = []
        min_procs = min(
            [n_processes, self.dimensions.index_dim.cardinality("location_id")]
        )
        for i in range(min_procs):
            p = Process(target=self._q_adjust, args=(inq, outq))
            adjust_procs.append(p)
            p.start()

        # run the simulations
        for location_id in self.dimensions.index_dim.get_level("location_id"):
            inq.put(location_id)

        # make the workers die after
        for _ in adjust_procs:
            inq.put(sentinel)

        # get results
        results = []
        for location_id in self.dimensions.index_dim.get_level("location_id"):
            proc_result = outq.get()
            results.append(proc_result)

        # close up the queue
        for p in adjust_procs:
            p.join()

        for exc, location_id in results:
            if exc:
                exc.re_raise()
Example #11
0
def run_squeeze(location_id, year_id, sex_id):

    ###################################
    # Prepare envelopes
    ###################################
    sequelae_map = pd.read_csv(SOURCE_TARGET_FILE)
    envelope_dict = create_env(location_id, year_id, sex_id)

    ###################################
    # Prepare unsqueezed prevalence
    ###################################
    # Load map of sequelae and their targets
    unsqueezed = get_unsqueezed(sequelae_map, location_id, year_id, sex_id)
    unsqueezed.loc[:, drawcols] = unsqueezed.loc[:, drawcols].clip(lower=0)

    ###################################
    # SQUEEZE
    ###################################
    # Parallelize the squeezing
    pool = Pool(20)
    ages = list(pd.unique(unsqueezed['age_group_id']))
    partial_squeeze = partial(squeeze_age_group,
                              unsqueezed=unsqueezed,
                              env_dict=envelope_dict)
    squeezed = pool.map(partial_squeeze, ages, chunksize=1)
    pool.close()
    pool.join()
    squeezed = pd.concat(squeezed)
    squeezed = squeezed.groupby(
        ['location_id', 'year_id', 'age_group_id', 'sex_id', 'me_id']).sum()
    squeezed = squeezed.reset_index()

    ##################################
    # Write to files
    ##################################
    write_squeezed(squeezed, location_id, year_id, sex_id, MAP_FILE)

    ##################################
    # Allocate residuals
    ##################################
    allocate_residuals(unsqueezed, squeezed, location_id, year_id, sex_id,
                       MAP_FILE)

    ###########################################
    # Determine the remainder of the envelopes
    ###########################################
    remains = calc_env_remainders(envelope_dict, squeezed)

    remain_map = {
        'id_bord': 2000,
        'id_mild': 1999,
        'id_mod': 2001,
        'id_sev': 2002,
        'id_prof': 2003
    }
    for key, meid in remain_map.iteritems():
        print('Writing remainder %s to file' % meid)
        try:
            meid = int(meid)
        except Exception:
            pass
        df = remains[key]
        df['location_id'] = int(float(location_id))
        df['year_id'] = int(float(year_id))
        df['sex_id'] = int(float(sex_id))
        df['measure_id'] = 5
        df['age_group_id'] = df.age_group_id.astype(float).astype(int)
        df["modelable_entity_id"] = meid
        pusher = SuperPusher(spec={
            'file_pattern': ("{modelable_entity_id}/{location_id}/"
                             "{measure_id}_{year_id}_{sex_id}.h5"),
            'h5_tablename':
            'draws'
        },
                             directory=output_dir)
        pusher.push(df[[
            'location_id', 'year_id', 'age_group_id', "sex_id",
            "modelable_entity_id", "measure_id"
        ] + drawcols],
                    append=False)
Example #12
0
def allocate_residuals(usqzd, sqzd, location_id, year_id, sex_id, map_file):
    tmap = pd.read_csv(map_file)

    resids = usqzd.merge(
        sqzd,
        on=['location_id', 'year_id', 'age_group_id', 'sex_id', 'me_id'],
        suffixes=('.usqzd', '.sqzd'))
    resids = resids[resids['resid_target_me.usqzd'].notnull()]

    dscols = ['draw_%s.sqzd' % d for d in range(1000)]
    ducols = ['draw_%s.usqzd' % d for d in range(1000)]
    toalloc = resids[ducols].values - resids[dscols].values
    toalloc = toalloc.clip(min=0)
    resids = resids.join(
        pd.DataFrame(data=toalloc, index=resids.index, columns=drawcols))
    resids = resids[[
        'location_id', 'year_id', 'age_group_id', 'sex_id',
        'resid_target_me.usqzd'
    ] + drawcols]
    resids.rename(columns={'resid_target_me.usqzd': 'resid_target_me'},
                  inplace=True)
    resids = resids.groupby(['resid_target_me', 'age_group_id']).sum()
    resids = resids.reset_index()
    resids = resids[['resid_target_me', 'age_group_id'] + drawcols]

    for me_id, resid_df in resids.groupby('resid_target_me'):
        t_meid = tmap.query('modelable_entity_id_source == %s' % me_id)
        t_meid = t_meid.modelable_entity_id_target.squeeze()
        try:
            t_meid = int(t_meid)
        except Exception:
            pass
        present = True
        try:
            draw_source = Epi.create_modelable_entity_draw_source(
                n_workers=1, modelable_entity_id=me_id, gbd_round_id=5)
            draw_source.remove_transform(automagic_age_sex_agg)
            t_df = draw_source.content(
                filters={
                    "location_id": location_id,
                    "year_id": year_id,
                    "sex_id": sex_id,
                    "measure_id": 5
                })
        except NoBestVersionError:
            present = False
        if present:
            t_df = t_df.merge(resid_df,
                              on='age_group_id',
                              suffixes=('#base', '#resid'))
            newvals = (t_df.filter(like="#base").values +
                       t_df.filter(like="#resid").values)
            t_df = t_df.join(
                pd.DataFrame(data=newvals, index=t_df.index, columns=drawcols))

            print('Writing residual %s to file' % t_meid)
            t_df['location_id'] = int(float(location_id))
            t_df['year_id'] = int(float(year_id))
            t_df['sex_id'] = int(float(sex_id))
            t_df['measure_id'] = 5
            t_df['age_group_id'] = t_df.age_group_id.astype(float).astype(int)
            t_df["modelable_entity_id"] = t_meid
            t_df = t_df[[
                'location_id', 'year_id', 'age_group_id', "sex_id",
                "modelable_entity_id", "measure_id"
            ] + drawcols]
            pusher = SuperPusher(spec={
                'file_pattern': ("{modelable_entity_id}/{location_id}/"
                                 "{measure_id}_{year_id}_{sex_id}.h5"),
                'h5_tablename':
                'draws'
            },
                                 directory=output_dir)
            pusher.push(t_df, append=False)
        else:
            print('ME ID %s missing' % me_id)

    return resids