def expand_location_arguments(args):
    """Expand the various location arguments into a set of location ids"""
    logger = logging.getLogger("dalynator")
    if args.location_id is not None:
        location_id_list = to_list(args.location_id)
        if args.subtree_id is not None:
            raise ValueError(
                "Cannot specify both a location_id and a subtree_id")
    else:
        if args.location_set_id:
            tree = loctree(None, args.location_set_id)
            location_id_list = [n.id for n in tree.nodes]
            logger.debug("  location_set_id original")
        elif args.location_set_version_id:
            tree = loctree(args.location_set_version_id)
            location_id_list = [n.id for n in tree.nodes]
            logger.debug("  location_set_version_id original")
        if args.subtree_id is not None:
            node_list = tree.get_node_by_id(args.subtree_id).all_descendants()
            location_id_list = [n.id for n in node_list]
            location_id_list.append(args.subtree_id)

    logger.debug("  location_id_list as list {}:{}".format(
        len(location_id_list), location_id_list))
    args.location_id_list = location_id_list

    return args
Beispiel #2
0
def main():
    popdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_age_'
              'splits/01_populations')
    popfile = 'single_age_u5_pops.dta'

    popdf = pd.read_stata(os.path.join(popdir, popfile))

    age_ranges = ['pys_{}'.format(x + 1) for x in range(4)]
    age_map = {val: idx + 1 for idx, val in enumerate(age_ranges)}

    popdf = melt_age_cols(popdf)
    popdf = popdf[popdf.age_group.isin(age_ranges)]

    popdf['age_group'] = popdf.age_group.map(age_map)
    popdf['sex_id'] = popdf.sex.map({'male': 1, 'female': 2})

    popdf = aggregate_ages(popdf)
    popdf.rename(columns={'year': 'year_id'}, inplace=True)

    locs = get_location_metadata(location_set_id=35, gbd_round_id=4)
    locs = locs[['location_id', 'ihme_loc_id']]

    popdf = pd.merge(popdf, locs, on='ihme_loc_id', how='left')
    popdf.drop(labels=['ihme_loc_id'], inplace=True, axis=1)
    popdf.rename(columns={'population': 'pop_scaled'}, inplace=True)

    lsvid = dbtrees.get_location_set_version_id(35, gbd_round=2016)
    lt = dbtrees.loctree(lsvid)

    index_cols = ['age_group_id', 'sex_id', 'year_id', 'location_id']
    data_cols = ['pop_scaled']

    aggpop = agg_hierarchy(lt, popdf, index_cols, data_cols, 'location_id')

    sdi_lsvid = dbtrees.get_location_set_version_id(40, gbd_round=2016)
    sdi_lts = dbtrees.loctree(sdi_lsvid, return_many=True)

    for tree in sdi_lts:
        sdi_agg_df = agg_hierarchy(tree, popdf, index_cols, data_cols,
                                   'location_id')
        sdi_agg_df = sdi_agg_df[sdi_agg_df.location_id.isin(
            [44634, 44635, 44636, 44637, 44639])]
        aggpop = aggpop.append(sdi_agg_df)

    outdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_age_'
              'splits/01_populations')
    outfile = os.path.join(outdir, 'age_pop.h5')

    aggpop.to_hdf(outfile,
                  'data',
                  mode='w',
                  format='table',
                  data_columns=index_cols)
    print('fin')
Beispiel #3
0
    def new_dimensions(
            self, location_id=[], year_id=[], sex_id=[], age_group_id=[],
            measure_id=[], n_draws=1000):
        if not location_id:
            lt = dbtrees.loctree(self.location_set_version_id)
            location_id = [loc.id for loc in lt.leaves()]
        if not year_id:
            year_id = [1990, 1995, 2000, 2005, 2010, 2016]
        if not sex_id:
            sex_id = [1, 2]
        if not age_group_id:
            age_group_id = [
                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
                19, 20, 30, 31, 32, 235]
        if not measure_id:
            measure_id = [3, 5, 6]

        index_dict = {
            "measure_id": measure_id,
            "year_id": year_id,
            "location_id": location_id,
            "sex_id": sex_id,
            "age_group_id": age_group_id
        }
        data_dict = {
            "draws": ['draw_{}'.format(d) for d in range(n_draws)]
        }
        dimensions = DataFrameDimensions(index_dict, data_dict)
        self.dimensions = dimensions
Beispiel #4
0
    def calculate_mad(self):
        """ Merge the ST results back onto the input dataset and calculate
        MAD estimates at every location level """
        flatmap = dbtrees.loctree(self.lvid).flatten()
        melted = self.long_result()
        merged = pd.merge(self.dataset,
                          melted,
                          on=[self.agevar, self.timevar, self.spacevar],
                          how='right')
        merged = pd.merge(merged,
                          flatmap,
                          left_on=self.spacevar,
                          right_on='leaf_node',
                          how='left')

        # Calculate residuals
        merged['st_resid'] = merged[self.datavar] - merged['st_prediction']

        # Calculate MAD estimates at various geographical levels
        for lvlcol in merged.filter(like='level').columns:
            if merged[lvlcol].notnull().any():
                mad_lvl = merged.groupby(lvlcol).agg({
                    'st_resid': mad
                }).reset_index().rename(
                    columns={'st_resid': 'mad_%s' % lvlcol})
                merged = pd.merge(merged, mad_lvl, on=lvlcol, how="left")
        return merged
Beispiel #5
0
def mix_locations(args):
    lid, yid, sample_size = args
    lt = dbtrees.loctree(None, 35)
    lids = [l.id for l in lt.get_node_by_id(lid).children]
    pops = get_pop({
        'location_id': lids,
        'year_id': yid,
        'sex_id': 3,
        'age_group_id': 22
    })
    pops['prop'] = pops.pop_scaled / pops.pop_scaled.sum()
    pops['nsamples'] = pops.prop.apply(lambda x: int(round(x * sample_size)))

    subsample = []
    for i, row in pops.iterrows():
        l = int(row['location_id'])
        nsims = row['nsamples']
        try:
            ss = pd.read_hdf('{pd}/{l}_{y}.h5'.format(pd=pooldir, l=l, y=yid))
            ss = ss.sample(nsims, replace=True)
            subsample.append(ss)
        except:
            print 'issue with %s' % l
    subsample = pd.concat(subsample)
    subsample.reset_index(drop=True, inplace=True)
    subsample.to_hdf("{od}/{l}_{y}.h5".format(od=pooldir, l=lid, y=yid),
                     'sims',
                     mode='w')
Beispiel #6
0
def calc_spatial_distances(location_set_version_id, o_locs):
    lt = dbtrees.loctree(location_set_version_id)
    o_locs = np.atleast_1d(o_locs).ravel()
    nlvls = lt.max_depth()

    leaf_lvls = []
    for lvl in reversed(range(nlvls + 1)):
        if len(set(lt.leaves()) & set(lt.level_n_descendants(lvl))) > 0:
            leaf_lvls.append(lvl)

    lflat = lt.flatten()
    o_locdf = pd.DataFrame({'leaf_node': o_locs})
    o_locdf = o_locdf.merge(lflat, on='leaf_node')
    d_df = []
    for lvl in leaf_lvls:
        leaf_df = lflat[lflat['level_%s' % lvl].notnull()]
        lflat = lflat[lflat['level_%s' % lvl].isnull()]
        d0_locs = (np.atleast_2d(
            leaf_df['level_%s' % lvl].values).T == np.atleast_2d(
                o_locdf['level_%s' % lvl].values)).astype(int)
        d1_locs = (np.atleast_2d(
            leaf_df['level_%s' % (lvl - 1)].values).T == np.atleast_2d(
                o_locdf['level_%s' % (lvl - 1)].values)).astype(int)
        d2_locs = (np.atleast_2d(
            leaf_df['level_%s' % (lvl - 2)].values).T == np.atleast_2d(
                o_locdf['level_%s' % (lvl - 2)].values)).astype(int)

        d_locs = d0_locs + d1_locs + d2_locs
        d_df.append(
            pd.DataFrame(d_locs.T,
                         columns=leaf_df['level_%s' % lvl].values,
                         index=o_locs))
    d_df = pd.concat(d_df, axis=1)
    d_df = d_df.fillna(0)
    return d_df
Beispiel #7
0
def expand_and_validate_location_lists(tool_name, location_set_ids,
                                       gbd_round_id):
    """
    Create lists of most-detailed and aggregate location ids by expanding the
    location_set_ids.


    Returns:
     most_detailed_locs - the leaf locations with not children;
     aggregate_locs - the internal locations with children

    """
    most_detailed_locs = set()
    aggregate_locs = set()

    for loc_set in location_set_ids:
        tree_list = hdb.loctree(location_set_id=loc_set,
                                gbd_round_id=gbd_round_id,
                                return_many=True)
        for lt in tree_list:
            most_detailed_locs.update(set([l.id for l in lt.leaves()]))
            aggregate_locs.update(
                set([l.id for l in lt.nodes
                     if l.id not in most_detailed_locs]))

    # Remove 44620 (non-standard Global location)
    aggregate_locs = set(aggregate_locs - {44620})

    return most_detailed_locs, aggregate_locs
Beispiel #8
0
def squeezer(q, log_dir):
    locations = dbtrees.loctree(None, 35)
    root_path = os.path.dirname(os.path.abspath(__file__))
    runfile = "%s/squeeze_em_all.py" % root_path

    for location_id in [l.id for l in locations.leaves()]:
        for year_id in [1990, 1995, 2000, 2005, 2010, 2016]:
            for sex_id in [1, 2]:
                params = [
                    '--location_id',
                    str(location_id), '--year_id',
                    str(year_id), '--sex_id',
                    str(sex_id)
                ]
                remote_job = job.Job(mon_dir=log_dir,
                                     runfile=runfile,
                                     name='squeeze_%s_%s_%s' %
                                     (location_id, year_id, sex_id),
                                     job_args=params)
                q.queue_job(remote_job,
                            slots=30,
                            memory=60,
                            project='proj_epic',
                            stderr='/FILEPATH',
                            stdout='/FILEPATH')

    q.block_till_done(poll_interval=60)
def epilepsy_any(cv):
    standard_dws = pd.read_csv("filepath/02_standard/dw.csv")
    healthstates = cv.sequela_list[["modelable_entity_id", "healthstate_id"]]

    # Get country-year specific prevalences for back-calculation
    lt = dbtrees.loctree(None, 35)
    locations = [l.id for l in lt.leaves()]
    years = [1990, 1995, 2000, 2005, 2010, 2016]
    prop_dfs = []
    args = [(l, y) for l in locations for y in years]

    pool = Pool(20)
    prop_dfs = pool.map(get_props, args)
    pool.close()
    pool.join()
    prop_dfs = pd.concat(prop_dfs)
    prop_dfs = prop_dfs.merge(healthstates)
    renames = {'draw_%s' % i: 'dw_prop_%s' % i for i in range(1000)}
    prop_dfs.rename(columns=renames, inplace=True)

    # Combine DWs
    dws_to_weight = prop_dfs.merge(standard_dws,
                                   on='healthstate_id',
                                   how='left')
    dws_to_weight = dws_to_weight.join(
        pd.DataFrame(data=(dws_to_weight.filter(like='draw').values *
                           dws_to_weight.filter(like='dw_prop_').values),
                     index=dws_to_weight.index,
                     columns=drawcols))

    def combine_dws(df):
        draws_to_combine = df.filter(like='draw')
        combined_draws = 1 - (1 - draws_to_combine).prod()
        return combined_draws

    combined_dws = dws_to_weight.groupby(['location_id', 'year_id'
                                          ]).apply(combine_dws).reset_index()
    combined_dws['healthstate_id'] = 772

    col_order = ['location_id', 'year_id', 'healthstate_id'] + drawcols
    combined_dws = combined_dws[col_order]

    # fill in missing years
    dims = deepcopy(cv.dimensions)
    dims.index_dim.add_level('healthstate_id', 772)
    dims.index_dim.replace_level("year_id", list(range(1990, 2017)))
    for index_dim in dims.index_groups:
        if index_dim not in ['location_id', 'year_id', 'healthstate_id']:
            dims.index_dim.drop_level(index_dim)
    gbdizer = GBDizeDataFrame(dims)
    combined_dws = gbdizer.fill_year_by_interpolating(
        df=combined_dws,
        rank_df=combined_dws[combined_dws.year_id == 2005].reset_index())

    combined_dws.to_hdf("{}/info/epilepsy_any_dws.h5".format(cv.como_dir),
                        'draws',
                        mode='w',
                        format='table',
                        data_columns=['location_id', 'year_id'])
Beispiel #10
0
    def run_task(self, location_set_version_id, component):
        source = self.get_source(component)
        sink = self.get_sink(component)
        dimensions = self.dimensions.get_dimension_by_component(
            component, self.measure_id)

        # get the tree we are aggregating
        loc_trees = dbtrees.loctree(
            location_set_version_id=location_set_version_id, return_many=True)
        for loc_tree in loc_trees:

            # get the weight vals
            pop = get_population(
                self.como_version,
                age_group_id=dimensions.index_dim.get_level("age_group_id"),
                location_id=[node.id for node in loc_tree.nodes],
                year_id=dimensions.index_dim.get_level("year_id"),
                sex_id=dimensions.index_dim.get_level("sex_id"))
            pop = pop[[
                "age_group_id", "location_id", "year_id", "sex_id",
                "population"
            ]]

            # set up our aggregation operator
            operator = WtdSum(index_cols=[
                col for col in dimensions.index_names if col != "location_id"
            ],
                              value_cols=dimensions.data_list(),
                              weight_df=pop,
                              weight_name="population",
                              merge_cols=[
                                  "location_id", "year_id", "age_group_id",
                                  "sex_id"
                              ])

            # run our aggregation
            aggregator = AggMemEff(draw_source=source,
                                   draw_sink=sink,
                                   index_cols=[
                                       col for col in dimensions.index_names
                                       if col != "location_id"
                                   ],
                                   aggregate_col="location_id",
                                   operator=operator,
                                   chunksize=self.chunksize[component])

            # run the tree
            aggregator.run(loc_tree,
                           draw_filters={
                               "measure_id": [self.measure_id],
                               "year_id":
                               dimensions.index_dim.get_level("year_id"),
                               "sex_id":
                               dimensions.index_dim.get_level("sex_id")
                           },
                           n_processes=self.chunksize[component])
Beispiel #11
0
 def _cache_location_hierarchy(self, location_set_id: int) -> None:
     logger.debug("Starting to load location_hierarchy cache")
     location_hierarchy = dbtrees.loctree(location_set_id=location_set_id,
                                          gbd_round_id=self.gbd_round_id,
                                          return_many=True)
     cache_file = "FILEPATH".format(self.cache_dir,
                                    location_set_id)
     pickle.dump(location_hierarchy, open(cache_file, "wb"))
     logger.debug("Cached location_hierarchy {} in {}".format(
         cache_file, location_set_id))
Beispiel #12
0
 def __init__(self, parent_meid, prop_drawfile=None):
     self.parent_meid = parent_meid
     self.lt = dbtrees.loctree(None, location_set_id=35)
     self.ags = self.age_groups()
     self.child_meids = self.get_children()
     if prop_drawfile is None:
         self.props = self.get_split_proportions()
         self.props = self.gen_proportion_draws()
     else:
         self.props = pd.read_csv(prop_drawfile)
Beispiel #13
0
    def _add_pct_change_tasks(self, agg_loc_set_versions):
        all_locs = []
        for location_set_version_id in agg_loc_set_versions:
            loc_tree = loctree(location_set_version_id=location_set_version_id)
            all_locs.extend(loc_tree.node_ids)
        all_locs = list(set(all_locs))

        for measure_id in self.como_version.measure_id:
            for location_id in all_locs:
                pct_change_task = self._pct_change_task_fac.get_task(
                    measure_id=measure_id,
                    location_id=location_id)
                self.dag.add_task(pct_change_task)
Beispiel #14
0
    def build_jobmon_workflow(self, identifier=None, extra_arguments=None):
        """
        Returns jobmon workflow that represents cascade job dag.

        Args:
            identifier (str): A unique string to identify this workflow
                for JobMon. Running twice with the same string will restart
                a workflow.
            extra_arguments (List[str]): Command-line arguments to add to
                every UGE Job specified in Jobmon.
        Returns:
            jobmon.Workflow: With all Jobmon tasks created.
        """
        extra_arguments = extra_arguments if extra_arguments else list()
        cv_iters = None if not self.run_cv else list(range(11))

        demo = Demographics(self.mvid)
        lsvid = self.mvm.location_set_version_id.values[0]
        lt = loctree(
            location_set_id=demo.LOCATION_SET_ID,
            location_set_version_id=lsvid,
            gbd_round_id=demo.gbd_round_id)

        desc = self.mvm.description.values[0]

        jobdag = make_dag(
            mvid=self.mvid, loctree=lt, cv_iter=cv_iters,
            add_arguments=extra_arguments
        )

        env = settings['env_variables']['ENVIRONMENT_NAME']
        identifier = identifier if identifier else f"dismod_{self.mvid}_{env}"
        wf = Workflow(
            workflow_args=identifier,
            name=f"dismod_{self.mvid}_{env}",
            resume=True,
            description=desc,
            project=self.project,
            stderr=self.logdir,
            stdout=self.logdir,
            seconds_until_timeout=1210000)

        # since we're looping through the dict and mutating each JobNode
        # to contain a reference to a PythonTask, we require the jobdag dict
        # to be sorted such that we've already visited all upstream tasks of
        # any given node.
        for jobname, dagnode in jobdag.items():
            dagnode.add_job(wf, jobdag, self.mvm)

        return wf
Beispiel #15
0
def launch_summaries(
        model_version_id,
        env='dev',
        years=[1990, 1995, 2000, 2005, 2010, 2015],
        file_pattern='all_draws.h5',
        h5_tablename='draws'):

    global pop, aw, ags
    pop = get_pop()
    aw = get_age_weights()
    ags = get_age_spans()
    drawdir = '/ihme/epi/panda_cascade/%s/%s/full/draws' % (
            env, model_version_id)
    outdir = '/ihme/epi/panda_cascade/%s/%s/full/summaries' % (
            env, model_version_id)
    try:
        os.makedirs(outdir)
        os.chmod(outdir, 0o775)
        os.chmod(os.path.join(outdir, '..'), 0o775)
        os.chmod(os.path.join(outdir, '..', '..'), 0o775)
    except:
        pass
    lt = dbtrees.loctree(None, location_set_id=35)
    locs = [l.id for l in lt.nodes]
    sg = super_gopher.SuperGopher({
            'file_pattern': file_pattern,
            'h5_tablename': h5_tablename},
            drawdir)
    pool = Pool(10)
    res = pool.map(slw, [(
        (l, drawdir, sg, years), {}) for l in locs])
    pool.close()
    pool.join()
    res = [r for r in res if isinstance(r, tuple)]
    res = zip(*res)
    summ = pd.concat([r for r in res[0] if r is not None])
    summ = summ[[
        'location_id', 'year_id', 'age_group_id', 'sex_id',
        'measure_id', 'mean', 'lower', 'upper']]
    summfile = "%s/model_estimate_final.csv" % outdir
    summ.to_csv(summfile, index=False)
    os.chmod(summfile, 0o775)
    csumm = pd.concat(res[1])
    if len(csumm) > 0:
        csumm = csumm[[
            'location_id', 'year_start', 'year_end', 'age_group_id', 'sex_id',
            'measure_id', 'median', 'lower', 'upper']]
        csummfile = "%s/change_summaries.csv" % outdir
        csumm.to_csv(csummfile, index=False)
        os.chmod(csummfile, 0o775)
Beispiel #16
0
def epilepsy_any(cvid):
    standard_dws = pd.read_csv(
        "/home/j/WORK/04_epi/03_outputs/01_code/02_dw/02_standard/dw.csv")
    eng = sqlalchemy.create_engine("strDir")
    healthstates = pd.read_sql(
        "SELECT modelable_entity_id, healthstate_id FROM epi.sequela", eng)

    # Get country-year specific prevalences for back-calculation
    lt = dbtrees.loctree(None, 35)
    locations = [l.id for l in lt.leaves()]
    years = range(1990, 2016, 5)
    prop_dfs = []
    args = [(l, y) for l in locations for y in years]

    pool = Pool(20)
    prop_dfs = pool.map(get_props, args)
    pool.close()
    pool.join()
    prop_dfs = pd.concat(prop_dfs)
    prop_dfs = prop_dfs.merge(healthstates)
    renames = {'draw_%s' % i: 'dw_prop_%s' % i for i in range(1000)}
    prop_dfs.rename(columns=renames, inplace=True)

    # Combine DWs
    dws_to_weight = prop_dfs.merge(standard_dws,
                                   on='healthstate_id',
                                   how='left')
    dws_to_weight = dws_to_weight.join(
        pd.DataFrame(data=(dws_to_weight.filter(like='draw').values *
                           dws_to_weight.filter(like='dw_prop_').values),
                     index=dws_to_weight.index,
                     columns=drawcols))

    def combine_dws(df):
        draws_to_combine = df.filter(like='draw')
        combined_draws = 1 - (1 - draws_to_combine).prod()
        return combined_draws

    combined_dws = dws_to_weight.groupby(['location_id', 'year_id'
                                          ]).apply(combine_dws).reset_index()
    combined_dws['healthstate_id'] = 772

    col_order = ['location_id', 'year_id', 'healthstate_id'] + drawcols
    combined_dws = combined_dws[col_order]
    combined_dws.to_hdf("/ihme/centralcomp/como/%s/info/epilepsy_any_dws.h5" %
                        cvid,
                        'draws',
                        mode='w',
                        format='table',
                        data_columns=['location_id', 'year_id'])
Beispiel #17
0
def to_como(como_dir, location_set_id, gbd_round_id):
    df = pd.read_csv("FILEPATH/urolith_symp_dws.csv")

    # fill for new locs
    lt = dbtrees.loctree(location_set_id=location_set_id,
                         gbd_round_id=gbd_round_id)
    locmap = lt.flatten()
    reg_avgs = df.merge(locmap[['leaf_node', 'level_2']],
                        left_on='location_id',
                        right_on='leaf_node')
    reg_avgs = reg_avgs[['level_2', 'year_id', 'healthstate_id'] +
                        list(df.filter(like='draw').columns)]
    reg_avgs = reg_avgs.groupby(['level_2', 'year_id'])
    reg_avgs = reg_avgs.mean().reset_index()
    reg_avgs.rename(columns={'level_2': 'location_id'}, inplace=True)
    df = df.append(reg_avgs)

    filllen = 0
    for ln in list(locmap.leaf_node.unique()):
        if ln not in list(df.location_id):
            for i in reversed(range(6)):
                fill_loc = locmap.loc[locmap.leaf_node == ln,
                                      'level_%s' % i].squeeze()
                filldf = df[df.location_id == fill_loc]
                if len(filldf) > 0:
                    filldf['location_id'] = ln
                    df = df.append(filldf)
                    filllen = filllen + 1
                    break
    df = df[df.location_id.isin([l.id for l in lt.leaves()])]

    # fill in missing years
    extra = df.query("year_id == 2013")
    extra['year_id'] = 2019
    df = df.append(extra)
    df = df.filter(regex='(.*_id|draw_)')
    interp = pchip_interpolate(df=df,
                               id_cols=['location_id', 'healthstate_id'],
                               value_cols=['draw_%s' % d for d in range(1000)],
                               time_col="year_id",
                               time_vals=list(range(1990, 2020)))
    df = df.append(interp)
    df = df[df.year_id.isin(list(range(1990, 2020)))]

    # save for como run
    df.to_hdf(f"{como_dir}/info/urolith_dws.h5",
              'draws',
              mode='w',
              format='table',
              data_columns=['location_id', 'year_id'])
Beispiel #18
0
 def new_simulation_index(self, year_id):
     lt = dbtrees.loctree(
         location_set_version_id=self.location_set_version_id)
     location_id = [loc.id for loc in lt.leaves()]
     demo = get_demographics(gbd_team="epi", gbd_round_id=self.gbd_round_id)
     if not year_id:
         year_id = demo['year_id']
     simulation_index = {
         "year_id": year_id,
         "location_id": location_id,
         "sex_id": demo['sex_id'],
         "age_group_id": demo['age_group_id']
     }
     self.simulation_index = simulation_index
Beispiel #19
0
    def get_task(self, component, year_id, sex_id, measure_id,
                 location_set_version_id):
        loc_trees = dbtrees.loctree(
            location_set_version_id=location_set_version_id, return_many=True)
        # put all aggregate locations in a mapping dict for summary dependency
        agg_locs = []
        for loc_tree in loc_trees:
            agg_locs.extend([
                node for node in loc_tree.node_ids
                if node not in [x.id for x in loc_tree.leaves()]
            ])
        for location_id in agg_locs:
            self.agg_loc_set_map[location_id] = location_set_version_id

        # hunt down upstream task list
        upstream_tasks = []
        for location_id in [node.id for node in loc_tree.leaves()]:
            if measure_id == measures.INCIDENCE:
                upstream_name = IncidenceTaskFactory.get_task_name(
                    location_id, sex_id)
            elif measure_id == measures.PREVALENCE and component in [
                    "sequela", "injuries", "impairment"
            ]:
                upstream_name = SimulationInputTaskFactory.get_task_name(
                    location_id, sex_id)
            else:
                upstream_name = SimulationTaskFactory.get_task_name(
                    location_id, sex_id, year_id)
            upstream_tasks.append(self.task_registry[upstream_name])

        name = self.get_task_name(component, year_id, sex_id, measure_id,
                                  location_set_version_id)
        task = PythonTask(script=THIS_FILE,
                          args=[
                              "--como_dir", self.como_version.como_dir,
                              "--component", component, "--year_id", year_id,
                              "--sex_id", sex_id, "--measure_id", measure_id,
                              "--location_set_version_id",
                              location_set_version_id
                          ],
                          name=name,
                          upstream_tasks=upstream_tasks,
                          num_cores=4,
                          m_mem_free="300G",
                          max_attempts=5,
                          max_runtime_seconds=(60 * 60 * 20),
                          tag="loc_agg")
        self.task_registry[name] = task
        return task
Beispiel #20
0
def to_como(como_dir):
    df = pd.read_csv(
        "filepath/03_custom/"
        "urolith_symp_dws.csv")

    iso3map = ezfuncs.query("""
        SELECT location_id, ihme_loc_id as iso3
        FROM shared.location_hierarchy_history
        WHERE location_set_version_id = 4""", conn_def="shared")
    df = df.rename(columns={'year': 'year_id'})

    df = df.merge(iso3map, on='iso3')
    lt = dbtrees.loctree(None, 35)
    locmap = lt.flatten()

    reg_avgs = df.merge(
        locmap[['leaf_node', 'level_2']],
        left_on='location_id', right_on='leaf_node')
    reg_avgs = reg_avgs[
        ['level_2', 'year_id', 'healthstate_id'] +
        list(df.filter(like='draw').columns)]
    reg_avgs = reg_avgs.groupby(['level_2', 'year_id'])
    reg_avgs = reg_avgs.mean().reset_index()
    reg_avgs.rename(columns={'level_2': 'location_id'}, inplace=True)
    df = df.append(reg_avgs)

    filllen = 0
    for ln in list(locmap.leaf_node.unique()):
        if ln not in list(df.location_id):
            for i in reversed(range(6)):
                fill_loc = locmap.ix[
                    locmap.leaf_node == ln, 'level_%s' % i].squeeze()
                filldf = df[df.location_id == fill_loc]
                if len(filldf) > 0:
                    filldf['location_id'] = ln
                    df = df.append(filldf)
                    filllen = filllen + 1
                    break
    df = df[df.location_id.isin([l.id for l in lt.leaves()])]
    df['year_id'] = df.year_id.replace({2013: 2016})
    df.rename(columns={
        'draw%s' % d: 'draw_%s' % d for d in range(1000)}, inplace=True)
    df = df.filter(regex='(.*_id|draw_)')
    df.to_hdf(
        "{}/info/urolith_dws.h5".format(como_dir),
        'draws',
        mode='w',
        format='table',
        data_columns=['location_id', 'year_id'])
Beispiel #21
0
def to_como(cvid):
    df = pd.read_csv("/home/j/WORK/04_epi/03_outputs/01_code/02_dw/03_custom/"
                     "urolith_symp_dws.csv")

    eng = sqlalchemy.create_engine("stDir")
    iso3map = pd.read_sql(
        """
        SELECT location_id, ihme_loc_id as iso3
        FROM shared.location_hierarchy_history
        WHERE location_set_version_id = 4""", eng)
    df = df.rename(columns={'year': 'year_id'})

    df = df.merge(iso3map, on='iso3')
    lt = dbtrees.loctree(None, 35)
    locmap = lt.flatten()

    reg_avgs = df.merge(locmap[['leaf_node', 'level_2']],
                        left_on='location_id',
                        right_on='leaf_node')
    reg_avgs = reg_avgs[['level_2', 'year_id', 'healthstate_id'] +
                        list(df.filter(like='draw').columns)]
    reg_avgs = reg_avgs.groupby(['level_2', 'year_id'])
    reg_avgs = reg_avgs.mean().reset_index()
    reg_avgs.rename(columns={'level_2': 'location_id'}, inplace=True)
    df = df.append(reg_avgs)

    filllen = 0
    for ln in list(locmap.leaf_node.unique()):
        if ln not in list(df.location_id):
            for i in reversed(range(6)):
                fill_loc = locmap.ix[locmap.leaf_node == ln,
                                     'level_%s' % i].squeeze()
                filldf = df[df.location_id == fill_loc]
                if len(filldf) > 0:
                    filldf['location_id'] = ln
                    df = df.append(filldf)
                    filllen = filllen + 1
                    break
    df = df[df.location_id.isin([l.id for l in lt.leaves()])]
    df['year_id'] = df.year_id.replace({2013: 2015})
    df.rename(columns={'draw%s' % d: 'draw_%s' % d
                       for d in range(1000)},
              inplace=True)
    df = df.filter(regex='(.*_id|draw_)')
    df.to_hdf("/ihme/centralcomp/como/%s/info/urolith_dws.h5" % cvid,
              'draws',
              mode='w',
              format='table',
              data_columns=['location_id', 'year_id'])
Beispiel #22
0
    def get_task(self, component, year_id, sex_id, measure_id,
                 location_set_version_id, redis_host):
        loc_tree = dbtrees.loctree(
            location_set_version_id=location_set_version_id)
        # put all aggregate locations in a mapping dict for summary dependency
        agg_locs = [
            node for node in loc_tree.node_ids
            if node not in [x.id for x in loc_tree.leaves()]
        ]
        for location_id in agg_locs:
            self.agg_loc_set_map[location_id] = location_set_version_id

        # hunt down upstream task list
        upstream_tasks = []
        for location_id in [node.id for node in loc_tree.leaves()]:
            if measure_id == 6:
                upstream_name = IncidenceTaskFactory.get_task_name(
                    location_id, sex_id)
            elif measure_id == 5 and component in [
                    "sequela", "injuries", "impairment"
            ]:
                upstream_name = SimulationInputTaskFactory.get_task_name(
                    location_id, sex_id)
            else:
                upstream_name = SimulationTaskFactory.get_task_name(
                    location_id, sex_id, year_id)
            upstream_tasks.append(self.task_registry[upstream_name])

        name = self.get_task_name(component, year_id, sex_id, measure_id,
                                  location_set_version_id)
        task = PythonTask(script=this_file,
                          args=[
                              "--como_dir", self.como_version.como_dir,
                              "--component", component, "--year_id", year_id,
                              "--sex_id", sex_id, "--measure_id", measure_id,
                              "--location_set_version_id",
                              location_set_version_id, "--redis_host",
                              redis_host
                          ],
                          name=name,
                          upstream_tasks=upstream_tasks,
                          slots=25,
                          mem_free=300,
                          max_attempts=5,
                          max_runtime=(60 * 60 * 10),
                          tag="loc_agg")
        self.task_registry[name] = task
        return task
Beispiel #23
0
 def new_simulation_index(self, year_id=[]):
     lt = dbtrees.loctree(
         location_set_version_id=self.location_set_version_id)
     location_id = [loc.id for loc in lt.leaves()]
     age_group_id = [
         2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20, 30, 31, 32, 235]
     if not year_id:
         year_id = [1990, 1995, 2000, 2005, 2010, 2017]
     simulation_index = {
         "year_id": year_id,
         "location_id": location_id,
         "sex_id": [1, 2],
         "age_group_id": age_group_id
     }
     self.simulation_index = simulation_index
Beispiel #24
0
    def _add_summarization_tasks(self, agg_loc_set_versions):
        all_locs = []
        for location_set_version_id in agg_loc_set_versions:
            loc_tree = loctree(location_set_version_id=location_set_version_id)
            all_locs.extend(loc_tree.node_ids)
        all_locs = list(set(all_locs))

        parallelism = ["measure_id", "year_id"]
        d = self.como_version.nonfatal_dimensions.get_simulation_dimensions(
            self.como_version.measure_id)
        for slices in d.index_slices(parallelism):
            for location_id in all_locs:
                summ_task = self._summarize_task_fac.get_task(
                    measure_id=slices[0],
                    year_id=slices[1],
                    location_id=location_id)
                self.dag.add_task(summ_task)
Beispiel #25
0
def launch_cod_splits(source_cause_id, target_cause_ids, target_meids,
                      output_dir):

    from multiprocessing import Pool

    cme_map = dict(zip(target_cause_ids, target_meids))
    lt = dbtrees.loctree(None, location_set_id=35)
    leaf_ids = [l.id for l in lt.leaves()]

    params = []
    for lid in leaf_ids:
        params.append((source_cause_id, cme_map, lid, output_dir))

    pool = Pool(30)
    res = pool.map(split_n_write, params)
    pool.close()
    return res
Beispiel #26
0
def launch_squeeze(autism_value=0.29, idiopathic=95.0, autism_resid=0.05):
    locations = dbtrees.loctree(None, 35)

    runfile = "strCodeDir/squeeze_em_all.py"
    for location_id in [l.id for l in locations.leaves()]:
        for year_id in [1990, 1995, 2000, 2005, 2010, 2015]:
            for sex_id in [1, 2]:
                params = [
                    '--location_id', location_id, '--year_id', year_id,
                    '--sex_id', sex_id
                ]
                sge.qsub(runfile,
                         'sqz_%s_%s_%s' % (location_id, year_id, sex_id),
                         parameters=params,
                         slots=30,
                         memory=60,
                         project='proj_como',
                         conda_env="isqueeze")
Beispiel #27
0
def run_como(
        como_dir=None,
        root_dir="FILEPATH",
        gbd_round_id=5,
        location_set_id=35,
        year_id=list(range(1990, 2018)),
        measure_id=[3, 5, 6],
        n_draws=1000,
        n_simulants=20000,
        components=["cause", "sequela", "injuries", "impairment"],
        change_years=[(1990, 2007), (2007, 2017), (1990, 2017)],
        agg_loc_sets=[35, 83],
        project="proj_como"):

    special_sets = set(agg_loc_sets) - set([location_set_id])
    all_sets = set(agg_loc_sets) | set([location_set_id])

    if como_dir is not None:
        cv = ComoVersion(como_dir)
        cv.load_cache()
    else:
        cv = ComoVersion.new(
            root_dir, gbd_round_id, location_set_id, year_id, measure_id,
            n_draws, components, change_years, special_sets)

    cwf = ComoWorkFlow(cv)
    cwf.add_tasks_to_dag(n_simulants=n_simulants, agg_loc_sets=all_sets)
    if cwf.run_workflow(project=project):
        all_locs = []
        for location_set_id in all_sets:
            loc_tree = loctree(location_set_id=location_set_id,
                               gbd_round_id=cv.gbd_round_id)
            all_locs.extend(loc_tree.node_ids)
        all_locs = list(set(all_locs))
        run_upload(cv, all_locs)
    else:
        raise RuntimeError("como unsuccessful")
Beispiel #28
0
def _get_population(
    version: MachineParameters,
    location_set_id: int = constants.LocationSetId.OUTPUTS,
    agg_loc_sets: Optional[List[int]] = (
        constants.LocationAggregation.Ids.SPECIAL_LOCATIONS +
        [constants.LocationSetId.OUTPUTS])
) -> pd.DataFrame:
    """
    Unpacks arguments from version object to use with get_population
    function. Requests most detailed ages and most detailed sexes because
    age-sex population aggregates are created in the summarize module.
    Dependant on demographics team to upload population for majority of
    aggregate locations but currently uses AggSynchronous to create population
    information for select Norway locations in LocationSetId.OUTPUTS.

    Arguments:
        version (MachineParameters): object containing all the demographic
            and configuration data needed to query population
            estimates.
        location_set_id (int): The id for hierarchy to aggregate up
        agg_loc_sets (list): Additional location sets to create special
                aggregates

    Return:
        pd.DataFrame
    """
    pop = get_population(age_group_id=version.most_detailed_age_group_ids,
                         location_id=version.location_ids,
                         year_id=version.year_ids,
                         sex_id=version.sex_ids,
                         run_id=version.population_version_id,
                         decomp_step=version.decomp_step,
                         gbd_round_id=version.gbd_round_id)
    io_mock = {}
    source = DrawSource({"draw_dict": io_mock, "name": "tmp"}, mem_read_func)
    sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func)
    index_cols = constants.Columns.DEMOGRAPHIC_INDEX
    data_cols = [constants.Columns.POPULATION]
    sink.push(pop[index_cols + data_cols])
    # location
    if agg_loc_sets:
        assert len(agg_loc_sets) == len(set(agg_loc_sets))
        assert agg_loc_sets[-1] == constants.LocationSetId.OUTPUTS

        for set_id in agg_loc_sets:
            loc_tree = dbtrees.loctree(location_set_id=set_id,
                                       gbd_round_id=version.gbd_round_id)
            operator = Sum(index_cols=([
                col for col in index_cols
                if col != constants.Columns.LOCATION_ID
            ]),
                           value_cols=data_cols)
            aggregator = AggSynchronous(
                draw_source=source,
                draw_sink=sink,
                index_cols=([
                    col for col in index_cols
                    if col != constants.Columns.LOCATION_ID
                ]),
                aggregate_col=constants.Columns.LOCATION_ID,
                operator=operator)
            aggregator.run(loc_tree)
        special_locations = source.content()
    else:
        special_locations = pd.DataFrame()

    return pd.concat([
        pop, special_locations.
        loc[~special_locations.location_id.isin(pop.location_id.unique())]
    ],
                     ignore_index=True)
Beispiel #29
0
    def __init__(self,
                 split_version_id,
                 output_dir,
                 decomp_step,
                 location_id=None,
                 year_id=None,
                 age_group_id=None,
                 sex_id=None,
                 measure_id=None,
                 location_set_id=35,
                 gbd_round_id=gbd.GBD_ROUND_ID,
                 n_draws=1000):

        # static ids
        self.split_version_id = split_version_id
        self.decomp_step = decomp_step
        self.location_set_id = location_set_id
        self.gbd_round_id = gbd_round_id

        # read func is derived from static values. we call it to initialize the
        # internal caching
        self._read_func = split_prop_read_func()
        cached_props = self._read_func(params={}, filters=self.ss_filters)

        # dimensions are derived unless explicit
        if not location_id:
            location_id = [
                node.id for node in
                dbtrees.loctree(location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id).leaves()
            ]
        if not year_id:
            year_id = estimation_years_from_gbd_round_id(gbd_round_id)
        if not age_group_id:
            # this has the advantage of instantiating the lru cache in the main
            # process before multiprocessing
            age_group_id = get_age_group_set(12)["age_group_id"].tolist()
            if -1 in cached_props["age_start"].unique().tolist():
                age_group_id.append(164)
        if not sex_id:
            sex_id = [gbd.sex.MALE, gbd.sex.FEMALE]
        if not measure_id:
            measure_id = [gbd.measures.PREVALENCE, gbd.measures.INCIDENCE]

        index_dict = {
            "location_id": location_id,
            "year_id": year_id,
            "age_group_id": age_group_id,
            "sex_id": sex_id,
            "measure_id": measure_id
        }
        data_dict = {"data": ['draw_{}'.format(i) for i in range(n_draws)]}
        self.dimensions = dimensionality.DataFrameDimensions(
            index_dict, data_dict)

        sp_formula = SevPropFormula()
        sp_formula.build_custom_draw_source(params={},
                                            read_func=self._read_func)
        sp_formula.add_transforms()
        self._ss_draw_source = sp_formula.draw_source

        # epi draws source
        mvid, dstep = get_best_model_version_and_decomp_step(
            output_dir, int(self.parent_meid))
        src = Epi.create_modelable_entity_draw_source(
            n_workers=1,
            modelable_entity_id=int(self.parent_meid),
            model_version_id=mvid,
            gbd_round_id=gbd_round_id,
            decomp_step=dstep)
        src.remove_transform(automagic_age_sex_agg)

        if n_draws < 1000:
            src.add_transform(group_and_downsample, n_draws)

        self._epi_draw_source = src

        self.pusher = SuperPusher(spec={
            'file_pattern': "{modelable_entity_id}/{location_id}.h5",
            'h5_tablename': 'draws'
        },
                                  directory=output_dir)
def aggregate_locations(aggregation_type: str, parent_dir: str,
                        measure_id: int, gbd_round_id: int,
                        location_set_id: int, year_id: int) -> None:
    """
    Uses a AggMemEff aggregator to aggregate locations for deaths and
    YLLs.

    Arguments:
        aggregation_type (str): the type of data to be aggregated up a
            location hierarchy. One of 'aggregated/rescaled',
            'aggregated/shocks', 'aggregated/unscaled'
            'scaled', or 'unaggregated/shocks'.
        parent_dir (str): parent fauxcorrect directory
            e.g. PATH/{version}
        measure_id (int): measure ID for deaths or YLLs
        gbd_round_id (int): GBD round ID for this fauxcorrect run
        location_set_id (int): location set ID with which to aggregate
        year_id (int): draws year ID

    Raises:
        ValueError: if measure_id is not deaths (1) or YLLs (4)
    """
    # Set up DrawSource and DrawSink.
    source_dir, sink_dir = _get_draw_source_sink_dirs(parent_dir,
                                                      aggregation_type,
                                                      measure_id)
    source, draw_filters = _get_draw_source_and_filters(
        aggregation_type, source_dir, year_id, measure_id)

    sink = DrawSink({
        'draw_dir':
        sink_dir,
        'file_pattern':
        FilePaths.LOCATION_AGGREGATE_FILE_PATTERN.format(year_id=year_id),
        'h5_tablename':
        Keys.DRAWS
    })
    sink.add_transform(_apply_regional_scalars, parent_dir, gbd_round_id,
                       location_set_id)
    sink.add_transform(add_measure_id_to_sink, measure_id=measure_id)

    # clean up old files we plan on writing
    clean_aggregation_directory(root_dir=sink.params['draw_dir'],
                                file_pattern=sink.params['file_pattern'],
                                location_set_id=location_set_id,
                                gbd_round_id=gbd_round_id)

    # Set up aggregator and location tree.
    index_cols = ([col for col in Columns.INDEX if col != Columns.LOCATION_ID])
    operator = Sum(index_cols, Columns.DRAWS)

    agg = AggMemEff(source,
                    sink,
                    index_cols,
                    Columns.LOCATION_ID,
                    operator,
                    chunksize=2)
    is_sdi_set = location_set_id == LocationSetId.SDI
    trees = loctree(location_set_id=location_set_id,
                    gbd_round_id=gbd_round_id,
                    return_many=is_sdi_set)

    logging.info(f"Aggregating locations, location_set_id: {location_set_id}")
    for tree in np.atleast_1d(trees):
        agg.run(tree, draw_filters=draw_filters, n_processes=10)