def get_subtype_draws(self): meid_measures = self.identify_subtypes() df = [] for mm in meid_measures: meid, msid = mm has_draws = False try: thisdf = gopher.draws( {'modelable_entity_ids': [meid]}, location_ids=self.location_id, year_ids=self.year_id, sex_ids=self.sex_id, age_group_ids=self.age_group_id, measure_ids=msid, source='dismod', gbd_round_id={GBD ROUND ID}) if len(thisdf) > 0: has_draws = True print 'Retrieved measure_id %s for meid %s' % (msid, meid) except: try: thisdf = gopher.draws( {'modelable_entity_ids': [meid]}, location_ids=self.location_id, year_ids=self.year_id, sex_ids=self.sex_id, age_group_ids=self.age_group_id, measure_ids=msid, source='dismod', status='latest') if len(thisdf) > 0: has_draws = True print 'Retrieved measure_id %s for meid %s' % ( msid, meid) except: pass if has_draws: df.append(thisdf) else: print 'meid %s draws not found. filling with zeros.' % meid dummy_draws = { 'modelable_entity_id': meid, 'model_version_id': 0, 'location_id': self.location_id, 'year_id': self.year_id, 'age_group_id': self.age_group_id, 'sex_id': self.sex_id} dummy_draws.update({'draw_%s' % d: 0 for d in range(1000)}) df.append(pd.DataFrame([dummy_draws])) df = pd.concat(df) reqd_cols = ['modelable_entity_id'] reqd_cols.extend(draw_cols) self.model_version_map = df[[ 'modelable_entity_id', 'model_version_id']] self.prevalence = df[reqd_cols].merge( self.subin, on='modelable_entity_id', how='left') return self.prevalence
def get_unsqueezed(sequelae_map, drawcols, location_id, year_id, sex_id): # Get all causes with epilepsy, ID, and blindness unsqueezed = [] for idx, seqrow in sequelae_map.iterrows(): me_id = int(seqrow[['me_id']]) a = seqrow['acause'] try: gbd_ids = {'modelable_entity_ids': [me_id]} df = gopher.draws(gbd_ids, 'dismod', location_ids=location_id, year_ids=year_id, sex_ids=sex_id, measure_ids=5) df['me_id'] = me_id unsqueezed.append(df) except: print('Failed retrieving %s. Filling with zeros' % (a)) df = unsqueezed[0].copy() df['me_id'] = me_id df.ix[:, drawcols] = 0 unsqueezed.append(df) unsqueezed = pd.concat(unsqueezed) unsqueezed = unsqueezed[ ['me_id', 'location_id', 'year_id', 'age_group_id', 'sex_id'] + drawcols] unsqueezed = unsqueezed.merge(sequelae_map, on='me_id') age_range = range(2, 21) + [30, 31, 32, 235] unsqueezed = unsqueezed[unsqueezed['age_group_id'].isin(age_range)] return unsqueezed
def import_cod_model_draws(model_version_id, location_id, cause_id, sex_id, required_columns, filter_years=None): """ Import model draws from CODEm/custom models Read in CODEm/custom model draws from a given filepath (filtered by a specific location_id) and then check to make sure that the imported draws are not missing any columns and do not have null values. """ logger = logging.getLogger('io.import_cod_model_draws') try: data = draws(gbd_ids={'cause_ids': [cause_id]}, source='codem', location_ids=[int(location_id)], sex_ids=[int(sex_id)], year_ids=filter_years, status=model_version_id) data = data.ix[data.age_group_id.isin(range(2, 22) + [30, 31, 32, 235])] except Exception: logger.exception("Failed to read" + '/n' + 'Problem demographics were mvid {} cause {}, ' 'location {}, sex {}, and years {}' .format(model_version_id, cause_id, location_id, sex_id, ','.join(str(y) for y in filter_years) )) sys.exit() r = check_data_format(data, required_columns) if not r: print model_version_id, r return None data = data.ix[:, required_columns] return data
def process_location_daly_draws(location_id, test=False): """Pull mortality numbers, limiting to desired ages by cause Gets all years >1990 and ages for the location id as mortality numbers from transmogrifier's gopher library """ dfs = [] cause_age_sets = [[dw.DALY_ALL_AGE_CAUSE_IDS, range(2, 22)], [dw.DALY_THIRTY_SEVENTY_CAUSE_IDS, range(11, 19)]] if test: years = [2015] else: years = [] for causes, ages in cause_age_sets: gbd_ids = {'cause_ids': causes} df = gopher.draws(gbd_ids, 'dalynator', location_ids=[location_id], year_ids=years, age_group_ids=ages, sex_ids=[3], verbose=True, num_workers=5, version=113) # without this here, it can give a too many inputs error df = df.query('metric_id == 1 & measure_id == 1') dfs.append(df) df = pd.concat(dfs, ignore_index=True) df = df.ix[(df['year_id'] >= 1990) | ((df['cause_id'].isin(dw.PRE_1990_CAUSES)) & (df['year_id'] >= 1985))] # make sure it looks like we expect assert set(df.age_group_id) == set(range(2, 22)), \ 'unexpected age group ids found' assert set(df.sex_id) == set([3]), \ 'unexpected sex ids found' if not test: assert set(df.ix[df['cause_id'].isin(dw.PRE_1990_CAUSES)].year_id) == \ set(range(1985, 2016, 1)), \ 'unexpected year ids found' assert set(df.ix[ ~df['cause_id'].isin(dw.PRE_1990_CAUSES) ].year_id) == \ set(range(1990, 2016, 1)), \ 'unexpected year ids found' assert set(df.location_id) == set([location_id]), \ 'unexpected location ids found' # age standardize df = age_standardize(df, 'dalynator') # write the output write_output(df, 'dalynator', location_id) return df
def pull_codcorrect_draws(self): codcorrect_df = draws(gbd_ids={'cause_ids': self.cause_id}, year_ids=self.year_id, source='codcorrect', sex_ids=self.sex_id, measure_ids=[1]) codcorrect_df = codcorrect_df.ix[codcorrect_df.age_group_id.isin( self.age_group_ids + self.aggregated_age_group_ids.keys())] return codcorrect_df[self.index_cols + ['cause_id'] + self.draw_cols]
def get_draws(self, measure_id=6): '''Uses gopher.draws to pull draws of the ME for this class instance''' draws = gopher.draws(gbd_ids={'modelable_entity_ids': [self.input_me]}, source='epi', measure_ids=[measure_id], location_ids=[], year_ids=[self.year_id], age_group_ids=[7, 8, 9, 10, 11, 12, 13, 14, 15], sex_ids=[2]) loc_df = self.get_locations(35) draws = draws.merge(loc_df, on='location_id', how='inner') draws.drop('most_detailed', axis=1, inplace=True) return draws
def grab_prevalence_draws(me_id, year, locations): # grabs prevalence draws for the given year, me_id, and locations gbd_round = {GBD ROUND ID} measure = {MEASURE ID} sexes = [{SEX ID}] ages = [{AGE GROUP IDS}] df = draws(source='epi', gbd_ids={"modelable_entity_ids": [me_id]}, location_ids=locations, year_ids=year, age_group_ids=ages, sex_ids=sexes, status='best', measure_ids=[measure], gbd_round_id=gbd_round) return df
def create_env(location_id, year, sex): env_ids = { 'epi': 2403, 'blind': 9805, 'id_bord': 9423, 'id_mild': 9424, 'id_mod': 9425, 'id_sev': 9426, 'id_prof': 9427} envelope_dict = {} for envlab, id in env_ids.iteritems(): env = gopher.draws( {'modelable_entity_ids': [id]}, 'dismod', location_ids=location_id, year_ids=year, sex_ids=sex, measure_ids=5) envelope_dict[envlab] = env.copy() return envelope_dict
def interp_loc(modelable_entity_id, measure_id, location_id, outpath): start_year = 1980 epi_start_year = 1990 end_year = 2015 rank_year = 2005 # Retrieve epi draws and interpolate epi_draws = [] for y in range(epi_start_year, end_year + 1, 5): d = gopher.draws({'modelable_entity_ids': [modelable_entity_id]}, year_ids=[y], location_ids=[location_id], measure_ids=[measure_id], verbose=False, source="dismod", age_group_ids=[ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 ]) assert len(d) > 0, ( "Uh oh, couldn't find epi draws. Make sure you have " "proportion estimates for the supplied meids") epi_draws.append(d) epi_draws = pd.concat(epi_draws) ip_epi_draws = [] for y in range(epi_start_year, end_year, 5): sy = y ey = y + 5 ip_draws = maths.interpolate( epi_draws.query('year_id==%s' % sy), epi_draws.query('year_id==%s' % ey), ['age_group_id', 'model_version_id', 'sex_id'], 'year_id', ['draw_%s' % i for i in range(1000)], sy, ey, rank_df=epi_draws.query('year_id==%s' % rank_year)) if ey != end_year: ip_draws = ip_draws[ip_draws.year_id != ey] ip_epi_draws.append(ip_draws) ip_epi_draws = pd.concat(ip_epi_draws) extrap_draws = [] for y in range(start_year, epi_start_year): esy_draws = ip_epi_draws.query('year_id==%s' % epi_start_year) esy_draws['year_id'] = y extrap_draws.append(esy_draws) epi_draws = pd.concat([ip_epi_draws] + extrap_draws) epi_draws.to_csv(outpath)
def get_props(args): location, year = args print location, year prev_dfs = gopher.draws({'modelable_entity_ids': [1951, 1952, 1953]}, source='dismod', location_ids=location, year_ids=year) prev_dfs['location_id'] = prev_dfs.location_id.astype(int) prev_dfs['year_id'] = prev_dfs.year_id.astype(int) # Extract proportions index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id'] props = maths.scale(prev_dfs, drawcols, index_cols, scalar=1) props = props.groupby(['location_id', 'year_id', 'modelable_entity_id']) props = props.mean().reset_index() props = props[['location_id', 'year_id', 'modelable_entity_id'] + drawcols] return props
def collect_risk_attrib_burden(rei_ids, measure_id, locs=None): ''' Given a list of rei_ids, use gopher to get attributable mortality draws and save to out directory. Since these are from dalynator draws, no further processing should be necessary. (except perhaps interpolation? Can do that as final step) Note: run this with a big qlogin because I use extra cores to read more files in parallel ''' # note -- untested since I don't have permission to create new directories if not locs: #locs = set(qry.queryToDF(qry.LOCATIONS.format(lsid=35)).location_id) query = "select location_id from locations where level = 3" # Only 188 countries engine = sql.create_engine('strConnection') locs = set(pd.read_sql_query(query, engine).location_id.values) df = gopher.draws(gbd_ids={ "rei_ids": rei_ids, "cause_ids": [294] }, source='dalynator', version=dw.RISK_BURDEN_DALY_VERS, location_ids=locs, age_group_ids=[27], sex_ids=[3], year_ids=[1990, 1995, 2000, 2005, 2010, 2015], measure_ids=[measure_id], metric_ids=[1], verbose=True, num_workers=10) out_dir = dw.RISK_BURDEN_OUTDIR # everything is already formatted perfectly so it can just be saved if not os.path.exists(out_dir): os.mkdir(out_dir) for rei_id in df.rei_id.unique(): print rei_id odf = df.query("rei_id == @rei_id") #sdg_test.all_sdg_locations(odf) odf.to_hdf(out_dir + "/{}.h5".format(int(rei_id)), key="data", format="table", data_columns=["location_id", "year_id"]) return df
def import_square(self, gopher_what, source, filler=None, **kwargs): """get draws for the specified modelable entity by dimensions""" if not kwargs: kwargs = self.idx_dmnsns.copy() # replace keys with their plural form for gopher.draws for k in kwargs.keys(): kwargs[k + "s"] = kwargs.pop(k) if filler is None: filler = 0 df = gopher.draws(gopher_what, source=source, verbose=False, **kwargs) for c in self.idx_dmnsns.keys(): df[c] = pd.to_numeric(df[c]) df = df.set_index(self.idx_dmnsns.keys()) df = df[self.draw_cols] df = pd.concat([self.index_df, df], axis=1) df.fillna(value=filler, inplace=True) return df
def split_location(location_id, gbd_round): draws = gopher.draws({'modelable_entity_ids': [ss.parent_meid]}, source='dismod', location_ids=location_id, measure_ids=[5, 6], gbd_round_id=year_map[gbd_round]) draws['measure_id'] = draws.measure_id.astype(int) gprops = ss.gbdize_proportions(location_id) gprops = gprops[gprops.measure_id.isin(draws.measure_id.unique())] gprops = gprops[gprops.age_group_id.isin(draws.age_group_id.unique())] gprops = gprops[gprops.sex_id.isin(draws.sex_id.unique())] dcs = ['draw_%s' % i for i in range(1000)] splits = maths.merge_split(draws, gprops, group_cols=[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id' ], value_cols=dcs) splits = splits.assign(modelable_entity_id=splits['child_meid']) return splits
def get_unsqueezed(sequelae_map, drawcols, location_id, year, sex): # Get all causes with epilepsy, ID, and blindness unsqueezed = [] for idx, seqrow in sequelae_map.iterrows(): me_id = int(seqrow[['me_id']]) a = seqrow['acause'] g = seqrow['grouping'] h = seqrow['healthstate'] try: gbd_ids = {'modelable_entity_ids': [me_id]} df = gopher.draws( gbd_ids, 'dismod', location_ids=location_id, year_ids=year, sex_ids=sex, measure_ids=5) df['me_id'] = me_id unsqueezed.append(df) except: print('Failed retrieving %s %s %s' % (a, g, h)) df = unsqueezed[0].copy() df['me_id'] = me_id df.ix[:, drawcols] = 0 unsqueezed.append(df) unsqueezed = pd.concat(unsqueezed) unsqueezed = unsqueezed[[ 'me_id', 'location_id', 'year_id', 'age_group_id', 'sex_id']+drawcols] unsqueezed = unsqueezed.merge(sequelae_map, on='me_id') unsqueezed = unsqueezed[unsqueezed.age_group_id < 22] unsqueezed = unsqueezed[unsqueezed.age_group_id > 1] return unsqueezed
def test_diff_input(self, gbd_id_dict, measure_ids, location_ids, start_year, end_year, sex_ids, status, source, version, metric_ids, age_group_ids, change_type): ''' run pct_change on all the different inputs given, and verify the results match the value obtained after manually doing the math on one draw (Or perhaps only validate inputs that we already know the answer to and can use a db lookup to compare?)''' # Get draws df = draws( gbd_id_dict, measure_ids=measure_ids, location_ids=location_ids, year_ids=[start_year, end_year], age_group_ids=age_group_ids, sex_ids=sex_ids, status=status, source=source, include_risks=True, version=version).reset_index(drop=True) # standardize all inputs by transforming everything to rate space df = define_metric(df, source) if 1 in df.metric_id.unique(): df.loc[df.metric_id == 1] = transform_metric( df.loc[df.metric_id == 1], to_id=3, from_id=1) # calculate pct_change # drop any 2's. transform only 3's. if change_type == 'pct_change_num': df = transform_metric(df[df.metric_id == 3], to_id=1, from_id=3) if change_type in ['pct_change_rate', 'pct_change_num']: change_type = 'pct_change' change_df = pct_change(df, start_year, end_year, change_type) # validate change_df result here... return change_df
locations = maternal_fns.get_locations() # set up columns we want to subset columns = maternal_fns.filter_cols() index_cols = [col for col in columns if not col.startswith('draw_')] # logging rlog.open('FILEPATH.log' % (log_dir, year)) rlog.log('') rlog.log('Starting to get late cause fractions') ############################################## # GET LATE CAUSE FRACTIONS: ############################################## codcorrect_df = draws(gbd_ids={'cause_ids': [env_id, late_id]}, source='codcorrect', year_ids=[year], sex_ids=[2], measure_ids=[1]) codcorrect_df['measure_id'] = 1 codcorrect_df = codcorrect_df[codcorrect_df.age_group_id.isin(range(7, 16))] envelope_df = codcorrect_df[codcorrect_df.cause_id == env_id] late_df = codcorrect_df[codcorrect_df.cause_id == late_id] # we only want index_cols and draws as columns envelope_df = envelope_df[columns].set_index(index_cols).sort() late_df = late_df[columns].set_index(index_cols).sort() # calculate late cause fractions rlog.log('Calculating late cfs for year %s' % year) late_cfs = late_df / envelope_df
def get_models(cause_set, ages, years, locations): ############################################# #Pull in active oldCorrect models given a #cause set, a set of ages, and a set of years ############################################# sources, targets = get_cause_ids(cause_set) source_model_codem_dfs = [] source_model_dismod_dfs = [] hybrid_scale_input_mvt = 8 custom_mvt = 4 hybrid_mvt = 3 for cause_id in sources: source_models_codem = pull_mvid(cause_id, hybrid_scale_input_mvt) source_models_dismod = pull_mvid(cause_id, custom_mvt) source_ids = {'cause_ids': [cause_id]} ############################################# #Pull in codem models ############################################# for source_model_codem in source_models_codem: source_codem = draws(source_ids, source='codem', measure_ids=[1], year_ids=years, age_group_ids=ages, location_ids=locations, status=source_model_codem) source_codem.drop('model_version_id', axis=1, inplace=True) try: source_codem.drop('measure_id', axis=1, inplace=True) except: pass try: source_codem.drop(['envelope', 'pop'], axis=1, inplace=True) except: pass source_model_codem_dfs.append(source_codem) ############################################# #Pull in custom/dismod models ############################################# for source_model_dismod in source_models_dismod: source_dismod = draws(source_ids, source='codem', measure_ids=[1], year_ids=years, age_group_ids=ages, location_ids=locations, status=source_model_dismod) source_dismod.drop('model_version_id', axis=1, inplace=True) try: source_dismod.drop('measure_id', axis=1, inplace=True) except: pass try: source_dismod.drop(['envelope', 'pop'], axis=1, inplace=True) except: pass source_model_dismod_dfs.append(source_dismod) source_codem = pd.concat(source_model_codem_dfs) source_dismod = pd.concat(source_model_dismod_dfs) target_dfs = [] target_nulls = [] for cause_id in targets: target_models = pull_mvid(cause_id, hybrid_mvt) target_ids = {'cause_ids': [cause_id]} ############################################# #Pull in target models ############################################# for target_model in target_models: target = draws(target_ids, source='codem', measure_ids=[1], year_ids=years, age_group_ids=ages, location_ids=locations, status=target_model) target.drop('model_version_id', axis=1, inplace=True) try: target.drop('measure_id', axis=1, inplace=True) except: pass try: target.drop(['envelope', 'pop'], axis=1, inplace=True) except: pass if len(target[target.isnull().any(axis=1)]) != 0: target_nulls.append(cause_id) target_dfs.append(target) assert len(target_nulls) == 0, "Nulls target: %s" % target_nulls target_df = pd.concat(target_dfs) assert len(source_dismod[source_dismod.isnull().any(axis=1)]) ==\ 0, "Nulls dismod" assert len(source_codem[source_codem.isnull().any(axis=1)]) ==\ 0, "Nulls codem" return source_codem, source_dismod, target_df
# set demographic data me_id = 8691 location_set = 35 gbd_round = 4 locations = get_most_detailed(location_set, gbd_round) years = [1990, 1995, 2000, 2005, 2010, 2016] ages = [164] sexes = [1, 2] measure = 5 upload_me = 15803 # grab the u_2500 birth prevalence prev_df = draws(source='dismod', gbd_ids={"modelable_entity_ids": [me_id]}, location_ids=locations, year_ids=years, age_group_ids=ages, sex_ids=sexes, status='best', measure_ids=[measure], gbd_round_id=gbd_round) prev_df = index_draws_by_demographics(prev_df) def mapping(x): y = predict_for_simple_ols(x, mean_parameters, cov_matrix) return y mean_weight_df = prev_df.applymap(mapping) mean_weight_df['age_group_id'] = 2 save_to_hdf(mean_weight_df, savefile) description = ('Estimate of mean birth weight from simple linear' 'regression. Units in grams') al.save_custom_results(meid=upload_me, description=description, input_dir=output_dir,
# logging rlog.open('%s/%s.log' % (log_dir, jobname)) rlog.log('out_dir is %s' % out_dir) # set up columns we want to subset columns = maternal_fns.filter_cols() columns.remove('measure_id') index_cols = [col for col in columns if not col.startswith('draw_')] # read maternal disorders envelope # CAUSES get multiplied by the Late corrected env from codem # TIMINGS get multiplied by the CoDcorrect env rlog.log("reading in envelope draws") if 'timing' in jobname: env = draws(gbd_ids={'cause_ids': [366]}, source='codcorrect', measure_ids=[1], sex_ids=[2], location_ids=locations) else: env = draws(gbd_ids={'cause_ids': [366]}, source='codem', sex_ids=[2], status=int(env_model_vers)) env = env[env.location_id.isin(locations)] # we only want maternal age groups env = env[env.age_group_id.isin(range(7, 16))] # we only want index cols & draws as columns, w multiindex env = env[columns].set_index(index_cols).sort_index() # read cfs rlog.log("reading in cfs") cfs = draws(gbd_ids={'modelable_entity_ids': [source_id]}, source='dismod', measure_ids=[18], sex_ids=[2]) cfs = cfs[cfs.location_id.isin(locations)] # we only want maternal age groups
held_constant_me = 9015 ####################################################################### # STEP 1: FOR EACH CAUSE, EXTRACT FILES, GET SUM BY GROUP + TOTAL SUM ####################################################################### print 'getting data' rlog.log('getting data') all_data = {} summed_idx = 0 for index, row in step_df.iterrows(): target_id = row['target_id'] try: subtype_df = draws( gbd_ids={'modelable_entity_ids': [row['source_id']]}, source='dismod', measure_ids=[18], sex_ids=[2], year_ids=[year]) except (ValueError, OSError): # pull data from where interp saves it subtype_df = pd.read_hdf( '%s/%s/%s_2.h5' % (cluster_dir, row['source_id'], year), 'draws') subtype_df = subtype_df.ix[(subtype_df.location_id.isin(locs)) & (subtype_df.age_group_id.isin(range(7, 16)))] subtype_df = subtype_df[columns].set_index(index_cols).sort_index() if row['source_id'] == str(held_constant_me): held_constant_df = subtype_df.copy(deep=True) else: # save this dataframe, and also sum it to all other subtypes all_data[target_id] = subtype_df
def process_location_como_draws(location_id, measure_id, test=False): """Pull indidence rates, merging with population to make cases Using COMO because there are plans to make this store each year. Gets all years, ages, and sexes for the location id as incidence rates from transmogrifier's gopher library, and combines into all ages, both sexes cases. """ db_pops = qry.get_pops() if measure_id == 6: gbd_ids = {'cause_ids': dw.COMO_INC_CAUSE_IDS} elif measure_id == 5: gbd_ids = {'cause_ids': dw.COMO_PREV_CAUSE_IDS} else: raise ValueError("bad measure_id: {}".format(measure_id)) if test: years = [2015] else: years = [] df = gopher.draws(gbd_ids, 'como', measure_ids=[measure_id], location_ids=[location_id], year_ids=years, age_group_ids=[], sex_ids=[], verbose=True, num_workers=5, version=dw.COMO_VERS) # make sure it looks like we expect assert set(df.age_group_id) == set(range(2, 22)), \ 'unexpected age group ids found' assert set(df.sex_id) == set([1, 2]), \ 'unexpected sex ids found' if not test: assert set(df.year_id) == set(range(1990, 2016, 5)), \ 'unexpected year ids found' assert set(df.location_id) == set([location_id]), \ 'unexpected location ids found' # these pull in as rates df['metric_id'] = 3 # merge with pops to transform to cases df = df.merge(db_pops, how='left') assert df.mean_pop.notnull().values.all(), 'merge with populations failed' # concatenate the metadata with the draw cols times the pop # this multiplies each draw column by the mean_pop column df = pd.concat([ df[dw.COMO_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['mean_pop']) ], axis=1) # now its numbers (this line is for readability) df['metric_id'] = 1 # aggregate sexes df['sex_id'] = 3 # collapse sexes together df = df.groupby(dw.COMO_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() # age standardize df = age_standardize(df, 'como') write_output(df, 'como', location_id) return df
assert (end_year > start_year), "Yr end must be more recent than yr start" assert (source != 'risk'), "Risk as a source is not supported." # convert kwargs from a list of single key dicts to one dict with # multiple keys, if any specified from get_pct_change.ado for d in args.pop('kwargs'): for k, v in d.iteritems(): args[k] = v # Get draws try: df = draws(gbd_id_dict, measure_ids=args.pop('measure_ids'), location_ids=args.pop('location_ids'), year_ids=[start_year, end_year], age_group_ids=args.pop('age_group_ids'), sex_ids=args.pop('sex_ids'), status=status, source=args.pop('source'), include_risks=args.pop('include_risks'), **args).reset_index(drop=True) except Exception as e: # catch all exceptions, because we need to write something to stdout # no matter what error. Get_pct_change.ado creates a pipe and reads # from it -- if nothing is written to the pipe, stata hangs print "Encountered error while reading draws: {}".format(e) raise # If they want age-std, make sure that's possible if 27 in age_group_ids: assert change_type != 'pct_change_num', ('Cant calc pct_change_num ' 'for age-std')
def allocate_residuals(usqzd, sqzd, location_id, year_id, sex_id, map_file, drawcols): tmap = pd.read_csv(map_file) resids = usqzd.merge( sqzd, on=['location_id', 'year_id', 'age_group_id', 'sex_id', 'me_id'], suffixes=('.usqzd', '.sqzd')) resids = resids[resids['resid_target_me.usqzd'].notnull()] dscols = ['draw_%s.sqzd' % d for d in range(1000)] ducols = ['draw_%s.usqzd' % d for d in range(1000)] toalloc = resids[ducols].values - resids[dscols].values toalloc = toalloc.clip(min=0) resids = resids.join( pd.DataFrame(data=toalloc, index=resids.index, columns=drawcols)) resids = resids[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'resid_target_me.usqzd' ] + drawcols] resids.rename(columns={'resid_target_me.usqzd': 'resid_target_me'}, inplace=True) resids = resids.groupby(['resid_target_me', 'age_group_id']).sum() resids = resids.reset_index() resids = resids[['resid_target_me', 'age_group_id'] + drawcols] for me_id, resid_df in resids.groupby('resid_target_me'): t_meid = tmap.query('modelable_entity_id_source == %s' % me_id) t_meid = t_meid.modelable_entity_id_target.squeeze() try: t_meid = int(t_meid) except: pass present = True try: gbd_ids = {'modelable_entity_ids': [me_id]} t_df = gopher.draws(gbd_ids, 'dismod', location_ids=location_id, year_ids=year_id, sex_ids=sex_id, measure_ids=5) except ValueError: present = False if present: t_df = t_df.merge(resid_df, on='age_group_id', suffixes=('#base', '#resid')) newvals = (t_df.filter(like="#base").values + t_df.filter(like="#resid").values) t_df = t_df.join( pd.DataFrame(data=newvals, index=t_df.index, columns=drawcols)) print('Writing residual %s to file' % t_meid) drawsdir = "/FILEPATH" fn = "%s/%s_%s_%s.h5" % (drawsdir, location_id, year_id, sex_id) try: os.makedirs(drawsdir) except OSError as e: if e.errno == errno.EEXIST: pass else: raise t_df['location_id'] = int(float(location_id)) t_df['year_id'] = int(float(year_id)) t_df['sex_id'] = int(float(sex_id)) t_df['measure_id'] = 5 t_df['age_group_id'] = t_df.age_group_id.astype(float).astype(int) datacols = [ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id' ] t_df[datacols + drawcols].to_hdf(fn, 'draws', mode='w', format='table', data_columns=datacols) else: print('ME ID %s missing' % me_id) return resids
def process_risk_exposure_draws(location_id, test=False): """Return yearly age standardized estimates of each rei_id. 1. Use gopher to pull data for each rei_id for the location_id the location id, and all years. 2. Keep appropriate categories for given rei_id 3. Draws only come with male/female in rates - change to cases and make both sexes aggregate. 4. Revert back to rates and age standardize using custom weights. Arguments: location_id: the location_id to process Returns: pandas dataframe like so: [ID_COLS] : [dw.DRAW_COLS] """ dfs = [] version_df = pd.DataFrame() all_ids = set(dw.RISK_EXPOSURE_REI_IDS).union( set(dw.RISK_EXPOSURE_REI_IDS_MALN)) if test: years = [2015] else: years = [] for rei_id in all_ids: print("pulling {r}".format(r=rei_id)) df = gopher.draws({"rei_ids": [rei_id]}, source='risk', draw_type='exposure', location_ids=[location_id], year_ids=years, age_group_ids=[], sex_ids=[1, 2], num_workers=5) # remove any other ages besides gbd ages df = df.query('age_group_id >= 2 & age_group_id <= 21') # only reporting since 1990 df = df.query('year_id>=1990') if rei_id == 167: # change IPV to just women df = df.query('sex_id == 2') if rei_id in dw.RISK_EXPOSURE_REI_IDS_MALN: # these are childhood stunting - cat1 + cat2 equals <-2 std dev df = df.query('parameter=="cat1" | parameter=="cat2"') else: # cat1 represents the prevalence in these cases (can't test this?) df = df.query('parameter=="cat1"') # set the rei_id because it isnt in the gopher pull df['rei_id'] = rei_id # keep track of what model versions where used version_df = version_df.append( df[['rei_id', 'modelable_entity_id', 'model_version_id']].drop_duplicates(), ignore_index=True) # these are prevalence rates df['metric_id'] = 3 df['measure_id'] = 5 dfs.append(df[dw.RISK_EXPOSURE_GROUP_COLS + dw.DRAW_COLS]) df = pd.concat(dfs, ignore_index=True) # note the versions used by risk exposure vers (manufactured by me) version_df.to_csv( "/home/j/WORK/10_gbd/04_journals/" "gbd2015_capstone_lancet_SDG/02_inputs/" "risk_exposure_versions_{v}.csv".format(v=dw.RISK_EXPOSURE_VERS), index=False) # COLLAPSE SEX print("collapsing sex") df = df.merge(qry.get_pops(), how='left') assert df.mean_pop.notnull().values.all(), 'merge with pops fail' # overriding the sex variable for collapsing df['sex_id'] = df.rei_id.apply(lambda x: 2 if x == 167 else 3) df = pd.concat([ df[dw.RISK_EXPOSURE_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['mean_pop']) ], axis=1) # so unnecessary programmatically but good for documentation - # these are now prev cases df['metric_id'] = 1 # now that its in cases it is possible to collapse sex df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS, as_index=False).sum() # RETURN TO RATES print("returning to rates") df = df.merge(qry.get_pops(), how='left') assert df.mean_pop.notnull().values.all(), 'merge with pops fail' df = pd.concat([ df[dw.RISK_EXPOSURE_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x / df['mean_pop']) ], axis=1) df['metric_id'] = 3 # AGE STANDARDIZE print("age standardizing") df['is_0_5'] = df.rei_id.apply( lambda x: 1 if x in dw.RISK_EXPOSURE_REI_IDS_MALN else 0) wgts = custom_age_weights(2, 21) wgts['is_0_5'] = 0 wgts_2 = custom_age_weights(2, 5) wgts_2['is_0_5'] = 1 wgts = wgts.append(wgts_2, ignore_index=True) df = df.merge(wgts, on=['is_0_5', 'age_group_id'], how='left') assert df.age_group_weight_value.notnull().values.all(), \ 'merge w wgts failed' df = pd.concat([ df[dw.RISK_EXPOSURE_GROUP_COLS], df[dw.DRAW_COLS].apply(lambda x: x * df['age_group_weight_value']) ], axis=1) df['age_group_id'] = 27 df = df.groupby(dw.RISK_EXPOSURE_GROUP_COLS, as_index=False)[dw.DRAW_COLS].sum() write_output(df, 'risk_exposure', location_id) return df
def allocate_residuals(usqzd, sqzd): tmap = pd.read_excel( "strCodeDir/map_pre_pos_mes.xlsx") resids = usqzd.merge( sqzd, on=['location_id', 'year_id', 'age_group_id', 'sex_id', 'me_id'], suffixes=('.usqzd', '.sqzd')) resids = resids[resids['resid_target_me.usqzd'].notnull()] dcols = ['draw_%s' % d for d in range(1000)] dscols = ['draw_%s.sqzd' % d for d in range(1000)] ducols = ['draw_%s.usqzd' % d for d in range(1000)] toalloc = resids[ducols].values - resids[dscols].values toalloc = toalloc.clip(min=0) resids = resids.join(pd.DataFrame( data=toalloc, index=resids.index, columns=dcols)) resids = resids[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'resid_target_me.usqzd']+dcols] resids.rename( columns={'resid_target_me.usqzd': 'resid_target_me'}, inplace=True) resids = resids.groupby(['resid_target_me', 'age_group_id']).sum() resids = resids.reset_index() resids = resids[['resid_target_me', 'age_group_id']+dcols] for me_id, resid_df in resids.groupby('resid_target_me'): t_meid = tmap.query('modelable_entity_id_source == %s' % me_id) t_meid = t_meid.modelable_entity_id_target.squeeze() try: t_meid = int(t_meid) except: pass gbd_ids = {'modelable_entity_ids': [me_id]} t_df = gopher.draws( gbd_ids, 'dismod', location_ids=location_id, year_ids=year, sex_ids=sex, measure_ids=5) t_df = t_df.merge( resid_df, on='age_group_id', suffixes=('#base', '#resid')) newvals = ( t_df.filter(like="#base").values + t_df.filter(like="#resid").values) t_df = t_df.join(pd.DataFrame( data=newvals, index=t_df.index, columns=dcols)) print 'Writing residual %s to file' % t_meid drawsdir = "strOutDir/%s" % t_meid fn = "%s/%s_%s_%s.h5" % (drawsdir, location_id, year, sex) try: os.makedirs(drawsdir) except: pass t_df['location_id'] = int(float(location_id)) t_df['year_id'] = int(float(year)) t_df['sex_id'] = int(float(sex)) t_df['measure_id'] = 5 t_df['age_group_id'] = t_df.age_group_id.astype(float).astype(int) datacols = [ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id'] t_df[datacols+dcols].to_hdf( fn, 'draws', mode='w', format='table', data_columns=datacols) return resids