(sev_version_id, rei_id, gbd_round_id, n_draws, location_set_id) = parse_arguments() drawdir = 'FILEPATH/{}/draws'.format(sev_version_id) # set up source and sink source_params = { 'draw_dir': drawdir, 'file_pattern': '{rei_id}/{location_id}.csv' } source = DrawSource(source_params) sink_params = { 'draw_dir': drawdir, 'file_pattern': '{rei_id}/{location_id}.csv' } sink = DrawSink(sink_params, write_func=partial(standard_write_func, index=False)) index_cols = [ 'rei_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'metric_id' ] draw_cols = ['draw_{}'.format(i) for i in range(n_draws)] for lsid in location_set_id: popfile = os.path.join(drawdir, 'population_{}.csv'.format(lsid)) population = pd.read_csv(popfile) # aggregation operator operator = WtdSum( index_cols=index_cols, value_cols=draw_cols,
def sequela_result_sink(self): return DrawSink(self.get_params_by_component("sequela", "draws"))
sex_id = config['eligible_sex_ids'] # Create draw source/sink logging.info("Creating draw source and sink.") draw_dir = os.path.join(parent_dir, 'aggregated/{}'.format(df_type)) input_pattern = '{measure_id}_{location_id}_{year_id}.h5' source_config = {'draw_dir': draw_dir, 'file_pattern': input_pattern} draw_source = DrawSource(source_config) output_pattern = '{measure_id}_{location_id}_{year_id}.h5' sink_config = { 'draw_dir': draw_dir, 'file_pattern': output_pattern, 'h5_tablename': 'draws' } draw_sink = DrawSink(sink_config) # Apply regional scalar transform region_locs = get_location_metadata(gbd_round_id=GBD.GBD_ROUND_ID, location_set_id=35) region_locs = region_locs[region_locs.level == 2].location_id.tolist() draw_sink.add_transform(apply_regional_scalars, region_locs=region_locs, parent_dir=parent_dir) draw_sink.add_transform(transform_add_measure, measure_id=measure_id) # create operator logging.info("Reading regional scalars from flatfiles.") index_cols = [col for col in index_cols if col != 'location_id'] operator = Sum(index_cols, draw_cols)
def save_all_draws(parent_dir, index_columns, rescaled_data, shock_data, unscaled_data, measure_id=1): for data in [rescaled_data, shock_data, unscaled_data]: for i in index_columns: data[i] = data[i].astype(int) rescaled_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/rescaled'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } rescaled_sink = DrawSink(rescaled_params) rescaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) rescaled_sink.push(rescaled_data, append=False) unscaled_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/unscaled'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } unscaled_sink = DrawSink(unscaled_params) unscaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) unscaled_sink.push(unscaled_data, append=False) shocks_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/shocks'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } shocks_sink = DrawSink(shocks_params) shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) shocks_sink.push(shock_data, append=False)
def new_population(self, location_set_id, agg_loc_sets=[]): dim = self.nonfatal_dimensions.get_simulation_dimensions( self.measure_id) df = get_population( age_group_id=( dim.index_dim.get_level("age_group_id") + [164]), location_id=dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=self.gbd_round_id ).node_ids, sex_id=dim.index_dim.get_level("sex_id"), year_id=dim.index_dim.get_level("year_id")) index_cols = ["location_id", "year_id", "age_group_id", "sex_id"] data_cols = ["population"] io_mock = {} source = DrawSource({"draw_dict": io_mock, "name": "tmp"}, mem_read_func) sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func) sink.push(df[index_cols + data_cols]) # location for set_id in agg_loc_sets: loc_tree = dbtrees.loctree( location_set_id=set_id, gbd_round_id=self.gbd_round_id) operator = Sum( index_cols=[col for col in index_cols if col != "location_id"], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "location_id"], aggregate_col="location_id", operator=operator) aggregator.run(loc_tree) # age for age_group_id in ComoSummaries._gbd_compare_age_group_list: age_tree = dbtrees.agetree(age_group_id) operator = Sum( index_cols=[col for col in index_cols if col != "age_group_id" ], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "age_group_id" ], aggregate_col="age_group_id", operator=operator) aggregator.run(age_tree) # sex sex_tree = dbtrees.sextree() operator = Sum( index_cols=[col for col in index_cols if col != "sex_id"], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "sex_id"], aggregate_col="sex_id", operator=operator) aggregator.run(sex_tree) df = source.content() df.to_hdf( "{}/info/population.h5".format(self.como_dir), 'draws', mode='w', format='table', data_columns=["location_id", "year_id", "age_group_id", "sex_id"])
def save_all_draws(parent_dir, ylls, yll_shocks, location_id, index_columns, measure_id=4): # Save yll data agg_rescaled_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/rescaled'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } rescaled_sink = DrawSink(agg_rescaled_params) rescaled_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) rescaled_sink.push(ylls, append=False) agg_shocks_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/shocks'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5', 'h5_tablename': 'draws' } shocks_sink = DrawSink(agg_shocks_params) shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) shocks_sink.push(yll_shocks, append=False)
def _save_all_ylls_fauxcorrect( ylls: pd.DataFrame, yll_shocks: pd.DataFrame, parent_dir: str, location_id: int, sex_id: int, measure_id: int = 4 ) -> None: """Save YLLs for given location and sex""" ylls_sink = DrawSink({ 'draw_dir': join( parent_dir, FilePaths.DRAWS_SCALED_DIR, FilePaths.YLLS_DIR ), 'file_pattern': FilePaths.YLL_DRAWS_FILE_PATTERN.format( sex_id=sex_id, location_id=location_id ), 'h5_tablename': Keys.DRAWS }) ylls_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) ylls_sink.push(ylls, append=False) shocks_sink = DrawSink({ 'draw_dir': join( parent_dir, FilePaths.UNAGGREGATED_DIR, FilePaths.SHOCKS_DIR, FilePaths.YLLS_DIR ), 'file_pattern': FilePaths.YLL_DRAWS_FILE_PATTERN.format( sex_id=sex_id, location_id=location_id ), 'h5_tablename': Keys.DRAWS }) shocks_sink.add_transform(add_measure_id_to_sink, measure_id=measure_id) shocks_sink.push(yll_shocks, append=False)
def gen_draw_sink(self): sink = DrawSink(self._mem_io_params, mem_write_func) sink.add_transform(sort_index_columns, self.dimensions.index_names) return sink
def location_aggregate_birth_counts(gbd_round_id: int, decomp_step: str, constants_path: pathlib.PosixPath, location_set_id: int) -> None: """ for given gbd_round, decomp_step, location_set_id, get a complete set of location-aggregated live births """ logger.info(f'aggregating for location_set_id {location_set_id}') multiple_tree_flag = (location_set_id in mmr_constants.MULTIPLE_ROOT_LOCATION_SET_IDS) scalars = get_regional_scalars(gbd_round_id, decomp_step) index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id'] cov_estimate_filename = ( mmr_constants.COV_ESTIMATES_FORMAT_FILENAME.format(location_set_id)) region_locs, most_detailed_locs = get_location_level_sets( gbd_round_id=gbd_round_id, decomp_step=decomp_step, location_set_id=location_set_id) save_birth_count_estimates(gbd_round_id=gbd_round_id, decomp_step=decomp_step, cov_estimate_filepath=constants_path / cov_estimate_filename, location_set_id=location_set_id, most_detailed_locs=most_detailed_locs) loc_trees = dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id, decomp_step=decomp_step, return_many=multiple_tree_flag) if not multiple_tree_flag: loc_trees = [loc_trees] draw_source = DrawSource(params={ 'draw_dir': str(constants_path), 'file_pattern': cov_estimate_filename }) i = 1 output_filenames = [] for loc_tree in loc_trees: output_filename = f'{location_set_id}_{i}.h5' i += 1 draw_sink = DrawSink(params={ 'draw_dir': str(constants_path), 'file_pattern': output_filename }) draw_sink.add_transform( _apply_regional_scalars, regional_scalars_df=scalars.query('location_id in @region_locs'), gbd_round_id=gbd_round_id, decomp_step=decomp_step) op = Sum(index_cols=[s for s in index_cols if s != 'location_id'], value_cols=[mmr_constants.Columns.LIVE_BIRTH_VALUE_COL]) AggSynchronous( draw_source=draw_source, draw_sink=draw_sink, index_cols=[s for s in index_cols if s != 'location_id'], aggregate_col='location_id', operator=op).run(loc_tree, include_leaves=True) output_filenames.append(output_filename) return output_filenames