def compute_percent(self): mem_source = DrawSource({ "draw_dict": self._io_mock, "name": "cause" }, mem_read_func) denom = mem_source.content(filters={"metric_id": metrics.RATE}) self._compute_percent(denom_df=denom)
def read_shocks_draw_files(parent_dir, location_id): """Reads in rescaled draw files.""" params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/shocks'), 'file_pattern': '1_{location_id}_{year_id}.h5' } source = DrawSource(params) return source.content(filters={'location_id': location_id})
def estimate_single_component(self, component): # data to summarize draw_source = DrawSource({ "draw_dict": self.io_mock, "name": component }, mem_read_func) df = draw_source.content() df = compute_estimates(df, point_estimate="mean") df.rename(columns={"mean": "val"}, inplace=True) return df
def read_aggregated_rescaled(parent_dir, location_id, diag_years): """ Read in location aggregates of rescaled draws for deaths only""" rescaled_params = { 'draw_dir': os.path.join(parent_dir, 'draws'), 'file_pattern': '{measure_id}_{location_id}.h5' } ds = DrawSource(rescaled_params) rescaled_draws = ds.content(filters={ 'location_id': location_id, 'year_id': diag_years, 'measure_id': 1 }) return rescaled_draws
def _load_data_frame(self): """If 'turn_off_null_check' is true then the null check will be skipped. Yuk. GBD 2015 como files have nulls caused by "other maternal" issues for males. Generally it is much safer to validate data, this is dangerous but historically necessary. This will pass-through NoDrawsError exception raised by the underlying SuperGopher implementation if it cannot find any files. Will raise ValueError if no files exist. ValueError is used to be consistent with other DataSource methods """ logger.debug('Super gopher _load_data_frame, kwargs:') for key, value in self.kwargs.items(): value = list(np.atleast_1d(value)) self.kwargs[key] = value logger.debug(" {} == {}".format(key, value)) self.kwargs.update({'strict_filter_checking': True}) try: pattern = self.file_naming_conventions['file_pattern'] draw_dir = self.dir_path h5_tablename = self.file_naming_conventions.get( 'h5_tablename', None) params = {'file_pattern': pattern, 'draw_dir': draw_dir} if h5_tablename: params.update({'h5_tablename': h5_tablename}) if not self.ds: ds = DrawSource(params=params) self.ds = ds df = ds.content(filters=self.kwargs) df = self._add_n_draws(df) except ex.InvalidFilter: logger.info( "Super gopher '{}' found no files with file_pattern: {}" ", draw_dir: {}, and filters {}. Stopping pipeline" "".format(self.name, pattern, draw_dir, self.kwargs)) raise logger.info('Super gopher "{}" got content, shape {}'.format( self.name, df.shape)) logger.debug( ('SuperGopher "{}" got and validated data, dir={}, filter=' '{}'.format(self.name, self.dir_path, self.file_naming_conventions))) return df
def read_aggregated_rescaled(parent_dir, location_id, diag_years): """ Read in location aggregates of rescaled draws for deaths only""" rescaled_params = { 'draw_dir': os.path.join(parent_dir, FilePaths.DRAWS_DIR), 'file_pattern': FilePaths.DRAWS_FILE_PATTERN } ds = DrawSource(rescaled_params) rescaled_draws = ds.content( filters={ 'location_id': location_id, 'year_id': diag_years, 'measure_id': Measures.Ids.DEATHS }) return rescaled_draws
def read_gbd_draw_files(parent_dir, location_id, years, measure_id): """Pull in all data to be summarized for gbd, by location and measure, and filtering by years.""" logger = logging.getLogger('summary.read_gbd_draw_files') try: params = { 'draw_dir': os.path.join(parent_dir, 'draws'), 'file_pattern': '{measure_id}_{location_id}.h5' } ds = DrawSource(params) return ds.content(filters={'measure_id': measure_id, 'location_id': location_id, 'year_id': years}) except Exception as e: logger.exception('Failed to read location: {}'.format(e))
def _read_unscaled_draws( parent_dir: str, location_id: int, sex_id: int ) -> pd.DataFrame: """Read unscaled draws for given location, sex, and year""" draw_dir = join( parent_dir, constants.FilePaths.UNAGGREGATED_DIR, constants.FilePaths.UNSCALED_DIR, constants.FilePaths.DEATHS_DIR ) file_pattern = constants.FilePaths.UNSCALED_DRAWS_FILE_PATTERN try: draws = DrawSource({ 'draw_dir': draw_dir, 'file_pattern': file_pattern, 'h5_tablename': constants.Keys.DRAWS, 'num_workers': constants.DAG.Tasks.Cores.APPLY_CORRECTION }).content(filters={ constants.Columns.LOCATION_ID: location_id, constants.Columns.SEX_ID: sex_id }) except InvalidSpec: raise FileNotFoundError( f"Draw files were not found for location: {location_id} and sex: " f"{sex_id}." ) return draws
def _read_scaled_draws( tool_name: str, parent_dir: str, location_id: int, year_id: int, measure_id: int ) -> pd.DataFrame: if tool_name == GBD.Process.Name.FAUXCORRECT: draw_dir = os.path.join( parent_dir, FilePaths.DRAWS_SCALED_DIR, str(measure_id) ) file_pattern = FilePaths.SUMMARY_INPUT_FILE_PATTERN.format( location_id=location_id, year_id=year_id) else: draw_dir = os.path.join( parent_dir, FilePaths.DRAWS_DIR) file_pattern = FilePaths.SUMMARY_AGGREGATE_READ_PATTERN.format( measure_id=measure_id, location_id=location_id, year_id=year_id) return DrawSource({ 'draw_dir': draw_dir, 'file_pattern': file_pattern, 'h5_tablename': Keys.DRAWS }).content()
def _get_draw_source_and_filters(aggregation_type: str, source_dir: str, year_id: int, measure_id: int) -> DrawSource: if aggregation_type == LocationAggregation.Type.UNAGGREGATED_SHOCKS: source = DrawSource( params={ 'draw_dir': source_dir, 'file_pattern': FilePaths.UNAGGREGATED_SHOCKS_FILE_PATTERN }) draw_filters = { Columns.MEASURE_ID: measure_id, Columns.YEAR_ID: year_id } else: source = DrawSource({ 'draw_dir': source_dir, 'file_pattern': FilePaths.LOCATION_AGGREGATE_FILE_PATTERN.format(year_id=year_id) }) draw_filters = {} return source, draw_filters
def read_cod_draw_files(pool, parent_dir, location_id, years): """Pull in all data to be summarized for CoD, by location and filtering by years.""" logger = logging.getLogger('summary.read_cod_draw_files') try: agg_rescaled_params = { 'draw_dir': os.path.join(parent_dir, 'aggregated/rescaled'), 'file_pattern': '{measure_id}_{location_id}_{year_id}.h5' } ds = DrawSource(agg_rescaled_params) rescaled_draws = ds.content(filters={'location_id': location_id, 'year_id': years, 'measure_id': 1}) daly_draw_params = { 'draw_dir': os.path.join(parent_dir, 'draws'), 'file_pattern': '{measure_id}_{location_id}.h5', } ds = DrawSource(daly_draw_params) dalynator_draws = ds.content(filters={'location_id': location_id, 'year_id': years, 'measure_id': 1}) return rescaled_draws, dalynator_draws except Exception as e: logger.exception('Failed to read location: {}'.format(e))
def _import_static_draws_by_component(self, component): dim = self.dimensions.get_dimension_by_component( component, self.measure_id) draw_dir = os.path.join(self.como_version.como_dir, "draws", component, str(dim.index_dim.get_level("location_id")[0])) real_source = DrawSource({ "draw_dir": draw_dir, "file_pattern": "{measure_id}_{year_id}_{sex_id}.h5" }) real_source.add_transform(add_metric) fake_sink = DrawSink({ "draw_dict": self.io_mock, "name": component }, mem_write_func) # get the filters and data sim_dim = self.dimensions.get_simulation_dimensions(self.measure_id) filters = sim_dim.index_dim.levels.copy() filters["age_group_id"] = dim.index_dim.get_level("age_group_id") df = real_source.content(filters=filters) fake_sink.push(df)
try: # Read in helper files logging.info("Reading in helper files.") config = read_helper_files(parent_dir) # Read in config variables index_cols = config['index_columns'] draw_cols = config['data_columns'] sex_id = config['eligible_sex_ids'] # Create draw source/sink logging.info("Creating draw source and sink.") draw_dir = os.path.join(parent_dir, 'aggregated/{}'.format(df_type)) input_pattern = '{measure_id}_{location_id}_{year_id}.h5' source_config = {'draw_dir': draw_dir, 'file_pattern': input_pattern} draw_source = DrawSource(source_config) output_pattern = '{measure_id}_{location_id}_{year_id}.h5' sink_config = { 'draw_dir': draw_dir, 'file_pattern': output_pattern, 'h5_tablename': 'draws' } draw_sink = DrawSink(sink_config) # Apply regional scalar transform region_locs = get_location_metadata(gbd_round_id=GBD.GBD_ROUND_ID, location_set_id=35) region_locs = region_locs[region_locs.level == 2].location_id.tolist() draw_sink.add_transform(apply_regional_scalars, region_locs=region_locs,
multi_file = os.path.join(outdir, 'multi_year_{}.csv'.format(location_id)) multi_year.to_csv(multi_file, index=False) os.chmod(multi_file, 0o775) if __name__ == '__main__': (sev_version_id, location_id, gbd_round_id, year_id, change_intervals) = parse_arguments() drawdir = 'FILEPATH/{}/draws/'.format(sev_version_id) outdir = 'FILEPATH/sev/{}/summaries/'.format(sev_version_id) # identify rei_ids from the csvs in the draw_dir files = glob(os.path.join(drawdir, '*')) rei_ids = [int(os.path.basename(file)) for file in files if 'population' not in file and 'params' not in file] # Instantiate draw source source = DrawSource( params={'draw_dir': drawdir, 'file_pattern': '{rei_id}/{location_id}.csv'}) summarize_loc(source, drawdir, outdir, location_id, year_id, rei_ids, change_intervals=change_intervals, gbd_round_id=gbd_round_id)
def sexual_violence_input_source(self): return DrawSource(self.get_params_by_component("sexual", "inputs"))
def injuries_input_source(self): return DrawSource(self.get_params_by_component("injuries", "inputs"))
def sequela_input_source(self): return DrawSource(self.get_params_by_component("sequela", "inputs"))
def injuries_result_source(self): return DrawSource(self.get_params_by_component("injuries", "draws"))
def impairment_result_source(self): return DrawSource(self.get_params_by_component("impairment", "draws"))
def cause_result_source(self): return DrawSource(self.get_params_by_component("cause", "draws"))
def sequela_result_source(self): return DrawSource(self.get_params_by_component("sequela", "draws"))
return (sev_version_id, rei_id, gbd_round_id, n_draws, location_set_id) if __name__ == '__main__': (sev_version_id, rei_id, gbd_round_id, n_draws, location_set_id) = parse_arguments() drawdir = 'FILEPATH/{}/draws'.format(sev_version_id) # set up source and sink source_params = { 'draw_dir': drawdir, 'file_pattern': '{rei_id}/{location_id}.csv' } source = DrawSource(source_params) sink_params = { 'draw_dir': drawdir, 'file_pattern': '{rei_id}/{location_id}.csv' } sink = DrawSink(sink_params, write_func=partial(standard_write_func, index=False)) index_cols = [ 'rei_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id', 'metric_id' ] draw_cols = ['draw_{}'.format(i) for i in range(n_draws)] for lsid in location_set_id: popfile = os.path.join(drawdir, 'population_{}.csv'.format(lsid))
def append_shocks( parent_dir: str, machine_process: str, measure_ids: List[int], location_id: int, most_detailed_location: bool, sex_id: int ) -> None: """ Add yll and death shocks (location-aggregated) to re/scaled ylls and re/scaled deaths (also location-aggregated). Draws are stored broken down by location and sex for parallel execution. Arguments: parent_dir (str): machine_process (str): measure_ids (list): the measure_ids included in this run location_id (int): draws location_id most_detailed_location (bool): sex_id (int): draws sex_id """ scaled_dir, shocks_dir = _get_input_filepaths( parent_dir, machine_process, most_detailed_location ) input_file_pattern = FilePaths.APPEND_SHOCKS_FILE_PATTERN.format( sex_id=sex_id, location_id=location_id) # Deaths if Measures.Ids.DEATHS in measure_ids: scaled_params = { 'draw_dir': os.path.join(scaled_dir, FilePaths.DEATHS_DIR), 'file_pattern': input_file_pattern } scaled_ds = DrawSource(scaled_params) scaled = scaled_ds.content( filters={ Columns.LOCATION_ID: location_id, Columns.SEX_ID: sex_id, Columns.MEASURE_ID: Measures.Ids.DEATHS } ) shock_params = { 'draw_dir': os.path.join(shocks_dir, FilePaths.DEATHS_DIR), 'file_pattern': input_file_pattern } shock_ds = DrawSource(shock_params) shocks = shock_ds.content( filters={ Columns.LOCATION_ID: location_id, Columns.SEX_ID: sex_id, Columns.MEASURE_ID: Measures.Ids.DEATHS } ) new_scaled = _append_shocks(scaled, shocks) else: new_scaled = None # YLLS if Measures.Ids.YLLS in measure_ids: scaled_yll_params = { 'draw_dir': os.path.join(scaled_dir, FilePaths.YLLS_DIR), 'file_pattern': input_file_pattern } scaled_yll_ds = DrawSource(scaled_yll_params) scaled_ylls = scaled_yll_ds.content( filters={ Columns.LOCATION_ID: location_id, Columns.SEX_ID: sex_id, Columns.MEASURE_ID: Measures.Ids.YLLS } ) shock_yll_params = { 'draw_dir': os.path.join(shocks_dir, FilePaths.YLLS_DIR), 'file_pattern': input_file_pattern } shock_yll_ds = DrawSource(shock_yll_params) shock_ylls = shock_yll_ds.content( filters={ Columns.LOCATION_ID: location_id, Columns.SEX_ID: sex_id, Columns.MEASURE_ID: Measures.Ids.YLLS } ) new_scaled_ylls = _append_shocks(scaled_ylls, shock_ylls) else: new_scaled_ylls = None save_map = { GBD.Process.Name.CODCORRECT: _save_all_codcorrect_outputs, GBD.Process.Name.FAUXCORRECT: _save_all_fauxcorrect_outputs } save_map[machine_process]( parent_dir, new_scaled, new_scaled_ylls, location_id, sex_id )
def gen_draw_source(self): source = DrawSource(self._mem_io_params, mem_read_func) return source
def new_population(self, location_set_id, agg_loc_sets=[]): dim = self.nonfatal_dimensions.get_simulation_dimensions( self.measure_id) df = get_population( age_group_id=( dim.index_dim.get_level("age_group_id") + [164]), location_id=dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=self.gbd_round_id ).node_ids, sex_id=dim.index_dim.get_level("sex_id"), year_id=dim.index_dim.get_level("year_id")) index_cols = ["location_id", "year_id", "age_group_id", "sex_id"] data_cols = ["population"] io_mock = {} source = DrawSource({"draw_dict": io_mock, "name": "tmp"}, mem_read_func) sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func) sink.push(df[index_cols + data_cols]) # location for set_id in agg_loc_sets: loc_tree = dbtrees.loctree( location_set_id=set_id, gbd_round_id=self.gbd_round_id) operator = Sum( index_cols=[col for col in index_cols if col != "location_id"], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "location_id"], aggregate_col="location_id", operator=operator) aggregator.run(loc_tree) # age for age_group_id in ComoSummaries._gbd_compare_age_group_list: age_tree = dbtrees.agetree(age_group_id) operator = Sum( index_cols=[col for col in index_cols if col != "age_group_id" ], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "age_group_id" ], aggregate_col="age_group_id", operator=operator) aggregator.run(age_tree) # sex sex_tree = dbtrees.sextree() operator = Sum( index_cols=[col for col in index_cols if col != "sex_id"], value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=[col for col in index_cols if col != "sex_id"], aggregate_col="sex_id", operator=operator) aggregator.run(sex_tree) df = source.content() df.to_hdf( "{}/info/population.h5".format(self.como_dir), 'draws', mode='w', format='table', data_columns=["location_id", "year_id", "age_group_id", "sex_id"])
years = config['eligible_year_ids'] sexes = config['eligible_sex_ids'] # Read in all inputs logging.info("Reading in all inputs for {}".format(location)) rescaled_dir = os.path.join(parent_dir, 'aggregated/rescaled') shock_dir = os.path.join(parent_dir, 'aggregated/shocks') input_file_pattern = '{measure_id}_{location_id}_{year_id}.h5' logging.info("Rescaled draws...") rescaled_params = { 'draw_dir': rescaled_dir, 'file_pattern': input_file_pattern } rescaled_ds = DrawSource(rescaled_params) rescaled = rescaled_ds.content(filters={ 'location_id': location, 'measure_id': 1 }) logging.info("Shock draws...") shock_params = { 'draw_dir': shock_dir, 'file_pattern': input_file_pattern } shock_ds = DrawSource(shock_params) shocks = shock_ds.content(filters={ 'location_id': location, 'measure_id': 1 })
def _get_population( version: MachineParameters, location_set_id: int = constants.LocationSetId.OUTPUTS, agg_loc_sets: Optional[List[int]] = ( constants.LocationAggregation.Ids.SPECIAL_LOCATIONS + [constants.LocationSetId.OUTPUTS]) ) -> pd.DataFrame: """ Unpacks arguments from version object to use with get_population function. Requests most detailed ages and most detailed sexes because age-sex population aggregates are created in the summarize module. Dependant on demographics team to upload population for majority of aggregate locations but currently uses AggSynchronous to create population information for select Norway locations in LocationSetId.OUTPUTS. Arguments: version (MachineParameters): object containing all the demographic and configuration data needed to query population estimates. location_set_id (int): The id for hierarchy to aggregate up agg_loc_sets (list): Additional location sets to create special aggregates Return: pd.DataFrame """ pop = get_population(age_group_id=version.most_detailed_age_group_ids, location_id=version.location_ids, year_id=version.year_ids, sex_id=version.sex_ids, run_id=version.population_version_id, decomp_step=version.decomp_step, gbd_round_id=version.gbd_round_id) io_mock = {} source = DrawSource({"draw_dict": io_mock, "name": "tmp"}, mem_read_func) sink = DrawSink({"draw_dict": io_mock, "name": "tmp"}, mem_write_func) index_cols = constants.Columns.DEMOGRAPHIC_INDEX data_cols = [constants.Columns.POPULATION] sink.push(pop[index_cols + data_cols]) # location if agg_loc_sets: assert len(agg_loc_sets) == len(set(agg_loc_sets)) assert agg_loc_sets[-1] == constants.LocationSetId.OUTPUTS for set_id in agg_loc_sets: loc_tree = dbtrees.loctree(location_set_id=set_id, gbd_round_id=version.gbd_round_id) operator = Sum(index_cols=([ col for col in index_cols if col != constants.Columns.LOCATION_ID ]), value_cols=data_cols) aggregator = AggSynchronous( draw_source=source, draw_sink=sink, index_cols=([ col for col in index_cols if col != constants.Columns.LOCATION_ID ]), aggregate_col=constants.Columns.LOCATION_ID, operator=operator) aggregator.run(loc_tree) special_locations = source.content() else: special_locations = pd.DataFrame() return pd.concat([ pop, special_locations. loc[~special_locations.location_id.isin(pop.location_id.unique())] ], ignore_index=True)
def location_aggregate_birth_counts(gbd_round_id: int, decomp_step: str, constants_path: pathlib.PosixPath, location_set_id: int) -> None: """ for given gbd_round, decomp_step, location_set_id, get a complete set of location-aggregated live births """ logger.info(f'aggregating for location_set_id {location_set_id}') multiple_tree_flag = (location_set_id in mmr_constants.MULTIPLE_ROOT_LOCATION_SET_IDS) scalars = get_regional_scalars(gbd_round_id, decomp_step) index_cols = ['location_id', 'year_id', 'age_group_id', 'sex_id'] cov_estimate_filename = ( mmr_constants.COV_ESTIMATES_FORMAT_FILENAME.format(location_set_id)) region_locs, most_detailed_locs = get_location_level_sets( gbd_round_id=gbd_round_id, decomp_step=decomp_step, location_set_id=location_set_id) save_birth_count_estimates(gbd_round_id=gbd_round_id, decomp_step=decomp_step, cov_estimate_filepath=constants_path / cov_estimate_filename, location_set_id=location_set_id, most_detailed_locs=most_detailed_locs) loc_trees = dbtrees.loctree(location_set_id=location_set_id, gbd_round_id=gbd_round_id, decomp_step=decomp_step, return_many=multiple_tree_flag) if not multiple_tree_flag: loc_trees = [loc_trees] draw_source = DrawSource(params={ 'draw_dir': str(constants_path), 'file_pattern': cov_estimate_filename }) i = 1 output_filenames = [] for loc_tree in loc_trees: output_filename = f'{location_set_id}_{i}.h5' i += 1 draw_sink = DrawSink(params={ 'draw_dir': str(constants_path), 'file_pattern': output_filename }) draw_sink.add_transform( _apply_regional_scalars, regional_scalars_df=scalars.query('location_id in @region_locs'), gbd_round_id=gbd_round_id, decomp_step=decomp_step) op = Sum(index_cols=[s for s in index_cols if s != 'location_id'], value_cols=[mmr_constants.Columns.LIVE_BIRTH_VALUE_COL]) AggSynchronous( draw_source=draw_source, draw_sink=draw_sink, index_cols=[s for s in index_cols if s != 'location_id'], aggregate_col='location_id', operator=op).run(loc_tree, include_leaves=True) output_filenames.append(output_filename) return output_filenames