def aggregate_sexes(df, draw_cols, index_cols): """Takes a DataFrame of sex-specific draws, aggregates them to both sexes combined. Function is immutable, and only returns the both sex draws.""" my_sex_aggr = SexAggregator(df, draw_cols, index_cols, data_container=MPGlobals.data_container, include_pre_df=False) df = my_sex_aggr.get_data_frame() MPGlobals.logger.debug("sex_aggregation complete, df shape " "{}".format(df.shape)) return df
def _compute_most_detailed_df(self): """ Computations only, does not write files. Makes testing easier. """ start_time = time.time() logger.info("START location-year pipeline at {}".format(start_time)) # Create a DataContainer data_container = DataContainer( { 'location_id': self.location_id, 'year_id': self.year_id }, n_draws=self.n_draws, gbd_round_id=self.gbd_round_id, epi_dir=self.epi_dir, cod_dir=self.cod_dir, cache_dir=self.cache_dir, turn_off_null_and_nan_check=self.turn_off_null_and_nan_check) yll_df = data_container['yll'] yld_df = data_container['yld'] # Compute DALYs draw_cols = list(yll_df.filter(like='draw').columns) index_cols = list(set(yll_df.columns) - set(draw_cols)) computer = ComputeDalys(yll_df, yld_df, draw_cols, index_cols) df = computer.get_data_frame() logger.info("DALY computation complete, df shape {}".format( (df.shape))) logger.info(" input DF age_group_id {}".format( df['age_group_id'].unique())) draw_cols = list(df.filter(like='draw').columns) index_cols = list(set(df.columns) - set(draw_cols)) existing_age_groups = df['age_group_id'].unique() logger.info("Preparing for sex aggregation") # Do sex aggregation my_sex_aggr = SexAggregator(df, draw_cols, index_cols) df = my_sex_aggr.get_data_frame() logger.info("Sex aggregation complete") # Do age aggregation my_age_aggr = AgeAggregator(df, draw_cols, index_cols, data_container=data_container) df = my_age_aggr.get_data_frame() logger.info("Age aggregation complete") # Convert to rate and % space df = MetricConverter(df, to_rate=True, to_percent=True, data_container=data_container).get_data_frame() logger.debug("new DF age_group_id {}".format( df['age_group_id'].unique())) logger.info(" FINAL dalynator result shape {}".format(df.shape)) end_time = time.time() elapsed = end_time - start_time logger.info( "DONE location-year pipeline at {}, elapsed seconds= {}".format( end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE)) return df, existing_age_groups
def run_burdenator_cleanup(out_dir, location_id, year_id, n_draws, measure_id, cod_dir, epi_dir, turn_off_null_and_nan_check, gbd_round_id, write_out_star_ids, cache_dir): """Take a set of aggregated results and reformat them into draws consistent with the most-detailed location draws. Args: out_dir (str): the root directory for this burdenator run location_id (int): location_id of the aggregate location year_id (int): year of the aggregate location n_draws (int): the number of draw columns in the H5 data frames, greater than zero measure_id (int): measure_id of the aggregate location cod_dir (str): directory where the cause-level envelope for cod (CoDCorrect) files are stored epi_dir (str): directory where the cause-level envelope for epi (COMO) files are stored turn_off_null_and_nan_check (bool): Disable checks for NaNs and Nulls write_out_star_ids (bool): If true, include star_ids in output draw files and CSV upload files """ MPGlobals.logger = logger start_time = time.time() logger.info("START pipeline burdenator cleanup at {}".format(start_time)) # Get aggregated draws logger.info("start append files, time = {}".format(time.time())) draw_dir = os.path.join(out_dir, 'draws') aggregated_draw_dir = os.path.join(out_dir, 'loc_agg_draws') df = [] for metric in ['burden']: input_file_pattern = ('{root}/{metric}/' '{location_id}/{measure_id}/' '{measure_id}_{year_id}_{location_id}_*.h5') logger.debug("Cleanup file pattern {}".format( input_file_pattern.format(root=aggregated_draw_dir, metric=metric, location_id=location_id, year_id=year_id, measure_id=measure_id))) draw_files = glob.glob(input_file_pattern.format( root=aggregated_draw_dir, metric=metric, location_id=location_id, year_id=year_id, measure_id=measure_id)) for f in draw_files: logger.info("appending {}".format(f)) this_df = pd.read_hdf('{}'.format(f)) dups = this_df[this_df.filter(like='_id').columns ].duplicated().any() if dups: msg = ("Duplicates found in location aggregate output " "file {}. Failing this cleanup job".format(f)) logger.error(msg) raise RuntimeError(msg) df.append(this_df) df = pd.concat(df) logger.info("append files complete, time = {}".format(time.time())) logger.info("columns appended df {}".format(get_index_columns(df))) add_star_id(df) # Get cause envelope data_container = DataContainer( {'location_id': location_id, 'year_id': year_id}, n_draws=n_draws, gbd_round_id=gbd_round_id, cod_dir=cod_dir, epi_dir=epi_dir, turn_off_null_and_nan_check=turn_off_null_and_nan_check, cache_dir=cache_dir) MPGlobals.data_container = data_container # cause_env_df has all-cause, without risks if measure_id == gbd.measures.DEATH: cause_env_df = data_container['death'] elif measure_id == gbd.measures.YLL: cause_env_df = data_container['yll'] elif measure_id == gbd.measures.YLD: cause_env_df = data_container['yld'] elif measure_id == gbd.measures.DALY: # Get YLLs and YLDs yll_df = data_container['yll'] yld_df = data_container['yld'] yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD] # Compute DALYs draw_cols = list(yld_df.filter(like='draw').columns) index_cols = list(set(yld_df.columns) - set(draw_cols)) daly_ce = ComputeDalys(yll_df, yld_df, draw_cols, index_cols) cause_env_df = daly_ce.get_data_frame() cause_env_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE cause_env_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL # Concatenate cause envelope with data most_detailed_age_groups = MetricConverter.get_detailed_ages() df = pd.concat([df, cause_env_df]) df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) & (df['age_group_id'].isin(most_detailed_age_groups)) & (df['metric_id'] == gbd.metrics.NUMBER))] # Do sex aggregation draw_cols = list(df.filter(like='draw').columns) index_cols = list(set(df.columns) - set(draw_cols)) logger.info("start aggregating sexes, time = {}".format(time.time())) my_sex_aggr = SexAggregator(df, draw_cols, index_cols) df = my_sex_aggr.get_data_frame() logger.info("aggregating ages sexes, time = {}".format(time.time())) # Do age aggregation logger.info("start aggregating ages, time = {}".format(time.time())) my_age_aggr = AgeAggregator(df, draw_cols, index_cols, data_container=data_container) df = my_age_aggr.get_data_frame() logger.info("aggregating ages complete, time = {}".format(time.time())) # Convert to rate space logger.info("start converting to rates, time = {}".format(time.time())) df = MetricConverter(df, to_rate=True, data_container=data_container).get_data_frame() logger.info("converting to rates complete, time = {}".format(time.time())) # Back-calculate PAFs logger.info("start back-calculating PAFs, time = {}".format(time.time())) to_calc_pafs = ((df['metric_id'] == gbd.metrics.NUMBER) | (df['age_group_id'] == gbd.age.AGE_STANDARDIZED)) pafs_df = df.loc[to_calc_pafs].copy(deep=True) pafs_df = back_calc_pafs(pafs_df, n_draws) df = pd.concat([df, pafs_df]) logger.info("back-calculating PAFs complete, time = {}" .format(time.time())) # Calculate and write out summaries as CSV files csv_dir = "{}/{}/upload/".format(draw_dir, location_id) write_sum.write_summaries(location_id, year_id, csv_dir, df, index_cols, do_risk_aggr=True, write_out_star_ids=write_out_star_ids) # Save draws df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) & (df['age_group_id'].isin(most_detailed_age_groups)) & (df['metric_id'].isin([gbd.metrics.NUMBER, gbd.metrics.PERCENT])))] logger.info("start saving draws, time = {}".format(time.time())) output_file_pattern = ('{location_id}/' '{measure_id}_{location_id}_{year_id}.h5') output_file_path = output_file_pattern.format( location_id=location_id, year_id=year_id, measure_id=measure_id) filename = "{}/{}".format(draw_dir, output_file_path) df = remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids) sink = HDFDataSink(filename, complib="zlib", complevel=1) sink.write(df) logger.info("saving output draws complete, time = {}".format(time.time())) # End log end_time = time.time() elapsed = end_time - start_time logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format( end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE))
def run_burdenator_cleanup(out_dir, location_id, year_id, n_draws, measure_id, cod_dir, cod_pattern, epi_dir, turn_off_null_and_nan_check, gbd_round_id, decomp_step, write_out_star_ids, cache_dir, dual_upload): """Take a set of aggregated results and reformat them into draws consistent with the most-detailed location draws. Args: out_dir (str): the root directory for this burdenator run location_id (int): location_id of the aggregate location year_id (int): year of the aggregate location n_draws (int): the number of draw columns in the H5 data frames, greater than zero measure_id (int): measure_id of the aggregate location cod_dir (str): directory where the cause-level envelope for cod (CoDCorrect) files are stored cod_pattern (str): file pattern for accessing CoD-or-FauxCorrect draws. Example: '{measure_id}_{location_id}.h5' epi_dir (str): directory where the cause-level envelope for epi (COMO) files are stored turn_off_null_and_nan_check (bool): Disable checks for NaNs and Nulls write_out_star_ids (bool): If true, include star_ids in output draw files and CSV upload files dual_upload (bool): If True upload to column store as well as the gbd database. Currently not implemented. """ MPGlobals.logger = logger start_time = time.time() logger.info("START pipeline burdenator cleanup at {}".format(start_time)) logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logging.warning('is when this event was logged.') # Get aggregated draws logger.info("start append files, time = {}".format(time.time())) draw_dir = os.path.join(out_dir, 'draws') aggregated_draw_dir = os.path.join(out_dir, 'loc_agg_draws') # df contains Attribute Burden, which is in Number space. # It is a subset of the total count for the parent metric, # ie AB of YLL's for a cause attributable to a risk # (or to all known & unknown risks, ie rei_id == 0) # df is a list of data frames df = [] for metric in ['burden']: input_file_pattern = ('FILEPATH') logger.debug("Cleanup file pattern {}".format( input_file_pattern.format(root=aggregated_draw_dir, metric=metric, location_id=location_id, year_id=year_id, measure_id=measure_id))) draw_files = glob.glob( input_file_pattern.format(root=aggregated_draw_dir, metric=metric, location_id=location_id, year_id=year_id, measure_id=measure_id)) for f in draw_files: logger.info("appending {}".format(f)) this_df = pd.read_hdf('{}'.format(f)) dups = this_df[this_df.filter( like='_id').columns].duplicated().any() if dups: msg = ("Duplicates found in location aggregate output " "file {}. Failing this cleanup job".format(f)) logger.error(msg) raise RuntimeError(msg) df.append(this_df) df = pd.concat(df) logger.info("append files complete, time = {}".format(time.time())) logger.info("columns appended df {}".format(get_index_columns(df))) add_star_id(df) # Get cause envelope data_container = DataContainer( { 'location_id': location_id, 'year_id': year_id }, n_draws=n_draws, gbd_round_id=gbd_round_id, decomp_step=decomp_step, cod_dir=cod_dir, cod_pattern=cod_pattern, epi_dir=epi_dir, turn_off_null_and_nan_check=turn_off_null_and_nan_check, cache_dir=cache_dir) MPGlobals.data_container = data_container # cause_env_df has all-cause mortality/whatever, without risks if measure_id == gbd.measures.DEATH: cause_env_df = data_container['death'] elif measure_id == gbd.measures.YLL: cause_env_df = data_container['yll'] elif measure_id == gbd.measures.YLD: cause_env_df = data_container['yld'] elif measure_id == gbd.measures.DALY: # Get YLLs and YLDs yll_df = data_container['yll'] yld_df = data_container['yld'] yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD] # Compute DALYs draw_cols = list(yld_df.filter(like='draw').columns) index_cols = list(set(yld_df.columns) - set(draw_cols)) daly_ce = ComputeDalys(yll_df, yld_df, draw_cols, index_cols) cause_env_df = daly_ce.get_data_frame() cause_env_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE cause_env_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL # Concatenate cause envelope with data most_detailed_age_groups = MetricConverter.get_detailed_ages() df = pd.concat([df, cause_env_df], sort=True) df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) & (df['age_group_id'].isin(most_detailed_age_groups)) & (df['metric_id'] == gbd.metrics.NUMBER))] # Do sex aggregation draw_cols = list(df.filter(like='draw').columns) index_cols = list(set(df.columns) - set(draw_cols)) logger.info("start aggregating sexes, time = {}".format(time.time())) my_sex_aggr = SexAggregator(df, draw_cols, index_cols) df = my_sex_aggr.get_data_frame() logger.info("aggregating ages sexes, time = {}".format(time.time())) # Do age aggregation logger.info("start aggregating ages, time = {}".format(time.time())) my_age_aggr = AgeAggregator(df, draw_cols, index_cols, data_container=data_container) df = my_age_aggr.get_data_frame() logger.info("aggregating ages complete, time = {}".format(time.time())) # Convert to rate space logger.info("start converting to rates, time = {}".format(time.time())) df = MetricConverter(df, to_rate=True, data_container=data_container).get_data_frame() logger.info("converting to rates complete, time = {}".format(time.time())) # df does not contain AB's any more, because they are RATES # Back-calculate PAFs logger.info("start back-calculating PAFs, time = {}".format(time.time())) to_calc_pafs = ((df['metric_id'] == gbd.metrics.NUMBER) | (df['age_group_id'] == gbd.age.AGE_STANDARDIZED)) pafs_df = df.loc[to_calc_pafs].copy(deep=True) # back_calc_pafs is part of the most detailed pipeline, reused from here. pafs_df = back_calc_pafs(pafs_df, n_draws) df = pd.concat([df, pafs_df], sort=True) logger.info("back-calculating PAFs complete, time = {}".format( time.time())) # Calculate and write out summaries as CSV files csv_dir = "FILEPATH".format(draw_dir, location_id) write_sum.write_summaries(location_id, year_id, csv_dir, df, index_cols, do_risk_aggr=True, write_out_star_ids=write_out_star_ids, dual_upload=dual_upload) # Save draws df = df.loc[( (df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) & (df['age_group_id'].isin(most_detailed_age_groups)) & (df['metric_id'].isin([gbd.metrics.NUMBER, gbd.metrics.PERCENT])))] logger.info("start saving draws, time = {}".format(time.time())) output_file_pattern = ('FILEPATH') output_file_path = output_file_pattern.format(location_id=location_id, year_id=year_id, measure_id=measure_id) filename = "FILEPATH".format(draw_dir, output_file_path) remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids) sink = HDFDataSink(filename, complib="zlib", complevel=1) sink.write(df) logger.info("saving output draws complete, time = {}".format(time.time())) # End log end_time = time.time() elapsed = end_time - start_time logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format( end_time, elapsed)) logger.info("FILEPATH".format(SUCCESS_LOG_MESSAGE))
def run_pipeline(args): """ Run the entire dalynator pipeline. Typically called from run_all->qsub->run_remote_pipeline->here Will throw ValueError if input files are not present. TBD Refactor as a ComputationElement followed by a DataSink at the end :param args :return: """ logger = logging.getLogger(__name__) start_time = time.time() logger.info("START location-year pipeline at {}".format(start_time)) # Create a DataContainer data_container = DataContainer( location_id=args.location_id, year_id=args.year_id, n_draws=args.n_draws, gbd_round_id=args.gbd_round_id, epi_dir=args.epi_dir, cod_dir=args.cod_dir, cache_dir=args.cache_dir, turn_off_null_and_nan_check=args.turn_off_null_and_nan_check) yll_df = data_container['yll'] yld_df = data_container['yld'] # Compute DALYs draw_cols = list(yll_df.filter(like='draw').columns) index_cols = list(set(yll_df.columns) - set(draw_cols)) computer = ComputeDalys(yll_df, yld_df, draw_cols, index_cols) df = computer.get_data_frame() logger.info("DALY computation complete, df shape {}".format((df.shape))) logger.info(" input DF age_group_id {}".format(df['age_group_id'].unique())) draw_cols = list(df.filter(like='draw').columns) index_cols = list(set(df.columns) - set(draw_cols)) existing_age_groups= df['age_group_id'].unique() logger.info("Preparing for sex aggregation") # Do sex aggregation my_sex_aggr = SexAggregator(df, draw_cols, index_cols) df = my_sex_aggr.get_data_frame() logger.info("Sex aggregation complete") # Do age aggregation my_age_aggr = AgeAggregator(df, draw_cols, index_cols, data_container=data_container) df = my_age_aggr.get_data_frame() logger.info("Age aggregation complete") # Convert to rate and % space df = MetricConverter(df, to_rate=True, to_percent=True, data_container=data_container).get_data_frame() logger.debug("new DF age_group_id {}".format(df['age_group_id'].unique())) logger.info(" FINAL dalynator result shape {}".format(df.shape)) # Calculate and write out the year summaries as CSV files draw_cols = list(df.filter(like='draw').columns) index_cols = list(set(df.columns) - set(draw_cols)) csv_dir = args.out_dir + '/upload/' write_sum.write_summaries(args.location_id, args.year_id, csv_dir, df, index_cols, False, args.gbd_round_id) end_time = time.time() elapsed = end_time - start_time logger.info("DONE location-year pipeline at {}, elapsed seconds= {}".format(end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE)) # Adding any index-like column to the HDF index for later random access filename = get_input_args.calculate_output_filename(args.out_dir, gbd.measures.DALY, args.location_id, args.year_id) if args.no_sex_aggr: df = df[df['sex_id'] != gbd.sex.BOTH] if args.no_age_aggr: df = df[df['age_group_id'].isin(existing_age_groups)] sink = HDFDataSink(filename, data_columns=[col for col in df if col.endswith("_id")], complib="zlib", complevel=1) sink.write(df) logger.info("DONE write DF {}".format(time.time())) return df.shape