def write_summaries_multi(location_id, start_year, end_year, csv_dir, df, index_cols, write_out_star_ids): logger.debug("Entering write summaries multi_year") year_dir = "DIRECTORY" write_columns_order = [ 'measure_id', 'year_start_id', 'year_end_id', 'location_id', 'sex_id', 'age_group_id', 'cause_id', 'metric_id', 'mean', 'upper', 'lower' ] if 'rei_id' in df.columns: # Add rei_id to write_columns_order cid_pos = write_columns_order.index('cause_id') write_columns_order.insert(cid_pos + 1, 'rei_id') # Merge on REI types rei_type_id_df = get_rei_type_id_df() df = pd.merge(df, rei_type_id_df, on='rei_id') remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids) for my_measure_id in (gbd.measures.DEATH, gbd.measures.DALY, gbd.measures.YLD, gbd.measures.YLL): this_df = df[df['measure_id'] == my_measure_id] if not this_df.empty: this_out_dir = '{d}/{m}/{y}'.format(d=csv_dir, m=my_measure_id, y=year_dir) if 'rei_id' in df.columns: # Write out risks out_file_name = "upload_risk_{}_{}_{}.csv".format( location_id, start_year, end_year) df_to_csv(this_df[this_df['rei_type_id'] == RISK_REI_TYPE], index_cols, this_out_dir, out_file_name, write_columns_order) # Write out etiologies out_file_name = "upload_eti_{}_{}_{}.csv".format( location_id, start_year, end_year) df_to_csv(this_df[this_df['rei_type_id'] == ETI_REI_TYPE], index_cols, this_out_dir, out_file_name, write_columns_order) else: out_file_name = "upload_summary_{}_{}_{}.csv".format( location_id, start_year, end_year) df_to_csv(this_df, index_cols, this_out_dir, out_file_name, write_columns_order)
def run_pipeline_burdenator(args): """ Run the entire dalynator pipeline. Typically called from run_all->qsub->run_remote_pipeline->here Will raise ValueError if input files are not present. :param args: :return: """ # Start logger logger = logging.getLogger(__name__) start_time = time.time() logger.info("START pipeline burdenator at {}".format(start_time)) logger.info("START pipeline burdenator n_draws {}".format(args.n_draws)) # Validate args before doing any heavy-lifting if not any([ args.write_out_ylls_paf, args.write_out_ylds_paf, args.write_out_deaths_paf, args.write_out_dalys_paf ]): raise ValueError("must choose at least one of --ylls_paf, --ylds_paf," " --deaths_paf, or --dalys_paf ") # Share args across processes MPGlobals.args = args MPGlobals.logger = logger # Get detailed ages MPGlobals.most_detailed_age_groups = MetricConverter.get_detailed_ages() logger.info("START pipeline burdenator before data_container ") # Create a DataContainer, cache data to be shared across processes data_container = DataContainer( location_id=args.location_id, year_id=args.year_id, n_draws=args.n_draws, gbd_round_id=args.gbd_round_id, epi_dir=args.epi_dir, cod_dir=args.cod_dir, daly_dir=args.daly_dir, paf_dir=args.paf_dir, turn_off_null_and_nan_check=args.turn_off_null_and_nan_check, cache_dir=args.cache_dir) # Fetch PAF input from RF team logger.info("start apply PAFs, time = {}".format(time.time())) yll_columns = ['paf_yll_{}'.format(x) for x in xrange(args.n_draws)] yld_columns = ['paf_yld_{}'.format(x) for x in xrange(args.n_draws)] draw_columns = ['draw_{}'.format(x) for x in xrange(args.n_draws)] pafs_filter = PAFInputFilter(yll_columns=yll_columns, yld_columns=yld_columns, draw_columns=draw_columns) paf_df = data_container['paf'] pafs_filter.set_input_data_frame(paf_df) MPGlobals.pafs_filter = pafs_filter # Cache data and burdenate measures = [] if args.write_out_ylls_paf: measures.append('yll') data_container['yll'] if args.write_out_ylds_paf: measures.append('yld') data_container['yld'] if args.write_out_deaths_paf: measures.append('death') data_container['death'] MPGlobals.data_container = data_container pool_size = len(measures) pool = Pool(pool_size) results = map_and_raise(pool, burdenate_caught, measures) # Compute DALYs and associated summaries, if requested if args.write_out_dalys_paf: if not (args.write_out_ylls_paf and args.write_out_ylds_paf): raise ValueError("Can't compute risk-attributable DALYs unless " "both ylls and ylds are also provided") measures.append('daly') yld_df = [i['draws'] for i in results if i['key'] == 'yld'][0] yll_df = [i['draws'] for i in results if i['key'] == 'yll'][0] daly_df = compute_dalys(yld_df[yld_df.measure_id == gbd.measures.YLD], yll_df) results.append({'key': 'daly', 'draws': daly_df}) # Write out meta-information for downstream aggregation step meta_df = pd.concat([get_dimensions(r['draws']) for r in results]) meta_df = aggregate_dimensions(meta_df) meta_dict = generate_meta(meta_df) write_meta(args.out_dir, meta_dict) # Set the results as a Global, for use in summarization Pool MPGlobals.results = results # Summarize pool_size = len(measures) pool = Pool(pool_size) summ_df = map_and_raise(pool, summarize_caught, measures) summ_df = pd.concat(summ_df) summ_df = match_with_dimensions(summ_df, meta_df) summ_df.reset_index(drop=True, inplace=True) logger.info( "Risk attribution & daly computation complete, df shape {}".format( (summ_df.shape))) logger.info(" FINAL burdenator result shape {}".format(summ_df.shape)) # Write out the year summaries as CSV files rei_types = get_rei_type_id_df() summ_df = summ_df.loc[summ_df['rei_id'] != 0] for measure_id in summ_df.measure_id.unique(): for risk_type in [RISK_REI_TYPE, ETI_REI_TYPE]: # Get list of rei_ids of this type risks_of_type = rei_types[rei_types.rei_type_id == risk_type] risks_of_type = risks_of_type.rei_id.squeeze() # Compute filename summ_fn = get_summ_filename(args.out_dir, risk_type, args.location_id, args.year_id, measure_id) logger.info("Writing {}".format(summ_fn)) # Write appropriate subset to file write_csv( summ_df[((summ_df.measure_id == measure_id) & (summ_df.rei_id.isin(risks_of_type)))], summ_fn) end_time = time.time() elapsed = end_time - start_time logger.info("DONE location-year pipeline at {}, elapsed seconds= " "{}".format(end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE)) return summ_df.shape
def write_summaries(location_id, year_id, csv_dir, df, index_cols, do_risk_aggr=False, write_out_star_ids=False): # find none/undefined value in df, remove it from df if do_risk_aggr: write_columns_order = [ 'measure_id', 'year_id', 'location_id', 'sex_id', 'age_group_id', 'cause_id', 'rei_id', 'metric_id', 'mean', 'upper', 'lower' ] else: write_columns_order = [ 'measure_id', 'year_id', 'location_id', 'sex_id', 'age_group_id', 'cause_id', 'metric_id', 'mean', 'upper', 'lower' ] write_columns_order = remove_unwanted_star_id_column( write_columns_order, write_out_star_ids) logger.debug("Entering write summaries") if do_risk_aggr: rei_type_id_df = get_rei_type_id_df() df = pd.merge(df, rei_type_id_df, on='rei_id') tmp_df = df measure_ids = [ gbd.measures.DEATH, gbd.measures.YLL, gbd.measures.YLD, gbd.measures.DALY ] for measure_id in measure_ids: this_df = tmp_df[tmp_df['measure_id'] == measure_id] if not this_df.empty: logger.debug("rei non-zero {}".format(measure_id)) this_out_dir = "{}/{}/single_year/".format(csv_dir, measure_id) logger.debug("this_out_dir={}".format(this_out_dir)) out_file_name = ("upload_risk_" + str(location_id) + "_" + str(year_id) + ".csv") df_to_csv(this_df[this_df['rei_type_id'] == RISK_REI_TYPE], index_cols, this_out_dir, out_file_name, write_columns_order) out_file_name = ("upload_eti_" + str(location_id) + "_" + str(year_id) + ".csv") df_to_csv(this_df[this_df['rei_type_id'] == ETI_REI_TYPE], index_cols, this_out_dir, out_file_name, write_columns_order) else: tmp_df = df # Save mortality measure IDs mortality_measure_ids = [gbd.measures.DEATH, gbd.measures.YLL] for measure_id in mortality_measure_ids: this_df = tmp_df[tmp_df['measure_id'] == measure_id] if not this_df.empty: logger.debug("rei_id is 0, measures 1 & 4, " + "measure == {}".format(measure_id)) this_out_dir = "{}/{}/single_year/".format(csv_dir, measure_id) out_file_name = ("upload_risk_" + str(location_id) + "_" + str(year_id) + ".csv") df_to_csv(this_df, index_cols, this_out_dir, out_file_name, write_columns_order) # Save DALY results for measure_id in [gbd.measures.DALY]: this_df = tmp_df[tmp_df['measure_id'] == measure_id] if not this_df.empty: logger.debug("measure 2, measure is {}".format(measure_id)) this_out_dir = "{}/{}/single_year/".format(csv_dir, measure_id) logger.debug("this_out_dir={}".format(this_out_dir)) out_file_name = ("upload_summary_" + str(location_id) + "_" + str(year_id) + ".csv") df_to_csv(this_df, index_cols, this_out_dir, out_file_name, write_columns_order)