Ejemplo n.º 1
0
def convert_to_rates(df):
    """Convert the values in df from number to rate space"""
    MPGlobals.logger.info("start converting to rates, time = "
                          "{}".format(time.time()))
    mc = MetricConverter(df,
                         to_rate=True,
                         data_container=MPGlobals.data_container,
                         include_pre_df=False)
    newdf = mc.get_data_frame()
    MPGlobals.logger.info("converting to rates complete, time = "
                          "{}".format(time.time()))
    return newdf
Ejemplo n.º 2
0
def compute_aggregates(df, n_draws, tool_name):
    """WARNING: THIS ALTERS DF!"""
    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))

    agga = partial(aggregate_ages, index_cols=index_cols,
                   draw_cols=draw_cols)
    aggs = partial(aggregate_sexes, index_cols=index_cols,
                   draw_cols=draw_cols)

    # Aggregate sex/age in numbers
    df = pd.concat([df, aggs(df)], sort=True)
    df = pd.concat([df, agga(df)], sort=True)

    if "burdenator" in tool_name:
        # Calculate rate space
        rate_df = convert_to_rates(df)
        dc.add_star_id(df)
        # Convert risk attr burden in count space to pct space (%)
        pct_df = back_calc_pafs(df, n_draws)
        df = pd.concat([df, rate_df, pct_df], sort=True)
    else:
        df = MetricConverter(df, to_rate=True, to_percent=True,
                             data_container=MPGlobals.data_container).get_data_frame()
    return df
Ejemplo n.º 3
0
    def _cache_pop(self) -> None:
        """Caches the call to the database for population AND
        also adds in a fake population for location 44620 (Global) so that SDI
        aggregation does not fail due to a missing population value
        """
        logger.debug(
            "Starting to load population cache")
        core_index = [
            'location_id', 'year_id', 'age_group_id', 'sex_id']
        pop_ds = GetPopulationDataSource(
            "population", year_id=self.all_year_ids,
            location_id=self.full_location_ids,
            gbd_round_id=self.gbd_round_id,
            decomp_step=self.decomp_step,
            desired_index=core_index)

        pop_df = pop_ds.get_data_frame()
        try:
            pop_df = MetricConverter.aggregate_population(
                pop_df)
        except ValueError as e:
            if str(e) != "No objects to concatenate":
                raise

        if len(pop_df[pop_df.location_id == 44620]) == 0:
            pop_df.append(pop_df[pop_df.location_id == 1].replace(
                {'location_id': {1: 44620}}))
        cache_file = "{}/pop.h5".format(
            self.cache_dir)
        pop_df.to_hdf(cache_file, "pop", data_columns=core_index,
                      format="table")
        logger.debug(
            "Cached population in {}".format(cache_file))
Ejemplo n.º 4
0
def run_pipeline_burdenator(args):
    """
    Run the entire dalynator pipeline. Typically called from
    run_all->qsub->run_remote_pipeline->here

    Will raise ValueError if input files are not present.

    :param args:
    :return:
    """
    # Start logger
    logger = logging.getLogger(__name__)
    start_time = time.time()
    logger.info("START pipeline burdenator at {}".format(start_time))
    logger.info("START pipeline burdenator n_draws {}".format(args.n_draws))
    # Validate args before doing any heavy-lifting
    if not any([
            args.write_out_ylls_paf, args.write_out_ylds_paf,
            args.write_out_deaths_paf, args.write_out_dalys_paf
    ]):
        raise ValueError("must choose at least one of --ylls_paf, --ylds_paf,"
                         " --deaths_paf, or --dalys_paf ")

    # Share args across processes
    MPGlobals.args = args
    MPGlobals.logger = logger

    # Get detailed ages
    MPGlobals.most_detailed_age_groups = MetricConverter.get_detailed_ages()

    logger.info("START pipeline burdenator before data_container ")
    # Create a DataContainer, cache data to be shared across processes
    data_container = DataContainer(
        location_id=args.location_id,
        year_id=args.year_id,
        n_draws=args.n_draws,
        gbd_round_id=args.gbd_round_id,
        epi_dir=args.epi_dir,
        cod_dir=args.cod_dir,
        daly_dir=args.daly_dir,
        paf_dir=args.paf_dir,
        turn_off_null_and_nan_check=args.turn_off_null_and_nan_check,
        cache_dir=args.cache_dir)

    # Fetch PAF input from RF team
    logger.info("start apply PAFs, time = {}".format(time.time()))
    yll_columns = ['paf_yll_{}'.format(x) for x in xrange(args.n_draws)]
    yld_columns = ['paf_yld_{}'.format(x) for x in xrange(args.n_draws)]
    draw_columns = ['draw_{}'.format(x) for x in xrange(args.n_draws)]
    pafs_filter = PAFInputFilter(yll_columns=yll_columns,
                                 yld_columns=yld_columns,
                                 draw_columns=draw_columns)
    paf_df = data_container['paf']
    pafs_filter.set_input_data_frame(paf_df)
    MPGlobals.pafs_filter = pafs_filter

    # Cache data and burdenate
    measures = []
    if args.write_out_ylls_paf:
        measures.append('yll')
        data_container['yll']
    if args.write_out_ylds_paf:
        measures.append('yld')
        data_container['yld']
    if args.write_out_deaths_paf:
        measures.append('death')
        data_container['death']

    MPGlobals.data_container = data_container
    pool_size = len(measures)
    pool = Pool(pool_size)
    results = map_and_raise(pool, burdenate_caught, measures)

    # Compute DALYs and associated summaries, if requested
    if args.write_out_dalys_paf:
        if not (args.write_out_ylls_paf and args.write_out_ylds_paf):
            raise ValueError("Can't compute risk-attributable DALYs unless "
                             "both ylls and ylds are also provided")
        measures.append('daly')
        yld_df = [i['draws'] for i in results if i['key'] == 'yld'][0]
        yll_df = [i['draws'] for i in results if i['key'] == 'yll'][0]
        daly_df = compute_dalys(yld_df[yld_df.measure_id == gbd.measures.YLD],
                                yll_df)
        results.append({'key': 'daly', 'draws': daly_df})

    # Write out meta-information for downstream aggregation step
    meta_df = pd.concat([get_dimensions(r['draws']) for r in results])
    meta_df = aggregate_dimensions(meta_df)
    meta_dict = generate_meta(meta_df)
    write_meta(args.out_dir, meta_dict)

    # Set the results as a Global, for use in summarization Pool
    MPGlobals.results = results

    # Summarize
    pool_size = len(measures)
    pool = Pool(pool_size)
    summ_df = map_and_raise(pool, summarize_caught, measures)

    summ_df = pd.concat(summ_df)
    summ_df = match_with_dimensions(summ_df, meta_df)
    summ_df.reset_index(drop=True, inplace=True)

    logger.info(
        "Risk attribution & daly computation complete, df shape {}".format(
            (summ_df.shape)))

    logger.info("  FINAL burdenator result shape {}".format(summ_df.shape))

    # Write out the year summaries as CSV files
    rei_types = get_rei_type_id_df()
    summ_df = summ_df.loc[summ_df['rei_id'] != 0]
    for measure_id in summ_df.measure_id.unique():
        for risk_type in [RISK_REI_TYPE, ETI_REI_TYPE]:

            # Get list of rei_ids of this type
            risks_of_type = rei_types[rei_types.rei_type_id == risk_type]
            risks_of_type = risks_of_type.rei_id.squeeze()

            # Compute filename
            summ_fn = get_summ_filename(args.out_dir, risk_type,
                                        args.location_id, args.year_id,
                                        measure_id)
            logger.info("Writing {}".format(summ_fn))

            # Write appropriate subset to file
            write_csv(
                summ_df[((summ_df.measure_id == measure_id) &
                         (summ_df.rei_id.isin(risks_of_type)))], summ_fn)

    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE location-year pipeline at {}, elapsed seconds= "
                "{}".format(end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))

    return summ_df.shape
    def _compute_most_detailed_df(self):
        """
        Computations only, does not write files. Makes testing easier.
        """

        start_time = time.time()
        logger.info("START location-year pipeline at {}".format(start_time))

        # Create a DataContainer
        data_container = DataContainer(
            {
                'location_id': self.location_id,
                'year_id': self.year_id
            },
            n_draws=self.n_draws,
            gbd_round_id=self.gbd_round_id,
            epi_dir=self.epi_dir,
            cod_dir=self.cod_dir,
            cache_dir=self.cache_dir,
            turn_off_null_and_nan_check=self.turn_off_null_and_nan_check)
        yll_df = data_container['yll']
        yld_df = data_container['yld']

        # Compute DALYs
        draw_cols = list(yll_df.filter(like='draw').columns)
        index_cols = list(set(yll_df.columns) - set(draw_cols))
        computer = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
        df = computer.get_data_frame()

        logger.info("DALY computation complete, df shape {}".format(
            (df.shape)))
        logger.info(" input DF age_group_id {}".format(
            df['age_group_id'].unique()))

        draw_cols = list(df.filter(like='draw').columns)
        index_cols = list(set(df.columns) - set(draw_cols))
        existing_age_groups = df['age_group_id'].unique()

        logger.info("Preparing for sex aggregation")

        # Do sex aggregation
        my_sex_aggr = SexAggregator(df, draw_cols, index_cols)
        df = my_sex_aggr.get_data_frame()
        logger.info("Sex aggregation complete")

        # Do age aggregation
        my_age_aggr = AgeAggregator(df,
                                    draw_cols,
                                    index_cols,
                                    data_container=data_container)
        df = my_age_aggr.get_data_frame()
        logger.info("Age aggregation complete")

        # Convert to rate and % space
        df = MetricConverter(df,
                             to_rate=True,
                             to_percent=True,
                             data_container=data_container).get_data_frame()

        logger.debug("new  DF age_group_id {}".format(
            df['age_group_id'].unique()))
        logger.info("  FINAL dalynator result shape {}".format(df.shape))
        end_time = time.time()
        elapsed = end_time - start_time
        logger.info(
            "DONE location-year pipeline at {}, elapsed seconds= {}".format(
                end_time, elapsed))
        logger.info("{}".format(SUCCESS_LOG_MESSAGE))

        return df, existing_age_groups
Ejemplo n.º 6
0
def run_burdenator_cleanup(out_dir, location_id, year_id, n_draws, measure_id,
                           cod_dir, cod_pattern, epi_dir,
                           turn_off_null_and_nan_check, gbd_round_id,
                           decomp_step, write_out_star_ids, cache_dir,
                           dual_upload):
    """Take a set of aggregated results and reformat them into draws consistent
    with the most-detailed location draws.

    Args:
        out_dir (str): the root directory for this burdenator run
        location_id (int): location_id of the aggregate location
        year_id (int): year of the aggregate location
        n_draws (int): the number of draw columns in the H5 data frames,
            greater than zero
        measure_id (int): measure_id of the aggregate location
        cod_dir (str): directory where the cause-level envelope for
            cod (CoDCorrect) files are stored
        cod_pattern (str): file pattern for accessing CoD-or-FauxCorrect
            draws.  Example: '{measure_id}_{location_id}.h5'
        epi_dir (str): directory where the cause-level envelope for
            epi (COMO) files are stored
        turn_off_null_and_nan_check (bool): Disable checks for NaNs and Nulls
        write_out_star_ids (bool): If true, include star_ids in output
        draw files and CSV upload files
        dual_upload (bool): If True upload to column store as well
            as the gbd database.  Currently not implemented.
    """
    MPGlobals.logger = logger
    start_time = time.time()
    logger.info("START pipeline burdenator cleanup at {}".format(start_time))
    logging.basicConfig(format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')
    logging.warning('is when this event was logged.')

    # Get aggregated draws
    logger.info("start append files, time = {}".format(time.time()))
    draw_dir = os.path.join(out_dir, 'draws')
    aggregated_draw_dir = os.path.join(out_dir, 'loc_agg_draws')
    # df contains Attribute Burden, which is in Number space.
    # It is a subset of the total count for the parent metric,
    # ie AB of YLL's for a cause attributable to a risk
    # (or to all known & unknown risks, ie rei_id == 0)

    # df is a list of data frames
    df = []
    for metric in ['burden']:
        input_file_pattern = ('FILEPATH')
        logger.debug("Cleanup file pattern {}".format(
            input_file_pattern.format(root=aggregated_draw_dir,
                                      metric=metric,
                                      location_id=location_id,
                                      year_id=year_id,
                                      measure_id=measure_id)))
        draw_files = glob.glob(
            input_file_pattern.format(root=aggregated_draw_dir,
                                      metric=metric,
                                      location_id=location_id,
                                      year_id=year_id,
                                      measure_id=measure_id))
        for f in draw_files:
            logger.info("appending {}".format(f))
            this_df = pd.read_hdf('{}'.format(f))
            dups = this_df[this_df.filter(
                like='_id').columns].duplicated().any()
            if dups:
                msg = ("Duplicates found in location aggregate output "
                       "file {}. Failing this cleanup job".format(f))
                logger.error(msg)
                raise RuntimeError(msg)
            df.append(this_df)
    df = pd.concat(df)
    logger.info("append files complete, time = {}".format(time.time()))
    logger.info("columns appended df {}".format(get_index_columns(df)))
    add_star_id(df)

    # Get cause envelope
    data_container = DataContainer(
        {
            'location_id': location_id,
            'year_id': year_id
        },
        n_draws=n_draws,
        gbd_round_id=gbd_round_id,
        decomp_step=decomp_step,
        cod_dir=cod_dir,
        cod_pattern=cod_pattern,
        epi_dir=epi_dir,
        turn_off_null_and_nan_check=turn_off_null_and_nan_check,
        cache_dir=cache_dir)
    MPGlobals.data_container = data_container

    # cause_env_df has all-cause mortality/whatever, without risks
    if measure_id == gbd.measures.DEATH:
        cause_env_df = data_container['death']
    elif measure_id == gbd.measures.YLL:
        cause_env_df = data_container['yll']
    elif measure_id == gbd.measures.YLD:
        cause_env_df = data_container['yld']
    elif measure_id == gbd.measures.DALY:
        # Get YLLs and YLDs
        yll_df = data_container['yll']
        yld_df = data_container['yld']
        yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD]
        # Compute DALYs
        draw_cols = list(yld_df.filter(like='draw').columns)
        index_cols = list(set(yld_df.columns) - set(draw_cols))
        daly_ce = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
        cause_env_df = daly_ce.get_data_frame()

    cause_env_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE
    cause_env_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL

    # Concatenate cause envelope with data

    most_detailed_age_groups = MetricConverter.get_detailed_ages()
    df = pd.concat([df, cause_env_df], sort=True)
    df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
                 (df['age_group_id'].isin(most_detailed_age_groups)) &
                 (df['metric_id'] == gbd.metrics.NUMBER))]

    # Do sex aggregation
    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))
    logger.info("start aggregating sexes, time = {}".format(time.time()))
    my_sex_aggr = SexAggregator(df, draw_cols, index_cols)
    df = my_sex_aggr.get_data_frame()
    logger.info("aggregating ages sexes, time = {}".format(time.time()))

    # Do age aggregation
    logger.info("start aggregating ages, time = {}".format(time.time()))
    my_age_aggr = AgeAggregator(df,
                                draw_cols,
                                index_cols,
                                data_container=data_container)
    df = my_age_aggr.get_data_frame()
    logger.info("aggregating ages complete, time = {}".format(time.time()))

    # Convert to rate space
    logger.info("start converting to rates, time = {}".format(time.time()))
    df = MetricConverter(df, to_rate=True,
                         data_container=data_container).get_data_frame()
    logger.info("converting to rates complete, time = {}".format(time.time()))

    # df does not contain AB's any more, because they are RATES

    # Back-calculate PAFs
    logger.info("start back-calculating PAFs, time = {}".format(time.time()))
    to_calc_pafs = ((df['metric_id'] == gbd.metrics.NUMBER) |
                    (df['age_group_id'] == gbd.age.AGE_STANDARDIZED))
    pafs_df = df.loc[to_calc_pafs].copy(deep=True)

    # back_calc_pafs is part of the most detailed pipeline, reused from here.
    pafs_df = back_calc_pafs(pafs_df, n_draws)
    df = pd.concat([df, pafs_df], sort=True)
    logger.info("back-calculating PAFs complete, time = {}".format(
        time.time()))

    # Calculate and write out summaries as CSV files
    csv_dir = "FILEPATH".format(draw_dir, location_id)
    write_sum.write_summaries(location_id,
                              year_id,
                              csv_dir,
                              df,
                              index_cols,
                              do_risk_aggr=True,
                              write_out_star_ids=write_out_star_ids,
                              dual_upload=dual_upload)

    # Save draws
    df = df.loc[(
        (df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
        (df['age_group_id'].isin(most_detailed_age_groups)) &
        (df['metric_id'].isin([gbd.metrics.NUMBER, gbd.metrics.PERCENT])))]
    logger.info("start saving draws, time = {}".format(time.time()))
    output_file_pattern = ('FILEPATH')
    output_file_path = output_file_pattern.format(location_id=location_id,
                                                  year_id=year_id,
                                                  measure_id=measure_id)
    filename = "FILEPATH".format(draw_dir, output_file_path)
    remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids)
    sink = HDFDataSink(filename, complib="zlib", complevel=1)
    sink.write(df)
    logger.info("saving output draws complete, time = {}".format(time.time()))

    # End log
    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format(
        end_time, elapsed))
    logger.info("FILEPATH".format(SUCCESS_LOG_MESSAGE))
Ejemplo n.º 7
0
def run_burdenator_cleanup(out_dir, location_id, year_id, n_draws, measure_id,
                           cod_dir, epi_dir,
                           turn_off_null_and_nan_check, gbd_round_id,
                           write_out_star_ids,
                           cache_dir):
    """Take a set of aggregated results and reformat them into draws consistent
    with the most-detailed location draws.

    Args:
        out_dir (str): the root directory for this burdenator run
        location_id (int): location_id of the aggregate location
        year_id (int): year of the aggregate location
        n_draws (int): the number of draw columns in the H5 data frames,
            greater than zero
        measure_id (int): measure_id of the aggregate location
        cod_dir (str): directory where the cause-level envelope for
            cod (CoDCorrect) files are stored
        epi_dir (str): directory where the cause-level envelope for
            epi (COMO) files are stored
        turn_off_null_and_nan_check (bool): Disable checks for NaNs and Nulls
        write_out_star_ids (bool): If true, include star_ids in output
        draw files and CSV upload files
    """
    MPGlobals.logger = logger
    start_time = time.time()
    logger.info("START pipeline burdenator cleanup at {}".format(start_time))

    # Get aggregated draws
    logger.info("start append files, time = {}".format(time.time()))
    draw_dir = os.path.join(out_dir, 'draws')
    aggregated_draw_dir = os.path.join(out_dir, 'loc_agg_draws')

    df = []
    for metric in ['burden']:
        input_file_pattern = ('{root}/{metric}/'
                              '{location_id}/{measure_id}/'
                              '{measure_id}_{year_id}_{location_id}_*.h5')
        logger.debug("Cleanup file pattern {}".format(
            input_file_pattern.format(root=aggregated_draw_dir, metric=metric,
                                      location_id=location_id, year_id=year_id,
                                      measure_id=measure_id)))
        draw_files = glob.glob(input_file_pattern.format(
            root=aggregated_draw_dir, metric=metric, location_id=location_id,
            year_id=year_id, measure_id=measure_id))
        for f in draw_files:
            logger.info("appending {}".format(f))
            this_df = pd.read_hdf('{}'.format(f))
            dups = this_df[this_df.filter(like='_id').columns
                           ].duplicated().any()
            if dups:
                msg = ("Duplicates found in location aggregate output "
                       "file {}. Failing this cleanup job".format(f))
                logger.error(msg)
                raise RuntimeError(msg)
            df.append(this_df)
    df = pd.concat(df)
    logger.info("append files complete, time = {}".format(time.time()))
    logger.info("columns appended df {}".format(get_index_columns(df)))
    add_star_id(df)

    # Get cause envelope
    data_container = DataContainer(
        {'location_id': location_id,
         'year_id': year_id},
        n_draws=n_draws,
        gbd_round_id=gbd_round_id,
        cod_dir=cod_dir,
        epi_dir=epi_dir,
        turn_off_null_and_nan_check=turn_off_null_and_nan_check,
        cache_dir=cache_dir)
    MPGlobals.data_container = data_container

    # cause_env_df has all-cause, without risks
    if measure_id == gbd.measures.DEATH:
        cause_env_df = data_container['death']
    elif measure_id == gbd.measures.YLL:
        cause_env_df = data_container['yll']
    elif measure_id == gbd.measures.YLD:
        cause_env_df = data_container['yld']
    elif measure_id == gbd.measures.DALY:
        # Get YLLs and YLDs
        yll_df = data_container['yll']
        yld_df = data_container['yld']
        yld_df = yld_df.loc[yld_df.measure_id == gbd.measures.YLD]
        # Compute DALYs
        draw_cols = list(yld_df.filter(like='draw').columns)
        index_cols = list(set(yld_df.columns) - set(draw_cols))
        daly_ce = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
        cause_env_df = daly_ce.get_data_frame()
    cause_env_df['rei_id'] = gbd.risk.TOTAL_ATTRIBUTABLE
    cause_env_df['star_id'] = gbd.star.ANY_EVIDENCE_LEVEL

    # Concatenate cause envelope with data
    most_detailed_age_groups = MetricConverter.get_detailed_ages()
    df = pd.concat([df, cause_env_df])
    df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
                (df['age_group_id'].isin(most_detailed_age_groups)) &
                (df['metric_id'] == gbd.metrics.NUMBER))]

    # Do sex aggregation
    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))
    logger.info("start aggregating sexes, time = {}".format(time.time()))
    my_sex_aggr = SexAggregator(df, draw_cols, index_cols)
    df = my_sex_aggr.get_data_frame()
    logger.info("aggregating ages sexes, time = {}".format(time.time()))

    # Do age aggregation
    logger.info("start aggregating ages, time = {}".format(time.time()))
    my_age_aggr = AgeAggregator(df, draw_cols, index_cols,
                                data_container=data_container)
    df = my_age_aggr.get_data_frame()
    logger.info("aggregating ages complete, time = {}".format(time.time()))

    # Convert to rate space
    logger.info("start converting to rates, time = {}".format(time.time()))
    df = MetricConverter(df, to_rate=True,
                         data_container=data_container).get_data_frame()
    logger.info("converting to rates complete, time = {}".format(time.time()))

    # Back-calculate PAFs
    logger.info("start back-calculating PAFs, time = {}".format(time.time()))
    to_calc_pafs = ((df['metric_id'] == gbd.metrics.NUMBER) |
                    (df['age_group_id'] == gbd.age.AGE_STANDARDIZED))
    pafs_df = df.loc[to_calc_pafs].copy(deep=True)

    pafs_df = back_calc_pafs(pafs_df, n_draws)
    df = pd.concat([df, pafs_df])
    logger.info("back-calculating PAFs complete, time = {}"
                .format(time.time()))

    # Calculate and write out summaries as CSV files
    csv_dir = "{}/{}/upload/".format(draw_dir, location_id)
    write_sum.write_summaries(location_id, year_id, csv_dir, df, index_cols,
                              do_risk_aggr=True,
                              write_out_star_ids=write_out_star_ids)

    # Save draws
    df = df.loc[((df['sex_id'].isin([gbd.sex.MALE, gbd.sex.FEMALE])) &
                (df['age_group_id'].isin(most_detailed_age_groups)) &
                (df['metric_id'].isin([gbd.metrics.NUMBER,
                                      gbd.metrics.PERCENT])))]
    logger.info("start saving draws, time = {}".format(time.time()))
    output_file_pattern = ('{location_id}/'
                           '{measure_id}_{location_id}_{year_id}.h5')
    output_file_path = output_file_pattern.format(
        location_id=location_id, year_id=year_id, measure_id=measure_id)
    filename = "{}/{}".format(draw_dir, output_file_path)
    df = remove_unwanted_stars(df, write_out_star_ids=write_out_star_ids)
    sink = HDFDataSink(filename,
                       complib="zlib",
                       complevel=1)
    sink.write(df)
    logger.info("saving output draws complete, time = {}".format(time.time()))

    # End log
    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format(
        end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))
def run_pipeline(args):

    """
    Run the entire dalynator pipeline. Typically called from run_all->qsub->run_remote_pipeline->here

    Will throw ValueError if input files are not present.

    TBD Refactor as a ComputationElement followed by a DataSink at the end
    :param args
    :return:
    """

    logger = logging.getLogger(__name__)
    start_time = time.time()
    logger.info("START location-year pipeline at {}".format(start_time))

    # Create a DataContainer
    data_container = DataContainer(
        location_id=args.location_id,
        year_id=args.year_id,
        n_draws=args.n_draws,
        gbd_round_id=args.gbd_round_id,
        epi_dir=args.epi_dir,
        cod_dir=args.cod_dir,
        cache_dir=args.cache_dir,
        turn_off_null_and_nan_check=args.turn_off_null_and_nan_check)
    yll_df = data_container['yll']
    yld_df = data_container['yld']

    # Compute DALYs
    draw_cols = list(yll_df.filter(like='draw').columns)
    index_cols = list(set(yll_df.columns) - set(draw_cols))
    computer = ComputeDalys(yll_df, yld_df, draw_cols, index_cols)
    df = computer.get_data_frame()

    logger.info("DALY computation complete, df shape {}".format((df.shape)))
    logger.info(" input DF age_group_id {}".format(df['age_group_id'].unique()))

    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))
    existing_age_groups= df['age_group_id'].unique()

    logger.info("Preparing for sex aggregation")

    # Do sex aggregation
    my_sex_aggr = SexAggregator(df, draw_cols, index_cols)
    df = my_sex_aggr.get_data_frame()
    logger.info("Sex aggregation complete")

    # Do age aggregation
    my_age_aggr = AgeAggregator(df, draw_cols, index_cols,
                                data_container=data_container)
    df = my_age_aggr.get_data_frame()
    logger.info("Age aggregation complete")

    # Convert to rate and % space
    df = MetricConverter(df, to_rate=True, to_percent=True,
                         data_container=data_container).get_data_frame()

    logger.debug("new  DF age_group_id {}".format(df['age_group_id'].unique()))
    logger.info("  FINAL dalynator result shape {}".format(df.shape))

    # Calculate and write out the year summaries as CSV files
    draw_cols = list(df.filter(like='draw').columns)
    index_cols = list(set(df.columns) - set(draw_cols))

    csv_dir = args.out_dir + '/upload/'
    write_sum.write_summaries(args.location_id, args.year_id, csv_dir, df, index_cols, False, args.gbd_round_id)

    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE location-year pipeline at {}, elapsed seconds= {}".format(end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))

    # Adding any index-like column to the HDF index for later random access
    filename = get_input_args.calculate_output_filename(args.out_dir, gbd.measures.DALY, args.location_id, args.year_id)
    if args.no_sex_aggr:
        df = df[df['sex_id'] != gbd.sex.BOTH]

    if args.no_age_aggr:
        df = df[df['age_group_id'].isin(existing_age_groups)]

    sink = HDFDataSink(filename,
                       data_columns=[col for col in df if col.endswith("_id")],
                       complib="zlib", complevel=1)
    sink.write(df)
    logger.info("DONE write DF {}".format(time.time()))

    return df.shape