Exemple #1
0
 def im_draw(df, draw_num, surv_uids):
     ''' Returns the dataframe with estimate of absolute survival for the 
             requested draw_num
     '''
     # Subset to only the necessary data
     max_surv = nd.nonfatalDataset().max_survival_months
     draw_uids = nd.nonfatalDataset().uid_cols
     abs_surv_col = nd.get_columns("absolute_survival") 
     increm_mort_col = nd.get_columns("incremental_mortality")
     # Calculate incremental mortality, the number of people who have lived
     #   with the disease for each period (those who die in year one
     #   had the disease for only a year)
     df[increm_mort_col] = df.sort_values(surv_uids).groupby(
         draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0)
     # Calculate the number of people surviving with the disease at and
     #   beyond the maximum year
     at_max_surv_months = (df['survival_month'] == max_surv)
     mort_total = df[~at_max_surv_months
                    ].groupby(draw_uids, as_index=False
                    )[increm_mort_col].agg(np.sum
                    ).rename(columns={increm_mort_col: 'total_mort'})
     df = df.merge(mort_total)
     df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort']
     # test and return
     assert not df.isnull().any().any(), "Error in im_draw {}".format(i)
     return(df.loc[:, surv_uids+[increm_mort_col]])
def save_model_results(df, metric_name, acause):
    ''' Saves a separate output file for each me_tag in the dataframe
    '''
    uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols
    data_cols = nd.get_columns(metric_name)
    draw_cols = nd.get_columns("draw_cols")
    if metric_name == "incidence":
        measure_id = utils.get_gbd_parameter('incidence_measure_id')
        df.loc[:, 'me_tag'] = 'primary_phase'
    elif metric_name == "prevalence":
        measure_id = utils.get_gbd_parameter('prevalence_measure_id')
    for this_tag in df['me_tag'].unique():
        me_id = nd.get_modelable_entity_id(acause, this_tag)
        if me_id is None:
            continue
        print("me_id " + str(me_id) + " " + this_tag)
        output_data = df.loc[df['me_tag'].eq(this_tag), uid_cols + data_cols]
        output_data.columns = uid_cols + draw_cols
        output_data['modelable_entity_id'] = me_id
        nd.save_outputs(
            "final_results",
            output_data,
            acause,
            me_id,
            measure_id,
        )
Exemple #3
0
def calc_abs_surv(df, acause):
    ''' Returns the dataframe with estimate of absolute survival for the requested draw_num
    '''
    # Generate absolute survival draws
    print("    estimating absolute survival")
    abs_surv_col = nd.get_columns("absolute_survival")
    rel_surv_col = nd.get_columns("relative_survival")
    df.loc[:, abs_surv_col] = \
        (df[rel_surv_col]*np.exp(df['lambda_years'])).clip(upper=1, lower=0)
    pe.validate_proportions(df[abs_surv_col])
    return (df)
def load_estimates(metric_name, acause, location_id):
    ''' Loads previously-generated estimates per the metric_name
    '''
    this_step = nd.nonfatalDataset(metric_name, acause)
    uid_cols = this_step.uid_cols
    if metric_name == "survival":
        type_cols = [nd.get_columns("absolute_survival")]
    else:
        type_cols = nd.get_columns(metric_name)
    #
    input_file = this_step.get_output_file(location_id)
    input_data = pd.read_csv(input_file)
    return (input_data[uid_cols + type_cols])
def save_procedure_inputs(df, acause, location_id):
    '''' Formats and saves procedure data for upload into the epi database
    '''
    uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id']
    draw_cols = nd.get_columns("draw_cols")
    epi_estimate_cols = ['mean', 'lower', 'upper']
    data = df.loc[:, uid_cols + draw_cols].copy()
    # apply formatting
    data.loc[df['age_group_id'].isin([33, 44, 301]), 'age_group_id'] = 235
    data = dft.collapse(data, by_cols=uid_cols, stub='draw')
    epi_df = epi_upload.format_draws_data(data)
    epi_df = epi_upload.convert_to_rate(epi_df, epi_estimate_cols, location_id)
    # Add metadata
    epi_df['measure'] = 'incidence'
    epi_df['unit_type'] = "Person*year"
    epi_df['extractor'] = getuser()
    epi_df['location_id'] = location_id
    # Finalize and export
    for me_id in epi_df['modelable_entity_id'].unique():
        print("me_id " + str(me_id) + " sequela split")
        me_table = nd.load_me_table()
        bundle_id = int(me_table.loc[me_table['modelable_entity_id'].eq(me_id),
                                     'bundle_id'].item())
        this_output = epi_df.loc[epi_df['modelable_entity_id'].eq(me_id), :]
        this_output = epi_upload.EpiUploadDataframe(this_output).data
        # Save output without testing (epi formatter has already tested data per
        #   epi specs)
        # add location_id to enable save_outputs
        this_output['location_id'] = location_id
        nd.save_outputs("dismod_inputs",
                        this_output,
                        acause,
                        bundle_id,
                        skip_testing=True)
Exemple #6
0
def load_incidence(acause, location_id):
    ''' Returns incidence estimation subset required for prevalence estimation
    '''
    uid_cols = nd.nonfatalDataset().uid_cols
    inc_cols = nd.get_columns("incidence")
    input_file = nd.nonfatalDataset(
        "incidence", acause).get_output_file(location_id)
    inc_data = pd.read_csv(input_file)[uid_cols+inc_cols]
    return(inc_data[uid_cols + inc_cols])
Exemple #7
0
def load_survival(acause, location_id):
    ''' Returns survival estimation subset required for prevalence estimation
    '''
    uid_cols = nd.nonfatalDataset("survival", acause).uid_cols
    abs_surv_col = nd.get_columns("absolute_survival")
    this_dataset = nd.nonfatalDataset("survival", acause)
    input_file = this_dataset.get_output_file(location_id)
    surv_data = pd.read_csv(input_file)
    return(surv_data[uid_cols + [abs_surv_col]])
Exemple #8
0
def calc_increm_mort(surv_df, acause, location_id):
    ''' Returns a dataframe of incremental survival estimates by uid
    '''
    def im_draw(df, draw_num, surv_uids):
        ''' Returns the dataframe with estimate of absolute survival for the 
                requested draw_num
        '''
        # Subset to only the necessary data
        max_surv = nd.nonfatalDataset().max_survival_months
        draw_uids = nd.nonfatalDataset().uid_cols
        abs_surv_col = nd.get_columns("absolute_survival") 
        increm_mort_col = nd.get_columns("incremental_mortality")
        # Calculate incremental mortality, the number of people who have lived
        #   with the disease for each period (those who die in year one
        #   had the disease for only a year)
        df[increm_mort_col] = df.sort_values(surv_uids).groupby(
            draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0)
        # Calculate the number of people surviving with the disease at and
        #   beyond the maximum year
        at_max_surv_months = (df['survival_month'] == max_surv)
        mort_total = df[~at_max_surv_months
                       ].groupby(draw_uids, as_index=False
                       )[increm_mort_col].agg(np.sum
                       ).rename(columns={increm_mort_col: 'total_mort'})
        df = df.merge(mort_total)
        df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort']
        # test and return
        assert not df.isnull().any().any(), "Error in im_draw {}".format(i)
        return(df.loc[:, surv_uids+[increm_mort_col]])

    # Generate incremental mortality draws
    output_uids = nd.nonfatalDataset("survival", acause).uid_cols
    abs_surv_cols = [nd.get_columns("absolute_survival")]
    incr_mort_cols = [nd.get_columns("incremental_mortality")]
    output_df = surv_df.loc[:, output_uids]
    print("    estimating incremental mortality proportion...")
    # Note: this section remains written with a loop to facilitate future
    #   processing of absolute survival draws
    for i, as_col in enumerate(abs_surv_cols):
        this_draw = im_draw(df=surv_df.loc[:, output_uids + [as_col]],
                            draw_num=i,
                            surv_uids=output_uids)
        output_df = output_df.merge(this_draw, on=output_uids)
    return(output_df[output_uids + incr_mort_cols])
Exemple #9
0
def calc_mortality(surv_df, acause, location_id):
    ''' Calculate mortality, the number of people who die of the
        cause during the interval (year), where
        mort= incremental_mortality_proportion*incidence.
        Returns a datafrane of mortality by uid
    '''
    print("    estimating absolute mortality...")
    uid_cols = nd.nonfatalDataset("survival", acause).uid_cols
    inc_cols = nd.get_columns("incidence")
    incr_mort_cols = [nd.get_columns('incremental_mortality')]
    mort_cols = nd.get_columns('mortality')
    incr_mort_df = calc_increm_mort(surv_df, acause, location_id)
    inc_df = load_incidence(acause, location_id)
    mrg_df = incr_mort_df.merge(inc_df)
    df = mrg_df[uid_cols]
    df[mort_cols] = \
        pd.DataFrame(mrg_df[inc_cols].values * mrg_df[incr_mort_cols].values)
    df = df.merge(incr_mort_df)
    return(df)
Exemple #10
0
def calc_prevalence(sequela_framework, mort_df, acause):
    '''
    '''
    print("    calculating prevalence...")
    prev_cols = nd.get_columns('prevalence')
    mort_cols = nd.get_columns('mortality')
    surv_uids = nd.nonfatalDataset("survival", acause).uid_cols
    prev_uids = nd.nonfatalDataset("prevalence", acause).uid_cols
    # Create the prevalence estimation frame from the survival and mortality 
    #       frames
    mrg_df = pd.merge(sequela_framework, mort_df)
    df = mrg_df[surv_uids + ['me_tag']]
    # Calculate prevalence of each sequela by multiplying sequela duration
    #     by the number of people surviving for only that duration
    df[prev_cols] = mrg_df[mort_cols].mul(mrg_df['sequela_duration'], axis=0)
    df = dft.collapse(df, combine_cols=prev_cols,
                      by_cols=prev_uids, func='sum')
    df.loc[:, prev_cols] = df[prev_cols] / 12  # convert to years
    assert not df.isnull().any().any(), "Error in im_draw {}".format(i)
    return(df)
def calc_procedure_tenplus(inc_df, proportions, acause, location_id):
    ''' Multiplies incidence draws by the procedure proportion and the absolute
            survival proportion at 10 years to estimate the number of cases
            surviving for at least 10 years
    '''
    # Load known values
    print(
        "    calculating the incidence of procedures with surv > ten years...")
    uid_cols = nd.nonfatalDataset().uid_cols
    type_cols = nd.get_columns('incidence')
    draw_cols = nd.get_columns("draw_cols")
    abs_surv = [nd.get_columns("absolute_survival")]
    max_estimation_year = utils.get_gbd_parameter('max_year')
    max_survival_months = nd.nonfatalDataset().max_survival_months
    # Estimate incidence of procedure
    mrg_df = inc_df.merge(proportions)
    adj_df = mrg_df[uid_cols]
    num_procedures = (mrg_df[type_cols].values * mrg_df[draw_cols].values)
    adj_df[type_cols] = pd.DataFrame(num_procedures).fillna(0)
    # Estimate number of procedures resulting in survival beyond ten years
    surv_df = load_estimates('survival', acause, location_id)
    surv_df = surv_df.loc[surv_df['survival_month'].eq(max_survival_months),
                          uid_cols + abs_surv]
    adj_df = adj_df.merge(surv_df)
    pbt_df = adj_df[uid_cols]
    num_procedures_10ys = adj_df[type_cols].values * \
                                        adj_df[abs_surv].values
    pbt_df[draw_cols] = pd.DataFrame(num_procedures_10ys).fillna(0)
    # Update years and age categories
    pbt_df.loc[:, 'age_group_id'] = pbt_df['age_group_id'].apply(
        add_decade_to_age)
    pbt_df.loc[:, 'year_id'] += 10
    # drop data that are now out of scope
    pbt_df = pbt_df.loc[pbt_df['year_id'] <= max_estimation_year, :]
    # For procedures whose sequelae are fractional,
    if sequelae_fractions(acause):
        pbt_df = split_sequelae(pbt_df, acause, location_id)
    else:
        pbt_df.loc[:, 'modelable_entity_id'] = \
            nd.get_modelable_entity_id(acause, 'procedure_sequelae')
    return (pbt_df)
Exemple #12
0
def load_rel_surv_values(acause, location_id, cnf_model_run_id):
    ''' Loads and returns survival best-case/worst-case estimations for the given
            acause
    '''
    print("       loading survival...")
    uid_cols = nd.nonfatalDataset().uid_cols
    rel_surv_col = nd.get_columns("relative_survival")
    sex_restrictions = {
        'neo_prostate': 1,
        'neo_testicular': 1,
        'neo_cervical': 2,
        'neo_ovarian': 2,
        'neo_uterine': 2
    }
    # Load specific input based on run_id
    surv_folder = load_surv_folder(cnf_model_run_id)
    input_file = "{}/{}/{}.csv".format(surv_folder, acause, location_id)
    # import and update names
    this_surv = pd.read_csv(input_file)
    this_surv.rename(columns={
        'year': 'year_id',
        'sex': 'sex_id'
    },
                     inplace=True)
    # Add 'year 0' survival equal to 1 (no time has passed through which to survive)
    this_surv['scaled_0year'] = 1
    # Subset by sex
    if acause in sex_restrictions.keys():
        this_surv = this_surv.loc[this_surv['sex_id'] ==
                                  sex_restrictions[acause], :]
    # Reshape and rename columns
    this_surv = dft.wide_to_long(this_surv,
                                 stubnames='scaled_',
                                 i=uid_cols,
                                 j=['survival_year'],
                                 drop_others=True)
    this_surv = this_surv.loc[
        this_surv['survival_year'] != '10year_restrict', :]
    this_surv.loc[:, 'survival_year'] = this_surv['survival_year'].str.replace(
        'year', '').astype(int)
    this_surv.rename(columns={'scaled_': rel_surv_col}, inplace=True)
    # extend age groups if not present
    this_surv = _fix_survival_ages(this_surv)
    # Test and return
    assert not this_surv.isnull().any().any(), \
        "Null values found in relative survival input after formatting"
    pe.validate_proportions(this_surv[rel_surv_col])
    return (this_surv)
def split_sequelae(df, acause, location_id):
    ''' Splits estimates into sequela based on proportions from literature
    '''
    print("    splitting sequelae...")
    uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id']
    draw_cols = nd.get_columns("draw_cols")
    # Generate dataframe containing the procedure_sequelae fractions
    fracs = pd.DataFrame().from_dict(
        sequelae_fractions(acause), orient='index').reset_index().rename(
            columns={'index': 'modelable_entity_id'})
    fracs = fracs[fracs['me_tag'].eq("procedure_sequelae")]
    fracs['acause'] = acause
    # Merge dataframe with data
    df['acause'] = acause
    split_df = df.merge(fracs)
    split_df[draw_cols] = split_df[draw_cols].multiply(split_df['fraction'],
                                                       axis='index')
    assert split_df[draw_cols].notnull().all().all(), "Nulls in split sequelae"
    return (split_df)
def apply_procdedure_proportions(df, proportions, acause, metric_name):
    ''' Multiplies estimates by procedure proportions, adding to the dataframe
            a set of estimates for the number of cancer events that do not 
            recieve the given procedure
    '''
    print("    adjusting to avoid double-counting procedures...")
    # Return if adjustment is unnecessary (if there is no rate id for the cause)
    uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols
    draw_cols = nd.get_columns("draw_cols")
    type_cols = nd.get_columns(metric_name)
    mrg_cols = [c for c in uid_cols if c != 'me_tag']
    # Subset estimates to the phase wherein procedures occur
    if metric_name == 'prevalence':
        mrg_df = df.loc[df['me_tag'] == "controlled_phase", :].copy()
        del mrg_df['me_tag']
    elif metric_name == 'incidence':
        mrg_df = df.copy()
    # For data where sequela are a fraction of the number of procedures, multiply
    #       the procedure proportion by those fractions
    if metric_name == 'prevalence' and bool(sequelae_fractions(acause)):
        # Generate dataframe to containing the fractions
        fracs = pd.DataFrame().from_dict(sequelae_fractions(acause),
                                         orient='index')
        fracs['acause'] = acause
        fracs = fracs[~fracs['me_tag'].eq("procedure_sequelae")]
        # Merge dataframe with proportions to expand
        proportions['acause'] = acause
        props = proportions.merge(fracs)
        # Adjust proportions by me
        props[draw_cols] = props[draw_cols].multiply(props['fraction'],
                                                     axis='index')
        del props['acause']
    else:
        # Determine fraction of population that does not recieve the procedure
        props = proportions.copy()
        props['me_tag'] = "adjusted_controlled_phase_a"
    # Apply proportions to estimates
    #   Note: may drop some data if proportions are only for estimation years
    mrg_df = mrg_df.merge(props, on=mrg_cols, how='inner')
    adj_df = mrg_df[uid_cols]
    evnt_wo_proc = pd.DataFrame(mrg_df[type_cols].values *
                                mrg_df[draw_cols].values).fillna(0)
    evnt_wo_proc.columns = type_cols
    adj_df[type_cols] = evnt_wo_proc
    assert not adj_df.isnull().any().any(
    ), "Error calculating procedure proportions"
    # For prevalence, append the adjusted data to the rest of the estimates
    if metric_name == 'prevalence':
        sq_df = dft.collapse(adj_df, mrg_cols,
                             combine_cols=type_cols).sort_values(mrg_cols)
        cntrl_df = df.loc[df['me_tag'].eq("controlled_phase"), :].merge(
            mrg_df[mrg_cols].drop_duplicates(), on=mrg_cols,
            how='inner').sort_values(mrg_cols)
        nosq_df = cntrl_df[mrg_cols]
        no_proc = pd.DataFrame(cntrl_df[type_cols].values -
                               sq_df[type_cols].values)
        no_proc.columns = type_cols
        nosq_df[type_cols] = no_proc
        nosq_df['me_tag'] = "adjusted_controlled_phase"
        adj_df = adj_df.append(nosq_df)
        output_data = df.append(adj_df)
    # Incidence of cancers with the procedure is estimated elsewhere, so there
    #      is no need to preserve the unadjusted data
    else:
        output_data = adj_df
    return (output_data[uid_cols + type_cols])