예제 #1
0
 def im_draw(df, draw_num, surv_uids):
     ''' Returns the dataframe with estimate of absolute survival for the 
             requested draw_num
     '''
     # Subset to only the necessary data
     max_surv = nd.nonfatalDataset().max_survival_months
     draw_uids = nd.nonfatalDataset().uid_cols
     abs_surv_col = nd.get_columns("absolute_survival") 
     increm_mort_col = nd.get_columns("incremental_mortality")
     # Calculate incremental mortality, the number of people who have lived
     #   with the disease for each period (those who die in year one
     #   had the disease for only a year)
     df[increm_mort_col] = df.sort_values(surv_uids).groupby(
         draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0)
     # Calculate the number of people surviving with the disease at and
     #   beyond the maximum year
     at_max_surv_months = (df['survival_month'] == max_surv)
     mort_total = df[~at_max_surv_months
                    ].groupby(draw_uids, as_index=False
                    )[increm_mort_col].agg(np.sum
                    ).rename(columns={increm_mort_col: 'total_mort'})
     df = df.merge(mort_total)
     df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort']
     # test and return
     assert not df.isnull().any().any(), "Error in im_draw {}".format(i)
     return(df.loc[:, surv_uids+[increm_mort_col]])
예제 #2
0
def generate_estimates(acause, location_id):
    ''' Applies procedure adjustments where necessary, then saves separate 
            outputs by measure and cancer phase
    '''
    print("Begin final adjustments...")
    inc_df = load_estimates('incidence', acause, location_id)
    prev_input = load_estimates('prevalence', acause, location_id)
    prev_df = calc_total_prevalence(prev_input,
                                    uid_cols=nd.nonfatalDataset(
                                        'prevalence', acause).uid_cols)
    pr_id = procedure_me_id(acause)
    if pr_id is not None:
        prop_df = load_procedure_proportions(pr_id, location_id)
        prev_df = apply_procdedure_proportions(prev_df, prop_df, acause,
                                               'prevalence')
        proc_data = calc_procedure_tenplus(inc_df, prop_df, acause,
                                           location_id)
        save_procedure_inputs(proc_data, acause, location_id)
        save_model_results(inc_df, 'incidence', acause)
        save_model_results(prev_df, 'prevalence', acause)
    else:
        save_model_results(inc_df, 'incidence', acause)
        save_model_results(prev_df, 'prevalence', acause)
    success_file = nd.nonfatalDataset('final_results',
                                      acause).get_output_file("finalized_" +
                                                              str(location_id))
    open(success_file, 'a').close()
    print(str(success_file) + " saved.")
    return (True)
예제 #3
0
def load_incidence(acause, location_id):
    ''' Returns incidence estimation subset required for prevalence estimation
    '''
    uid_cols = nd.nonfatalDataset().uid_cols
    inc_cols = nd.get_columns("incidence")
    input_file = nd.nonfatalDataset(
        "incidence", acause).get_output_file(location_id)
    inc_data = pd.read_csv(input_file)[uid_cols+inc_cols]
    return(inc_data[uid_cols + inc_cols])
예제 #4
0
def load_survival(acause, location_id):
    ''' Returns survival estimation subset required for prevalence estimation
    '''
    uid_cols = nd.nonfatalDataset("survival", acause).uid_cols
    abs_surv_col = nd.get_columns("absolute_survival")
    this_dataset = nd.nonfatalDataset("survival", acause)
    input_file = this_dataset.get_output_file(location_id)
    surv_data = pd.read_csv(input_file)
    return(surv_data[uid_cols + [abs_surv_col]])
예제 #5
0
def save_procedure_inputs(df, acause, location_id):
    '''' Formats and saves procedure data for upload into the epi database
    '''
    uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id']
    draw_cols = nd.get_columns("draw_cols")
    epi_estimate_cols = ['mean', 'lower', 'upper']
    data = df.loc[:, uid_cols + draw_cols].copy()
    # apply formatting
    data.loc[df['age_group_id'].isin([33, 44, 301]), 'age_group_id'] = 235
    data = dft.collapse(data, by_cols=uid_cols, stub='draw')
    epi_df = epi_upload.format_draws_data(data)
    epi_df = epi_upload.convert_to_rate(epi_df, epi_estimate_cols, location_id)
    # Add metadata
    epi_df['measure'] = 'incidence'
    epi_df['unit_type'] = "Person*year"
    epi_df['extractor'] = getuser()
    epi_df['location_id'] = location_id
    # Finalize and export
    for me_id in epi_df['modelable_entity_id'].unique():
        print("me_id " + str(me_id) + " sequela split")
        me_table = nd.load_me_table()
        bundle_id = int(me_table.loc[me_table['modelable_entity_id'].eq(me_id),
                                     'bundle_id'].item())
        this_output = epi_df.loc[epi_df['modelable_entity_id'].eq(me_id), :]
        this_output = epi_upload.EpiUploadDataframe(this_output).data
        # Save output without testing (epi formatter has already tested data per
        #   epi specs)
        # add location_id to enable save_outputs
        this_output['location_id'] = location_id
        nd.save_outputs("dismod_inputs",
                        this_output,
                        acause,
                        bundle_id,
                        skip_testing=True)
예제 #6
0
def save_model_results(df, metric_name, acause):
    ''' Saves a separate output file for each me_tag in the dataframe
    '''
    uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols
    data_cols = nd.get_columns(metric_name)
    draw_cols = nd.get_columns("draw_cols")
    if metric_name == "incidence":
        measure_id = utils.get_gbd_parameter('incidence_measure_id')
        df.loc[:, 'me_tag'] = 'primary_phase'
    elif metric_name == "prevalence":
        measure_id = utils.get_gbd_parameter('prevalence_measure_id')
    for this_tag in df['me_tag'].unique():
        me_id = nd.get_modelable_entity_id(acause, this_tag)
        if me_id is None:
            continue
        print("me_id " + str(me_id) + " " + this_tag)
        output_data = df.loc[df['me_tag'].eq(this_tag), uid_cols + data_cols]
        output_data.columns = uid_cols + draw_cols
        output_data['modelable_entity_id'] = me_id
        nd.save_outputs(
            "final_results",
            output_data,
            acause,
            me_id,
            measure_id,
        )
예제 #7
0
def generate_estimates(acause, location_id):
    ''' Runs the prevalence estimation pipeline
    '''
    output_file = nd.nonfatalDataset(
        "prevalence", acause).get_output_file(location_id)
    print("Begin prevalence estimation...")
    surv_df = load_survival(acause, location_id)
    mort_df = calc_mortality(surv_df, acause, location_id)
    sequela_framework = load_sequela_framework(surv_df, acause)
    prev_df = calc_prevalence(sequela_framework, mort_df, acause)
    nd.save_outputs("prevalence", prev_df, acause)
예제 #8
0
def calc_prevalence(sequela_framework, mort_df, acause):
    '''
    '''
    print("    calculating prevalence...")
    prev_cols = nd.get_columns('prevalence')
    mort_cols = nd.get_columns('mortality')
    surv_uids = nd.nonfatalDataset("survival", acause).uid_cols
    prev_uids = nd.nonfatalDataset("prevalence", acause).uid_cols
    # Create the prevalence estimation frame from the survival and mortality 
    #       frames
    mrg_df = pd.merge(sequela_framework, mort_df)
    df = mrg_df[surv_uids + ['me_tag']]
    # Calculate prevalence of each sequela by multiplying sequela duration
    #     by the number of people surviving for only that duration
    df[prev_cols] = mrg_df[mort_cols].mul(mrg_df['sequela_duration'], axis=0)
    df = dft.collapse(df, combine_cols=prev_cols,
                      by_cols=prev_uids, func='sum')
    df.loc[:, prev_cols] = df[prev_cols] / 12  # convert to years
    assert not df.isnull().any().any(), "Error in im_draw {}".format(i)
    return(df)
예제 #9
0
def create_estimation_frame(acause, location_id, cnf_model_run_id):
    ''' Returns a dataframe containing the ages and covariates used to estimate
            survival and incremental mortality
    '''
    print("    creating estimation frame...")
    max_surv = nd.nonfatalDataset().max_survival_months
    uid_cols = nd.nonfatalDataset().uid_cols
    keep_ages = list(range(1, 21)) + list(range(30, 34)) + [235]
    # load and subset survival curve to match the estimation parameters
    surv_data = load_rel_surv_values(acause, location_id, cnf_model_run_id)
    surv_data['survival_month'] = surv_data['survival_year'] * 12
    surv_data = surv_data.loc[surv_data['survival_month'] <= max_surv]
    # merge with lambda values to create the survival estimation_frame
    lambda_input = load_lambda_values(location_id, cnf_model_run_id)
    estim_frame = surv_data.merge(lambda_input[uid_cols + ['lambda']])
    estim_frame = estim_frame.loc[
        estim_frame['age_group_id'].isin(keep_ages), :]
    estim_frame['lambda_years'] = (estim_frame['lambda'] *
                                   (estim_frame['survival_year']))
    return (estim_frame)
예제 #10
0
def calc_procedure_tenplus(inc_df, proportions, acause, location_id):
    ''' Multiplies incidence draws by the procedure proportion and the absolute
            survival proportion at 10 years to estimate the number of cases
            surviving for at least 10 years
    '''
    # Load known values
    print(
        "    calculating the incidence of procedures with surv > ten years...")
    uid_cols = nd.nonfatalDataset().uid_cols
    type_cols = nd.get_columns('incidence')
    draw_cols = nd.get_columns("draw_cols")
    abs_surv = [nd.get_columns("absolute_survival")]
    max_estimation_year = utils.get_gbd_parameter('max_year')
    max_survival_months = nd.nonfatalDataset().max_survival_months
    # Estimate incidence of procedure
    mrg_df = inc_df.merge(proportions)
    adj_df = mrg_df[uid_cols]
    num_procedures = (mrg_df[type_cols].values * mrg_df[draw_cols].values)
    adj_df[type_cols] = pd.DataFrame(num_procedures).fillna(0)
    # Estimate number of procedures resulting in survival beyond ten years
    surv_df = load_estimates('survival', acause, location_id)
    surv_df = surv_df.loc[surv_df['survival_month'].eq(max_survival_months),
                          uid_cols + abs_surv]
    adj_df = adj_df.merge(surv_df)
    pbt_df = adj_df[uid_cols]
    num_procedures_10ys = adj_df[type_cols].values * \
                                        adj_df[abs_surv].values
    pbt_df[draw_cols] = pd.DataFrame(num_procedures_10ys).fillna(0)
    # Update years and age categories
    pbt_df.loc[:, 'age_group_id'] = pbt_df['age_group_id'].apply(
        add_decade_to_age)
    pbt_df.loc[:, 'year_id'] += 10
    # drop data that are now out of scope
    pbt_df = pbt_df.loc[pbt_df['year_id'] <= max_estimation_year, :]
    # For procedures whose sequelae are fractional,
    if sequelae_fractions(acause):
        pbt_df = split_sequelae(pbt_df, acause, location_id)
    else:
        pbt_df.loc[:, 'modelable_entity_id'] = \
            nd.get_modelable_entity_id(acause, 'procedure_sequelae')
    return (pbt_df)
예제 #11
0
def load_estimates(metric_name, acause, location_id):
    ''' Loads previously-generated estimates per the metric_name
    '''
    this_step = nd.nonfatalDataset(metric_name, acause)
    uid_cols = this_step.uid_cols
    if metric_name == "survival":
        type_cols = [nd.get_columns("absolute_survival")]
    else:
        type_cols = nd.get_columns(metric_name)
    #
    input_file = this_step.get_output_file(location_id)
    input_data = pd.read_csv(input_file)
    return (input_data[uid_cols + type_cols])
예제 #12
0
def load_rel_surv_values(acause, location_id, cnf_model_run_id):
    ''' Loads and returns survival best-case/worst-case estimations for the given
            acause
    '''
    print("       loading survival...")
    uid_cols = nd.nonfatalDataset().uid_cols
    rel_surv_col = nd.get_columns("relative_survival")
    sex_restrictions = {
        'neo_prostate': 1,
        'neo_testicular': 1,
        'neo_cervical': 2,
        'neo_ovarian': 2,
        'neo_uterine': 2
    }
    # Load specific input based on run_id
    surv_folder = load_surv_folder(cnf_model_run_id)
    input_file = "{}/{}/{}.csv".format(surv_folder, acause, location_id)
    # import and update names
    this_surv = pd.read_csv(input_file)
    this_surv.rename(columns={
        'year': 'year_id',
        'sex': 'sex_id'
    },
                     inplace=True)
    # Add 'year 0' survival equal to 1 (no time has passed through which to survive)
    this_surv['scaled_0year'] = 1
    # Subset by sex
    if acause in sex_restrictions.keys():
        this_surv = this_surv.loc[this_surv['sex_id'] ==
                                  sex_restrictions[acause], :]
    # Reshape and rename columns
    this_surv = dft.wide_to_long(this_surv,
                                 stubnames='scaled_',
                                 i=uid_cols,
                                 j=['survival_year'],
                                 drop_others=True)
    this_surv = this_surv.loc[
        this_surv['survival_year'] != '10year_restrict', :]
    this_surv.loc[:, 'survival_year'] = this_surv['survival_year'].str.replace(
        'year', '').astype(int)
    this_surv.rename(columns={'scaled_': rel_surv_col}, inplace=True)
    # extend age groups if not present
    this_surv = _fix_survival_ages(this_surv)
    # Test and return
    assert not this_surv.isnull().any().any(), \
        "Null values found in relative survival input after formatting"
    pe.validate_proportions(this_surv[rel_surv_col])
    return (this_surv)
예제 #13
0
def calc_increm_mort(surv_df, acause, location_id):
    ''' Returns a dataframe of incremental survival estimates by uid
    '''
    def im_draw(df, draw_num, surv_uids):
        ''' Returns the dataframe with estimate of absolute survival for the 
                requested draw_num
        '''
        # Subset to only the necessary data
        max_surv = nd.nonfatalDataset().max_survival_months
        draw_uids = nd.nonfatalDataset().uid_cols
        abs_surv_col = nd.get_columns("absolute_survival") 
        increm_mort_col = nd.get_columns("incremental_mortality")
        # Calculate incremental mortality, the number of people who have lived
        #   with the disease for each period (those who die in year one
        #   had the disease for only a year)
        df[increm_mort_col] = df.sort_values(surv_uids).groupby(
            draw_uids)[abs_surv_col].diff(-1).fillna(0).clip(lower=0)
        # Calculate the number of people surviving with the disease at and
        #   beyond the maximum year
        at_max_surv_months = (df['survival_month'] == max_surv)
        mort_total = df[~at_max_surv_months
                       ].groupby(draw_uids, as_index=False
                       )[increm_mort_col].agg(np.sum
                       ).rename(columns={increm_mort_col: 'total_mort'})
        df = df.merge(mort_total)
        df.loc[at_max_surv_months, increm_mort_col] = 1 - df['total_mort']
        # test and return
        assert not df.isnull().any().any(), "Error in im_draw {}".format(i)
        return(df.loc[:, surv_uids+[increm_mort_col]])

    # Generate incremental mortality draws
    output_uids = nd.nonfatalDataset("survival", acause).uid_cols
    abs_surv_cols = [nd.get_columns("absolute_survival")]
    incr_mort_cols = [nd.get_columns("incremental_mortality")]
    output_df = surv_df.loc[:, output_uids]
    print("    estimating incremental mortality proportion...")
    # Note: this section remains written with a loop to facilitate future
    #   processing of absolute survival draws
    for i, as_col in enumerate(abs_surv_cols):
        this_draw = im_draw(df=surv_df.loc[:, output_uids + [as_col]],
                            draw_num=i,
                            surv_uids=output_uids)
        output_df = output_df.merge(this_draw, on=output_uids)
    return(output_df[output_uids + incr_mort_cols])
예제 #14
0
def calc_mortality(surv_df, acause, location_id):
    ''' Calculate mortality, the number of people who die of the
        cause during the interval (year), where
        mort= incremental_mortality_proportion*incidence.
        Returns a datafrane of mortality by uid
    '''
    print("    estimating absolute mortality...")
    uid_cols = nd.nonfatalDataset("survival", acause).uid_cols
    inc_cols = nd.get_columns("incidence")
    incr_mort_cols = [nd.get_columns('incremental_mortality')]
    mort_cols = nd.get_columns('mortality')
    incr_mort_df = calc_increm_mort(surv_df, acause, location_id)
    inc_df = load_incidence(acause, location_id)
    mrg_df = incr_mort_df.merge(inc_df)
    df = mrg_df[uid_cols]
    df[mort_cols] = \
        pd.DataFrame(mrg_df[inc_cols].values * mrg_df[incr_mort_cols].values)
    df = df.merge(incr_mort_df)
    return(df)
예제 #15
0
def split_sequelae(df, acause, location_id):
    ''' Splits estimates into sequela based on proportions from literature
    '''
    print("    splitting sequelae...")
    uid_cols = nd.nonfatalDataset().uid_cols + ['modelable_entity_id']
    draw_cols = nd.get_columns("draw_cols")
    # Generate dataframe containing the procedure_sequelae fractions
    fracs = pd.DataFrame().from_dict(
        sequelae_fractions(acause), orient='index').reset_index().rename(
            columns={'index': 'modelable_entity_id'})
    fracs = fracs[fracs['me_tag'].eq("procedure_sequelae")]
    fracs['acause'] = acause
    # Merge dataframe with data
    df['acause'] = acause
    split_df = df.merge(fracs)
    split_df[draw_cols] = split_df[draw_cols].multiply(split_df['fraction'],
                                                       axis='index')
    assert split_df[draw_cols].notnull().all().all(), "Nulls in split sequelae"
    return (split_df)
예제 #16
0
def main(meid, desc, indir, run_id, meas_id):
    ''' Loads meid information from the cancer database and uses it to run
            the save_results function
    '''
    this_step = nd.nonfatalDataset("split", meid)
    success_file = this_step.get_output_file('upload')
    print("Working on {} ({}) in {}".format(meid, desc, indir, run_id))
    success_df = save_worker(meid=meid, meas_ids=meas_id, 
                            description=desc, input_dir=indir, 
                             cnf_run_id=run_id)
    # Validate save and preserve record if successful
    if (len(success_df) > 0) and isinstance(success_df, pd.DataFrame):
       if 'model_version_id' in success_df.columns:
            model_id = success_df.at[0, 'model_version_id']
            epi_upload.update_upload_record(meid, run_id, model_id,
                                    cancer_model_type="split_custom_epi")
            success_df.to_csv(success_file, index=False)
            return(True)
    else:
        print("Error during split")
    return(False)
예제 #17
0
def load_sequela_framework(surv_df, acause):
    ''' Adjust sequela duration based on survival
            First adjust incremental sequela duration (including controlled) to 
            equal total months from diagnosis (at midyear) to death (at end 
            year). This is the total amount of time someone may experience any 
            of the sequela (from diagnosis to death, separated out by the amount
            of time they are living with the cancer)

        Then iteratively adjust duration of each sequela so time lived with 
            cancer is equal to the sum of all sequela durations
            First zero-out metastatic_phase and terminal_phase for events that 
            occur at the maximum survival duration.
            - The terminal phase is set and not adjusted
            - The most flexible phase is controlled: Adjust controlled time to 
            equal the difference between incremental_duration sequela duration 
            and the duration of each of the other sequela
            - The next most flexible time is primary diagnosis and treatment
            - Finally we can adjust the metastatic time if the totals still do 
            not add up
    '''
    def adjust_duration(df, stage, uid_cols):
        '''
        '''
        sd_col = 'sequela_duration'
        input_cols = df.columns.tolist()
        this_phase = (df['me_tag'] == stage)
        df = df.merge(df[~this_phase].groupby(uid_cols, as_index=False)[
            sd_col].sum().rename(columns={sd_col: 'tot_dur'}))
        df.loc[this_phase, sd_col] = df['incremental_duration'] - df['tot_dur']
        df.loc[this_phase & (df[sd_col] <= 0), sd_col] = 0
        assert not df.duplicated(uid_cols+['me_tag']).any(), \
            "ERROR: error when calculating sequela_durations for {} stage".format(
                stage)
        return(df[input_cols])
    #
    print("    creating sequela framework...")
    nf_ds = nd.nonfatalDataset("survival", acause)
    uid_cols = nf_ds.uid_cols
    max_survival_months = nf_ds.max_survival_months
    seq_dur = load_durations(acause)
    # Add sequela durations
    surv_df.loc[:, 'acause'] = acause
    df = pd.merge(surv_df[uid_cols + ['acause']], seq_dur, on='acause')
    df.loc[:, 'raw_sequela_duration'] = df['sequela_duration']
    df.loc[:, 'incremental_duration'] = df['survival_month'] + 6
    # Set the 'beyond maximum' survival years to the duration of survival
    #   for the final period
    end_of_period = (df['survival_month'].eq(max_survival_months))
    df.loc[end_of_period, 'incremental_duration'] = max_survival_months - 6
    # Set late-phase duration to 0 at the end of the survival period 
    #   (if someone survives beyond the maximum duration, they are treated
    #   as 'survivors', so there are no terminal or metastatic phases)
    late_phase = (df['me_tag'].isin(["terminal_phase", "metastatic_phase"]))
    end_of_period = df['survival_month'].eq(max_survival_months)
    df.loc[late_phase & end_of_period, 'sequela_duration'] = 0
    # Iteratively adjust sequela duration (see docstring for explanation)
    for stage in ['controlled_phase', "primary_phase", "metastatic_phase"]:
        df = adjust_duration(df, stage, uid_cols)
    assert df['incremental_duration'].notnull().all(), \
        "error calculating sequela durations"
    return(df)
예제 #18
0
def apply_procdedure_proportions(df, proportions, acause, metric_name):
    ''' Multiplies estimates by procedure proportions, adding to the dataframe
            a set of estimates for the number of cancer events that do not 
            recieve the given procedure
    '''
    print("    adjusting to avoid double-counting procedures...")
    # Return if adjustment is unnecessary (if there is no rate id for the cause)
    uid_cols = nd.nonfatalDataset(metric_name, acause).uid_cols
    draw_cols = nd.get_columns("draw_cols")
    type_cols = nd.get_columns(metric_name)
    mrg_cols = [c for c in uid_cols if c != 'me_tag']
    # Subset estimates to the phase wherein procedures occur
    if metric_name == 'prevalence':
        mrg_df = df.loc[df['me_tag'] == "controlled_phase", :].copy()
        del mrg_df['me_tag']
    elif metric_name == 'incidence':
        mrg_df = df.copy()
    # For data where sequela are a fraction of the number of procedures, multiply
    #       the procedure proportion by those fractions
    if metric_name == 'prevalence' and bool(sequelae_fractions(acause)):
        # Generate dataframe to containing the fractions
        fracs = pd.DataFrame().from_dict(sequelae_fractions(acause),
                                         orient='index')
        fracs['acause'] = acause
        fracs = fracs[~fracs['me_tag'].eq("procedure_sequelae")]
        # Merge dataframe with proportions to expand
        proportions['acause'] = acause
        props = proportions.merge(fracs)
        # Adjust proportions by me
        props[draw_cols] = props[draw_cols].multiply(props['fraction'],
                                                     axis='index')
        del props['acause']
    else:
        # Determine fraction of population that does not recieve the procedure
        props = proportions.copy()
        props['me_tag'] = "adjusted_controlled_phase_a"
    # Apply proportions to estimates
    #   Note: may drop some data if proportions are only for estimation years
    mrg_df = mrg_df.merge(props, on=mrg_cols, how='inner')
    adj_df = mrg_df[uid_cols]
    evnt_wo_proc = pd.DataFrame(mrg_df[type_cols].values *
                                mrg_df[draw_cols].values).fillna(0)
    evnt_wo_proc.columns = type_cols
    adj_df[type_cols] = evnt_wo_proc
    assert not adj_df.isnull().any().any(
    ), "Error calculating procedure proportions"
    # For prevalence, append the adjusted data to the rest of the estimates
    if metric_name == 'prevalence':
        sq_df = dft.collapse(adj_df, mrg_cols,
                             combine_cols=type_cols).sort_values(mrg_cols)
        cntrl_df = df.loc[df['me_tag'].eq("controlled_phase"), :].merge(
            mrg_df[mrg_cols].drop_duplicates(), on=mrg_cols,
            how='inner').sort_values(mrg_cols)
        nosq_df = cntrl_df[mrg_cols]
        no_proc = pd.DataFrame(cntrl_df[type_cols].values -
                               sq_df[type_cols].values)
        no_proc.columns = type_cols
        nosq_df[type_cols] = no_proc
        nosq_df['me_tag'] = "adjusted_controlled_phase"
        adj_df = adj_df.append(nosq_df)
        output_data = df.append(adj_df)
    # Incidence of cancers with the procedure is estimated elsewhere, so there
    #      is no need to preserve the unadjusted data
    else:
        output_data = adj_df
    return (output_data[uid_cols + type_cols])