def project_data():
    ''' Runs pipeline to combine previously-selected incidence data
        Requires incidence data that are unique by location_id-year-sex-age-acause
    '''
    input_file = utils.get_path("combined_incidence", process="cod_mortality")
    output_file = utils.get_path("projected_incidence",
                                 process="cod_mortality")
    df = pd.read_csv(input_file)
    df = project_to_special_locations(df)
    df = project_ihme_location_estimates(df)
    df.to_csv(output_file, index=False)
    print("incidence data projected")
Beispiel #2
0
def get_age_frmat_map(frmat_type):
    ''' Loads map with indicators for which age groups need splitting
    '''
    if frmat_type == "im_frmat":
        resource = pd.read_csv(
            utils.get_path('im_frmat_map', process="mi_dataset"))
    elif frmat_type == "frmat":
        resource = pd.read_csv(
            utils.get_path('frmat_map', process="mi_dataset"))
    resource = md.stdz_col_formats(
        resource, additional_float_stubs=['age_specific', 'age_split'])
    return (resource)
def run():
    ''' Runs pipeline to combine previously-selected incidence data
        Requires incidence data that are unique by location_id-year-sex-age-acause
    '''
    output_file = utils.get_path("combined_incidence", process="cod_mortality")
    input_file = utils.get_path("prepped_incidence", process="cod_mortality")
    utils.ensure_dir(output_file)
    df = pd.read_csv(input_file)
    print("Combining data to one single entry per uid...")
    df = combine_incidence(df)
    df.to_csv(output_file, index=False)
    return (df)
Beispiel #4
0
def project_incidence():
    ''' For each IHME location_id, projects estimates based in the input cancer
        rates
        Includes generation of national estimates from subnational estimates
        where national estimates are not present (note: in CoD, such estimates
        are used for validation only)
    '''
    print("   projecting data to ihme demographic specifications...")
    output_file = utils.get_path("projected_incidence",
                                 process="cod_mortality")
    input_file = utils.get_path("combined_incidence", process="cod_mortality")
    pop_uids = [c for c in get_uid_columns() if c != 'acause']
    df = pd.read_csv(input_file)
    # define subset that can be projected to the IHME population
    df = modeled_locations.add_subnational_status(df)
    df = supplement_national_estimates(df)
    # Ensure validity of sdi_quintile
    df = modeled_locations.add_sdi_quintile(df, delete_existing=True)
    # Calculate rate of input
    df.loc[:, 'rate'] = df['cases'] / df['pop']
    df['registry_pop'] = df['pop']
    # Mark data to be projected
    project_to_ihme = (df['sdi_quintile'].eq(5))
    df_sdi5 = df.loc[project_to_ihme, :].copy()
    df_other = df.loc[~project_to_ihme, :].copy()
    # Add IHME population to applicable uids
    del df_sdi5['pop']
    ihme_pop = load_ihme_pop(df.loc[project_to_ihme, 'location_id'].unique())
    df_sdi5 = df_sdi5.merge(ihme_pop)
    # Homogenize population by group where not applying IHME populations
    df_other = staging_functions.homogenize_pop(df_other, uid_cols=pop_uids)
    output = df_other.append(df_sdi5)
    # reindex to allow multiplying series
    # create new column index, then set that as the new index
    output['index'] = np.arange(len(output))
    output = output.set_index('index')
    # Broadcast rates to the final population estimate for all locations
    output.loc[(output['pop'].notnull()) & (output['rate'].notnull()),
               'cases'] = output['rate'] * output['pop']
    # Drop registry-specific tags
    output = output.drop([
        'national_registry', 'full_coverage', 'is_subnational', 'registry_pop'
    ],
                         axis=1,
                         errors='ignore')
    assert not output.loc[output.duplicated(get_uid_columns()), :].any().any(), \
        "Duplicates exist after projection"
    assert not output['pop'].isnull().any(), "Missing population data"
    assert len(output) == len(df), "Error during estimate projection"
    output.to_csv(output_file, index=False)
    print("   data projected.")
    return (output)
def run():
    ''' Runs merge between incidence and MIR estimates to generate mortaliy estimate
            output
    '''
    output_file = utils.get_path("mortality_estimates", process='cod_mortality')
    input_file = utils.get_path("projected_incidence", process="cod_mortality")
    df = pd.read_csv(input_file)
    df = calculate_mortality(df)
    df = apply_recode(df)
    # Validate and save
    df.to_csv(output_file, index=False)
    print("    deaths calculated and recoded.")
    return(df)
def load_mi_estimates():
    ''' returns the compiled MIR model results with the provided suffix,
            formatted for use with incidence selected for the CoD upload 
        -- Inputs
            location_ids : list of location_ids to return
            years : list of years to return
            ages : list of ages to return
    '''
    print("    formatting mir estimates...")
    uid_columns = get_uid_columns()
    required_columns = uid_columns + ['mi_ratio']
    input_file = utils.get_path("compiled_mir_outputs",  process="cancer_model")
    # load and subset data
    df = pd.read_csv(input_file)
    df.rename(columns={'sex':'sex_id', 'year':'year_id'}, inplace=True)
    df = df.loc[:, required_columns]
    # add extended age groups
    extended_ages = [30, 31, 32, 235]
    if not any(a in df['age_group_id'].unique() for a in extended_ages):
        eightyplus = df.loc[df['age_group_id'] == 21, :].copy()
        for a in extended_ages:
            eightyplus.loc[:,'age_group_id'] = a
            df = df.append(eightyplus)
    df = df.loc[df['age_group_id'] != 21, :]
    return(df)
def run():
    ''' Finalizes data for CoD prep, then runs CoD prep's format code
    '''
    finalized_file = utils.get_path("formatted_CoD_data",
                                    process="cod_mortality")
    CoD_format_script = utils.get_path("finalize_CoD_input",
                                       process="cod_mortality")
    input_file = utils.get_path("mortality_estimates", process="cod_mortality")
    df = pd.read_csv(input_file)
    # Ensure that there is a single entry by uid (data will not be collapsed
    #   after this point)
    assert not df[df.duplicated(get_uid_cols())].any().any(), \
        "Duplicate values present at input"
    df = refine_by_cc_code(df)
    df = add_subdiv(df)
    df = add_CoD_variables(df)
    df = format_CoD_variables(df)
    df = test_output(df)
    df.to_csv(finalized_file)
    pydo.do_stata(CoD_format_script, arg_list=None)
    return (df)
Beispiel #8
0
def submit_rdp(input_data, this_dataset, is_resubmission):
    ''' Returns full dataset after redistribution.
        Separates data by submission requirement before submitting rdp for only
        only those data that require it
    '''
    def submission_requirement(df, uid):
        return needs_rdp(df[df['uid'] == uid], this_dataset)

    def output_file_function(id):
        return get_rdp_file(this_dataset,
                            which_file='split_output',
                            splitNum=id[2])

    # create a list of the uids that require redistribution and set aside a
    #   dataframe of the uids that do not require redistribution
    rdp_code_location = utils.get_path("redistribution",
                                       base="code_repo",
                                       process="mi_dataset")
    worker_script = rdp_code_location + "/rdp_worker.py"
    output_uids = md.get_uid_cols(7)
    header = "cncRDP_{}_{}".format(this_dataset.dataset_id,
                                   this_dataset.data_type_id)
    rdp_input_file = get_rdp_file(this_dataset, which_file='rdp_input')
    #
    prepped_df = prep_input(input_data, this_dataset)
    submitted_data, unsubmitted_data = cup.split_submission_data(
        prepped_df, 'uid', submission_requirement, rdp_input_file)
    uid_list = submitted_data['uid'].unique().tolist()
    rdp_job_dict = cup.generate_prep_workers(worker_script,
                                             list_of_uids=uid_list,
                                             ds_instance=this_dataset,
                                             job_header=header,
                                             is_resubmission=is_resubmission,
                                             pace_interval=0.05)
    output_files = cup.get_results(rdp_job_dict,
                                   output_file_function,
                                   parent_process_name="rdp",
                                   noisy_checker=is_resubmission,
                                   add_resubmission_argument=is_resubmission,
                                   wait_time=5)
    # Re-combine compiled results with the set-aside data, before collapsing
    #   and testing
    final_results = pe.append_files(output_files)
    final_results = final_results.append(unsubmitted_data)
    # Re-set all 'under 5' data, then collapse to combine it with any existing
    #       'under 5' data
    final_results.loc[final_results['age'].lt(7) |
                      (final_results['age'].gt(90)
                       & final_results['age'].lt(95)), 'age'] = 2
    final_results = dft.collapse(final_results,
                                 by_cols=output_uids,
                                 combine_cols=this_dataset.metric)
    return (final_results)
Beispiel #9
0
def load_package_set(df):
    ''' loads the package set linked to the coding system
    '''
    code_version = df.coding_system.unique()[0]
    params_dir = utils.get_path('mi_dataset_resources', process="mi_dataset")
    package_path = params_dir + '/redistribution/packagesets_{}.dta'.format(
        code_version)
    package = pd.read_stata(package_path)
    assert len(package) == 1, "Incorrect number of source labels in "\
        "packagesets_{}. Expected 1, got {}. Redistribution failed."\
        .format(code_version, len(package))
    return (package.package_set_id.unique()[0])
def generate_code_index(input_codes):
    ''' Returns an index of all possible ICD10 codes with attached number 
            indicating order of appearance and tag for viability. "Viable" tag
            indicates either an official code or an unofficial code that exists 
            in the data
    '''
    if not isinstance(input_codes, tuple):
        input_codes = tuple(input_codes)
    # Import list of ICD10 codes and define code index
    code_list_path = (utils.get_path('mi_input') + "/_resources/" +
        "subtotal_recalculation/list_of_official_NUMERIC_ICD10_codes.csv")
    ICD10_code_list = pd.read_csv(code_list_path)
    ICD10_code_list.sort_values(by=['ICD10_code'], inplace=True)
    ICD10_code_list = tuple(ICD10_code_list['ICD10_code'])
    ICD10_code_index = {}
    order_num = 1
    for k in ['C', 'D']:
        under_10_alternate = ['00', '01', '02',
                              '03', '04', '05', '06', '07', '08', '09']

        for o in under_10_alternate + list(range(10, 100)):
            kode = '{}{}'.format(k, o)
            ICD10_code_index[kode] = {}
            ICD10_code_index[kode]['order'] = order_num
            if kode in ICD10_code_list or kode in input_codes:
                ICD10_code_index['{}{}'.format(k, o)]['viable'] = True
            else:
                ICD10_code_index[kode]['viable'] = False
            order_num += 1

            for d in range(0, 10):
                kode = '{}{}.{}'.format(k, o, d)
                ICD10_code_index[kode] = {}
                ICD10_code_index[kode]['order'] = order_num
                if kode in ICD10_code_list or kode in input_codes:
                    ICD10_code_index['{}{}.{}'.format(
                        k, o, d)]['viable'] = True
                else:
                    ICD10_code_index[kode]['viable'] = False
                order_num += 1

                for e in range(0, 10):
                    kode = '{}{}.{}{}'.format(k, o, d, e)
                    ICD10_code_index[kode] = {}
                    ICD10_code_index[kode]['order'] = order_num
                    if kode in ICD10_code_list or kode in input_codes:
                        ICD10_code_index[kode]['viable'] = True
                    else:
                        ICD10_code_index[kode]['viable'] = False
                    order_num += 1
    return(ICD10_code_index)
Beispiel #11
0
def run(input_data, PACKAGE_MAP, TEMP_FOLDER):
    ''' Manages formatting and redistribution for the input data, then saves
            the output
    '''
    if int(input_data['freq'].sum()) == 0:
        print("Data sums to zero.")
        return (input_data)

    #
    output_cols = [
        'registry_index', 'year_start', 'year_end', 'year_id', 'sex',
        'coding_system', 'split_group', 'age', 'cause', 'freq'
    ]
    proportion_uids = [
        'dev_status', 'super_region', 'region', 'subnational_level1',
        'subnational_level2', 'country', 'location_id', 'registry_index',
        'year_start', 'year_end', 'year_id', 'sex', 'coding_system',
        'split_group', 'age'
    ]
    residual_cause = 'ZZZ'
    resources_dir = utils.get_path("mi_dataset_resources",
                                   process="mi_dataset")
    package_folder = '{}/redistribution/{}'.format(resources_dir, PACKAGE_MAP)
    cause_map_file = package_folder + '/cause_map.csv'
    cause_map = pd.read_csv(cause_map_file)

    packages = get_packages(input_data, package_folder, cause_map)
    if len(packages) == 0:
        print("No packages available with which to redistribute this data.")
        return (input_data)

    start_time = time.time()
    prepped_data, proportion_metadata = prep_data(
        input_data, proportion_uids, residual_cause=residual_cause)
    evaluated_cause_map = evaluate_cause_restrictions(cause_map,
                                                      proportion_metadata,
                                                      proportion_uids)
    result, diagnostics = apply_packages(prepped_data,
                                         proportion_metadata,
                                         evaluated_cause_map,
                                         packages,
                                         residual_cause=residual_cause)
    output_data = result.merge(proportion_metadata, on='proportion_id')
    #
    output_data = output_data.ix[:, output_cols]
    output_data.loc[:, 'freq'].fillna(value=0, inplace=True)
    diff = output_data['freq'].sum() - input_data['freq'].sum()
    assert abs(diff) < 1, "Difference from input after rdp is too large"
    return (output_data)
Beispiel #12
0
def load_surv_folder(cnf_model_run_id):
    ''' Using the cnf_lambda_version_id, returns the datestamp suffix
            of that version
    '''
    surv_folder = surv_folder = utils.get_path("relative_survival",
                                               process="nonfatal_model")
    record = nd.get_run_record(cnf_model_run_id)
    rs_version = record.at[0, 'rel_survival_version_id']
    db_link = cdb.db_api()
    this_version = db_link.get_entry(table_name='rel_survival_version',
                                     uniq_col='rel_survival_version_id',
                                     val=rs_version)
    suffix = str(this_version.at[0, 'date_updated'])
    rs_folder = surv_folder.replace("<date>", suffix)
    return (rs_folder)
Beispiel #13
0
def load_lambda_file(cnf_model_run_id):
    ''' Using the cnf_lambda_version_id, returns the datestamp suffix
            of that version
    '''
    lambda_file_default = utils.get_path("lambda_values",
                                         process="nonfatal_model")
    record = nd.get_run_record(cnf_model_run_id)
    lambda_version = record.at[0, 'cnf_lambda_version_id']
    db_link = cdb.db_api()
    this_version = db_link.get_entry(table_name='cnf_lambda_version',
                                     uniq_col='cnf_lambda_version_id',
                                     val=lambda_version)
    suffix = str(this_version.at[0, 'date_updated'])
    lambda_file = lambda_file_default.replace("<date>", suffix)
    return (lambda_file)
def generate_estimates(acause,
                       location_id,
                       cnf_model_run_id,
                       is_resubmission=False):
    ''' Runs a subprocess that passes all arguments to the R script that 
            calculates incidence draws
    '''
    r_script = utils.get_path("calculate_incidence", process="nonfatal_model")
    cmd = "bash {shl} {scr} {ac} {l} {id} {rs}".format(
        shl=utils.get_cluster_setting("r_shell"),
        scr=r_script,
        ac=acause,
        l=location_id,
        id=cnf_model_run_id,
        rs=int(is_resubmission))
    print(cmd)
    subprocess.call(cmd, shell=True)
    return (True)
Beispiel #15
0
def update_repness(df):
    ''' Returns dataframe with updated representativeness status, superceeding 
            the value with one from override file (if present)
    '''
    repres_update_file = utils.get_path("representativeness_override",
                                        process="staging")
    repres_update = pd.read_csv(repres_update_file)
    repres_update = repres_update[['country_id', 'grouping', 'representative']]
    repres_update.rename(columns={'representative': 'update_rep'},
                         inplace=True)
    df = modeled_locations.add_country_id(df)
    df.loc[df['location_id'] == df['country_id'], 'grouping'] = 'national'
    df.loc[df['grouping'] != 'national', 'grouping'] = 'subnational'
    df = df.merge(repres_update, how='left')
    df.loc[df['update_rep'].notnull(), 'representative'] = df['update_rep']
    df.loc[df['representative'].isnull(), 'representative'] = 0
    df = df.drop(['update_rep', 'grouping'], axis=1)
    return (df)
Beispiel #16
0
def manage_split(source_cid, target_cids, proportion_meids, work_dir,
                 description):
    ''' Manages the split of the source_cid followed by saving of the targets, 
            then returns boolean indication of success
    '''
    utils.ensure_dir(work_dir)
    # split model
    df = split_cod_model(source_cause_id=source_cid,
                         target_cause_ids=target_cids,
                         target_meids=proportion_meids,
                         output_dir=work_dir)
    print(
        print("Split data saved to " + work_dir + " at " +
              utils.display_timestamp()))

    # Generate a list of arguments (one for each child me)
    save_args_template = "--target {targ} --desc {desc} --indir {dir}"
    save_arg_list = []
    for t in target_cids:
        save_arg_list += [
            save_args_template.format(targ=t, desc=description, dir=work_dir)
        ]
    # Start jobs
    header = description.replace(" ", "_")
    save_worker = utils.get_path("save_cod_worker", process="cancer_model")
    job_dict = cluster_tools.create_jobs(script_path=save_worker,
                                         job_header=description,
                                         memory_request=50,
                                         id_list=target_cids,
                                         script_args=save_arg_list,
                                         use_argparse=True,
                                         project_name="cancer")
    for i in job_dict:
        job_dict[i]['job'].launch()

    # Check for results
    job_descrip = description + " upload"
    success_df = cluster_tools.wait_for_results(job_dict,
                                                jobs_description=job_descrip,
                                                noisy_checker=False,
                                                max_minutes=30)
    success = cluster_tools.validate_success(success_df, description)
    return (success)
Beispiel #17
0
def split_liver():
    ''' Submits the liver-cancer-specific information to the split manager
    '''
    # set source and targets
    source_cid = 417  # parent cause_id
    target_cids = [996, 418, 419, 420, 421]  # cause_ids
    proportion_meids = [18763, 2470, 2471, 2472, 2473]  # proportion me_ids
    years = list(range(1980, int(utils.get_gbd_parameter("max_year")) + 1))
    description = "lvr_cncr_split"
    liver_model_path = utils.get_path('cod_liver_splits',
                                      process='cancer_model')
    work_dir = "{}/{}".format(liver_model_path, utils.display_timestamp())
    # Run split
    print(utils.display_timestamp())
    success = manage_split(source_cid, target_cids, proportion_meids, work_dir,
                           description)
    if success:
        print("All CoD liver splits uploaded. " + utils.display_timestamp())
    else:
        print("Error during CoD splits for liver cancer")
Beispiel #18
0
def add_location_hierarchy_info(df):
    ''' Returns the dataframe (df) with added location information: region,
            super_region, subnational_status, etc.
        Stops RDP if there is a problem with the location ifnormation
    '''
    print("    Adding location information.")
    input_len = len(df)
    # Reformat/convert variables and ages
    loc_info_dir = utils.get_path('mi_dataset_resources', process="mi_dataset")
    loc_info_path = loc_info_dir + '/redistribution/location_hierarchy.dta'
    location_hierarchy = pd.read_stata(loc_info_path)
    location_hierarchy = location_hierarchy[[
        'location_id', 'dev_status', 'super_region', 'region', 'country',
        'subnational_level1', 'subnational_level2'
    ]]
    df = df.merge(location_hierarchy, how='left', on='location_id')
    assert not df.location_id.isnull().any(), \
        "Cannot redistribute. Unmapped location ids present."
    assert len(
        df) == input_len, "ERROR: data lost while adding location metadata"
    return (df)
def run(dataset_id, data_type_id, uid):
    ''' Preps data for recalculation then recalculates as necessary
    '''
    this_dataset = md.MI_Dataset(dataset_id, 2, data_type_id)
    dataset_name = this_dataset.name
    metric = this_dataset.metric
    input_file = run_sr.get_sr_file(this_dataset, "sr_input")
    # Exit if output already exists
    output_file = run_sr.get_sr_file(this_dataset, 'split_output', uid)
    print(output_file)
    if os.path.exists(output_file):
        print("     output file found for uid " + str(uid))
        return(None)
    #
    negative_data_ok = is_exception(dataset_id, data_type_id)
    error_folder = utils.get_path("mi_input", base='j_temp')
    subcause_issue_file = '{}/subcause_issue/{}_{}_uid_{}.txt'.format(
        error_folder, dataset_name, data_type_id, uid)
    exception_file = '{}/negative_data/{}_{}_uid_{}.csv'.format(
        error_folder, dataset_name, data_type_id, uid)
    for d in [subcause_issue_file, exception_file, error_folder]:
        utils.ensure_dir(d)
    #
    print("    removing subtotals from uid {}...".format(uid))
    # add data for the given uid
    df = pd.read_hdf(input_file, 'split_{}'.format(uid))
    # Create a list of possible codes so that decimal subcauses are only added 
    #   if available
    input_cause_list = sorted(df['orig_cause'].unique().tolist())
    # create a dictionary for codes in the selected uid and attach the uid's 
    #   data
    uid_subset = {}
    input_data = {}
    # process decimals first and ranges last to ensure that nested causes are 
    #   removed
    for c in sorted(df['orig_cause'].unique().tolist()):
        uid_subset[c] = {}
        input_data[c] = {}
        uid_subset[c]['codes'] = []
        uid_subset[c]['subcodes'] = []
        if "-" not in c and "," not in c:
            uid_subset[c]['codes'].append(c)
            # add subcodes to 'subcode' key
            df.loc[df['orig_cause'].eq(c), 'cause'].dropna().unique().tolist()
            for subcode in sorted(df['cause'].where(df['orig_cause'] == c
                    ).dropna().unique().tolist()):
                if subcode != c:
                    uid_subset[c]['subcodes'].append(subcode)
            # if none of the subcodes appear in the list, set the cause as a 
            #   subcode of itself (prevents the addition of unused decimal 
            #   causes)
            if not len(uid_subset[c]['subcodes']):
                uid_subset[c]['subcodes'] = uid_subset[c]['codes']
            elif (not any('{}.'.format(sub[:3]) in check 
                    for check in input_cause_list 
                        for sub in uid_subset[c]['subcodes'])):
                uid_subset[c]['subcodes'] = uid_subset[c]['codes']
        else:
            for code in sorted(df['cause'].where(
                df['orig_cause'].eq(c)).dropna().unique().tolist()):
                uid_subset[c]['codes'].append(code)
                uid_subset[c]['subcodes'].append(code)
        # create other lists associated with the cause and add the metric data
        uid_subset[c]['subcauses_remaining'] = []
        uid_subset[c]['codes_removed'] = []
        uid_subset[c]['causes_removed'] = []
        uid_subset[c]['data'] = df.loc[df['cause'].eq(c),
                                        ['age', metric]].set_index('age')
        input_data[c]['data'] = uid_subset[c]['data']
        input_data[c]['codes'] = uid_subset[c]['codes']
    # Determine subcauses and highest number of causes remaining (how many 
    #   subcauses are contained within each cause)
    uid_set = set_subcauses(uid_subset, subcause_issue_file)
    highest_level = determine_highest_level(uid_set)
    # remove lowest level codes from parent causes
    if highest_level == 0:
        print('     no subcauses present.')
    else:
        subcauses_removed = True
        while subcauses_removed:
            uid_set, subcauses_removed = remove_subcauses(
                uid_set, uid, exception_file)
            # remove duplicates
            uid_set = remove_duplicates(uid_set)
            # re-set subcauses and num_subcause_remaining
            uid_set, highest_level = set_subcauses(
                uid_set, subcause_issue_file,)
            print("     subcauses removed.")
    # Prepare Output
    print("saving output...")
    output = pd.DataFrame(
        columns=['cause', 'codes_remaining', 'codes_removed', 'age', metric])
    for c in uid_set:
        # format cause information
        cause_data = pd.DataFrame(
            columns=['cause', 'codes_remaining', 'codes_removed'])
        cause_data.loc[0, ['cause']] = c
        # if nothing was removed, or there was only a single cause, or all of 
        #   the input codes are still present, set the codes remaining as the 
        #   cause
        if (not len(uid_set[c]['codes_removed']) or 
            ("-" not in c and "," not in c) or 
            set(input_data[c]['codes']) <= set(uid_set[c]['codes'])):
            cause_data.loc[0, ['codes_remaining']] = c
        else:
            cause_data.loc[0, ['codes_remaining']] = ','.join(
                convert_to_range(uid_set[c]['codes']))
        cause_data.loc[0, ['codes_removed']] = ','.join(
            convert_to_range(uid_set[c]['codes_removed']))
        # format output data
        output_data = uid_set[c]['data']
        output_data['age'] = output_data.index
        output_data['cause'] = c
        orig_data = input_data[c]['data']
        orig_data['age'] = orig_data.index
        orig_data = orig_data.rename(
            columns={metric: 'orig_metric_value'})
        orig_data['cause'] = c
        # combine and add to output
        final = pd.merge(output_data, cause_data, on='cause')
        final = pd.merge(final, orig_data, on=['cause', 'age'])
        output = output.append(final)
    # Create output dataset
    output['uniqid'] = uid
    # Update encoding (bug fix to work around pandas to_stata issue)
    output = md.stdz_col_formats(output, additional_float_stubs='uniqid')
    # Export results
    output.to_csv(output_file, index=False)
    print('\n Done!')
    time.sleep(1)
    return(None)