Beispiel #1
0
def add_required_columns(df, uid_cols, data_version_id, db):
    ''' adding in required columns for upload
    '''
    print('adding  required coluuns...')
    final_df = df.copy()
    final_df['data_version_id'] = data_version_id

    # rename column names
    final_df.rename(columns = {"national":"representative_id",
                                'sex':'sex_id',
                                'year':'year_id',
                                },inplace = True)

    # add temp cf columns
    cf_var_cols = ['cf_final_low_rd', 
                'cf_final_high_rd','cf_final_low_ss', 'cf_final_high_ss', 
                'cf_final_low_total', 'cf_final_high_total',
                'variance_rd_logit_cf', 'variance_rd_log_dr']
    for col in cf_var_cols:
        final_df[col] = 0

    # add cause_id 
    cancer_link = cdb.db_api()
    gbd_id = utils.get_gbd_parameter('current_gbd_round')

    # other columns that are constant 
    final_df['underlying_nid'] = np.nan
    final_df['source_id'] = 68 # cancer default 
    final_df['data_type_id'] = 2 # cancer registry

    # add site labels. upload to site table if new sources are present 
    final_df = map_site_id(final_df, db)
    return(final_df)
Beispiel #2
0
def add_coverage_metadata(df):
    ''' Returns the dataframe with a boolean indication of whether each
            registry covers it's associated location_id
    '''
    merge_col = ['registry_index']
    metadata_cols = ['full_coverage', 'national_registry']
    assert all(c in df.columns for c in merge_col), \
        "add_coverage_metadata requires {} column(s)".format(merge_col)
    if (all(c in df.columns for c in metadata_cols) and
            df.loc[:, metadata_cols].notnull().all().all()):
        return(df)
    else:
        assert df.loc[:, merge_col].notnull().all().all(), \
            "add_coverage_metadata cannot accept nulls for {} column(s)".format(
                merge_col)
    input_len = len(df)
    df = df.drop(labels=metadata_cols, axis=1, errors='ignore')
    reg_df = cdb.db_api().get_table("registry")
    reg_df.rename(
        columns={'coverage_of_location_id': 'full_coverage'}, inplace=True)
    reg_df.loc[(reg_df['location_id'] == reg_df['country_id']) &
               reg_df['full_coverage'].isin([1]), 'national_registry'] = 1
    reg_df.loc[reg_df['national_registry'].isnull(), 'national_registry'] = 0
    df = df.merge(reg_df[merge_col + metadata_cols], how='left', on=merge_col)
    assert len(df) == input_len, "Data dropped while adding coverage metadata"
    return(df)
Beispiel #3
0
def get_me_info(parent_meid):
    ''' Returns the modelable_entity_ids (meids) of the "child" (component) MEs,
            meids of the proportions to generate those children MEs, and the 
            me_tag of the survival phase associated with the modelable entity
    '''
    # Load modelable_entity_ids from the cnf_model_entity_table.
    db_link = cdb.db_api("cancer_db")
    me_table = db_link.get_table('cnf_model_entity')
    parent_info = me_table.loc[
        me_table['modelable_entity_id'].eq(parent_meid), :].sort_values(
            'me_tag').copy().reset_index()
    parent_cause = parent_info['acause'].item()
    me_tag = parent_info['me_tag'].item()
    me_info = me_table[
        me_table['is_active'].eq(1)
        & me_table['acause'].str.startswith(parent_cause)
        & me_table['me_tag'].isin([me_tag, "subcause_proportion"])
        & me_table['cancer_model_type'].eq("split_custom_epi")].sort_values(
            ['acause', 'me_tag']).copy().reset_index()
    children_mes = me_info.loc[me_info['me_tag'].eq(me_tag),
                               'modelable_entity_id'].to_dict(into=OrderedDict)
    proportion_mes = me_info.loc[me_info['me_tag'].eq("subcause_proportion"),
                                 'modelable_entity_id'].to_dict(
                                     into=OrderedDict)
    return ([c for c in children_mes.values()],
            [p for p in proportion_mes.values()], me_tag)
Beispiel #4
0
def update_cancer_mortality_version(lsid): 
    ''' Updates cod_mortality_version table and inserts CoD's launch_set_id 
        for versioning purposes.
    '''
    # get max_id 
    db_link = cdb.db_api() 
    cod_df = db_link.get_table('cod_mortality_version')
    max_id = cod_df['cod_mortality_version_id'].max()
    
    # update Cancer's database with CoD's launch_set_id
    update_query='''
    UPDATE cancer_db.{tbl}
    cod_launch_set_id = {l_id} 
    WHERE cod_mortality_version_id={cod_id}
    '''.format(tbl='cod_mortality_version', l_id=int(lsid), cod_id=max_id)
    db_link = cdb.db_api()
    db_link.run_query(update_query)
    print('cancers cod_mortality_version table updated!')
    return 
Beispiel #5
0
def is_exception(dataset_id, data_type_id):
    ''' Determines if dataset is flagged such that negative values are accepted
    '''
    db_link = cdb.db_api()
    tbl = db_link.get_table("prep_exception")
    is_exception = tbl.loc[tbl['dataset_id'].eq(dataset_id)
                           & tbl['data_type_id'].eq(data_type_id)
                           & tbl['prep_exception_type_id'].eq(1)
                           & tbl['processing_status_id'].eq(2), :].any().any()
    return (is_exception)
Beispiel #6
0
def get_cod_description(): 
    ''' Pulls notes from cod_mortality_version table from the cancer database
    '''
    # grab description 
    db_link = cdb.db_api() 
    cod_df = db_link.get_table('cod_mortality_version')
    max_id = cod_df['cod_mortality_version_id'].max()
    desc = cod_df.loc[cod_df['cod_mortality_version_id'].eq(max_id), 'notes'].reset_index()
    
    return (desc['notes'][0])
def procedure_me_id(acause):
    me_table = (cdb.db_api('cancer_db')).get_table('cnf_model_entity')
    me_id = me_table.loc[me_table['is_active'].eq(1)
                         & me_table['acause'].eq(acause)
                         & me_table['me_tag'].eq('procedure_proportion'),
                         'modelable_entity_id']
    if len(me_id) == 0:
        me_id = None
    else:
        me_id = me_id.item()
    return (me_id)
Beispiel #8
0
def query_data_version(database, table, launch_set_id):
    '''
    '''
    query_string = '''
        SELECT MAX(data_version_id)
        FROM cod.{db_table}
        WHERE launch_set_id={lsid}
    '''.format(db_table=table,lsid=launch_set_id)
    db_link = cdb.db_api(database)
    conn = cdb.create_connection_string(database)
    max_id = cdb.execute_query(query_string, conn)
    print('DATA VERSION ID SET TO...{}'.format(max_id)) 
    return max_id
Beispiel #9
0
def query_launch_set_id(db,tb): 
    ''' Looks for the current max id in claude_launch_set id per user
    '''
    query_string = '''
        SELECT MAX(launch_set_id)
        FROM cod.{db_table}
        WHERE username='******'
    '''.format(db_table=tb,usr=getpass.getuser())
    db_link = cdb.db_api(db)
    conn = cdb.create_connection_string(db)
    max_id = cdb.execute_query(query_string, conn)
    print('LAUNCH SET ID SET TO...{}'.format(max_id)) 
    return max_id
Beispiel #10
0
def get_sites(db): 
    ''' Fetch metadata for site labels 
    '''
    query_string = '''
        SELECT site_id, site_name
        FROM cod.site
    '''
    db_link = cdb.db_api(db)
    conn = cdb.create_connection_string(db)
    site_df = cdb.execute_query(query_string, conn)

    # ensure empty string instead of null values 
    site_df.loc[(site_df['site_id'].eq(2)) & (site_df['site_name'].isnull()), 'site_name'] = ''
    return site_df 
Beispiel #11
0
def check_columns(df, table, db):
    ''' checks that all required columns are present before upload
    '''
    db_link = cdb.db_api(db)
    cod_cols = load_db_cols(table,db)

    # remove id and update cols from check 
    cod_cols = list(cod_cols[1:,]) 
    update_cols = get_update_settings()
    for col in update_cols.keys():
        cod_cols.remove(col)
    assert set(cod_cols).difference(set(df.columns)) == set(), \
        "Missing columns in dataset: \n{}".format(
            set(cod_cols).difference(df.columns))
    return(None)
Beispiel #12
0
def load_surv_folder(cnf_model_version_id):
    ''' Using the cnf_lambda_version_id, returns the datestamp suffix
            of that version
    '''
    surv_folder = surv_folder = utils.get_path("relative_survival",
                                             process="nonfatal_model")
    record = nd.get_run_record(cnf_model_version_id)
    rs_version = record.at[0, 'rel_survival_version_id']
    db_link = cdb.db_api()
    this_version = db_link.get_entry(table_name='rel_survival_version', 
                                uniq_col='rel_survival_version_id',
                                val=rs_version)
    suffix =  str(this_version.at[0, 'date_generated'])
    rs_folder = surv_folder.replace("<date>", suffix)
    return(rs_folder)
Beispiel #13
0
def load_lambda_file(cnf_model_version_id):
    ''' Using the cnf_lambda_version_id, returns the datestamp suffix
            of that version
    '''
    lambda_file_default = utils.get_path(
        "lambda_values", process="nonfatal_model")
    record = nd.get_run_record(cnf_model_version_id)
    lambda_version = record.at[0, 'cnf_lambda_version_id']
    db_link = cdb.db_api()
    this_version = db_link.get_entry(table_name='cnf_lambda_version', 
                                uniq_col='cnf_lambda_version_id',
                                val=lambda_version)
    suffix = str(this_version.at[0, 'date_generated'])
    lambda_file = lambda_file_default.replace("<date>", suffix)
    return(lambda_file)
def run_nonzero_floor():   
    ''' enforces cause-, age-, sex-, and year-specific minimums on non-zero 
        cause fractions. It allows cause fractions to be 0, but not to be 
        arbitrarily small.
    '''
    print('running nonzero floor regression...')
    db_link = cdb.db_api()
    nr_df = pd.read_csv(utils.get_path(process='cod_mortality', 
                                    key='noise_reduced'))
    cause_df = db_link.get_table('cod_model_entity')
    cause_df = cause_df.loc[cause_df['is_active'].eq(1),['cause_id','acause']]
    cause_metadata = nr_df.merge(cause_df, how='left', on='acause')
    cause_hierarchy = get_cause_hierarchy()
    nonzero_floorer = ra.NonZeroFloorer(cause_metadata)
    df = nonzero_floorer.get_computed_dataframe(get_pop(), get_env(), cause_hierarchy)
    df.to_csv(<FILE PATH>)
    return 
def get_current_mi_results(cod_mortality_version_id):
    ''' Returns the current/best compiled MI results
    '''
    db_link = cdb.db_api()
    mor_config = db_link.get_table('cod_mortality_version')

    # get mir_model_version_id
    mir_id = mor_config.loc[mor_config['cod_mortality_version_id'] ==
                            cod_mortality_version_id,
                            'mir_model_version_id'][len(mor_config) - 1]
    print('using mir model version id...{}'.format(mir_id))
    mi_path = utils.get_path(process='mir_model', key='compiled_mir_results')

    # replace suffix with model_version_id
    mi_path = mi_path.replace('<mir_model_version_id>', str(mir_id))
    compiled_mi_df = pd.read_csv(mi_path)
    return compiled_mi_df
Beispiel #16
0
def update_data_version(database, table, launch_set_id):
    '''
    '''
    clear_prev_data_version_status(database, table)
    update_query = """
        UPDATE cod.{tbl}
        SET status='1'
        WHERE launch_set_id = {lsid}
    """.format(
        tbl=table,
        lsid=launch_set_id
    )
    db_link = cdb.db_api(database)
    db_link.run_query(update_query)

    print('updated data version table!!')
    return()
Beispiel #17
0
def update_launch_set(database, table, launch_set_id):
    '''
    '''
    launch_set_end = make_db_datestamp()
    update_query = """
        UPDATE cod.{tbl}
        SET launch_set_end = '{lse}'
        WHERE launch_set_id = {lsid}
    """.format(
        tbl=table, lse=launch_set_end,
        lsid=launch_set_id
    )
    db_link = cdb.db_api(database)
    db_link.run_query(update_query)

    print('updated launch_set_end to...{}'.format(launch_set_end))
    return()
Beispiel #18
0
def check_all_entries_uploaded(database, table, data_version_id):
    '''
    '''
    upload_file = pd.read_csv(_get_upload_path('data'))
    count_query = '''
        SELECT COUNT(data_version_id)
        FROM cod.{tbl}
        WHERE data_version_id={d_id}
    '''.format(tbl=table, d_id=data_version_id)
    db_link = cdb.db_api(database)
    conn = cdb.create_connection_string(database)
    data_count = cdb.execute_query(count_query, conn)
    if len(upload_file) == data_count: 
        print("ALL DATA POINTS SUCCESSFULLY UPLOADED")
    else: 
        print("ERROR: SOME datapoints were not uploaded. Please check what data points \
        have not been uploaded successfully.")
    return 
Beispiel #19
0
def _add_ihme_pop_marker(df):
    ''' Returns the dataframe with an added 'ihme_pop_ok' column indicating whether
        ihme population estimates may be merged with the uid
    '''
    if not 'sdi_quintile' in df.columns:
        df = modeled_locations.add_sdi_quintile(df)
    if not 'full_coverage' in df.columns:
        df = add_coverage_metadata(df)
    ds_df = cdb.db_api().get_table("dataset")
    df.loc[:, 'ihme_pop_ok'] = 0
    for dsid in df['dataset_id'].unique():
        pop_ok = ds_df.loc[ds_df['dataset_id'] ==
                           dsid, 'can_use_ihme_pop'].values[0]
        if pop_ok == 1:
            df.loc[df['dataset_id'] == dsid, 'ihme_pop_ok'] = pop_ok
    ihme_pop_ok = (df['sdi_quintile'].isin([5]) &
                   (df['full_coverage'].isin([1])))
    df.loc[ihme_pop_ok, 'ihme_pop_ok'] = 1
    return(df)
def sequelae_fractions(acause):
    ''' Defines fractions from lit review to be used when splitting sequela
    '''
    # Set fractions of population recieving treatment according to
    #   what is basically their primary disabilty (sequela)
    pros_incont_frac = 0.18  # pct. who primarily develop incontinence
    pros_impot_frac = 0.55  # pct. who primarily develop impotence
    # Define dict
    fractions = {
        'neo_prostate': {
            # Fractions used to calculate the controlled phase
            18781: {
                'fraction': pros_impot_frac
            },  # with impotence
            18782: {
                'fraction': pros_incont_frac
            },  # with incontinence
            # Fractions used to calculate the metrics of sequela beyond ten years
            18784: {
                'fraction': pros_impot_frac
            },
            18785: {
                'fraction': pros_incont_frac
            }
        }
    }
    # Add me_tags to dict (enables later linking of data to modelable_entity_id)
    me_tbl = cdb.db_api().get_table("cnf_model_entity")
    meids = list(fractions['neo_prostate'].keys())
    for me in meids:
        if me_tbl.loc[me_tbl['modelable_entity_id'].eq(me),
                      'is_active'].item() == 0:
            del fractions['neo_prostate'][me]
        else:
            tag = me_tbl.loc[me_tbl['modelable_entity_id'].eq(me),
                             'me_tag'].item()
            fractions['neo_prostate'][me]['me_tag'] = tag
    if acause in fractions.keys():
        return (fractions[acause])
    else:
        return (False)
Beispiel #21
0
def load_durations(acause):
    '''
    '''
    db_link = cdb.db_api('cancer_db')
    if acause[:8] == "neo_liver_":
        sequelae_cause = "neo_liver"
    elif acause == "neo_leukemia_other":
        sequelae_cause = "neo_leukemia_ll_chronic"
    elif acause == "neo_nmsc":
        sequelae_cause = "neo_nmsc_scc"
    elif acause == "neo_other_cancer":
        sequelae_cause = "neo_other"
    else:
        sequelae_cause = acause
    sq_df = db_link.get_table('sequelae_durations')
    this_sq = sq_df.loc[sq_df['acause'] == sequelae_cause, :]
    this_sq.loc[:, 'acause'] = acause
    assert this_sq['sequela_duration'].notnull().all(
    ), "error loading sequela durations"
    assert len(this_sq) > 0, "Error loading sequela durations"
    return (this_sq[['acause', 'me_tag', 'sequela_duration']])
Beispiel #22
0
def add_representativeness(df):
    ''' Adds 'representative' integer indicating whether data are representative
            of their attached location_id
    '''
    def _avg_repness(regs, rep_table):
        ''' If all registries are representative, returns 1. Else returns 0
        '''
        try:
            if not isinstance(regs, tuple) and not isinstance(regs, list):
                try:
                    regs = list(literal_eval(regs))
                except:
                    regs = list(regs)
            rep = rep_table.loc[rep_table['registry_index'].isin(regs),
                                'representative_of_location_id']
            if len(rep) == 0:
                return(0)
            else:
                return(rep.min())
        except:
            return(0)

    print("adding representative status...")
    db_link = cdb.db_api("cancer_db")
    # Add representative status based on the input registries
    rep_status = db_link.get_table(
        "registry")[['registry_index', 'representative_of_location_id']]
    rep_df = pd.DataFrame(
        {'registry_index': df['registry_index'].unique().tolist()})
    get_repness = partial(_avg_repness, rep_table=rep_status)
    rep_df.loc[:, 'representative'] = rep_df['registry_index'].apply(
        get_repness)
    output = df.merge(rep_df, on='registry_index')
    output = update_repness(output)
    assert len(output) == len(
        df), "add_representativeness is adding or deleting data"
    return(output)
Beispiel #23
0
def execute_upload(df, table, db):
    '''Uploads our table to cod data. This will load in the columns from the 
        database you want to upload to, and re-orders the dataframe to the 
        approrpriate order 
    '''
    print('executing upload...')
    db_link = cdb.db_api(db)

    # order columns same as table
    cod_cols = load_db_cols(table,db)
    cod_cols = list(cod_cols[1:,])
    update_cols = get_update_settings()
    for col in update_cols.keys():
        cod_cols.remove(col)
    upload_df = df[cod_cols]

    # ASSERT columns to upload are the same as what exists 
    assert set(cod_cols).difference(set(df.columns)) == set(), \
        "Missing columns in dataset: \n{}".format(
            set(cod_cols).difference(df.columns))
    
    # upload dataframe 
    write_table_to_csv(upload_df, table)
    upload_to_db(table, db)
Beispiel #24
0
    def _check_all_floors_exist(self, nzf_df):
        ''' Check that all expected cancers, ages, and years, are present and have
            nonzero floor values 
        '''
        def _remove_ages_less_than(a, b):
            '''
            '''
            orig_list = a.copy()
            for val in orig_list:
                if b == 5 & val in [2, 3, 4]:
                    continue
                if val < b:
                    a.remove(val)
            return a

        print("CHECKING FOR ALL CAUSES, AGES, and YEARS...")
        # create cause_list
        db_link = cdb.db_api(db_connection_name='cancer_db')
        gbd_id = utils.get_gbd_parameter('current_gbd_round')
        registry_entity = db_link.get_table('registry_input_entity')
        registry_entity = registry_entity.loc[
            registry_entity['gbd_round_id'].eq(gbd_id)
            & registry_entity['is_active'].eq(1), ]
        cancer_metadata = registry_entity[[
            'acause', 'cause_id', 'yll_age_start', 'yll_age_end'
        ]]
        causes_checklist = registry_entity['acause'].unique().tolist()

        # exceptions for nonzero floors
        causes_checklist.remove('neo_nmsc_bcc')
        causes_checklist.remove('neo_ben_intest')
        causes_checklist.remove('neo_ben_utr')
        causes_checklist.remove('neo_ben_other')
        causes_checklist.remove('neo_ben_brain')
        causes_checklist.remove('_gc')

        # create year_list
        year_start = utils.get_gbd_parameter('min_year_cod')
        year_end = utils.get_gbd_parameter('max_year')  # + 1 for GBD2020
        year_checklist = list(range(year_start, year_end))

        # sex &  age_id checklist
        age_id_checklist = [
            5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 31,
            32, 235, 2, 3, 4
        ]  #age_ids for 0-95 ages
        sex_checklist = [1, 2]

        # print any causes/years/sexes that are expected and missing
        for cancer in causes_checklist:
            print('working on...{}'.format(cancer))
            subset = nzf_df.loc[nzf_df['acause'].eq(cancer), ]
            age_start = int(
                cancer_metadata.loc[cancer_metadata['acause'].eq(cancer),
                                    'yll_age_start'])
            age_start = (age_start /
                         5) + 5  # conversion from age to GBD age_group_id
            if len(subset) == 0:
                print('MISSING CAUSE... {} '.format(cancer))
            missing_ages = set(age_id_checklist) - set(
                subset['age_group_id'].unique().tolist())
            missing_ages = list(missing_ages)
            missing_ages = _remove_ages_less_than(missing_ages, age_start)
            if len(missing_ages) > 0:
                print('missing the following ages for {}: {}'.format(
                    cancer, missing_ages))
            missing_sexes = set(sex_checklist) - set(
                subset['sex_id'].unique().tolist())
            if len(missing_sexes) > 0:
                print('missing the following sexes for {}: {}'.format(
                    cancer, missing_sexes))
            missing_years = set(year_checklist) - set(
                subset['year_id'].unique().tolist())
            if len(missing_years) > 0:
                print('missing the following years for {}: {}'.format(
                    cancer, missing_years))
        return