Esempio n. 1
0
def get_units(cols_input):
    # connect to castor api to fetch information on variable lists
    config = configparser.ConfigParser()
    config.read('../user_settings.ini')  # create this once and never upload

    path_creds = config['CastorCredentials']['local_private_path']
    c = ca.CastorApi(path_creds)
    c.select_study_by_name(config['CastorCredentials']['study_name'])
    optiongroups = c.request_study_export_optiongroups()
    studystruct = c.request_study_export_structure()

    cols = pd.Series(cols_input)
    units = pd.Series(cols_input)
    units[:] = ''
    lookup_dict, numeric_vars = get_unit_lookup_dict()
    for variable in cols.to_list():
        if variable in numeric_vars:
            # the one with 1.0 as conversion factor is used.
            for ind, conversion in lookup_dict[numeric_vars[variable]].items():
                if conversion == 1.0:
                    option_group_id = studystruct['Field Option Group'][
                        studystruct['Field Variable Name'] ==
                        numeric_vars[variable]]
                    options = optiongroups[['Option Name', 'Option Value'
                                            ]][optiongroups['Option Group Id']
                                               == option_group_id.values[0]]
                    unit = options['Option Name'][
                        options['Option Value'].values.astype(int) == ind]
                    units[cols == variable] = unit.values[0]
    return units.to_list()
Esempio n. 2
0
def transform_numeric_features(data, data_struct):
    # Calculates all variables to the same unit,
    #   according to a handmade mapping in unit_lookup.py
    unit_dict, var_numeric = get_unit_lookup_dict()

    numeric_columns = is_in_columns(var_numeric.keys(), data)
    wbc_value_study = 'WBC_2_1'  # = 'units_lymph', 'units_neutro'
    wbc_value_report = 'WBC_2'  # = 'lymph_units_1', 'neutro_units_2'

    for col in numeric_columns:
        unit_col = var_numeric[col]
        data[unit_col] = data[unit_col] \
                            .fillna(-1) \
                            .astype(int) \
                            .apply(lambda x: unit_dict[unit_col].get(x))
        if unit_col in ['units_lymph', 'units_neutro']:
            has_999 = data[unit_col] == -999
            data.loc[has_999, unit_col] = data.loc[has_999, wbc_value_study] \
                                              .astype(float).div(100)
        elif unit_col in ['lymph_units_1', 'neutro_units_2']:
            has_999 = data[unit_col] == -999
            data.loc[has_999, unit_col] = data.loc[has_999, wbc_value_report] \
                                              .astype(float).div(100)
        has_value = data[col].notna()
        data.loc[has_value, col] = data.loc[has_value, col].astype(float) \
                            * data.loc[has_value, unit_col].astype(float)

    data = data.drop(is_in_columns(unit_dict.keys(), data), axis=1)

    return data, data_struct
def transform_numeric_features(data, data_struct):
    # Calculates all variables to the same unit,
    #   according to a handmade mapping in unit_lookup.py
    data = data.copy()

    unit_dict, var_numeric = get_unit_lookup_dict()

    numeric_columns = is_in_columns(var_numeric.keys(), data)
    wbc_value_study = 'WBC_2_1'  # = 'units_lymph', 'units_neutro'
    wbc_value_report = 'WBC_2'  # = 'lymph_units_1', 'neutro_units_2'

    for col in numeric_columns:
        unit_col = var_numeric[col]
        data[unit_col] = data[unit_col] \
                            .fillna(-1) \
                            .astype(int) \
                            .apply(lambda x: unit_dict[unit_col].get(x))
        if unit_col in ['units_lymph', 'units_neutro']:
            has_999 = data[unit_col] == -999
            data.loc[has_999, unit_col] = data.loc[has_999, wbc_value_study] \
                                              .astype(float).div(100)
        elif unit_col in ['lymph_units_1', 'neutro_units_2']:
            has_999 = data[unit_col] == -999
            data.loc[has_999, unit_col] = data.loc[has_999, wbc_value_report] \
                                              .astype(float).div(100)
        has_value = data[col].notna()
        data.loc[has_value, col] = data.loc[has_value, col].astype(float) \
                            * data.loc[has_value, unit_col].astype(float)

    data = data.drop(is_in_columns(unit_dict.keys(), data), axis=1)

    # PaO2_1 and PaO2 are measured in 3 ways
    # as described in PaO2_sample\_type_1 and PaO2_sample_type
    # divide these variables in four dummies.
    for p in ['', '_1']:
        options = data_struct.loc[data_struct['Field Variable Name'] == (
            'PaO2_sample_type' + p),
                                  ['Option Name', 'Option Value']].iloc[0, :]

        col_dict = dict(zip(options['Option Value'], options['Option Name']))
        # Non-Strings are missing values
        colnames = [
            'PaO2' + p + '_' + v for v in col_dict.values() if type(v) == str
        ]
        df = pd.DataFrame(0, columns=colnames, index=data.index)

        for value, name in col_dict.items():
            if str(name) == 'nan':
                # occurs only once, so better drop it
                continue
            colname = 'PaO2' + p + '_' + str(name)
            is_measure_type = data['PaO2_sample_type' + p] == value
            df.loc[is_measure_type, colname] = data.loc[is_measure_type,
                                                        'PaO2' + p]
        df.loc[data['PaO2' + p].isna(), :] = None

        data = data.drop(columns=['PaO2' + p, 'PaO2_sample_type' + p])
        data = pd.concat([data, df], axis=1)
    return data, data_struct
Esempio n. 4
0
def transform_binary_features(data, data_struct):
    '''
    
    
    '''

    value_na = None
    dict_yes_no = {0: 0, 1: 1, 2: 0, 3: value_na, 9: value_na, 9999: value_na}
    dict_yp = {
        0: 0,
        1: 1,
        2: .5,
        3: 0,
        4: value_na
    }  # [1, 2, 3, 4 ] --> [1, .5, 0, -1]
    dict_yu = {0: 0, 1: 1, 9999: value_na}
    dict_smoke = {
        0: 0,
        1: 1,
        2: 0,
        3: .5,
        4: value_na
    }  # [Yes, no, stopped_smoking] --> [1, 0, .5]

    # Some fixed for erronuous field types
    data_struct.loc[data_struct['Field Variable Name'] == 'MH_HF',
                    'Field Type'] = 'radio'

    radio_fields = data_struct.loc[data_struct['Field Type'] == 'radio',
                                   'Field Variable Name'].to_list()

    # Find all answers with Yes No and re-value them
    if_yes_no = lambda x: 1 if type(x) == list and ("Yes" in x and "No" in x
                                                    ) else 0
    is_yes_no = data_struct['Option Name'].apply(if_yes_no) == 1
    vars_yes_no = is_in_columns(
        data_struct.loc[is_yes_no, 'Field Variable Name'].to_list(), data)
    data.loc[:, vars_yes_no] = data.loc[:, vars_yes_no].fillna(3).astype(
        int).applymap(lambda x: dict_yes_no.get(x))

    # Find all answers with Yes probable
    if_yes_probable = lambda x: 1 if type(x) == list and (
        "YES - Probable" in x or "Yes - Probable" in x) else 0
    is_yes_probable = data_struct['Option Name'].apply(if_yes_probable) == 1
    vars_yes_probable = is_in_columns(
        data_struct.loc[is_yes_probable, 'Field Variable Name'].to_list(),
        data)
    data.loc[:, vars_yes_probable] = data.loc[:, vars_yes_probable].fillna(
        4).astype(int).applymap(lambda x: dict_yp.get(x))

    # NOTE in current implementation all unknowns are caught by is_yes_no
    # Find all answers with Unknown (cardiac variables)
    if_unknown = lambda x: 1 if (type(x)==list) \
                             and (("Unknown" in x or "unknown" in x) \
                             and ("Yes" in x)) \
                             else 0
    has_unknown = data_struct['Option Name'].apply(if_unknown) == 1
    vars_yes_unknown = is_in_columns(
        data_struct.loc[has_unknown, 'Field Variable Name'].to_list(), data)
    data.loc[:, vars_yes_unknown] = data.loc[:, vars_yes_unknown].fillna(
        9999).astype(int).applymap(lambda x: dict_yu.get(x))

    # Hand code some other variables
    other_radio_vars = [
        'Bacteria', 'Smoking', 'CT_thorax_performed', 'facility_transfer',
        'culture'
    ]
    data.loc[:, 'Bacteria'].fillna(3) \
                           .astype(int) \
                           .apply(lambda x: dict_yes_no.get(x))
    data.loc[:, 'Smoking'].fillna(4) \
                          .astype(int) \
                          .apply(lambda x: dict_smoke.get(x))
    data.loc[:, 'CT_thorax_performed'].fillna(3) \
                                      .astype(int) \
                                      .apply(lambda x: {0:0, 1:0, 2:1, 3:0}.get(x))
    data.loc[:, 'facility_transfer'].fillna(3) \
                                    .astype(int) \
                                    .apply(lambda x: dict_yes_no.get(x))
    data.loc[:, 'culture'].fillna(1) \
                          .astype(int) \
                          .apply(lambda x: {0:0, 1:0, 2:1, 3:2}.get(x))

    unit_dict, _ = get_unit_lookup_dict()
    vars_units = data_struct.loc[(data_struct['Field Type'] == 'radio') & \
                                  data_struct['Field Variable Name'].isin(unit_dict.keys()),
                                 'Field Variable Name'].to_list()
    data_struct.loc[data_struct.loc[:, 'Field Variable Name'] \
                               .isin(vars_units), 'Field Type'] = 'unit'

    # All other variables
    handled_vars = vars_yes_no + vars_yes_probable + other_radio_vars \
        + vars_yes_unknown + vars_units
    vars_other = is_in_columns(
        [v for v in radio_fields if v not in handled_vars], data)
    data_struct.loc[data_struct['Field Variable Name'].isin(vars_other),
                    'Field Type'] = 'category_1'

    return data, data_struct