def get_units(cols_input): # connect to castor api to fetch information on variable lists config = configparser.ConfigParser() config.read('../user_settings.ini') # create this once and never upload path_creds = config['CastorCredentials']['local_private_path'] c = ca.CastorApi(path_creds) c.select_study_by_name(config['CastorCredentials']['study_name']) optiongroups = c.request_study_export_optiongroups() studystruct = c.request_study_export_structure() cols = pd.Series(cols_input) units = pd.Series(cols_input) units[:] = '' lookup_dict, numeric_vars = get_unit_lookup_dict() for variable in cols.to_list(): if variable in numeric_vars: # the one with 1.0 as conversion factor is used. for ind, conversion in lookup_dict[numeric_vars[variable]].items(): if conversion == 1.0: option_group_id = studystruct['Field Option Group'][ studystruct['Field Variable Name'] == numeric_vars[variable]] options = optiongroups[['Option Name', 'Option Value' ]][optiongroups['Option Group Id'] == option_group_id.values[0]] unit = options['Option Name'][ options['Option Value'].values.astype(int) == ind] units[cols == variable] = unit.values[0] return units.to_list()
def transform_numeric_features(data, data_struct): # Calculates all variables to the same unit, # according to a handmade mapping in unit_lookup.py unit_dict, var_numeric = get_unit_lookup_dict() numeric_columns = is_in_columns(var_numeric.keys(), data) wbc_value_study = 'WBC_2_1' # = 'units_lymph', 'units_neutro' wbc_value_report = 'WBC_2' # = 'lymph_units_1', 'neutro_units_2' for col in numeric_columns: unit_col = var_numeric[col] data[unit_col] = data[unit_col] \ .fillna(-1) \ .astype(int) \ .apply(lambda x: unit_dict[unit_col].get(x)) if unit_col in ['units_lymph', 'units_neutro']: has_999 = data[unit_col] == -999 data.loc[has_999, unit_col] = data.loc[has_999, wbc_value_study] \ .astype(float).div(100) elif unit_col in ['lymph_units_1', 'neutro_units_2']: has_999 = data[unit_col] == -999 data.loc[has_999, unit_col] = data.loc[has_999, wbc_value_report] \ .astype(float).div(100) has_value = data[col].notna() data.loc[has_value, col] = data.loc[has_value, col].astype(float) \ * data.loc[has_value, unit_col].astype(float) data = data.drop(is_in_columns(unit_dict.keys(), data), axis=1) return data, data_struct
def transform_numeric_features(data, data_struct): # Calculates all variables to the same unit, # according to a handmade mapping in unit_lookup.py data = data.copy() unit_dict, var_numeric = get_unit_lookup_dict() numeric_columns = is_in_columns(var_numeric.keys(), data) wbc_value_study = 'WBC_2_1' # = 'units_lymph', 'units_neutro' wbc_value_report = 'WBC_2' # = 'lymph_units_1', 'neutro_units_2' for col in numeric_columns: unit_col = var_numeric[col] data[unit_col] = data[unit_col] \ .fillna(-1) \ .astype(int) \ .apply(lambda x: unit_dict[unit_col].get(x)) if unit_col in ['units_lymph', 'units_neutro']: has_999 = data[unit_col] == -999 data.loc[has_999, unit_col] = data.loc[has_999, wbc_value_study] \ .astype(float).div(100) elif unit_col in ['lymph_units_1', 'neutro_units_2']: has_999 = data[unit_col] == -999 data.loc[has_999, unit_col] = data.loc[has_999, wbc_value_report] \ .astype(float).div(100) has_value = data[col].notna() data.loc[has_value, col] = data.loc[has_value, col].astype(float) \ * data.loc[has_value, unit_col].astype(float) data = data.drop(is_in_columns(unit_dict.keys(), data), axis=1) # PaO2_1 and PaO2 are measured in 3 ways # as described in PaO2_sample\_type_1 and PaO2_sample_type # divide these variables in four dummies. for p in ['', '_1']: options = data_struct.loc[data_struct['Field Variable Name'] == ( 'PaO2_sample_type' + p), ['Option Name', 'Option Value']].iloc[0, :] col_dict = dict(zip(options['Option Value'], options['Option Name'])) # Non-Strings are missing values colnames = [ 'PaO2' + p + '_' + v for v in col_dict.values() if type(v) == str ] df = pd.DataFrame(0, columns=colnames, index=data.index) for value, name in col_dict.items(): if str(name) == 'nan': # occurs only once, so better drop it continue colname = 'PaO2' + p + '_' + str(name) is_measure_type = data['PaO2_sample_type' + p] == value df.loc[is_measure_type, colname] = data.loc[is_measure_type, 'PaO2' + p] df.loc[data['PaO2' + p].isna(), :] = None data = data.drop(columns=['PaO2' + p, 'PaO2_sample_type' + p]) data = pd.concat([data, df], axis=1) return data, data_struct
def transform_binary_features(data, data_struct): ''' ''' value_na = None dict_yes_no = {0: 0, 1: 1, 2: 0, 3: value_na, 9: value_na, 9999: value_na} dict_yp = { 0: 0, 1: 1, 2: .5, 3: 0, 4: value_na } # [1, 2, 3, 4 ] --> [1, .5, 0, -1] dict_yu = {0: 0, 1: 1, 9999: value_na} dict_smoke = { 0: 0, 1: 1, 2: 0, 3: .5, 4: value_na } # [Yes, no, stopped_smoking] --> [1, 0, .5] # Some fixed for erronuous field types data_struct.loc[data_struct['Field Variable Name'] == 'MH_HF', 'Field Type'] = 'radio' radio_fields = data_struct.loc[data_struct['Field Type'] == 'radio', 'Field Variable Name'].to_list() # Find all answers with Yes No and re-value them if_yes_no = lambda x: 1 if type(x) == list and ("Yes" in x and "No" in x ) else 0 is_yes_no = data_struct['Option Name'].apply(if_yes_no) == 1 vars_yes_no = is_in_columns( data_struct.loc[is_yes_no, 'Field Variable Name'].to_list(), data) data.loc[:, vars_yes_no] = data.loc[:, vars_yes_no].fillna(3).astype( int).applymap(lambda x: dict_yes_no.get(x)) # Find all answers with Yes probable if_yes_probable = lambda x: 1 if type(x) == list and ( "YES - Probable" in x or "Yes - Probable" in x) else 0 is_yes_probable = data_struct['Option Name'].apply(if_yes_probable) == 1 vars_yes_probable = is_in_columns( data_struct.loc[is_yes_probable, 'Field Variable Name'].to_list(), data) data.loc[:, vars_yes_probable] = data.loc[:, vars_yes_probable].fillna( 4).astype(int).applymap(lambda x: dict_yp.get(x)) # NOTE in current implementation all unknowns are caught by is_yes_no # Find all answers with Unknown (cardiac variables) if_unknown = lambda x: 1 if (type(x)==list) \ and (("Unknown" in x or "unknown" in x) \ and ("Yes" in x)) \ else 0 has_unknown = data_struct['Option Name'].apply(if_unknown) == 1 vars_yes_unknown = is_in_columns( data_struct.loc[has_unknown, 'Field Variable Name'].to_list(), data) data.loc[:, vars_yes_unknown] = data.loc[:, vars_yes_unknown].fillna( 9999).astype(int).applymap(lambda x: dict_yu.get(x)) # Hand code some other variables other_radio_vars = [ 'Bacteria', 'Smoking', 'CT_thorax_performed', 'facility_transfer', 'culture' ] data.loc[:, 'Bacteria'].fillna(3) \ .astype(int) \ .apply(lambda x: dict_yes_no.get(x)) data.loc[:, 'Smoking'].fillna(4) \ .astype(int) \ .apply(lambda x: dict_smoke.get(x)) data.loc[:, 'CT_thorax_performed'].fillna(3) \ .astype(int) \ .apply(lambda x: {0:0, 1:0, 2:1, 3:0}.get(x)) data.loc[:, 'facility_transfer'].fillna(3) \ .astype(int) \ .apply(lambda x: dict_yes_no.get(x)) data.loc[:, 'culture'].fillna(1) \ .astype(int) \ .apply(lambda x: {0:0, 1:0, 2:1, 3:2}.get(x)) unit_dict, _ = get_unit_lookup_dict() vars_units = data_struct.loc[(data_struct['Field Type'] == 'radio') & \ data_struct['Field Variable Name'].isin(unit_dict.keys()), 'Field Variable Name'].to_list() data_struct.loc[data_struct.loc[:, 'Field Variable Name'] \ .isin(vars_units), 'Field Type'] = 'unit' # All other variables handled_vars = vars_yes_no + vars_yes_probable + other_radio_vars \ + vars_yes_unknown + vars_units vars_other = is_in_columns( [v for v in radio_fields if v not in handled_vars], data) data_struct.loc[data_struct['Field Variable Name'].isin(vars_other), 'Field Type'] = 'category_1' return data, data_struct