def risk_cases_encoder(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) mot_esp_cases = ['referencia', 'contra_referencia', 'urgencias', 'entrega de medicamentos', 'citas de consulta medica especializada', 'procedimientos y/o servicios', 'enfermedades raras o hu'] afec_edad_cases = ['de 6 a 12 años', 'de 0 a 5 años', 'de 13 a 17 años', 'mayor de 63 años'] cie10_cases = ['vih', 'tumores malignos', 'maternas', 'trasplantados'] dataset['CASO_RIESGO'] = dataset['MOTIVO_ESPECIFICO'].apply(lambda value: contains(value, mot_esp_cases)) dataset['POBESPECIAL'] = dataset['AFEC_POBESPECIAL'].apply(lambda value: False if value == 'no aplica' else True) dataset['EDAD_RIESGO'] = dataset['AFEC_EDADR'].apply(lambda value: contains(value, afec_edad_cases)) dataset['CIE10_RIESGO'] = dataset['CIE_10'].apply(lambda value: contains(value, cie10_cases)) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) labels = dataset[['RIESGO_VIDA']] features = dataset.drop(['RIESGO_VIDA'], axis = 1) encoded_features = data_utils.encode_features(features, labels) encoded_features['RIESGO_VIDA'] = labels encoded_features.to_csv("datasets/experiments/risk_cases_encoder.csv", index = False) log_artifact("datasets/experiments/risk_cases_encoder.csv")
def clean_dataset(path): dataset = data_utils.get_dataset() dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset.to_csv(path, index=False)
def missing_state_and_imputing(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) imp = SimpleImputer(missing_values = '0', strategy="most_frequent") zero_values = list(dataset.columns[dataset.eq('0').mean().mean() > 0])[0] print(zero_values) for feature in zero_values: print(feature) dataset[f'{feature}_is_missing'] = dataset[feature].apply(lambda value: 0 if value == '0' else 1) dataset[zero_values] = imp.fit_transform(dataset[zero_values]) dtypes_df = dataset.dtypes.to_frame(name = 'dtype') dtypes_df = dtypes_df[dtypes_df['dtype'] == 'object'] non_numeric_features = list(dtypes_df.index) dataset[non_numeric_features] = dataset[non_numeric_features].applymap(str) dataset[non_numeric_features] = label_encode(dataset[non_numeric_features]) dataset.to_csv("datasets/experiments/missing_state_and_imputing.csv", index = False) log_artifact("datasets/experiments/missing_state_and_imputing.csv")
def basic(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset.drop(['index'], axis = 1) dataset = label_encode(dataset) dataset.to_csv("datasets/experiments/basic.csv", index = False) log_artifact("datasets/experiments/basic.csv")
def target_encoder(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) labels = dataset[['RIESGO_VIDA']] features = dataset.drop(['RIESGO_VIDA'], axis = 1) encoded_features = data_utils.encode_features(features, labels) encoded_features['RIESGO_VIDA'] = labels encoded_features.to_csv("datasets/experiments/target_encoder.csv", index = False)
def imputing(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) zero_values = list(dataset.columns[dataset.eq('0').mean() > 0]) dataset[zero_values] = dataset[zero_values].apply(lambda col: col.fillna(col.mode()[0]), axis=0) dataset = dataset.applymap(str) dataset = label_encode(dataset) dataset.to_csv("datasets/experiments/imputing.csv", index = False) log_artifact("datasets/experiments/imputing.csv")
def missing_state(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) zero_values = set(dataset.columns[dataset.eq('0').mean() > 0]) for feature in zero_values: dataset[f'{feature}_is_missing'] = dataset[feature].apply(lambda f: 1 if f == '0' else 0) features_columns = [column for column in dataset.columns if '_is_missing' not in column] dataset[features_columns] = label_encode(dataset[features_columns]) dataset.to_csv("datasets/experiments/missing_state.csv", index = False) log_artifact("datasets/experiments/missing_state.csv")
def target_encoder_only_complains(dataset): dataset = dataset[ (dataset['PQR_TIPOPETICION'] != 'peticion de informacion') & (dataset['PQR_TIPOPETICION'] != 'consulta y/o solicitud de informacion') ] dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) labels = dataset[['RIESGO_VIDA']] features = dataset.drop(['RIESGO_VIDA'], axis = 1) encoded_features = data_utils.encode_features(features, labels) encoded_features['RIESGO_VIDA'] = labels encoded_features.to_csv("datasets/experiments/target_encoder_only_complains.csv", index = False) log_artifact("datasets/experiments/target_encoder_only_complains.csv")
def normalizing(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) dataset = label_encode(dataset) scaler = MinMaxScaler() features = dataset.drop(['RIESGO_VIDA'], axis = 1) labels = dataset[['RIESGO_VIDA']] features[features.columns] = features[features.columns].apply(lambda x: np.log(x + 1)) features[features.columns] = scaler.fit_transform(features[features.columns]) dataset = features dataset['RIESGO_VIDA'] = labels.values dataset.to_csv("datasets/experiments/normalizing.csv", index = False) log_artifact("datasets/experiments/normalizing.csv")