Beispiel #1
0
def clean_dataset(path):
    dataset = data_utils.get_dataset()
    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset.to_csv(path, index=False)
def missing_state_and_imputing(dataset):
    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset = dataset.reset_index()
    dataset = dataset.drop(['index'], axis = 1)

    imp = SimpleImputer(missing_values = '0', strategy="most_frequent")

    zero_values = list(dataset.columns[dataset.eq('0').mean().mean() > 0])[0]
    print(zero_values)
    for feature in zero_values:
        print(feature)
        dataset[f'{feature}_is_missing'] = dataset[feature].apply(lambda value: 0 if value == '0' else 1)
        
    dataset[zero_values] = imp.fit_transform(dataset[zero_values])

    dtypes_df = dataset.dtypes.to_frame(name = 'dtype')
    dtypes_df = dtypes_df[dtypes_df['dtype'] == 'object']
    non_numeric_features = list(dtypes_df.index)

    dataset[non_numeric_features] = dataset[non_numeric_features].applymap(str)
    dataset[non_numeric_features] = label_encode(dataset[non_numeric_features])
    dataset.to_csv("datasets/experiments/missing_state_and_imputing.csv", index = False)
    log_artifact("datasets/experiments/missing_state_and_imputing.csv")
def risk_cases_encoder(dataset):
    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)

    mot_esp_cases = ['referencia', 'contra_referencia', 'urgencias', 'entrega de medicamentos', 'citas de consulta medica especializada', 'procedimientos y/o servicios', 'enfermedades raras o hu']
    afec_edad_cases = ['de 6 a 12 años', 'de 0 a 5 años', 'de 13 a 17 años', 'mayor de 63 años']
    cie10_cases = ['vih', 'tumores malignos', 'maternas', 'trasplantados']
    
    dataset['CASO_RIESGO'] = dataset['MOTIVO_ESPECIFICO'].apply(lambda value: contains(value, mot_esp_cases)) 
    dataset['POBESPECIAL'] = dataset['AFEC_POBESPECIAL'].apply(lambda value: False if value == 'no aplica' else True) 
    dataset['EDAD_RIESGO'] = dataset['AFEC_EDADR'].apply(lambda value: contains(value, afec_edad_cases)) 
    dataset['CIE10_RIESGO'] = dataset['CIE_10'].apply(lambda value: contains(value, cie10_cases)) 

    dataset = data_utils.remove_features(dataset)
    dataset = dataset.reset_index()
    dataset = dataset.drop(['index'], axis = 1)

    labels = dataset[['RIESGO_VIDA']]
    features = dataset.drop(['RIESGO_VIDA'], axis = 1)

    encoded_features = data_utils.encode_features(features, labels)

    encoded_features['RIESGO_VIDA'] = labels
    encoded_features.to_csv("datasets/experiments/risk_cases_encoder.csv", index = False)
    log_artifact("datasets/experiments/risk_cases_encoder.csv")
def basic(dataset):
    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset = dataset.reset_index()
    dataset.drop(['index'], axis = 1)
    dataset = label_encode(dataset)
    dataset.to_csv("datasets/experiments/basic.csv", index = False)
    log_artifact("datasets/experiments/basic.csv")
def target_encoder(dataset):
    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset = dataset.reset_index()
    dataset = dataset.drop(['index'], axis = 1)

    labels = dataset[['RIESGO_VIDA']]
    features = dataset.drop(['RIESGO_VIDA'], axis = 1)

    encoded_features = data_utils.encode_features(features, labels)

    encoded_features['RIESGO_VIDA'] = labels
    encoded_features.to_csv("datasets/experiments/target_encoder.csv", index = False)
def imputing(dataset):
    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset = dataset.reset_index()
    dataset = dataset.drop(['index'], axis = 1)

    zero_values = list(dataset.columns[dataset.eq('0').mean() > 0])
    dataset[zero_values] = dataset[zero_values].apply(lambda col: col.fillna(col.mode()[0]), axis=0)
    dataset = dataset.applymap(str)
    dataset = label_encode(dataset)

    dataset.to_csv("datasets/experiments/imputing.csv", index = False)
    log_artifact("datasets/experiments/imputing.csv")
def missing_state(dataset):
    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset = dataset.reset_index()
    dataset = dataset.drop(['index'], axis = 1)

    zero_values = set(dataset.columns[dataset.eq('0').mean() > 0])
    for feature in zero_values:
        dataset[f'{feature}_is_missing'] = dataset[feature].apply(lambda f: 1 if f == '0' else 0)

    features_columns = [column for column in dataset.columns if '_is_missing' not in column]
    
    dataset[features_columns] = label_encode(dataset[features_columns])
    dataset.to_csv("datasets/experiments/missing_state.csv", index = False)
    log_artifact("datasets/experiments/missing_state.csv")
def target_encoder_only_complains(dataset):
    dataset = dataset[
        (dataset['PQR_TIPOPETICION'] != 'peticion de informacion') &
        (dataset['PQR_TIPOPETICION'] != 'consulta y/o solicitud de informacion') 
        ]

    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset = dataset.reset_index()
    dataset = dataset.drop(['index'], axis = 1)

    labels = dataset[['RIESGO_VIDA']]
    features = dataset.drop(['RIESGO_VIDA'], axis = 1)

    encoded_features = data_utils.encode_features(features, labels)

    encoded_features['RIESGO_VIDA'] = labels
    encoded_features.to_csv("datasets/experiments/target_encoder_only_complains.csv", index = False)   
    log_artifact("datasets/experiments/target_encoder_only_complains.csv")
def normalizing(dataset):
    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.clean_cie_10(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset = dataset.reset_index()
    dataset = dataset.drop(['index'], axis = 1)

    dataset = label_encode(dataset)

    scaler = MinMaxScaler() 

    features = dataset.drop(['RIESGO_VIDA'], axis = 1)
    labels = dataset[['RIESGO_VIDA']]

    features[features.columns] = features[features.columns].apply(lambda x: np.log(x + 1))
    features[features.columns] = scaler.fit_transform(features[features.columns])

    dataset = features
    dataset['RIESGO_VIDA'] = labels.values
    dataset.to_csv("datasets/experiments/normalizing.csv", index = False)
    log_artifact("datasets/experiments/normalizing.csv")
def cie10_only_complains(dataset):
    dataset = dataset[
        (dataset['PQR_TIPOPETICION'] != 'peticion de informacion') &
        (dataset['PQR_TIPOPETICION'] != 'consulta y/o solicitud de informacion') 
        ]
    cie10_df = pd.read_csv('datasets/CIE10.csv', sep = ';')
    cie10_df['DESCRIPCION_COD_CIE_10_04'] = cie10_df['DESCRIPCION_COD_CIE_10_04'].apply(lambda value: value.lower())
    dataset = pd.merge(left = dataset, right = cie10_df, how = 'left', left_on='CIE_10', right_on='DESCRIPCION_COD_CIE_10_04')

    dataset = dataset.drop(['CIE_10', 'NOMBRE_CAPITULO', 'DESCRIPCION_COD_CIE_10_03', 'DESCRIPCION_COD_CIE_10_04'], axis = 1)

    cie10_columns = [
        'CAPITULO', 
        'COD_CIE_10_03', 
        'COD_CIE_10_04', 
        'SEXO', 
        'LIMITE_INFERIOR_EDAD', 
        'LIMITE_SUPERIOR_EDAD']

    dataset[cie10_columns] = dataset[cie10_columns].replace(np.nan, 'no_cie10', regex=True)
    dataset = dataset[dataset['CAPITULO'] != 'no_cie10']

    dataset['CIE10_SEXO'] = dataset['SEXO'].apply(cie10_sexo)
    dataset['LIMITE_INFERIOR_EDAD_Y'] = dataset['LIMITE_INFERIOR_EDAD'].apply(to_year)
    dataset['LIMITE_SUPERIOR_EDAD_Y'] = dataset['LIMITE_SUPERIOR_EDAD'].apply(to_year)
    dataset['AFEC_EDADR_INF'] = dataset['AFEC_EDADR'].apply(get_edad_inf)
    dataset['AFEC_EDADR_SUP'] = dataset['AFEC_EDADR'].apply(get_edad_sup)
    dataset['CIE10_RANGO_EDAD'] = dataset.apply(in_range, axis=1)

    dataset = dataset.drop(
        [
            'SEXO',
            'LIMITE_INFERIOR_EDAD', 
            'LIMITE_SUPERIOR_EDAD', 
            'LIMITE_INFERIOR_EDAD_Y', 
            'LIMITE_SUPERIOR_EDAD_Y',
            'AFEC_EDADR'

        ], axis = 1
    )


    dataset[
        [
            'AFEC_GENERO',
            'CIE10_SEXO',
            'CIE10_RANGO_EDAD',
            'AFEC_EDADR_INF', 
            'AFEC_EDADR_SUP',

        ]
    ].head()

    dataset = data_utils.clean_afec_dpto(dataset)
    dataset = data_utils.clean_riesgo_vida(dataset)
    dataset = data_utils.remove_features(dataset)
    dataset = dataset.reset_index()
    dataset = dataset.drop(['index'], axis = 1)

    labels = dataset[['RIESGO_VIDA']]
    features = dataset.drop(['RIESGO_VIDA'], axis = 1)

    encoded_features = data_utils.encode_features(features, labels)

    encoded_features['RIESGO_VIDA'] = labels

    encoded_features.to_csv("datasets/experiments/cie10_only_complains.csv", index = False)
    log_artifact("datasets/experiments/cie10_only_complains.csv")