def preprocess(data, data_struct):
    ''' Processed the data per datatype.'''

    # Drop useless data
    cols = data_struct.loc[data_struct.loc[:, 'Form Collection Name']\
                                        .isin(['!!! CARDIO (OLD DON’T USE)!!!',
                                                'Vascular medicine (OPTIONAL)',
                                                'CARDIOLOGY - CAPACITY (OPTIONAL)']),
                            'Field Variable Name'].to_list()
    data = data.drop(is_in_columns(cols, data), axis=1)

    # Fix single errors
    data = fix_single_errors(data)

    # Transform variables
    data, data_struct = transform_binary_features(data, data_struct)
    data, data_struct = transform_categorical_features(data, data_struct)
    data, data_struct = transform_numeric_features(data, data_struct)
    data, data_struct = transform_time_features(data, data_struct)
    data, data_struct = transform_string_features(data, data_struct)
    data, data_struct = transform_calculated_features(data, data_struct)

    # Remove columns without any information
    data, data_struct = select_data(data, data_struct)

    return data, data_struct
Esempio n. 2
0
def preprocess(data, col_dict, field_types):
    ''' Processed the data per datatype.
    
    TODO: Remove all columns that represent if a value is measured
    TODO: Rotate  - All values such that higher value should be better outcome
    TODO: Automatic outlier detection
    '''
    
    is_in_columns = lambda df: [field for field in df if field in data.columns]

    # Binary features --> Most radio button fields
    radio_fields = is_in_columns(field_types['Variable name'][field_types['Field type'] == 'radio'].tolist())
    data[radio_fields] = transform_binary_features(data[radio_fields])

    # Categorical --> Some radio button fields, dropdown fields, checkbox fields
    category_fields = is_in_columns(field_types['Variable name'][field_types['Field type'].isin(['dropdown', 'checkbox'])].tolist())
    data = transform_categorical_features(data, category_fields, radio_fields) # NOTE: categorical radio fields are selected in util
    
    # Numeric
    numeric_fields = is_in_columns(field_types['Variable name'][field_types['Field type'] == 'numeric'].tolist())
    data = transform_numeric_features(data, numeric_fields)
    
    data = transform_time_features(data)
    
    string_fields = field_types['Variable name'][field_types['Field type'] == 'string'].tolist()
    data = transform_string_features(data, string_fields)

    data = select_data(data)
    
    return data
def preprocess(data, data_struct):
    ''' Processed the data per datatype.'''

    # Fix single errors
    data = fix_single_errors(data)

    # Transform variables
    data, data_struct = transform_binary_features(data, data_struct)
    data, data_struct = transform_categorical_features(data, data_struct)
    data, data_struct = transform_numeric_features(data, data_struct)
    data, data_struct = transform_time_features(data, data_struct)
    data, data_struct = transform_string_features(data, data_struct)
    data, data_struct = transform_calculated_features(data, data_struct)

    # Remove columns without any information
    data, data_struct = select_data(data, data_struct)

    return data, data_struct