Esempio n. 1
0
    def test_transform_with_response(self):
        train_df = lib.load_mushroom()

        # Two numerical variables, df_out = False
        test_columns = ['odor', 'habitat']
        auto = Automater(categorical_vars=test_columns,
                         df_out=False,
                         response_var='habitat')
        auto.fit(train_df)

        (X, y) = auto.transform(train_df)
        self.assertEqual((8124, ), X[0].shape)

        # Two numerical variables, df_out = True
        test_columns = ['odor', 'habitat']
        auto = Automater(categorical_vars=test_columns,
                         df_out=True,
                         response_var='habitat')
        auto.fit(train_df)

        transformed = auto.transform(train_df)
        self.assertEqual(8124, len(transformed.index))
        self.assertEqual((8124, 2), transformed.shape)
        self.assertCountEqual(test_columns, transformed.columns)

        # Test w/ response var unavailable.
        test_columns = ['odor']
        test_df = train_df[test_columns]
        transformed = auto.transform(test_df)
        self.assertEqual(8124, len(transformed.index))
        self.assertEqual((8124, 1), transformed.shape)
        self.assertCountEqual(test_columns, transformed.columns)
def main():

    # Load data
    observations = lib.load_mushroom()
    # observations = lib.load_lending_club(test_run=False)
    print('Observation columns: {}'.format(list(observations.columns)))
    print('Class balance:\n {}'.format(observations['class'].value_counts()))

    # List out variable types
    numerical_vars = []
    categorical_vars = [
        'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
        'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
        'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
        'stalk-surface-below-ring', 'stalk-color-above-ring',
        'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
        'ring-type', 'spore-print-color', 'population', 'habitat'
    ]
    text_vars = []

    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # Create and fit Automater
    auto = Automater(numerical_vars=numerical_vars,
                     categorical_vars=categorical_vars,
                     text_vars=text_vars,
                     response_var='class')
    auto.fit(train_observations)

    # Create and fit keras (deep learning) model
    # The auto.transform, auto.input_nub, auto.input_layers, and auto.loss are provided by keras-pandas, and
    # everything else is core Keras
    train_X, train_y = auto.transform(train_observations)
    test_X, test_y = auto.transform(test_observations)

    x = auto.input_nub
    x = Dense(32)(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(32)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

    model.fit(train_X, train_y)

    test_y_pred = model.predict(test_X)

    # Inverse transform model output, to get usable results and save all results
    test_observations[auto.response_var +
                      '_pred'] = auto.inverse_transform_output(test_y_pred)
    print('Predictions: {}'.format(test_observations[auto.response_var +
                                                     '_pred']))

    pass
Esempio n. 3
0
    def test_fit(self):
        train_df = lib.load_mushroom()

        # Two variables
        mushroom_categorical_cols = ['odor', 'habitat']
        auto = Automater(categorical_vars=mushroom_categorical_cols)
        auto.fit(train_df)

        self.assertEqual(Automater, type(auto))
        self.assertEqual(mushroom_categorical_cols, auto._user_provided_variables)
        self.assertTrue(auto.fitted)

        # Assert that transformation pipline has been built / trained
        self.assertEqual([['odor'], ['habitat']], list(map(lambda x: x[0], auto.input_mapper.built_features)))
Esempio n. 4
0
    def test_transform_no_response(self):
        train_df = lib.load_mushroom()

        # Two numerical variables, df_out = False
        test_columns = ['odor', 'habitat']
        auto = Automater(categorical_vars=test_columns, df_out=False)
        auto.fit(train_df)

        (X, y) = auto.transform(train_df)
        self.assertEqual((8124, ), X[0].shape)
        self.assertEqual(None, y)

        # Two numerical variables, df_out = True
        test_columns = ['odor', 'habitat']
        auto = Automater(categorical_vars=test_columns, df_out=True)
        auto.fit(train_df)

        transformed = auto.transform(train_df)
        self.assertEqual(8124, len(transformed.index))
        self.assertEqual((8124, 2), transformed.shape)
        self.assertCountEqual(test_columns, transformed.columns)
Esempio n. 5
0
    def test_categorical_whole(self):
        # St up data set
        mushroom_df = lib.load_mushroom()
        msk = numpy.random.rand(len(mushroom_df)) < 0.95
        mushroom_train = mushroom_df[msk]
        mushroom_test = mushroom_df[~msk]
        categorical_vars = ['odor', 'habitat', 'population', 'class']

        # Create auto
        auto = Automater(categorical_vars=categorical_vars,
                         response_var='class')

        # Train auto
        auto.fit(mushroom_train)
        X_train, y_train = auto.transform(mushroom_train)

        # Extract input_nub from auto
        input_nub = auto.input_nub

        # Extract output_nub from auto
        output_nub = auto.output_nub

        # Create DL model
        x = input_nub
        x = Dense(30)(x)
        x = output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='Adam', loss=auto.loss)

        # Train DL model
        model.fit(X_train, y_train)

        # Transform test set
        mushroom_test = mushroom_test.drop('class', axis=1)
        X_test, y_test = auto.transform(mushroom_test)
        model.predict(X_test)

        pass
Esempio n. 6
0
    def test_boolean(self):
        observations = lib.load_mushroom()
        observations['population_bool'] = observations['population'] == 's'

        msk = numpy.random.rand(len(observations)) < 0.95
        mushroom_train = observations[msk]
        mushroom_test = observations[~msk]

        categorical_vars = ['odor', 'habitat', 'class']
        boolean_vars = ['population_bool']

        auto = Automater(categorical_vars=categorical_vars,
                         boolean_vars=boolean_vars,
                         response_var='class')

        auto.fit(mushroom_train)
        X_train, y_train = auto.transform(mushroom_train)

        # Extract input_nub from auto
        input_nub = auto.input_nub

        # Extract output_nub from auto
        output_nub = auto.output_nub

        # Create DL model
        x = input_nub
        x = Dense(30)(x)
        x = output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='Adam', loss=auto.loss)

        # Train DL model
        model.fit(X_train, y_train)

        # Transform test set
        mushroom_test = mushroom_test.drop('class', axis=1)
        X_test, y_test = auto.transform(mushroom_test)
        model.predict(X_test)
    def test_whole(self):
        # Create datatype
        datatype = Categorical()

        # Load observations
        observations = lib.load_mushroom()

        # Transform observations
        mapper = DataFrameMapper(
            [(['cap-shape'], datatype.default_transformation_pipeline)],
            df_out=True)
        transformed_df = mapper.fit_transform(observations)

        # Create network
        input_layer, input_nub = datatype.input_nub_generator(
            'cap-shape', transformed_df)
        output_nub = datatype.output_nub_generator('cap-shape', transformed_df)

        x = input_nub
        x = output_nub(x)

        model = Model(input_layer, x)
        model.compile(optimizer='adam', loss=datatype.output_suggested_loss())
Esempio n. 8
0
    def test_create_input_nub_numerical(self):
        # TODO rename function, there is no numerical input
        train_df = lib.load_mushroom()

        # Zero variables
        variable_type_dict = {'categorical_vars': []}
        input_layers, input_nub = Automater()._create_input_nub(variable_type_dict, train_df)
        self.assertEqual(list(), input_layers)

        # One variable
        iris_numerical_cols = ['odor']
        variable_type_dict = {'numerical_vars': iris_numerical_cols}
        input_layers, input_nub = Automater(numerical_vars=iris_numerical_cols).\
            _create_input_nub(variable_type_dict, train_df)
        # TODO Check layer type
        self.assertEqual(1, len(input_layers))

        # Multiple numeric variables
        iris_numerical_cols = ['odor', 'habitat', 'population']
        variable_type_dict = {'numerical_vars': iris_numerical_cols}
        input_layers, input_nub = Automater(numerical_vars=iris_numerical_cols).\
            _create_input_nub(variable_type_dict, train_df)
        self.assertEqual(3, len(input_layers))