Ejemplo n.º 1
0
    def test_unsupervised(self):
        observations = lib.load_lending_club()

        # Train /test split
        train_observations, test_observations = train_test_split(observations)
        train_observations = train_observations.copy()
        test_observations = test_observations.copy()

        # Unsupervised
        data_type_dict = {
            'numerical': [
                'loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs',
                'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec',
                'revol_bal', 'revol_util', 'total_acc', 'pub_rec_bankruptcies'
            ],
            'categorical': [
                'term', 'grade', 'emp_length', 'home_ownership', 'loan_status',
                'addr_state', 'application_type'
            ],
            'text': ['desc', 'purpose', 'title']
        }
        auto = Automater(data_type_dict=data_type_dict)
        self.assertFalse(auto.supervised)

        expected_input_vars = reduce(lambda x, y: x + y,
                                     data_type_dict.values())
        self.assertCountEqual(expected_input_vars, auto.input_vars)
        self.assertEqual(None, auto.output_var)
        self.assertTrue(isinstance(auto.input_mapper, DataFrameMapper))
        self.assertIsNone(auto.output_mapper)
        self.assertFalse(auto.fitted)

        self.assertRaises(AssertionError, auto._check_has_response_var)

        # Test fit
        auto.fit(train_observations)
        self.assertTrue(auto.fitted)

        self.assertIsNotNone(auto.input_mapper.built_features)
        self.assertTrue(isinstance(auto.input_layers, list))
        self.assertEqual(len(expected_input_vars), len(auto.input_layers))
        self.assertIsNotNone(auto.input_nub)

        self.assertIsNone(auto.output_nub)
        self.assertIsNone(auto.output_mapper)

        # Test transform, df_out=False
        X, y = auto.transform(test_observations)
        self.assertTrue(isinstance(X, list))
        self.assertIsNone(y)
        self.assertEqual(test_observations.shape[0],
                         X[0].shape[0])  # Correct number of rows back

        # Test transform, df_out=True
        transformed_observations = auto.transform(test_observations,
                                                  df_out=True)
        self.assertTrue(isinstance(transformed_observations, pandas.DataFrame))
        self.assertEqual(
            test_observations.shape[0],
            transformed_observations.shape[0])  # Correct number of rows back
Ejemplo n.º 2
0
def main():
    # List out which components are supplied by Automater
    # In this example, we're utilizing X and y generated by the Automater, auto.input_nub, auto.input_layers,
    # auto.output_nub, and auto.suggest_loss

    save_results = True

    # Load data
    observations = lib.load_lending_club()
    print('Observation columns: {}'.format(list(observations.columns)))
    print('Class balance:\n {}'.format(observations['loan_status'].value_counts()))

    # Train /test split
    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # List out variable types
    data_type_dict = {'numerical': ['loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs',
                                    'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec', 'revol_bal',
                                    'revol_util',
                                    'total_acc', 'pub_rec_bankruptcies'],
                      'categorical': ['term', 'grade', 'emp_length', 'home_ownership', 'loan_status', 'addr_state',
                                      'application_type', 'disbursement_method'],
                      'text': ['desc', 'purpose', 'title']}
    output_var = 'loan_status'

    # Create and fit Automater
    auto = Automater(data_type_dict=data_type_dict, output_var=output_var)
    auto.fit(train_observations)

    # Transform data
    train_X, train_y = auto.fit_transform(train_observations)
    test_X, test_y = auto.transform(test_observations)

    # Create and fit keras (deep learning) model.

    x = auto.input_nub
    x = Dense(32)(x)
    x = Dense(32)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='adam', loss=auto.suggest_loss())

    model.fit(train_X, train_y)

    # Make model predictions and inverse transform model predictions, to get usable results
    pred_test_y = model.predict(test_X)
    auto.inverse_transform_output(pred_test_y)

    # Save all results
    if save_results:
        temp_dir = lib.get_temp_dir()
        model.save(os.path.join(temp_dir, 'model.h5py'))
        pickle.dump(train_X, open(os.path.join(temp_dir, 'train_X.pkl'), 'wb'))
        pickle.dump(train_y, open(os.path.join(temp_dir, 'train_y.pkl'), 'wb'))
        pickle.dump(test_X, open(os.path.join(temp_dir, 'test_X.pkl'), 'wb'))
        pickle.dump(test_y, open(os.path.join(temp_dir, 'test_y.pkl'), 'wb'))
        pickle.dump(pred_test_y, open(os.path.join(temp_dir, 'pred_test_y.pkl'), 'wb'))
def main():
    logging.getLogger().setLevel(logging.INFO)

    # Reference variables
    test_run = True

    observations = load_lending_club()

    if test_run:
        observations = observations.sample(n=100)

    # Transform the data set, using keras_pandas
    categorical_vars = [
        'term', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
        'verification_status', 'issue_d', 'pymnt_plan', 'purpose',
        'addr_state', 'initial_list_status', 'application_type',
        'disbursement_method', 'loan_status'
    ]
    numerical_vars = [
        'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'annual_inc',
        'installment', 'dti', 'inq_last_6mths', 'open_acc', 'pub_rec',
        'revol_bal', 'total_acc', 'pub_rec_bankruptcies', 'int_rate',
        'revol_util'
    ]
    text_vars = ['desc', 'title']

    for categorical_var in categorical_vars:
        observations[categorical_var] = observations[categorical_var].fillna(
            'None')
        observations[categorical_var] = observations[categorical_var].apply(
            str)

    auto = Automater(categorical_vars=categorical_vars,
                     numerical_vars=numerical_vars,
                     text_vars=text_vars,
                     response_var='loan_status')

    X, y = auto.fit_transform(observations)

    # Start model with provided input nub
    x = auto.input_nub

    # Fill in your own hidden layers
    x = Dense(8)(x)
    x = Dense(16, activation='relu')(x)
    x = Dense(8)(x)

    # End model with provided output nub
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

    # Train model
    logging.warning(
        'Settle in! This training normally takes about 5-20 minutes on CPU')
    model.fit(X, y, epochs=1, validation_split=.2)

    pass
Ejemplo n.º 4
0
    def test_lending(self):
        observations = load_lending_club()

        # Check datatypes
        self.assertIsInstance(observations, pandas.DataFrame)

        # Check columns
        self.assertCountEqual([
            'id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
            'term', 'int_rate', 'installment', 'grade', 'sub_grade',
            'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
            'verification_status', 'issue_d', 'loan_status', 'pymnt_plan',
            'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti',
            'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
            'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
            'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
            'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
            'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
            'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
            'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d',
            'last_credit_pull_d', 'collections_12_mths_ex_med',
            'mths_since_last_major_derog', 'policy_code', 'application_type',
            'annual_inc_joint', 'dti_joint', 'verification_status_joint',
            'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
            'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
            'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m',
            'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
            'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths',
            'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
            'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
            'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
            'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_bc_dlq',
            'mths_since_recent_inq', 'mths_since_recent_revol_delinq',
            'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl',
            'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',
            'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats',
            'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',
            'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75',
            'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim',
            'total_bal_ex_mort', 'total_bc_limit',
            'total_il_high_credit_limit', 'revol_bal_joint',
            'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths',
            'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util',
            'sec_app_open_act_il', 'sec_app_num_rev_accts',
            'sec_app_chargeoff_within_12_mths',
            'sec_app_collections_12_mths_ex_med',
            'sec_app_mths_since_last_major_derog', 'hardship_flag',
            'hardship_type', 'hardship_reason', 'hardship_status',
            'deferral_term', 'hardship_amount', 'hardship_start_date',
            'hardship_end_date', 'payment_plan_start_date', 'hardship_length',
            'hardship_dpd', 'hardship_loan_status',
            'orig_projected_additional_accrued_interest',
            'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
            'disbursement_method', 'debt_settlement_flag',
            'debt_settlement_flag_date', 'settlement_status',
            'settlement_date', 'settlement_amount', 'settlement_percentage',
            'settlement_term'
        ], observations.columns)
Ejemplo n.º 5
0
    def test_inverse_transform_numerical_response(self):

        # :oad data
        observations = lib.load_lending_club()

        # Set to test run
        observations = observations.sample(n=100)

        # Declare variable types
        categorical_vars = ['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status',
                            'issue_d',
                            'pymnt_plan', 'purpose', 'addr_state', 'initial_list_status', 'application_type',
                            'disbursement_method', 'loan_status']
        numerical_vars = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'annual_inc', 'installment', 'dti',
                          'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies',
                          'int_rate', 'revol_util']

        text_vars = ['desc', 'title']

        # Manual null filling
        for categorical_var in categorical_vars:
            observations[categorical_var] = observations[categorical_var].fillna('None')
            observations[categorical_var] = observations[categorical_var].apply(str)

        auto = Automater(categorical_vars=categorical_vars, numerical_vars=numerical_vars, text_vars=text_vars,
                         response_var='funded_amnt')

        X, y = auto.fit_transform(observations)

        # Start model with provided input nub
        x = auto.input_nub

        # Fill in your own hidden layers
        x = Dense(8)(x)
        x = Dense(16, activation='relu')(x)
        x = Dense(8)(x)

        # End model with provided output nub
        x = auto.output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

        # Train model
        logging.warning('Settle in! This training normally takes about 5-20 minutes on CPU')
        model.fit(X, y, epochs=1, validation_split=.2)
        unscaled_preds = model.predict(X)

        logging.debug('unscaled_preds: {}'.format(list(unscaled_preds)))

        scaled_preds = auto.inverse_transform_output(unscaled_preds)

        logging.debug('scaled_preds: {}'.format(list(scaled_preds)))

        self.assertNotAlmostEquals(0, numpy.mean(scaled_preds))

        self.assertNotAlmostEquals(1, numpy.std(scaled_preds))
Ejemplo n.º 6
0
def main():

    # Load data
    observations = lib.load_lending_club()
    print('Observation columns: {}'.format(list(observations.columns)))
    print('Class balance:\n {}'.format(
        observations['loan_status'].value_counts()))

    # Heuristic data transformations
    for var in ['int_rate', 'revol_util']:

        # Strip out percent signs
        observations[var] = observations[var].apply(
            lambda x: str(x).replace('%', ''))
        observations[var] = pandas.to_numeric(observations[var],
                                              errors='coerce')
    for var in ['mths_since_last_delinq', 'annual_inc_joint']:

        # Heuristic null filling for some variables
        observations[var] = observations[var].fillna(0)

    # List out variable types
    numerical_vars = [
        'loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs',
        'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec', 'revol_bal',
        'revol_util', 'total_acc', 'pub_rec_bankruptcies'
    ]
    categorical_vars = [
        'term', 'grade', 'emp_length', 'home_ownership', 'addr_state',
        'application_type', 'disbursement_method'
    ]
    text_vars = ['desc', 'purpose', 'title']

    # Train /test split
    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # Create and fit Automater
    auto = Automater(numerical_vars=numerical_vars,
                     categorical_vars=categorical_vars,
                     text_vars=text_vars,
                     response_var='loan_amnt')
    auto.fit(train_observations)

    # Create and fit keras (deep learning) model
    # The auto.transform, auto.input_nub, auto.input_layers, and auto.loss are provided by keras-pandas, and
    # everything else is core Keras
    train_X, train_y = auto.transform(train_observations)
    test_X, test_y = auto.transform(test_observations)

    x = auto.input_nub
    x = Dense(32)(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(32)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

    model.fit(train_X, train_y)

    test_y_pred = model.predict(test_X)

    # Inverse transform model output, to get usable results and save all results
    test_observations[auto.response_var +
                      '_pred'] = auto.inverse_transform_output(test_y_pred)
    print('Predictions: {}'.format(test_observations[auto.response_var +
                                                     '_pred']))

    pass
Ejemplo n.º 7
0
    def test_supervised(self):
        observations = lib.load_lending_club()

        # Train /test split
        train_observations, test_observations = train_test_split(observations)
        train_observations = train_observations.copy()
        test_observations = test_observations.copy()

        # Supervised
        data_type_dict = {
            'numerical': [
                'loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs',
                'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec',
                'revol_bal', 'revol_util', 'total_acc', 'pub_rec_bankruptcies'
            ],
            'categorical': [
                'term', 'grade', 'emp_length', 'home_ownership', 'loan_status',
                'addr_state', 'application_type'
            ],
            'text': ['desc', 'purpose', 'title']
        }
        output_var = 'loan_status'

        auto = Automater(data_type_dict=data_type_dict, output_var=output_var)

        self.assertTrue(auto.supervised)
        expected_input_vars = reduce(lambda x, y: x + y,
                                     data_type_dict.values())
        expected_input_vars.remove(output_var)
        self.assertCountEqual(expected_input_vars, auto.input_vars)
        self.assertEqual(output_var, auto.output_var)
        self.assertTrue(isinstance(auto.input_mapper, DataFrameMapper))
        self.assertTrue(isinstance(auto.output_mapper, DataFrameMapper))
        self.assertFalse(auto.fitted)
        self.assertRaises(AssertionError, auto._check_fitted)

        # Test fit
        auto.fit(train_observations)
        self.assertTrue(auto.fitted)

        self.assertIsNotNone(auto.input_mapper.built_features)
        self.assertTrue(isinstance(auto.input_layers, list))
        self.assertEqual(len(expected_input_vars), len(auto.input_layers))
        self.assertIsNotNone(auto.input_nub)

        self.assertIsNotNone(auto.output_nub)
        self.assertIsNotNone(auto.output_mapper.built_features)

        # Test transform, df_out=False
        train_X, train_y = auto.transform(train_observations)
        test_X, test_y = auto.transform(test_observations)
        self.assertTrue(isinstance(test_X, list))
        self.assertTrue(isinstance(test_y, numpy.ndarray))
        self.assertEqual(test_observations.shape[0],
                         test_X[0].shape[0])  # Correct number of rows back
        self.assertEqual(test_observations.shape[0],
                         test_y.shape[0])  # Correct number of rows back

        # Test transform, df_out=True
        transformed_observations = auto.transform(test_observations,
                                                  df_out=True)
        self.assertTrue(isinstance(transformed_observations, pandas.DataFrame))
        self.assertEqual(
            test_observations.shape[0],
            transformed_observations.shape[0])  # Correct number of rows back

        # Test suggest_loss
        suggested_loss = auto.suggest_loss()
        self.assertTrue(callable(suggested_loss))

        # Test model building

        x = auto.input_nub
        x = Dense(32)(x)
        x = auto.output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='Adam', loss=auto.suggest_loss())
        model.fit(train_X, train_y)

        pred_y = model.predict(test_X)

        # Test inverse_transform_output
        inv_transformed_pred_y = auto.inverse_transform_output(pred_y)
        self.assertEqual(test_observations.shape[0],
                         inv_transformed_pred_y.shape[0])