def test_column_transformer_special_strings():

    # one 'drop' -> ignore
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer(
        [('trans1', Trans(), [0]), ('trans2', 'drop', [1])])
    exp = np.array([[0.], [1.], [2.]])
    assert_array_equal(ct.fit_transform(X_array), exp)
    assert_array_equal(ct.fit(X_array).transform(X_array), exp)

    # all 'drop' -> return shape 0 array
    ct = ColumnTransformer(
        [('trans1', 'drop', [0]), ('trans2', 'drop', [1])])
    assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0))
    assert_array_equal(ct.fit_transform(X_array).shape, (3, 0))

    # 'passthrough'
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer(
        [('trans1', Trans(), [0]), ('trans2', 'passthrough', [1])])
    exp = X_array
    assert_array_equal(ct.fit_transform(X_array), exp)
    assert_array_equal(ct.fit(X_array).transform(X_array), exp)

    # None itself / other string is not valid
    for val in [None, 'other']:
        ct = ColumnTransformer(
            [('trans1', Trans(), [0]), ('trans2', None, [1])])
        assert_raise_message(TypeError, "All estimators should implement",
                             ct.fit_transform, X_array)
        assert_raise_message(TypeError, "All estimators should implement",
                             ct.fit, X_array)
def test_make_column_transformer_pandas():
    pd = pytest.importorskip('pandas')
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    norm = Normalizer()
    ct1 = ColumnTransformer([('norm', Normalizer(), X_df.columns)])
    ct2 = make_column_transformer((norm, X_df.columns))
    assert_almost_equal(ct1.fit_transform(X_df),
                        ct2.fit_transform(X_df))
def test_column_transformer_remainder_numpy(key):
    # test different ways that columns are specified with passthrough
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
def test_column_transformer_sparse_stacking():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)])
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert_true(sparse.issparse(X_trans))
    assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1))
    assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer_negative_column_indexes():
    X = np.random.randn(2, 2)
    X_categories = np.array([[1], [2]])
    X = np.concatenate([X, X_categories], axis=1)

    ohe = OneHotEncoder(categories='auto')

    tf_1 = ColumnTransformer([('ohe', ohe, [-1])], remainder='passthrough')
    tf_2 = ColumnTransformer([('ohe', ohe,  [2])], remainder='passthrough')
    assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
def test_2D_transformer_output():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    # if one transformer is dropped, test that name is still correct
    ct = ColumnTransformer([('trans1', 'drop', 0),
                            ('trans2', TransNo2D(), 1)])
    assert_raise_message(ValueError, "the 'trans2' transformer should be 2D",
                         ct.fit_transform, X_array)
    ct.fit(X_array)
    assert_raise_message(ValueError, "the 'trans2' transformer should be 2D",
                         ct.transform, X_array)
def test_column_transformer_no_remaining_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    ct = ColumnTransformer([('trans1', Trans(), [0, 1, 2])],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_array)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_array)
    assert len(ct.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'
def test_column_transformer_remainder_pandas(key):
    # test different ways that columns are specified with passthrough
    pd = pytest.importorskip('pandas')

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
def test_2D_transformer_output_pandas():
    pd = pytest.importorskip('pandas')

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['col1', 'col2'])

    # if one transformer is dropped, test that name is still correct
    ct = ColumnTransformer([('trans1', TransNo2D(), 'col1')])
    assert_raise_message(ValueError, "the 'trans1' transformer should be 2D",
                         ct.fit_transform, X_df)
    ct.fit(X_df)
    assert_raise_message(ValueError, "the 'trans1' transformer should be 2D",
                         ct.transform, X_df)
def test_column_transformer_get_set_params():
    ct = ColumnTransformer([('trans1', StandardScaler(), [0]),
                            ('trans2', StandardScaler(), [1])])

    exp = {'n_jobs': 1,
           'remainder': 'drop',
           'trans1': ct.transformers[0][1],
           'trans1__copy': True,
           'trans1__with_mean': True,
           'trans1__with_std': True,
           'trans2': ct.transformers[1][1],
           'trans2__copy': True,
           'trans2__with_mean': True,
           'trans2__with_std': True,
           'transformers': ct.transformers,
           'transformer_weights': None}

    assert_dict_equal(ct.get_params(), exp)

    ct.set_params(trans1__with_mean=False)
    assert_false(ct.get_params()['trans1__with_mean'])

    ct.set_params(trans1='passthrough')
    exp = {'n_jobs': 1,
           'remainder': 'drop',
           'trans1': 'passthrough',
           'trans2': ct.transformers[1][1],
           'trans2__copy': True,
           'trans2__with_mean': True,
           'trans2__with_std': True,
           'transformers': ct.transformers,
           'transformer_weights': None}

    assert_dict_equal(ct.get_params(), exp)
def test_column_transformer_no_estimators():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).astype('float').T
    ct = ColumnTransformer([], remainder=StandardScaler())

    params = ct.get_params()
    assert params['remainder__with_mean']

    X_trans = ct.fit_transform(X_array)
    assert X_trans.shape == X_array.shape
    assert len(ct.transformers_) == 1
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][2] == [0, 1, 2]
def test_column_transformer_get_set_params_with_remainder():
    ct = ColumnTransformer([('trans1', StandardScaler(), [0])],
                           remainder=StandardScaler())

    exp = {'n_jobs': 1,
           'remainder': ct.remainder,
           'remainder__copy': True,
           'remainder__with_mean': True,
           'remainder__with_std': True,
           'trans1': ct.transformers[0][1],
           'trans1__copy': True,
           'trans1__with_mean': True,
           'trans1__with_std': True,
           'transformers': ct.transformers,
           'transformer_weights': None}

    assert ct.get_params() == exp

    ct.set_params(remainder__with_std=False)
    assert not ct.get_params()['remainder__with_std']

    ct.set_params(trans1='passthrough')
    exp = {'n_jobs': 1,
           'remainder': ct.remainder,
           'remainder__copy': True,
           'remainder__with_mean': True,
           'remainder__with_std': False,
           'trans1': 'passthrough',
           'transformers': ct.transformers,
           'transformer_weights': None}

    assert ct.get_params() == exp
def test_column_transformer_sparse_array():
    X_sparse = sparse.eye(3, 2).tocsr()

    # no distinction between 1D and 2D
    X_res_first = X_sparse[:, 0]
    X_res_both = X_sparse

    for col in [0, [0], slice(0, 1)]:
        for remainder, res in [('drop', X_res_first),
                               ('passthrough', X_res_both)]:
            ct = ColumnTransformer([('trans', Trans(), col)],
                                   remainder=remainder,
                                   sparse_threshold=0.8)
            assert sparse.issparse(ct.fit_transform(X_sparse))
            assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res)
            assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                         res)

    for col in [[0, 1], slice(0, 2)]:
        ct = ColumnTransformer([('trans', Trans(), col)],
                               sparse_threshold=0.8)
        assert sparse.issparse(ct.fit_transform(X_sparse))
        assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both)
        assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse),
                                     X_res_both)
def test_column_transformer_named_estimators():
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer([('trans1', StandardScaler(), [0]),
                            ('trans2', StandardScaler(with_std=False), [1])])
    assert_false(hasattr(ct, 'transformers_'))
    ct.fit(X_array)
    assert_true(hasattr(ct, 'transformers_'))
    assert_true(isinstance(ct.named_transformers_['trans1'], StandardScaler))
    assert_true(isinstance(ct.named_transformers_.trans1, StandardScaler))
    assert_true(isinstance(ct.named_transformers_['trans2'], StandardScaler))
    assert_true(isinstance(ct.named_transformers_.trans2, StandardScaler))
    assert_false(ct.named_transformers_.trans2.with_std)
    # check it are fitted transformers
    assert_equal(ct.named_transformers_.trans1.mean_, 1.)
def test_column_transformer_drop_all_sparse_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T
    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder=SparseMatrixTrans())

    X_trans = ct.fit_transform(X_array)
    assert sparse.issparse(X_trans)

    #  SparseMatrixTrans creates 3 features for each column, thus:
    assert X_trans.shape == (3, 3)
    assert_array_equal(X_trans.toarray(), np.eye(3))
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_list():
    X_list = [
        [1, float('nan'), 'a'],
        [0, 0, 'b']
    ]
    expected_result = np.array([
        [1, float('nan'), 1, 0],
        [-1, 0, 0, 1],
    ])

    ct = ColumnTransformer([
        ('numerical', StandardScaler(), [0, 1]),
        ('categorical', OneHotEncoder(), [2]),
    ])

    assert_array_equal(ct.fit_transform(X_list), expected_result)
    assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_column_transformer_drops_all_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    # columns are doubled when remainder = DoubleTrans
    X_res_both = 2 * X_array.copy()[:, 1:3]

    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_remainder_pandas(key):
    # test different ways that columns are specified with passthrough
    pd = pytest.importorskip('pandas')
    if isinstance(key, six.string_types) and key == 'pd-index':
        key = pd.Index(['first'])

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    X_res_both = X_array

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])
def test_column_transformer_remainder_transformer(key):
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T
    X_res_both = X_array.copy()

    # second and third columns are doubled when remainder = DoubleTrans
    X_res_both[:, 1:3] *= 2

    ct = ColumnTransformer([('trans1', Trans(), key)],
                           remainder=DoubleTrans())

    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], DoubleTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_2D_transformer_output():

    class TransNo2D(BaseEstimator):
        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            return X

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    # if one transformer is dropped, test that name is still correct
    ct = ColumnTransformer([('trans1', 'drop', 0),
                            ('trans2', TransNo2D(), 1)])
    assert_raise_message(ValueError, "the 'trans2' transformer should be 2D",
                         ct.fit_transform, X_array)
    ct.fit(X_array)
    assert_raise_message(ValueError, "the 'trans2' transformer should be 2D",
                         ct.transform, X_array)
def test_column_transformer_list():
    X_list = [
        [1, float('nan'), 'a'],
        [0, 0, 'b']
    ]
    expected_result = np.array([
        [1, float('nan'), 1, 0],
        [-1, 0, 0, 1],
    ])

    ct = ColumnTransformer([
        ('numerical', StandardScaler(), [0, 1]),
        ('categorical', OneHotEncoder(), [2]),
    ])

    with pytest.warns(DataConversionWarning):
        # TODO: this warning is not very useful in this case, would be good
        # to get rid of it
        assert_array_equal(ct.fit_transform(X_list), expected_result)
        assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_column_transformer_sparse_remainder_transformer():
    X_array = np.array([[0, 1, 2],
                        [2, 4, 6],
                        [8, 6, 4]]).T

    ct = ColumnTransformer([('trans1', Trans(), [0])],
                           remainder=SparseMatrixTrans())

    X_trans = ct.fit_transform(X_array)
    assert sparse.issparse(X_trans)
    # SparseMatrixTrans creates 3 features for each column. There is
    # one column in ``transformers``, thus:
    assert X_trans.shape == (3, 3 + 1)

    exp_array = np.hstack(
        (X_array[:, 0].reshape(-1, 1), np.eye(3)))
    assert_array_equal(X_trans.toarray(), exp_array)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans)
    assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_callable_specifier():
    # assert that function gets the full array / dataframe
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_res_first = np.array([[0, 1, 2]]).T

    def func(X):
        assert_array_equal(X, X_array)
        return [0]

    ct = ColumnTransformer([('trans', Trans(), func)],
                           remainder='drop')
    assert_array_equal(ct.fit_transform(X_array), X_res_first)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)

    pd = pytest.importorskip('pandas')
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])

    def func(X):
        assert_array_equal(X.columns, X_df.columns)
        assert_array_equal(X.values, X_df.values)
        return ['first']

    ct = ColumnTransformer([('trans', Trans(), func)],
                           remainder='drop')
    assert_array_equal(ct.fit_transform(X_df), X_res_first)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
def test_column_transformer_cloning():
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T

    ct = ColumnTransformer([('trans', StandardScaler(), [0])])
    ct.fit(X_array)
    assert_false(hasattr(ct.transformers[0][1], 'mean_'))
    assert_true(hasattr(ct.transformers_[0][1], 'mean_'))

    ct = ColumnTransformer([('trans', StandardScaler(), [0])])
    ct.fit_transform(X_array)
    assert_false(hasattr(ct.transformers[0][1], 'mean_'))
    assert_true(hasattr(ct.transformers_[0][1], 'mean_'))
def test_column_transformer_sparse_stacking():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)],
                                  sparse_threshold=0.8)
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert sparse.issparse(X_trans)
    assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1))
    assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
    assert len(col_trans.transformers_) == 2
    assert col_trans.transformers_[-1][0] != 'remainder'

    col_trans = ColumnTransformer([('trans1', Trans(), [0]),
                                   ('trans2', SparseMatrixTrans(), 1)],
                                  sparse_threshold=0.1)
    col_trans.fit(X_array)
    X_trans = col_trans.transform(X_array)
    assert not sparse.issparse(X_trans)
    assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1)
    assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer_sparse_threshold():
    X_array = np.array([['a', 'b'], ['A', 'B']], dtype=object).T
    # above data has sparsity of 4 / 8 = 0.5

    # apply threshold even if all sparse
    col_trans = ColumnTransformer([('trans1', OneHotEncoder(), [0]),
                                   ('trans2', OneHotEncoder(), [1])],
                                  sparse_threshold=0.2)
    res = col_trans.fit_transform(X_array)
    assert not sparse.issparse(res)
    assert not col_trans.sparse_output_

    # mixed -> sparsity of (4 + 2) / 8 = 0.75
    for thres in [0.75001, 1]:
        col_trans = ColumnTransformer(
            [('trans1', OneHotEncoder(sparse=True), [0]),
             ('trans2', OneHotEncoder(sparse=False), [1])],
            sparse_threshold=thres)
        res = col_trans.fit_transform(X_array)
        assert sparse.issparse(res)
        assert col_trans.sparse_output_

    for thres in [0.75, 0]:
        col_trans = ColumnTransformer(
            [('trans1', OneHotEncoder(sparse=True), [0]),
             ('trans2', OneHotEncoder(sparse=False), [1])],
            sparse_threshold=thres)
        res = col_trans.fit_transform(X_array)
        assert not sparse.issparse(res)
        assert not col_trans.sparse_output_

    # if nothing is sparse -> no sparse
    for thres in [0.33, 0, 1]:
        col_trans = ColumnTransformer(
            [('trans1', OneHotEncoder(sparse=False), [0]),
             ('trans2', OneHotEncoder(sparse=False), [1])],
            sparse_threshold=thres)
        res = col_trans.fit_transform(X_array)
        assert not sparse.issparse(res)
        assert not col_trans.sparse_output_
def test_column_transformer():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    X_res_first1D = np.array([0, 1, 2])
    X_res_second1D = np.array([2, 4, 6])
    X_res_first = X_res_first1D.reshape(-1, 1)
    X_res_both = X_array

    cases = [
        # single column 1D / 2D
        (0, X_res_first),
        ([0], X_res_first),
        # list-like
        ([0, 1], X_res_both),
        (np.array([0, 1]), X_res_both),
        # slice
        (slice(0, 1), X_res_first),
        (slice(0, 2), X_res_both),
        # boolean mask
        (np.array([True, False]), X_res_first),
    ]

    for selection, res in cases:
        ct = ColumnTransformer([('trans', Trans(), selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_array), res)
        assert_array_equal(ct.fit(X_array).transform(X_array), res)

        # callable that returns any of the allowed specifiers
        ct = ColumnTransformer([('trans', Trans(), lambda x: selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_array), res)
        assert_array_equal(ct.fit(X_array).transform(X_array), res)

    ct = ColumnTransformer([('trans1', Trans(), [0]),
                            ('trans2', Trans(), [1])])
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2

    # test with transformer_weights
    transformer_weights = {'trans1': .1, 'trans2': 10}
    both = ColumnTransformer([('trans1', Trans(), [0]),
                              ('trans2', Trans(), [1])],
                             transformer_weights=transformer_weights)
    res = np.vstack([transformer_weights['trans1'] * X_res_first1D,
                     transformer_weights['trans2'] * X_res_second1D]).T
    assert_array_equal(both.fit_transform(X_array), res)
    assert_array_equal(both.fit(X_array).transform(X_array), res)
    assert len(both.transformers_) == 2

    both = ColumnTransformer([('trans', Trans(), [0, 1])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
Ejemplo n.º 28
0
def prepare_data(train_df_raw, test_df_raw, data_prep_dict):
    '''
    
        Function to process raw data into required modelling data
        
        Inputs:
            1. train_df_raw - Dataframe
            2. test_df_raw  - Dataframe
            3. data_prep_dict - Dictionary
        
        Outputs:
            1. train_df_processed - Dataframe
            2. test_df_processed - Dataframe
    '''

    #quick check to apply data processing on both train and test combined
    #train_df_raw = pd.concat([train_df_raw,test_df_raw],axis = 0)

    #override simple imputer error by manually assigning missing values
    train_df_raw['Holding_Policy_Duration'].fillna('-1', inplace=True)
    test_df_raw['Holding_Policy_Duration'].fillna('-1', inplace=True)
    train_df_raw.fillna('missing', inplace=True)
    test_df_raw.fillna('missing', inplace=True)

    #modify data values to convert catergorical raw attributes to potential numeric features

    train_df_raw.replace({'14+': '14'}, inplace=True)
    train_df_raw['Holding_Policy_Duration'] = train_df_raw[
        'Holding_Policy_Duration'].astype(float)
    test_df_raw.replace({'14+': '14'}, inplace=True)
    test_df_raw['Holding_Policy_Duration'] = test_df_raw[
        'Holding_Policy_Duration'].astype(float)

    #freeze data types
    train_df_raw[data_prep_dict['one_hot_encode']] = train_df_raw[
        data_prep_dict['one_hot_encode']].astype(str)
    test_df_raw[data_prep_dict['one_hot_encode']] = test_df_raw[
        data_prep_dict['one_hot_encode']].astype(str)

    #target encode required attributes
    for target_encode_col in data_prep_dict['target_encode']:
        encoding_dict = train_df_raw.groupby(
            target_encode_col)[TARGET].mean().to_dict()
        train_df_raw[target_encode_col] = train_df_raw[target_encode_col].map(
            encoding_dict)
        test_df_raw[target_encode_col] = test_df_raw[target_encode_col].map(
            encoding_dict)

    #fill missing Region Codes
    #city_code_means = train_df_raw.groupby(['City_Code'])[TARGET].mean().reset_index()
    #test_df_raw['Region_Code'] = test_df_raw.apply(
    #lambda row: city_code_means[TARGET][city_code_means.City_Code ==
    #                                    row['City_Code']].values[0]
    #                                if row['Region_Code'] not in train_df_raw['Region_Code'].unique() else row['Region_Code'],
    #                            axis=1
    #                        )

    #define set of transformation steps per raw attribute present in the data

    column_transformer_1 = ColumnTransformer(
        [('one_hot_encode', OneHotEncoder(sparse=False, drop='if_binary'),
          data_prep_dict['one_hot_encode'])],
        remainder='passthrough',
        verbose='True')

    #build and fit the column transformer on train data
    train_df_processed = column_transformer_1.fit_transform(train_df_raw)
    #apply the column transformer on test data
    test_df_processed = column_transformer_1.transform(test_df_raw)

    #convert numpy arrays into pandas dataframe for further analysis
    train_df_processed_1 = pd.DataFrame(
        train_df_processed, columns=column_transformer_1.get_feature_names())
    test_df_processed_1 = pd.DataFrame(
        test_df_processed, columns=column_transformer_1.get_feature_names())

    column_transformer_2 = ColumnTransformer([('passthrough', 'passthrough', [
        col for col in train_df_processed_1.columns
        if col not in data_prep_dict['standard_scale']
    ]), ('standard_scale', StandardScaler(), data_prep_dict['standard_scale'])
                                              ],
                                             remainder='passthrough',
                                             verbose='True')

    #build and fit the column transformer on train data
    train_df_processed_2 = column_transformer_2.fit_transform(
        train_df_processed_1)
    #apply the column transformer on test data
    test_df_processed_2 = column_transformer_2.transform(test_df_processed_1)

    #recreate column names in the correct order, to understand feature importances
    train_df_processed_out = pd.DataFrame(
        train_df_processed_2,
        columns=[
            col for col in train_df_processed_1.columns
            if col not in data_prep_dict['standard_scale']
        ] + data_prep_dict['standard_scale'])
    test_df_processed_out = pd.DataFrame(
        test_df_processed_2,
        columns=[
            col for col in train_df_processed_1.columns
            if col not in data_prep_dict['standard_scale']
        ] + data_prep_dict['standard_scale'])

    #progress logger
    print('Target encoding completed, return processed data')

    return train_df_processed_out, test_df_processed_out
          "land_use_type_3_fraction", "land_use_type_4_fraction", "land_use_type_5_fraction",
          "land_use_type_6_fraction", "land_use_type_9_fraction"]
y_list = "observe_O3"

start = time()
file = "F:/graduation_thesis/new_all_data/model/all_model_data.csv"

data = pd.read_csv(file)
data.dropna(inplace=True)
data = data.sample(1000)
features = data[x_list]
labels = data[y_list]

numeric_features = x_list
numeric_transformer = Pipeline(steps=[('imp2', SimpleImputer(missing_values=-999, strategy='mean'))])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features)])
X = preprocessor.fit_transform(features)

Y = labels
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
model = RandomForestRegressor(bootstrap=True, criterion='mse',
                              max_depth=60, max_features='auto',
                              max_leaf_nodes=None, min_impurity_decrease=0.0,
                              min_impurity_split=None, min_samples_leaf=5,
                              min_samples_split=90, min_weight_fraction_leaf=0.0,
                              n_estimators=1200, n_jobs=-1, oob_score=False,
                              random_state=None, verbose=0, warm_start=False)

print("model training")
model.fit(x_train, y_train)
print("model training finished")
def test_column_transformer_get_feature_names():
    X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T
    ct = ColumnTransformer([('trans', Trans(), [0, 1])])
    # raise correct error when not fitted
    assert_raises(NotFittedError, ct.get_feature_names)
    # raise correct error when no feature names are available
    ct.fit(X_array)
    assert_raise_message(AttributeError,
                         "Transformer trans (type Trans) does not provide "
                         "get_feature_names", ct.get_feature_names)

    # working example
    X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}],
                  [{'c': 5}, {'c': 6}]], dtype=object).T
    ct = ColumnTransformer(
        [('col' + str(i), DictVectorizer(), i) for i in range(2)])
    ct.fit(X)
    assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b', 'col1__c'])

    # passthrough transformers not supported
    ct = ColumnTransformer([('trans', 'passthrough', [0, 1])])
    ct.fit(X)
    assert_raise_message(
        NotImplementedError, 'get_feature_names is not yet supported',
        ct.get_feature_names)

    ct = ColumnTransformer([('trans', DictVectorizer(), 0)],
                           remainder='passthrough')
    ct.fit(X)
    assert_raise_message(
        NotImplementedError, 'get_feature_names is not yet supported',
        ct.get_feature_names)

    # drop transformer
    ct = ColumnTransformer(
        [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)])
    ct.fit(X)
    assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b'])
Ejemplo n.º 31
0
    1.0074863283776458, 0.20239896852403538, -0.043678728558593366,
    -0.13929748680369286, 1.3163604645710438, -0.3699637766938669,
    -0.6149300604558857, -0.854369594993175, 0.263445277972641,
    0.5712416961268142
]

# In[36]:

ct = pd.DataFrame([test_country], columns=df.columns)

# In[37]:

cols2 = df.select_dtypes(['int64', 'float64']).columns
pl = Pipeline(steps=[('imp', SimpleImputer(
    strategy='median')), ('scaler', StandardScaler())])
tf = ColumnTransformer(transformers=[('number', pl, cols2)], n_jobs=-1)
tf.fit(df)

# In[38]:


def q4():
    res = tf.transform(ct)[0][cols2.get_loc('Arable')]
    return round(float(res), 3)


# ## Questão 5
#
# Descubra o número de _outliers_ da variável `Net_migration` segundo o método do _boxplot_, ou seja, usando a lógica:
#
# $$x \notin [Q1 - 1.5 \times \text{IQR}, Q3 + 1.5 \times \text{IQR}] \Rightarrow x \text{ é outlier}$$
Ejemplo n.º 32
0
admissionData = pd.read_csv("admissions_data.csv")

admissionData = admissionData.drop(["Serial No."], axis=1)
labels = admissionData.iloc[:, -1]

# remove uni rating and TOEFL score - unethical?
# remove serial no. and research - irrelevant info
features = admissionData.iloc[:, [0, 3, 4, 5, 6]]

# split dataset into train and test
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)

# scale/normalise dataset features
ct = ColumnTransformer([("normalize", Normalizer(), [0, 1, 2, 3])],
                       remainder='passthrough')
features_train = ct.fit_transform(features_train)
features_test = ct.transform(features_test)

learning_rate = 0.001
num_epochs = 20

# create neural network

#  admissionsModel = build_model(features_train, learning_rate)  # rewrite this function
#  admissionsModel.fit(features_train, labels_train, epochs=20, batch_size=1, verbose=1)
history1 = fit_model(build_model(features_train, learning_rate),
                     features_train, labels_train, learning_rate, num_epochs)

#  need to return the fitted model into a graph somehow here
Ejemplo n.º 33
0
from core.utils.common_transformers import TypeSelector, FeatureSquarer
from core.utils.common_estimators import RandomBinaryClassifier

# sys.path.append('D:/GitRepos/github/PythonTestCode/prod_test')


# transformer tests
df = pd.DataFrame(data=[[1, 2, 'chad'], [4, 5, 'John']],
                  columns=['col1', 'col2', 'col3'])

float_pipeline = Pipeline(steps=[('float_squarer', FeatureSquarer())])
float_pipeline.fit(df)
float_pipeline.transform(df)

transformer_list = [('float', float_pipeline, ['col1', 'col2'])]
preprocessor = ColumnTransformer(transformer_list)

int_data = Pipeline(steps=[('column_extractor', TypeSelector('int64'))])

int_data.fit_transform(df)

# estimator test
X = pd.DataFrame(np.random.randn(100, 4), columns=list('ABCD'))
y = pd.Series(np.random.choice(['setosa', 'virginica'], 100, p=[0.3, 0.7]))
y_test = pd.Series(np.random.choice(['setosa', 'virginica'],
                                    100, p=[0.3, 0.7]))

model = RandomBinaryClassifier()

model.fit(X, y)
Ejemplo n.º 34
0
class DogeDataLoader:
    def __init__(self,
                 filename,
                 categorical_cols,
                 target_col,
                 seq_length,
                 batch_size,
                 preprocessor=True,
                 prediction_window=1):
        '''
        :param filename: path to the csv dataset
        :param categorical_cols: name of the categorical columns, if None pass empty list
        :param target_col: name of the targeted column
        :param seq_length: window length to use
        :param prediction_window: window length to predict
        :param preprocessor: if normalize data or not
        :param batch_size: batch size
        '''
        self.data = self.read_and_preprocess(filename)
        self.categorical_cols = categorical_cols
        self.numerical_cols = list(
            set(self.data.columns) - set(categorical_cols) - set(target_col))
        self.target_col = target_col
        self.seq_length = seq_length
        self.prediction_window = prediction_window
        self.batch_size = batch_size
        self.preprocessor = preprocessor
        self.preprocess = ColumnTransformer(
            [
                ("scaler", StandardScaler(), self.numerical_cols),
                #("encoder", OneHotEncoder(), self.categorical_cols)
            ],
            remainder="passthrough")

    def read_and_preprocess(self, filename):
        # Reading
        df = pd.read_csv(filename)
        # Reorder and resetting index
        df = df[::-1].reset_index(drop=True)
        # Preprocessing 'Change' column
        df['Change %'] = df['Change %'].str.replace("%", "")
        df['Change %'] = pd.to_numeric(df['Change %'].str.replace(",", ""))
        # Preprocessing 'Vol.' column
        vols = [el for el in df['Vol.']]
        for num, el in enumerate(vols):
            # Check if is billion
            isB = el[-1] == 'B'
            try:
                el = float(el[:-1])
            except ValueError:
                print("Value Error at row ", num)
                el = vols[num - 1]
            if isB:
                el = el * 1000
            vols[num] = el
        df['Vol.'] = vols
        # Dropping Date column
        df.pop('Date')
        # Done, returning dataframe
        return df

    def preprocess_data(self):
        '''
        Preprocessing function
        '''
        X = self.data.drop(self.target_col, axis=1)
        y = self.data[self.target_col]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            train_size=0.8,
                                                            shuffle=False)
        if self.preprocessor is not None:
            X_train = self.preprocess.fit_transform(X_train)
            X_test = self.preprocess.fit_transform(X_test)

        if self.target_col:
            return X_train, X_test, y_train.values, y_test.values
        return X_train, X_test

    def frame_series(self, X, y=None):
        '''
        Function used to prepare the data for time series prediction
        :param X: set of features
        :param y: targeted value to predict
        :return: TensorDataset
        '''
        nb_obs, nb_features = X.shape
        features, target, y_hist = [], [], []

        for i in range(1, nb_obs - self.seq_length - self.prediction_window):
            features.append(
                torch.FloatTensor(X[i:i + self.seq_length, :]).unsqueeze(0))

        features_var = torch.cat(features)

        if y is not None:
            for i in range(1,
                           nb_obs - self.seq_length - self.prediction_window):
                target.append(
                    torch.tensor(y[i + self.seq_length:i + self.seq_length +
                                   self.prediction_window]))
            target_var = torch.cat(target)
            return TensorDataset(features_var, target_var)
        return TensorDataset(features_var)

    def get_loaders(self, ):
        '''
        Preprocess and frame the dataset
        :return: DataLoaders associated to training and testing data
        '''

        X_train, X_test, y_train, y_test = self.preprocess_data()

        train_dataset = self.frame_series(X_train, y_train)
        test_dataset = self.frame_series(X_test, y_test)

        train_iter = DataLoader(train_dataset,
                                batch_size=self.batch_size,
                                shuffle=False,
                                drop_last=True)
        test_iter = DataLoader(test_dataset,
                               batch_size=self.batch_size,
                               shuffle=False,
                               drop_last=True)
        return train_iter, test_iter
Ejemplo n.º 35
0
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

# For text, we:
# 1. Impute missing values with the string "missing"
# 2. Tfidf encode the text, using 1-grams and 2-grams.
text_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("tfidf", MultiColumnTfidfVectorizer(ngram_range=(1, 2))),
])

# Sparse preprocessing pipeline, for models such as Ridge that handle sparse input well
sparse_preprocessing_pipeline = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_selector),
    ("cat", categorical_pipeline, categorical_selector),
    ("txt", text_pipeline, text_selector),
])


# Modified TruncatedSVD that doesn't fail if n_components > ncols
class MyTruncatedSVD(TruncatedSVD):
    def fit_transform(self, X, y=None):
        if X.shape[1] <= self.n_components:
            self.n_components = X.shape[1] - 1
        return TruncatedSVD.fit_transform(self, X=X, y=y)


# Dense preprocessing pipeline, for models such as XGboost that do not do well with
# extremely wide, sparse data
# This preprocessing will work with linear models such as Ridge too
Ejemplo n.º 36
0
])

categorical_features = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
                   'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
                   'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
                   'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                   'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
                   'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
                   'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
categorical_transforms = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('scaler', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transforms, numeric_features),
    ('cat', categorical_transforms, categorical_features)
])

# append classifier to preprocessor

classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0))
])

# remove unuseful columns
drop_labels = ['Id']
X = train_dataset.drop(labels=drop_labels, axis=1)
y = train_dataset['SalePrice']

# # fit local
Ejemplo n.º 37
0
# Label Encoder

labelEncoder_previsores = LabelEncoder()
previsores[:, 1] = labelEncoder_previsores.fit_transform(previsores[:, 1])
previsores[:, 3] = labelEncoder_previsores.fit_transform(previsores[:, 3])
previsores[:, 5] = labelEncoder_previsores.fit_transform(previsores[:, 5])
previsores[:, 6] = labelEncoder_previsores.fit_transform(previsores[:, 6])
previsores[:, 7] = labelEncoder_previsores.fit_transform(previsores[:, 7])
previsores[:, 8] = labelEncoder_previsores.fit_transform(previsores[:, 8])
previsores[:, 9] = labelEncoder_previsores.fit_transform(previsores[:, 9])
previsores[:, 13] = labelEncoder_previsores.fit_transform(previsores[:, 13])

# One Hot Encoder
oneHotEncoder = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'),
      [1, 3, 5, 6, 7, 8, 9, 13])],
    remainder='passthrough')
previsores = oneHotEncoder.fit_transform(previsores).toarray()

# Y
labelEncoder_classe = LabelEncoder()
classe = labelEncoder_classe.fit_transform(classe)

# Escalonamento dos dados
##### Escalonamento Parcial #####
# scalerCols = previsores[:, 102:]
# scaler = StandardScaler()
# previsores[:, 102:] = scaler.fit_transform(scalerCols)
##### Escalonamento Total #####
scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)
# %%
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

categorical_processor = OneHotEncoder(handle_unknown="ignore")
numerical_processor = StandardScaler()

# %% [markdown]
# Subsequently, create a `ColumnTransformer` to redirect the specific columns
# a preprocessing pipeline.

# %%
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('cat-preprocessor', categorical_processor, categorical_columns),
    ('num-preprocessor', numerical_processor, numerical_columns)
])

# %% [markdown]
# Finally, concatenate the preprocessing pipeline with a logistic regression.

# %%
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

model = make_pipeline(preprocessor, LogisticRegression())

# %% [markdown]
# Use a `RandomizedSearchCV` to find the best set of hyperparameters by tuning
# the following parameters for the `LogisticRegression` model:
# - `C` with values ranging from 0.001 to 10. You can use a log-uniform
Ejemplo n.º 39
0
#Implementando a transformação em números e atribuindo
previsores[:, 1] = labelEncoder_previsores.fit_transform(previsores[:, 1])
previsores[:, 3] = labelEncoder_previsores.fit_transform(previsores[:, 3])
previsores[:, 5] = labelEncoder_previsores.fit_transform(previsores[:, 5])
previsores[:, 6] = labelEncoder_previsores.fit_transform(previsores[:, 6])
previsores[:, 7] = labelEncoder_previsores.fit_transform(previsores[:, 7])
previsores[:, 8] = labelEncoder_previsores.fit_transform(previsores[:, 8])
previsores[:, 9] = labelEncoder_previsores.fit_transform(previsores[:, 9])
previsores[:, 13] = labelEncoder_previsores.fit_transform(previsores[:, 13])

#Criar variáveis dummy para melhor uso dos dados

from sklearn.compose import ColumnTransformer

column_transform = ColumnTransformer(
    [("encoder", OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13])],
    remainder='passthrough')

#Transforma variável categórica em números
labelEncoder_classe = LabelEncoder()
classe = labelEncoder_classe.fit_transform(classe)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

previsores = scaler.fit_transform(previsores)

#Gerar uma amostra de treinamento - previsores e classe
#Gerar uma amostra de teste - previsores e classe
from sklearn.model_selection import train_test_split
Ejemplo n.º 40
0
# Separamos en entrada (X) y salida (y)
X = df[X_cols]
y = df[y_col]

# Declaramos los Imputer que insertarán valores en los NaN
imputer_media = SimpleImputer(strategy='mean')
imputer_moda = SimpleImputer(strategy='most_frequent')

# Declaramos los Scaler que estandarizarán los datos
scaler_media = StandardScaler()
scaler_moda = StandardScaler()

# Creamos un ColumnTransformer para el SimpleImputer
imputer = ColumnTransformer([
    ('imputer_media', imputer_media, slice(0, 8)),
    ('imputer_moda', imputer_moda, slice(8, len(X.columns))),
])

# Creamos un ColumnTransformer para el StandardScaler
scaler = ColumnTransformer([('scaler_media', scaler_media, slice(0, 8)),
                            ('scaler_moda', scaler_moda,
                             slice(8, len(X.columns)))])

# Creamos el Pipeline incorporando ColumnTransformer y Clasificador
pipeline = Pipeline([('imputer', imputer), ('scaler', scaler),
                     ('svm',
                      SVC(random_state=RANDOM_STATE,
                          class_weight=CLASS_WEIGHT,
                          probability=True))])

# InnerCV (GridSearchCV de 2-folds 5-times (stratified) para obtener mejores parámetros)
Ejemplo n.º 41
0
def main():

    # Lazy import libraries
    from rlearnlib.utils import (
        predefined_estimators,
        load_training_data,
        save_training_data,
        option_to_list,
        scoring_metrics,
        check_class_weights,
    )
    from rlearnlib.raster import RasterStack

    try:
        import sklearn

        if sklearn.__version__ < "0.20":
            gs.fatal(
                "Package python3-scikit-learn 0.20 or newer is not installed")

    except ImportError:
        gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed")

    try:
        import pandas as pd

    except ImportError:
        gs.fatal("Package python3-pandas 0.25 or newer is not installed")

    # parser options ----------------------------------------------------------
    group = options["group"]
    training_map = options["training_map"]
    training_points = options["training_points"]
    field = options["field"]
    model_save = options["save_model"]
    model_name = options["model_name"]
    hyperparams = {
        "penalty": options["penalty"],
        "alpha": options["alpha"],
        "l1_ratio": options["l1_ratio"],
        "C": options["c"],
        "epsilon": options["epsilon"],
        "min_samples_leaf": options["min_samples_leaf"],
        "n_estimators": options["n_estimators"],
        "learning_rate": options["learning_rate"],
        "subsample": options["subsample"],
        "max_depth": options["max_depth"],
        "max_features": options["max_features"],
        "n_neighbors": options["n_neighbors"],
        "weights": options["weights"],
        "hidden_layer_sizes": options["hidden_units"],
    }
    cv = int(options["cv"])
    group_raster = options["group_raster"]
    importances = flags["f"]
    preds_file = options["preds_file"]
    classif_file = options["classif_file"]
    fimp_file = options["fimp_file"]
    param_file = options["param_file"]
    norm_data = flags["s"]
    random_state = int(options["random_state"])
    load_training = options["load_training"]
    save_training = options["save_training"]
    n_jobs = int(options["n_jobs"])
    balance = flags["b"]
    category_maps = option_to_list(options["category_maps"])

    # define estimator --------------------------------------------------------
    hyperparams, param_grid = process_param_grid(hyperparams)
    estimator, mode = predefined_estimators(model_name, random_state, n_jobs,
                                            hyperparams)

    # remove dict keys that are incompatible for the selected estimator
    estimator_params = estimator.get_params()
    param_grid = {
        key: value
        for key, value in param_grid.items() if key in estimator_params
    }
    scoring, search_scorer = scoring_metrics(mode)

    # checks of input options -------------------------------------------------
    if (mode == "classification" and balance is True
            and model_name not in check_class_weights()):
        gs.warning(model_name + " does not support class weights")
        balance = False

    if mode == "regression" and balance is True:
        gs.warning(
            "Balancing of class weights is only possible for classification")
        balance = False

    if classif_file:
        if cv <= 1:
            gs.fatal("Output of cross-validation global accuracy requires "
                     "cross-validation cv > 1")

        if not os.path.exists(os.path.dirname(classif_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                classif_file))

    # feature importance file selected but no cross-validation scheme used
    if importances:
        if sklearn.__version__ < "0.22":
            gs.fatal("Feature importances calculation requires scikit-learn "
                     "version >= 0.22")

    if fimp_file:
        if importances is False:
            gs.fatal(
                'Output of feature importance requires the "f" flag to be set')

        if not os.path.exists(os.path.dirname(fimp_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                fimp_file))

    # predictions file selected but no cross-validation scheme used
    if preds_file:
        if cv <= 1:
            gs.fatal("Output of cross-validation predictions requires "
                     "cross-validation cv > 1")

        if not os.path.exists(os.path.dirname(preds_file)):
            gs.fatal("Directory for output file {} does not exist".format(
                preds_file))

    # define RasterStack ------------------------------------------------------
    stack = RasterStack(group=group)

    if category_maps is not None:
        stack.categorical = category_maps

    # extract training data ---------------------------------------------------
    if load_training != "":
        X, y, cat, class_labels, group_id = load_training_data(load_training)

        if class_labels is not None:
            a = pd.DataFrame({"response": y, "labels": class_labels})
            a = a.drop_duplicates().values
            class_labels = {k: v for (k, v) in a}

    else:
        gs.message("Extracting training data")

        if group_raster != "":
            stack.append(group_raster)

        if training_map != "":
            X, y, cat = stack.extract_pixels(training_map)
            y = y.flatten()

            with RasterRow(training_map) as src:

                if mode == "classification":
                    src_cats = {v: k for (k, v, m) in src.cats}
                    class_labels = {k: k for k in np.unique(y)}
                    class_labels.update(src_cats)
                else:
                    class_labels = None

        elif training_points != "":
            X, y, cat = stack.extract_points(training_points, field)
            y = y.flatten()

            if y.dtype in (np.object_, np.object):
                from sklearn.preprocessing import LabelEncoder

                le = LabelEncoder()
                y = le.fit_transform(y)
                class_labels = {k: v for (k, v) in enumerate(le.classes_)}
            else:
                class_labels = None

        # take group id from last column and remove from predictors
        if group_raster != "":
            group_id = X[:, -1]
            X = np.delete(X, -1, axis=1)
            stack.drop(group_raster)
        else:
            group_id = None

        # check for labelled pixels and training data
        if y.shape[0] == 0 or X.shape[0] == 0:
            gs.fatal("No training pixels or pixels in imagery group ...check "
                     "computational region")

        from sklearn.utils import shuffle

        if group_id is None:
            X, y, cat = shuffle(X, y, cat, random_state=random_state)
        else:
            X, y, cat, group_id = shuffle(X,
                                          y,
                                          cat,
                                          group_id,
                                          random_state=random_state)

        if save_training != "":
            save_training_data(save_training, X, y, cat, class_labels,
                               group_id, stack.names)

    # cross validation settings -----------------------------------------------
    # inner resampling method (cv=2)
    from sklearn.model_selection import GridSearchCV, StratifiedKFold, GroupKFold, KFold

    if any(param_grid) is True:
        if group_id is None and mode == "classification":
            inner = StratifiedKFold(n_splits=3)
        elif group_id is None and mode == "regression":
            inner = KFold(n_splits=3)
        else:
            inner = GroupKFold(n_splits=3)
    else:
        inner = None

    # outer resampling method (cv=cv)
    if cv > 1:
        if group_id is None and mode == "classification":
            outer = StratifiedKFold(n_splits=cv)
        elif group_id is None and mode == "regression":
            outer = KFold(n_splits=cv)
        else:
            outer = GroupKFold(n_splits=cv)

    # modify estimators that take sample_weights ------------------------------
    if balance is True:
        from sklearn.utils import compute_class_weight

        class_weights = compute_class_weight(class_weight="balanced",
                                             classes=(y),
                                             y=y)
        fit_params = {"sample_weight": class_weights}

    else:
        class_weights = None
        fit_params = {}

    # preprocessing -----------------------------------------------------------
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder

    # standardization
    if norm_data is True and category_maps is None:
        scaler = StandardScaler()
        trans = ColumnTransformer(
            remainder="passthrough",
            transformers=[("scaling", scaler, np.arange(0, stack.count))],
        )

    # one-hot encoding
    elif norm_data is False and category_maps is not None:
        enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
        trans = ColumnTransformer(remainder="passthrough",
                                  transformers=[("onehot", enc,
                                                 stack.categorical)])

    # standardization and one-hot encoding
    elif norm_data is True and category_maps is not None:
        scaler = StandardScaler()
        enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
        trans = ColumnTransformer(
            remainder="passthrough",
            transformers=[
                ("onehot", enc, stack.categorical),
                (
                    "scaling",
                    scaler,
                    np.setxor1d(range(stack.count),
                                stack.categorical).astype("int"),
                ),
            ],
        )

    # combine transformers
    if norm_data is True or category_maps is not None:
        estimator = Pipeline([("preprocessing", trans),
                              ("estimator", estimator)])
        param_grid = wrap_named_step(param_grid)
        fit_params = wrap_named_step(fit_params)

    if any(param_grid) is True:
        estimator = GridSearchCV(
            estimator=estimator,
            param_grid=param_grid,
            scoring=search_scorer,
            n_jobs=n_jobs,
            cv=inner,
        )

    # estimator training ------------------------------------------------------
    gs.message(os.linesep)
    gs.message(("Fitting model using " + model_name))
    if balance is True and group_id is not None:
        estimator.fit(X, y, groups=group_id, **fit_params)
    elif balance is True and group_id is None:
        estimator.fit(X, y, **fit_params)
    else:
        estimator.fit(X, y)

    # message best hyperparameter setup and optionally save using pandas
    if any(param_grid) is True:
        gs.message(os.linesep)
        gs.message("Best parameters:")

        optimal_pars = [
            (k.replace("estimator__", "").replace("selection__", "") + " = " +
             str(v)) for (k, v) in estimator.best_params_.items()
        ]

        for i in optimal_pars:
            gs.message(i)

        if param_file != "":
            param_df = pd.DataFrame(estimator.cv_results_)
            param_df.to_csv(param_file)

    # cross-validation --------------------------------------------------------
    if cv > 1:
        from sklearn.metrics import classification_report
        from sklearn import metrics

        if (mode == "classification"
                and cv > np.histogram(y, bins=np.unique(y))[0].min()):
            gs.message(os.linesep)
            gs.fatal("Number of cv folds is greater than number of samples in "
                     "some classes ")

        gs.message(os.linesep)
        gs.message("Cross validation global performance measures......:")

        if (mode == "classification" and len(np.unique(y)) == 2
                and all([0, 1] == np.unique(y))):
            scoring["roc_auc"] = metrics.roc_auc_score

        from sklearn.model_selection import cross_val_predict

        preds = cross_val_predict(estimator,
                                  X,
                                  y,
                                  group_id,
                                  cv=outer,
                                  n_jobs=n_jobs,
                                  fit_params=fit_params)

        test_idx = [test for train, test in outer.split(X, y)]
        n_fold = np.zeros((0, ))

        for fold in range(outer.get_n_splits()):
            n_fold = np.hstack((n_fold, np.repeat(fold,
                                                  test_idx[fold].shape[0])))

        preds = {"y_pred": preds, "y_true": y, "cat": cat, "fold": n_fold}

        preds = pd.DataFrame(data=preds,
                             columns=["y_pred", "y_true", "cat", "fold"])
        gs.message(os.linesep)
        gs.message("Global cross validation scores...")
        gs.message(os.linesep)
        gs.message("Metric \t Mean \t Error")

        for name, func in scoring.items():
            score_mean = (preds.groupby("fold").apply(
                lambda x: func(x["y_true"], x["y_pred"])).mean())

            score_std = (preds.groupby("fold").apply(
                lambda x: func(x["y_true"], x["y_pred"])).std())

            gs.message(name + "\t" + str(score_mean.round(3)) + "\t" +
                       str(score_std.round(3)))

        if mode == "classification":
            gs.message(os.linesep)
            gs.message("Cross validation class performance measures......:")

            report_str = classification_report(
                y_true=preds["y_true"],
                y_pred=preds["y_pred"],
                sample_weight=class_weights,
                output_dict=False,
            )

            report = classification_report(
                y_true=preds["y_true"],
                y_pred=preds["y_pred"],
                sample_weight=class_weights,
                output_dict=True,
            )
            report = pd.DataFrame(report)

            gs.message(report_str)

            if classif_file != "":
                report.to_csv(classif_file, mode="w", index=True)

        # write cross-validation predictions to csv file
        if preds_file != "":
            preds.to_csv(preds_file, mode="w", index=False)
            text_file = open(preds_file + "t", "w")
            text_file.write('"Real", "Real", "integer", "integer"')
            text_file.close()

    # feature importances -----------------------------------------------------
    if importances is True:
        from sklearn.inspection import permutation_importance

        fimp = permutation_importance(
            estimator,
            X,
            y,
            scoring=search_scorer,
            n_repeats=5,
            n_jobs=n_jobs,
            random_state=random_state,
        )

        feature_names = deepcopy(stack.names)
        feature_names = [i.split("@")[0] for i in feature_names]

        fimp = pd.DataFrame({
            "feature": feature_names,
            "importance": fimp["importances_mean"],
            "std": fimp["importances_std"],
        })

        gs.message(os.linesep)
        gs.message("Feature importances")
        gs.message("Feature" + "\t" + "Score")

        for index, row in fimp.iterrows():
            gs.message(row["feature"] + "\t" + str(row["importance"]) + "\t" +
                       str(row["std"]))

        if fimp_file != "":
            fimp.to_csv(fimp_file, index=False)

    # save the fitted model
    import joblib

    joblib.dump((estimator, y, class_labels), model_save)
Ejemplo n.º 42
0
    0.5712416961268142
]

# In[14]:

test_data = pd.DataFrame([test_country], columns=countries.columns)

# In[15]:

data_features = countries.select_dtypes('number').columns

data_pipeline = Pipeline(steps=[('imputer', SimpleImputer(
    strategy='median')), ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(transformers=[('num', data_pipeline,
                                                data_features)],
                                 remainder='drop')

preprocessor.fit(countries)

# In[16]:


def q4():
    arable = preprocessor.transform(test_data)[0][data_features.get_loc(
        'Arable')]
    return float(round(arable, 3))


# ## Questão 5
#
Ejemplo n.º 43
0
band_gap = train_set.drop('Eg(G0W0;eV)', axis=1, inplace=False)
band_gap_label = train_set['Eg(G0W0;eV)'].copy()

# %%
band_gap_num = band_gap.drop('Compound', axis=1)

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std', StandardScaler()),
])
band_gap_tr = pipe.fit_transform(band_gap_num)
# %%
num_attribs = list(band_gap_num)
cat_attribs = ['Compound']

full_pipe = ColumnTransformer([('num', pipe, num_attribs),
                               ('cat', OrdinalEncoder(), cat_attribs)])
band_gap_prepared = full_pipe.fit_transform(band_gap)
# %%
# OrdinalEncoder().categories
# band_gap_prepared_df = pd.DataFrame(band_gap_prepared)
# band_gap_prepared_df.head(10)
# %%
lin_reg = LinearRegression()
lin_reg.fit(band_gap_prepared, band_gap_label)
# %%
band_gap_prediction = lin_reg.predict(band_gap_prepared)
zip_sample = zip(band_gap_prediction, band_gap_label)
for i, j in zip_sample:
    print(i, j)

bg_mse = mean_squared_error(band_gap_prediction, band_gap_label)
df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
df["Exposure"] = df["Exposure"].clip(upper=1)
df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)

log_scale_transformer = make_pipeline(
    FunctionTransformer(func=np.log),
    StandardScaler()
)

column_trans = ColumnTransformer(
    [
        ("binned_numeric", KBinsDiscretizer(n_bins=10),
            ["VehAge", "DrivAge"]),
        ("onehot_categorical", OneHotEncoder(),
            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
        ("passthrough_numeric", "passthrough",
            ["BonusMalus"]),
        ("log_scaled_numeric", log_scale_transformer,
            ["Density"]),
    ],
    remainder="drop",
)
X = column_trans.fit_transform(df)

# Insurances companies are interested in modeling the Pure Premium, that is
# the expected total claim amount per unit of exposure for each policyholder
# in their portfolio:
df["PurePremium"] = df["ClaimAmount"] / df["Exposure"]

# This can be indirectly approximated by a 2-step modeling: the product of the
# Frequency times the average claim amount per claim:
Ejemplo n.º 45
0
    num_attr = features_1.dtypes == 'float'
    cat_attr = ~num_attr

    for i in range(len(feature_nam)):
        if cat_attr[i] == True:
            print(feature_nam[i])
            val = feature_nam[i]
            features_1[val].fillna(features_1[val].value_counts().index[0], inplace=True)

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

preprocess = ColumnTransformer([
    ("num", num_pipeline, num_attr),
    ("cat", OneHotEncoder(), cat_attr),
])

features_prepared = preprocess.fit_transform(features_1)
features_prepared_2 = preprocess.fit_transform(features_2)

# Set up train and test arrays
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_prepared, target_1, random_state=0)


#=====================================================================================================#
# Prediction models
print() ; print('=============== Predicition Models ===============')
nam_model = []
type_model = []
dataset = pd.read_csv('Bank_Predictions.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

# Encoding categorical data
#convert gender and country to number data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

label_encoder_x_1 = LabelEncoder()
X[:, 2] = label_encoder_x_1.fit_transform(X[:,2])
transformer = ColumnTransformer(
    transformers=[
        ("OneHot",        # Just a name
         OneHotEncoder(), # The transformer class
         [1]              # The column(s) to be applied on.
         )
    ],
    remainder='passthrough' # donot apply anything to the remaining columns
)
X = transformer.fit_transform(X.tolist())

X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
Sscale = StandardScaler()
Ejemplo n.º 47
0
filename = cwd + '/default of credit card clients.xls'
nanDict = {}
df = pd.read_excel(filename,
                   header=1,
                   skiprows=0,
                   index_col=0,
                   na_values=nanDict)

df.rename(index=str,
          columns={"default payment next month": "targets"},
          inplace=True)

#Assume that pay 0 is actaul meant to be pay

print(df.columns)

#There is no pay_1 -->

# Features and targets
X = df.loc[:, df.columns != 'targets'].values
y = df.loc[:, df.columns == 'targets'].values

# Categorical variables to one-hot's
onehotencoder = OneHotEncoder(categories="auto")

X = ColumnTransformer([
    ("", onehotencoder, [3]),
], remainder="passthrough").fit_transform(X)

print(X)
#def dense_identity(X):
#    return X.todense()

text_features = ['text_feat']
text_transformer = Pipeline(steps=[('vec', CountVectorizer())])  #,
#('to_dense', FunctionTransformer(func=dense_identity, validate=True, accept_sparse=True))])

numeric_features = ['numeric_feat']  #['mkt_ret']
numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0.)
            ), ('scaler', StandardScaler())])

# combine features preprocessing
preprocessor = ColumnTransformer(
    transformers=[('text', text_transformer,
                   'text_feat'), ('num', numeric_transformer,
                                  numeric_features)])

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X1 = pipeline.fit_transform(X)
print('Expected (2, 13), got', X1.shape)

X2 = text_transformer.fit_transform(X['text_feat'])
print('Single pipeline works as expected:', X2.shape)

#%%
data = pd.DataFrame(
    data={
        'text_feat': ['This is my first sentence.', 'This is my second.'],
        'numeric_feat': [1, 2],
def test_column_transformer_remainder():
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_second = np.array([2, 4, 6]).reshape(-1, 1)
    X_res_both = X_array

    # default drop
    ct = ColumnTransformer([('trans1', Trans(), [0])])
    assert_array_equal(ct.fit_transform(X_array), X_res_first)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'drop'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # specify passthrough
    ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # column order is not preserved (passed through added to end)
    ct = ColumnTransformer([('trans1', Trans(), [1])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1])
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1])
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [0])

    # passthrough when all actual transformers are skipped
    ct = ColumnTransformer([('trans1', 'drop', [0])],
                           remainder='passthrough')
    assert_array_equal(ct.fit_transform(X_array), X_res_second)
    assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'passthrough'
    assert_array_equal(ct.transformers_[-1][2], [1])

    # error on invalid arg
    ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
        "or estimator.", ct.fit, X_array)
    assert_raise_message(
        ValueError,
        "remainder keyword needs to be one of \'drop\', \'passthrough\', "
        "or estimator.", ct.fit_transform, X_array)

    # check default for make_column_transformer
    ct = make_column_transformer(([0], Trans()))
    assert ct.remainder == 'drop'
Ejemplo n.º 50
0

#create dummy model and evaluate
model = DummyClassifier(strategy='constant', constant=1)
scores = evaluate_model(X, y, model)
print(X.shape, y.shape, Counter(y))
print('Dummy Classifier  :  ')
print('Mean F2 :%.3f (%.3f)' % (mean(scores), std(scores)))

models, names = get_models()
results = list()
#evaluate each model and print results

for i in range(len(models)):
    # one hot encode categorical, normalize numerical
    ct = ColumnTransformer([('c', OneHotEncoder(), cat_ix),
                            ('n', MinMaxScaler(), num_ix)])
    # wrap the model in a pipeline
    pipeline = Pipeline(steps=[('t', ct), ('m', models[i])])
    # evaluate the model and store results
    scores = evaluate_model(X, y, pipeline)
    results.append(scores)
    # summarize and store
    print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))

#boxplot of results
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

results = list()
models, names = get_under_sample_models()
for i in range(len(models)):
Ejemplo n.º 51
0
# convert texts to numbers
housing_cat = housing[['ocean_proximity']]
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
ordinal_encoder.categories_

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    # ('attribs_adder', Combined)
    ('std_scaler', StandardScaler())
])

num_attribs = list(housing_num)
cat_attribs = ['ocean_proximitnum_pipeliney']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)


Ejemplo n.º 52
0
train_set, test_set = train_test_split(df, test_size=0.2)
test_set.to_csv("test_set.csv")
#%%
# Vectorization for Categorical data
# Categorial Fields to be encoded using onehot method
loans_cat_1hot = ["flag_fthb", "ppmt_pnlty", "st"]

# Normalization for numerical data
loans_num_norm = [
    "orig_loan_term", "loan_age", "fico", "mi_pct", "cltv", "dti", "ltv",
    "int_rt", "current_int_rt", "median_income", "unemployment_rate",
    "house_index"
]

# Resample the data
pipeline = ColumnTransformer([("num", Normalizer(), loans_num_norm),
                              ("cat", OneHotEncoder(), loans_cat_1hot)])

X_train = pipeline.fit_transform(train_set)

smte = SMOTE(random_state=42, k_neighbors=3)
res_train, res_target = smte.fit_sample(X_train, train_set["time_to_d"])

res_target = res_target.reshape(-1, 1)
res_target = OneHotEncoder(
    categories="auto").fit_transform(res_target).toarray()

# %%
# save resampled data
np.save("res_train.npy", res_train.toarray())
np.save("res_target.npy", res_target)
Ejemplo n.º 53
0
train = pd.merge(train, u_user, how='left', left_on='user', right_on='u_id')
train.drop('u_id', axis=1, inplace=True)
train = pd.merge(train, u_item, how='left', left_on='item', right_on='m_id')
train.drop('m_id', axis=1, inplace=True)
train.drop(['user', 'item'], axis=1, inplace=True)
# %%
test = pd.merge(test, u_user, how='left', left_on='user', right_on='u_id')
test.drop('u_id', axis=1, inplace=True)
test = pd.merge(test, u_item, how='left', left_on='item', right_on='m_id')
test.drop('m_id', axis=1, inplace=True)
test.drop(['user', 'item'], axis=1, inplace=True)

# %%
ct = ColumnTransformer([
                        # ('u_i_onehot',OneHotEncoder(categories=[range(1, n_user + 1), range(1, n_item + 1)], sparse=False,dtype=np.int), ['user', 'item']),
                        ('gender_onehot', OneHotEncoder(dtype=np.int, sparse=False),['gender', 'occupation', 'zip_code'])
                        ],
                       remainder='passthrough')
ct.fit(train)
X_train = ct.transform(train)
X_test = ct.transform(test)

# %%
# 特征维度与V的维度
n_feature = X_train.shape[1]
k = 10
# %%
# 定义权重
w0 = tf.Variable(initial_value=tf.truncated_normal(shape=[1]), name='w0')
w = tf.Variable(initial_value=tf.truncated_normal(shape=[n_feature]), name='w')
V = tf.Variable(initial_value=tf.truncated_normal(shape=[k, n_feature]), name='V')
Ejemplo n.º 54
0
    return df.apply(pd.cut, axis=0, **kwargs)


winddir_discretizer = Pipeline([
    ('binning',
     FunctionTransformer(binning,
                         kw_args={
                             'bins': [0, 45, 90, 135, 180, 225, 270, 315, 360],
                             'retbins': False
                         })),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('minmax', MinMaxScaler(), ['sum_wind_last_3_hours']),
    ('winddir_discreizer', winddir_discretizer, ['wind_direction'])
],
                                 remainder=StandardScaler())

pipe = Pipeline([('preprocess', preprocessor),
                 ('forest',
                  RandomForestRegressor(max_depth=70,
                                        n_estimators=200,
                                        n_jobs=-1))])

param_grid = {
    'forest__max_depth': [20, 100],
    'forest__n_estimators': [200, 1000]
}

model = GridSearchCV(pipe, param_grid, n_jobs=-1)
Ejemplo n.º 55
0
]

# get numerical columns
numerical_cols = [
    cname for cname in x_train.columns
    if x_train[cname].dtype in ['int64', 'float64']
]

# preprocessing for numerical data
numerical_transformer = SimpleImputer()

#preprocessing for categorical data
categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='most_frequent')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[(
    'num', numerical_transformer,
    numerical_cols), ('cat', categorical_transformer, categorical_cols)])

model = RandomForestRegressor()
clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

clf.fit(x_train, y_train)

preds = clf.predict(test)

#### output model results
output = pd.DataFrame({'Id': test.index, 'SalePrice': preds})

output.to_csv('submission.csv', index=False)
def test_column_transformer_dataframe():
    pd = pytest.importorskip('pandas')

    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])

    X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
    X_res_both = X_array

    cases = [
        # String keys: label based

        # scalar
        ('first', X_res_first),
        # list
        (['first'], X_res_first),
        (['first', 'second'], X_res_both),
        # slice
        (slice('first', 'second'), X_res_both),

        # int keys: positional

        # scalar
        (0, X_res_first),
        # list
        ([0], X_res_first),
        ([0, 1], X_res_both),
        (np.array([0, 1]), X_res_both),
        # slice
        (slice(0, 1), X_res_first),
        (slice(0, 2), X_res_both),

        # boolean mask
        (np.array([True, False]), X_res_first),
        (pd.Series([True, False], index=['first', 'second']), X_res_first),
    ]

    for selection, res in cases:
        ct = ColumnTransformer([('trans', Trans(), selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_df), res)
        assert_array_equal(ct.fit(X_df).transform(X_df), res)

        # callable that returns any of the allowed specifiers
        ct = ColumnTransformer([('trans', Trans(), lambda X: selection)],
                               remainder='drop')
        assert_array_equal(ct.fit_transform(X_df), res)
        assert_array_equal(ct.fit(X_df).transform(X_df), res)

    ct = ColumnTransformer([('trans1', Trans(), ['first']),
                            ('trans2', Trans(), ['second'])])
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    ct = ColumnTransformer([('trans1', Trans(), [0]),
                            ('trans2', Trans(), [1])])
    assert_array_equal(ct.fit_transform(X_df), X_res_both)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    # test with transformer_weights
    transformer_weights = {'trans1': .1, 'trans2': 10}
    both = ColumnTransformer([('trans1', Trans(), ['first']),
                              ('trans2', Trans(), ['second'])],
                             transformer_weights=transformer_weights)
    res = np.vstack([transformer_weights['trans1'] * X_df['first'],
                     transformer_weights['trans2'] * X_df['second']]).T
    assert_array_equal(both.fit_transform(X_df), res)
    assert_array_equal(both.fit(X_df).transform(X_df), res)
    assert len(both.transformers_) == 2
    assert ct.transformers_[-1][0] != 'remainder'

    # test multiple columns
    both = ColumnTransformer([('trans', Trans(), ['first', 'second'])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'

    both = ColumnTransformer([('trans', Trans(), [0, 1])],
                             transformer_weights={'trans': .1})
    assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both)
    assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both)
    assert len(both.transformers_) == 1
    assert ct.transformers_[-1][0] != 'remainder'

    # ensure pandas object is passes through

    class TransAssert(BaseEstimator):

        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            assert_true(isinstance(X, (pd.DataFrame, pd.Series)))
            if isinstance(X, pd.Series):
                X = X.to_frame()
            return X

    ct = ColumnTransformer([('trans', TransAssert(), 'first')],
                           remainder='drop')
    ct.fit_transform(X_df)
    ct = ColumnTransformer([('trans', TransAssert(), ['first', 'second'])])
    ct.fit_transform(X_df)

    # integer column spec + integer column names -> still use positional
    X_df2 = X_df.copy()
    X_df2.columns = [1, 0]
    ct = ColumnTransformer([('trans', Trans(), 0)], remainder='drop')
    assert_array_equal(ct.fit_transform(X_df), X_res_first)
    assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)

    assert len(ct.transformers_) == 2
    assert ct.transformers_[-1][0] == 'remainder'
    assert ct.transformers_[-1][1] == 'drop'
    assert_array_equal(ct.transformers_[-1][2], [1])
Ejemplo n.º 57
0
LabelEncoder_X = LabelEncoder()
X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0])
X[:, 2] = LabelEncoder_X.fit_transform(X[:, 2])
X[:, 4] = LabelEncoder_X.fit_transform(X[:, 4])
X[:, 5] = LabelEncoder_X.fit_transform(X[:, 5])
X[:, 7] = LabelEncoder_X.fit_transform(X[:, 7])
X[:, 8] = LabelEncoder_X.fit_transform(X[:, 8])
X[:, 10] = LabelEncoder_X.fit_transform(X[:, 10])
X[:, 12] = LabelEncoder_X.fit_transform(X[:, 12])

# Processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('encoder', OneHotEncoder(), [5])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=np.float)
X = X[:, 1:]

# Processing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('encoder', OneHotEncoder(), [8])],
                       remainder='passthrough')
X = np.array(ct.fit_transform(X), dtype=np.float)
X = X[:, 1:]

# Spliting the data value in train and test
from sklearn.model_selection import train_test_split
Ejemplo n.º 58
0
# In[8]:

data.drop('bool_of_active', axis=1, inplace=True)
data

# In[9]:

data.drop('step_count', axis=1, inplace=True)
data

# In[10]:

from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
    [("mood", OneHotEncoder(), [0])], remainder="passthrough"
)  # The last arg ([0]) is the list of columns you want to transform in this step
x = ct.fit_transform(data)
x

# In[11]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=0)

# In[12]:

from sklearn.naive_bayes import GaussianNB
# Multiple Linear Regression

# Importing the libraries
import numpy as np
import pandas as pd

# Importing the dataset 
dataset = pd.read_csv('50_Startups.csv') 
X = dataset.iloc[ : , :-1].values
y = dataset.iloc[ : , -1].values

# Encoding Categorical Data 
# Encoding the Independent Variable
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder
ct =ColumnTransformer( transformers = [('encoder', OneHotEncoder(), [3])], remainder ='passthrough') 
X= np.array (ct.fit_transform(X))

#avoiding dummy variable trap 
X = X[ : ,1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split( X, y, test_size = 0.2, random_state = 0)

# Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
#missing data management
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy ='mean')

#imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis =0)
imputer = imputer.fit(X[:,1:3])
X[:, 1:3] = imputer.transform (X[:,1:3])

#Encoding categorical values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
LabelEncoder_X = LabelEncoder()
X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0])

columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])],     
                                      remainder='passthrough')
X=np.array(columnTransformer.fit_transform(X),dtype=np.str)

LabelEncoder_y = LabelEncoder()
y = LabelEncoder_y.fit_transform(y)
# columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [3])],     
#                                       remainder='passthrough')
# y=np.array(columnTransformer.fit_transform(y),dtype=np.str)

#Splitting datasets into test and training sets nb train sz + tst sz = 1
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=0)

#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()