def test_column_transformer_callable_specifier(): # assert that function gets the full array / dataframe X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first = np.array([[0, 1, 2]]).T def func(X): assert_array_equal(X, X_array) return [0] ct = ColumnTransformer([('trans', Trans(), func)], remainder='drop') assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) pd = pytest.importorskip('pandas') X_df = pd.DataFrame(X_array, columns=['first', 'second']) def func(X): assert_array_equal(X.columns, X_df.columns) assert_array_equal(X.values, X_df.values) return ['first'] ct = ColumnTransformer([('trans', Trans(), func)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), X_res_first) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
def test_column_transformer_special_strings(): # one 'drop' -> ignore X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer( [('trans1', Trans(), [0]), ('trans2', 'drop', [1])]) exp = np.array([[0.], [1.], [2.]]) assert_array_equal(ct.fit_transform(X_array), exp) assert_array_equal(ct.fit(X_array).transform(X_array), exp) # all 'drop' -> return shape 0 array ct = ColumnTransformer( [('trans1', 'drop', [0]), ('trans2', 'drop', [1])]) assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0)) assert_array_equal(ct.fit_transform(X_array).shape, (3, 0)) # 'passthrough' X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer( [('trans1', Trans(), [0]), ('trans2', 'passthrough', [1])]) exp = X_array assert_array_equal(ct.fit_transform(X_array), exp) assert_array_equal(ct.fit(X_array).transform(X_array), exp) # None itself / other string is not valid for val in [None, 'other']: ct = ColumnTransformer( [('trans1', Trans(), [0]), ('trans2', None, [1])]) assert_raise_message(TypeError, "All estimators should implement", ct.fit_transform, X_array) assert_raise_message(TypeError, "All estimators should implement", ct.fit, X_array)
def test_column_transformer_sparse_array(): X_sparse = sparse.eye(3, 2).tocsr() # no distinction between 1D and 2D X_res_first = X_sparse[:, 0] X_res_both = X_sparse for col in [0, [0], slice(0, 1)]: for remainder, res in [('drop', X_res_first), ('passthrough', X_res_both)]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder, sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res) for col in [[0, 1], slice(0, 2)]: ct = ColumnTransformer([('trans', Trans(), col)], sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both)
def test_column_transformer_remainder(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_second = np.array([2, 4, 6]).reshape(-1, 1) X_res_both = X_array # default drop ct = ColumnTransformer([('trans1', Trans(), [0])]) assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'drop' assert_array_equal(ct.transformers_[-1][2], [1]) # specify passthrough ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1]) # column order is not preserved (passed through added to end) ct = ColumnTransformer([('trans1', Trans(), [1])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1]) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1]) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [0]) # passthrough when all actual transformers are skipped ct = ColumnTransformer([('trans1', 'drop', [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_second) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1]) # error on invalid arg ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\', \'passthrough\', " "or estimator.", ct.fit, X_array) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\', \'passthrough\', " "or estimator.", ct.fit_transform, X_array) # check default for make_column_transformer ct = make_column_transformer(([0], Trans())) assert ct.remainder == 'drop'
def test_column_transformer_sparse_stacking(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)]) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert_true(sparse.issparse(X_trans)) assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1)) assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first1D = np.array([0, 1, 2]) X_res_second1D = np.array([2, 4, 6]) X_res_first = X_res_first1D.reshape(-1, 1) X_res_both = X_array cases = [ # single column 1D / 2D (0, X_res_first), ([0], X_res_first), # list-like ([0, 1], X_res_both), (np.array([0, 1]), X_res_both), # slice (slice(0, 1), X_res_first), (slice(0, 2), X_res_both), # boolean mask (np.array([True, False]), X_res_first), ] for selection, res in cases: ct = ColumnTransformer([('trans', Trans(), selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_array), res) assert_array_equal(ct.fit(X_array).transform(X_array), res) # callable that returns any of the allowed specifiers ct = ColumnTransformer([('trans', Trans(), lambda x: selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_array), res) assert_array_equal(ct.fit(X_array).transform(X_array), res) ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} both = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])], transformer_weights=transformer_weights) res = np.vstack([transformer_weights['trans1'] * X_res_first1D, transformer_weights['trans2'] * X_res_second1D]).T assert_array_equal(both.fit_transform(X_array), res) assert_array_equal(both.fit(X_array).transform(X_array), res) assert len(both.transformers_) == 2 both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both) assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both) assert len(both.transformers_) == 1
def test_2D_transformer_output(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T # if one transformer is dropped, test that name is still correct ct = ColumnTransformer([('trans1', 'drop', 0), ('trans2', TransNo2D(), 1)]) assert_raise_message(ValueError, "the 'trans2' transformer should be 2D", ct.fit_transform, X_array) ct.fit(X_array) assert_raise_message(ValueError, "the 'trans2' transformer should be 2D", ct.transform, X_array)
def test_column_transformer_cloning(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', StandardScaler(), [0])]) ct.fit(X_array) assert_false(hasattr(ct.transformers[0][1], 'mean_')) assert_true(hasattr(ct.transformers_[0][1], 'mean_')) ct = ColumnTransformer([('trans', StandardScaler(), [0])]) ct.fit_transform(X_array) assert_false(hasattr(ct.transformers[0][1], 'mean_')) assert_true(hasattr(ct.transformers_[0][1], 'mean_'))
def test_2D_transformer_output_pandas(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['col1', 'col2']) # if one transformer is dropped, test that name is still correct ct = ColumnTransformer([('trans1', TransNo2D(), 'col1')]) assert_raise_message(ValueError, "the 'trans1' transformer should be 2D", ct.fit_transform, X_df) ct.fit(X_df) assert_raise_message(ValueError, "the 'trans1' transformer should be 2D", ct.transform, X_df)
def test_column_transformer_named_estimators(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans1', StandardScaler(), [0]), ('trans2', StandardScaler(with_std=False), [1])]) assert_false(hasattr(ct, 'transformers_')) ct.fit(X_array) assert_true(hasattr(ct, 'transformers_')) assert_true(isinstance(ct.named_transformers_['trans1'], StandardScaler)) assert_true(isinstance(ct.named_transformers_.trans1, StandardScaler)) assert_true(isinstance(ct.named_transformers_['trans2'], StandardScaler)) assert_true(isinstance(ct.named_transformers_.trans2, StandardScaler)) assert_false(ct.named_transformers_.trans2.with_std) # check it are fitted transformers assert_equal(ct.named_transformers_.trans1.mean_, 1.)
def test_column_transformer_remainder_numpy(key): # test different ways that columns are specified with passthrough X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), key)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
def test_2D_transformer_output(): class TransNo2D(BaseEstimator): def fit(self, X, y=None): return self def transform(self, X, y=None): return X X_array = np.array([[0, 1, 2], [2, 4, 6]]).T # if one transformer is dropped, test that name is still correct ct = ColumnTransformer([('trans1', 'drop', 0), ('trans2', TransNo2D(), 1)]) assert_raise_message(ValueError, "the 'trans2' transformer should be 2D", ct.fit_transform, X_array) ct.fit(X_array) assert_raise_message(ValueError, "the 'trans2' transformer should be 2D", ct.transform, X_array)
def test_column_transformer_remainder_pandas(key): # test different ways that columns are specified with passthrough pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), key)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
def test_column_transformer_sparse_stacking(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)], sparse_threshold=0.8) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert sparse.issparse(X_trans) assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1)) assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0])) assert len(col_trans.transformers_) == 2 assert col_trans.transformers_[-1][0] != 'remainder' col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)], sparse_threshold=0.1) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert not sparse.issparse(X_trans) assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1) assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer_no_remaining_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T ct = ColumnTransformer([('trans1', Trans(), [0, 1, 2])], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_array) assert_array_equal(ct.fit(X_array).transform(X_array), X_array) assert len(ct.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder'
def test_column_transformer_remainder(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_second = np.array([2, 4, 6]).reshape(-1, 1) X_res_both = X_array # default passthrough ct = ColumnTransformer([('trans', Trans(), [0])]) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) # specify to drop remaining columns ct = ColumnTransformer([('trans1', Trans(), [0])], remainder='drop') assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) # column order is not preserved (passed through added to end) ct = ColumnTransformer([('trans1', Trans(), [1])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1]) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1]) # passthrough when all actual transformers are skipped ct = ColumnTransformer([('trans1', 'drop', [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_second) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second) # error on invalid arg ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\' or \'passthrough\'", ct.fit, X_array) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\' or \'passthrough\'", ct.fit_transform, X_array)
def test_column_transformer_empty_columns(pandas, column): # test case that ensures that the column transformer does also work when # a given transformer doesn't have any columns to work on X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_both = X_array if pandas: pd = pytest.importorskip('pandas') X = pd.DataFrame(X_array, columns=['first', 'second']) else: X = X_array ct = ColumnTransformer([('trans1', Trans(), [0, 1]), ('trans2', Trans(), column)]) assert_array_equal(ct.fit_transform(X), X_res_both) assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[1][1], Trans) ct = ColumnTransformer([('trans1', Trans(), column), ('trans2', Trans(), [0, 1])]) assert_array_equal(ct.fit_transform(X), X_res_both) assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[0][1], Trans) ct = ColumnTransformer([('trans', Trans(), column)], remainder='passthrough') assert_array_equal(ct.fit_transform(X), X_res_both) assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 # including remainder assert isinstance(ct.transformers_[0][1], Trans) fixture = np.array([[], [], []]) ct = ColumnTransformer([('trans', Trans(), column)], remainder='drop') assert_array_equal(ct.fit_transform(X), fixture) assert_array_equal(ct.fit(X).transform(X), fixture) assert len(ct.transformers_) == 2 # including remainder assert isinstance(ct.transformers_[0][1], Trans)
def test_column_transformer_get_feature_names(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', Trans(), [0, 1])]) # raise correct error when not fitted assert_raises(NotFittedError, ct.get_feature_names) # raise correct error when no feature names are available ct.fit(X_array) assert_raise_message(AttributeError, "Transformer trans (type Trans) does not provide " "get_feature_names", ct.get_feature_names) # working example X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}], [{'c': 5}, {'c': 6}]], dtype=object).T ct = ColumnTransformer( [('col' + str(i), DictVectorizer(), i) for i in range(2)]) ct.fit(X) assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b', 'col1__c']) # passthrough transformers not supported ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) ct.fit(X) assert_raise_message( NotImplementedError, 'get_feature_names is not yet supported', ct.get_feature_names) ct = ColumnTransformer([('trans', DictVectorizer(), 0)], remainder='passthrough') ct.fit(X) assert_raise_message( NotImplementedError, 'get_feature_names is not yet supported', ct.get_feature_names) # drop transformer ct = ColumnTransformer( [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)]) ct.fit(X) assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b'])
def test_column_transformer_list(): X_list = [ [1, float('nan'), 'a'], [0, 0, 'b'] ] expected_result = np.array([ [1, float('nan'), 1, 0], [-1, 0, 0, 1], ]) ct = ColumnTransformer([ ('numerical', StandardScaler(), [0, 1]), ('categorical', OneHotEncoder(), [2]), ]) assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_column_transformer_drops_all_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T # columns are doubled when remainder = DoubleTrans X_res_both = 2 * X_array.copy()[:, 1:3] ct = ColumnTransformer([('trans1', 'drop', [0])], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], DoubleTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_remainder_transformer(key): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T X_res_both = X_array.copy() # second and third columns are doubled when remainder = DoubleTrans X_res_both[:, 1:3] *= 2 ct = ColumnTransformer([('trans1', Trans(), key)], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], DoubleTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_remainder_pandas(key): # test different ways that columns are specified with passthrough pd = pytest.importorskip('pandas') if isinstance(key, six.string_types) and key == 'pd-index': key = pd.Index(['first']) X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), key)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1])
def test_column_transformer_list(): X_list = [ [1, float('nan'), 'a'], [0, 0, 'b'] ] expected_result = np.array([ [1, float('nan'), 1, 0], [-1, 0, 0, 1], ]) ct = ColumnTransformer([ ('numerical', StandardScaler(), [0, 1]), ('categorical', OneHotEncoder(), [2]), ]) with pytest.warns(DataConversionWarning): # TODO: this warning is not very useful in this case, would be good # to get rid of it assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = [ 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit' ] categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='most_frequent') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)]) preprocessor.fit(hour_x_train) x_train = preprocessor.transform(hour_x_train).todense() x_val = preprocessor.transform(hour_x_val).todense() pickle.dump(preprocessor, open('encoder.p', "wb")) # Save encoder print('Predictors prepared') # Prepare targets y_train = hour_y_train.values.astype(float) y_val = hour_y_val.values.astype(float) print('all data prepared') # Test different weight decays weight_decay_list = 10**np.linspace(-5, 3, 10) val_loss_list = [] result_path_list = []
train_df = pd.read_csv('training.csv') X = train_df.drop('Instance', axis=1) X = X.drop('Income in EUR', axis=1) y = train_df['Income in EUR'] X_pred = pd.read_csv('test.csv') X_pred = X_pred.drop('Income', axis=1) X_pred = X_pred.drop('Instance', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1) ct = ColumnTransformer(transformers=[('num_imp', SimpleImputer(strategy='median'), [0, 2, 4, 9]), ('cat_imp', SimpleImputer(strategy='most_frequent'), [1, 3, 5, 6, 7, 8])], remainder='passthrough') ct.fit(X_train, y_train) X_train = ct.transform(X_train) X_test = ct.transform(X_test) jobs = X_train[:,6] senior_job_terms = ['senior', 'manager', 'doctor', 'lawyer', 'analyst', 'programmer', 'specialist', 'supervisor', 'chief'] senior_job = [] for j in jobs: found=False for s in senior_job_terms: if s in j: senior_job.append('yes') found = True break if not found: senior_job.append('no')
Xtest_new1 = d_tr.drop(['ID'], axis=1) featuresObject = ['season', 'year', 'month', 'hours', 'is_business_day', 'is_holiday'] for var in featuresObject: Xtest_new1[var] = Xtest_new1[var].astype('category') Xtest_new1.info() Xtest_new = Xtest_new1.copy() col_name_test = [f for f in Xtest_new.columns if Xtest_new[f].dtype == float] type(col_name_test) numeric_features_test = Xtest_new[col_name_test] type(numeric_features_test) Xtest_new[col_name_test].dtypes # Les Num X_te = ct_num.fit(numeric_features_test) Xtest_new[col_name_test] = pd.DataFrame(X_te, columns=numeric_features_test.columns, index = list(X_test.index.values)) Xtest_new.info() type(X_te) print(type(X_te)) print(X_te[0]) numeric_features_test = scaler.transform(numeric_features_test.values) # Gerer les variables categoriques d_1he_test = ct.fit(Xtest_new) Xtest_new.info() Xtrain_new.info() d_encoded_data = pd.DataFrame(d_1he_test, columns=ct.get_feature_names(), index = list(X_test.index.values)) d_encoded_data.drop(['oh_enc__x0_2016', 'oh_enc__x1_1','oh_enc__x2_0', 'oh_enc__x3_0','oh_enc__x4_0', 'oh_enc__x5_fall'], inplace=True, axis=1) df_concat = pd.concat([Xtest_new.reset_index(drop=True), d_encoded_data.reset_index(drop=True)], axis=1)
), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) turbine_type_feature = ['turbine_type'] turbine_type_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='HAWT') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features), ('ttype', turbine_type_transformer, turbine_type_feature) ], remainder="drop") preprocessor.fit(raw_data) joblib.dump(preprocessor, os.path.join(args.model_dir, "model.joblib")) def input_fn(input_data, content_type): """Parse input data payload We currently only take csv input. Since we need to process both labelled and unlabelled data we first determine whether the label column is present by looking at how many columns were provided. """ if content_type == 'text/csv': # Read the raw input data as CSV. df = pd.read_csv(StringIO(input_data), header=None)
class TransformerClass(BaseEstimator, TransformerMixin): """ TransformerClass """ def __init__(self): pass def create_pipeline_for_categorical_params(self) -> Pipeline: """ processing of categorical params """ return Pipeline([("OH", OneHotEncoder())]) def create_pipeline_for_numerical_params(self) -> Pipeline: """ processing of numerical params """ return Pipeline([("impute", SimpleImputer(np.nan, "mean"))]) def create(self, params: FeatureParams) -> ColumnTransformer: """ create transformer pipeline """ self.transformer = ColumnTransformer([ ( "pipeline_for_categorical_params", self.create_pipeline_for_categorical_params(), params.categorical, ), ( "pipeline_for_numerical_params", self.create_pipeline_for_numerical_params(), params.numerical, ), ]) return self.transformer def save(self, path_to_save: str) -> str: """ save transformer to disk """ with open(path_to_save, "wb") as file: pickle.dump(self.transformer, file) return path_to_save def load(self, path_to_save: str) -> ColumnTransformer: """ load transformer from disk """ with open(path_to_save, "rb") as file: self.transformer = pickle.load(file) return self.transformer def fit(self, df: pd.DataFrame, params: FeatureParams) -> pd.DataFrame: """ fit the transformer to input data """ self.create(params) self.transformer.fit(df) return self def fit_transform(self, df: pd.DataFrame, params: FeatureParams) -> pd.DataFrame: """ fit the transformer to input data and transform the data """ self.create(params) return pd.DataFrame(self.transformer.fit_transform(df)) def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ transform the data with already fitted transformer """ return pd.DataFrame(self.transformer.transform(df))
# Calculate important parameters. n_patients_train = X_train.shape[0] n_features = X_train.shape[1] # %% [markdown] # ### Pre-process data # %% # Standardization cols_standardize = [ 'grade', 'age', 'n_positive_nodes', 'progesterone', 'estrogen' ] X_ct = ColumnTransformer([('standardizer', StandardScaler(), cols_standardize) ]) X_ct.fit(X_train[cols_standardize]) X_train[cols_standardize] = X_ct.transform(X_train[cols_standardize]) X_test[cols_standardize] = X_ct.transform(X_test[cols_standardize]) Y_scaler = StandardScaler().fit(Y_train) Y_train['T'] = Y_scaler.transform(Y_train) Y_test['T'] = Y_scaler.transform(Y_test) # %% # Sorting sort_idx = np.argsort(Y_train.to_numpy(), axis=None)[::-1] X_train = X_train.loc[sort_idx, :] Y_train = Y_train.loc[sort_idx, :] E_train = E_train.loc[sort_idx, :]
class EasyPreprocessor(BaseEstimator, TransformerMixin): """A simple preprocessor Detects variable types, encodes everything as floats for use with sklearn. Applies one-hot encoding, missing value imputation and scaling. Attributes ---------- ct_ : ColumnTransformer Main container for all transformations. columns_ : pandas columns Columns of training data dtypes_ : Series of dtypes Dtypes of training data columns. types_ : something Inferred input types. Parameters ---------- scale : boolean, default=True Whether to scale continuous data. verbose : int, default=0 Control output verbosity. """ def __init__(self, scale=True, verbose=0, types=None): self.verbose = verbose self.scale = scale self.types = types def fit(self, X, y=None): """A reference implementation of a fitting function for a transformer. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The training input samples. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- self : object Returns self. """ if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) self.columns_ = X.columns self.dtypes_ = X.dtypes if self.types is None: # FIXME some sanity check? types = detect_types(X, verbose=self.verbose) else: types = self.types types = types.copy() # low card int encoded as categorical and continuous for now: types.loc[types.low_card_int, 'continuous'] = True types.loc[types.low_card_int, 'categorical'] = True # go over variable blocks # check for missing values # scale etc steps_categorical = [] if X.loc[:, types.categorical].isna().any(axis=None): steps_categorical.append( SimpleImputer(strategy='constant', fill_value='dabl_missing')) steps_categorical.append( OneHotEncoder(categories='auto', handle_unknown='ignore', sparse=False)) pipe_categorical = make_pipeline(*steps_categorical) steps_continuous = [] if (X.loc[:, types.continuous].isna().any(axis=None) or types['dirty_float'].any()): # we could skip the imputer here, but if there's dirty # floats, they'll have NaN, and we reuse the cont pipeline steps_continuous.append(SimpleImputer(strategy='median')) if self.scale: steps_continuous.append(StandardScaler()) # if X.loc[:, types['continuous']].isnull().values.any(): # FIXME doesn't work if missing values only in dirty column pipe_continuous = make_pipeline(*steps_continuous) # FIXME only have one imputer/standard scaler in all # (right now copied in dirty floats and floats) pipe_dirty_float = make_pipeline( DirtyFloatCleaner(), make_column_transformer((pipe_continuous, select_cont), remainder="passthrough")) # construct column transformer transformer_cols = [] if types['continuous'].any(): transformer_cols.append( ('continuous', pipe_continuous, types['continuous'])) if types['categorical'].any(): transformer_cols.append( ('categorical', pipe_categorical, types['categorical'])) if types['dirty_float'].any(): # FIXME we're not really handling this here any more? (yes we are) transformer_cols.append( ('dirty_float', pipe_dirty_float, types['dirty_float'])) if not len(transformer_cols): raise ValueError("No feature columns found") self.ct_ = ColumnTransformer(transformer_cols, sparse_threshold=.1) self.ct_.fit(X) self.input_shape_ = X.shape self.types_ = types # Return the transformer return self def get_feature_names(self): # this can go soon hopefully feature_names = [] for name, trans, cols in self.ct_.transformers_: if name == "continuous": # three should be no all-nan columns in the imputer if (trans.steps[0][0] == "simpleimputer" and np.isnan(trans.steps[0][1].statistics_).any()): raise ValueError("So unexpected! Looks like the imputer" " dropped some all-NaN columns." "Try calling 'clean' on your data first.") feature_names.extend(cols.index[cols]) elif name == 'categorical': # this is the categorical pipe, extract one hot encoder ohe = trans.steps[-1][1] # FIXME that is really strange?! ohe_cols = self.columns_[self.columns_.map(cols)] feature_names.extend(ohe.get_feature_names(ohe_cols)) elif name == "remainder": assert trans == "drop" elif name == "dirty_float": raise ValueError( "Can't compute feature names when handling dirty floats. " "Call 'clean' as a workaround") else: raise ValueError( "Can't compute feature names for {}".format(name)) return feature_names def transform(self, X): """ A reference implementation of a transform function. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- X_transformed : array of int of shape = [n_samples, n_features] The array containing the element-wise square roots of the values in `X` """ # Check is fit had been called check_is_fitted(self, ['ct_']) return self.ct_.transform(X)
def test_column_transformer_dataframe(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_both = X_array cases = [ # String keys: label based # scalar ('first', X_res_first), # list (['first'], X_res_first), (['first', 'second'], X_res_both), # slice (slice('first', 'second'), X_res_both), # int keys: positional # scalar (0, X_res_first), # list ([0], X_res_first), ([0, 1], X_res_both), (np.array([0, 1]), X_res_both), # slice (slice(0, 1), X_res_first), (slice(0, 2), X_res_both), # boolean mask (np.array([True, False]), X_res_first), (pd.Series([True, False], index=['first', 'second']), X_res_first), ] for selection, res in cases: ct = ColumnTransformer([('trans', Trans(), selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), res) assert_array_equal(ct.fit(X_df).transform(X_df), res) # callable that returns any of the allowed specifiers ct = ColumnTransformer([('trans', Trans(), lambda X: selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), res) assert_array_equal(ct.fit(X_df).transform(X_df), res) ct = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} both = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])], transformer_weights=transformer_weights) res = np.vstack([ transformer_weights['trans1'] * X_df['first'], transformer_weights['trans2'] * X_df['second'] ]).T assert_array_equal(both.fit_transform(X_df), res) assert_array_equal(both.fit(X_df).transform(X_df), res) assert len(both.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' # test multiple columns both = ColumnTransformer([('trans', Trans(), ['first', 'second'])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder' both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder' # ensure pandas object is passes through class TransAssert(BaseEstimator): def fit(self, X, y=None): return self def transform(self, X, y=None): assert_true(isinstance(X, (pd.DataFrame, pd.Series))) if isinstance(X, pd.Series): X = X.to_frame() return X ct = ColumnTransformer([('trans', TransAssert(), 'first')], remainder='drop') ct.fit_transform(X_df) ct = ColumnTransformer([('trans', TransAssert(), ['first', 'second'])]) ct.fit_transform(X_df) # integer column spec + integer column names -> still use positional X_df2 = X_df.copy() X_df2.columns = [1, 0] ct = ColumnTransformer([('trans', Trans(), 0)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), X_res_first) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'drop' assert_array_equal(ct.transformers_[-1][2], [1])
def test_check_preprocessing_1(self): """ Test check preprocessing on multiple preprocessing """ train = pd.DataFrame({ 'Onehot1': ['A', 'B', 'A', 'B'], 'Onehot2': ['C', 'D', 'C', 'D'], 'Binary1': ['E', 'F', 'E', 'F'], 'Binary2': ['G', 'H', 'G', 'H'], 'Ordinal1': ['I', 'J', 'I', 'J'], 'Ordinal2': ['K', 'L', 'K', 'L'], 'BaseN1': ['M', 'N', 'M', 'N'], 'BaseN2': ['O', 'P', 'O', 'P'], 'Target1': ['Q', 'R', 'Q', 'R'], 'Target2': ['S', 'T', 'S', 'T'], 'other': ['other', np.nan, 'other', 'other'] }) y = pd.DataFrame(data=[0, 1, 0, 0], columns=['y']) enc_onehot = ce.OneHotEncoder(cols=['Onehot1', 'Onehot2']).fit(train) train_onehot = enc_onehot.transform(train) enc_binary = ce.BinaryEncoder( cols=['Binary1', 'Binary2']).fit(train_onehot) train_binary = enc_binary.transform(train_onehot) enc_ordinal = ce.OrdinalEncoder( cols=['Ordinal1', 'Ordinal2']).fit(train_binary) train_ordinal = enc_ordinal.transform(train_binary) enc_basen = ce.BaseNEncoder( cols=['BaseN1', 'BaseN2']).fit(train_ordinal) train_basen = enc_basen.transform(train_ordinal) enc_target = ce.TargetEncoder(cols=['Target1', 'Target2']).fit( train_basen, y) input_dict1 = dict() input_dict1['col'] = 'Onehot2' input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan], index=['C', 'D', 'missing']) input_dict1['data_type'] = 'object' input_dict2 = dict() input_dict2['col'] = 'Binary2' input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan], index=['G', 'H', 'missing']) input_dict2['data_type'] = 'object' input_dict = dict() input_dict['col'] = 'state' input_dict['mapping'] = pd.Series(data=['US', 'FR-1', 'FR-2'], index=['US', 'FR', 'FR']) input_dict['data_type'] = 'object' input_dict3 = dict() input_dict3['col'] = 'Ordinal2' input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan], index=['K', 'L', 'missing']) input_dict3['data_type'] = 'object' list_dict = [input_dict2, input_dict3] y = pd.DataFrame(data=[0, 1], columns=['y']) train = pd.DataFrame({ 'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': ['A', 'B'] }) enc = ColumnTransformer(transformers=[('onehot', skp.OneHotEncoder(), ['city', 'state'])], remainder='drop') enc.fit(train, y) wrong_prepro = skp.OneHotEncoder().fit(train, y) check_preprocessing([ enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target, input_dict1, list_dict ]) for preprocessing in [ enc_onehot, enc_binary, enc_ordinal, enc_basen, enc_target ]: check_preprocessing(preprocessing) check_preprocessing(input_dict2) check_preprocessing(enc) check_preprocessing(None) with self.assertRaises(Exception): check_preprocessing(wrong_prepro)
def test_multiple_encoding_columntransfomers(self): """ Test multiple preprocessing columntransformers """ train = pd.DataFrame({ 'Onehot1': ['A', 'B'], 'Onehot2': ['C', 'D'], 'Binary1': ['E', 'F'], 'Binary2': ['G', 'H'], 'Ordinal1': ['I', 'J'], 'Ordinal2': ['K', 'L'], 'BaseN1': ['M', 'N'], 'BaseN2': ['O', 'P'], 'Target1': ['Q', 'R'], 'Target2': ['S', 'T'], 'other': ['other', np.nan] }) contributions = pd.DataFrame( [[ 1, 0, 1, 1, 1, 0, 1, 1, 3, 0, -3.5, 0, 4, 4, 5, 5, 0, 6, 7, 0, 8, 9, 10 ], [ .5, .5, 2, 0, .5, .5, 2, 0, 1.5, 1.5, 5.5, -2, -4, -4, -5, -5, 8.5, -2.5, -7, 14, -8, -9, -10 ]], index=['index1', 'index2']) expected_contrib = pd.DataFrame( { 'onehot_skp_Onehot1': [1., 1.], 'onehot_skp_Onehot2': [2, 2], 'onehot_ce_Onehot1': [1., 1.], 'onehot_ce_Onehot2': [2, 2], 'binary_ce_Binary1': [3., 3.], 'binary_ce_Binary2': [-3.5, 3.5], 'ordinal_ce_Ordinal1': [4, -4], 'ordinal_ce_Ordinal2': [4, -4], 'ordinal_skp_Ordinal1': [5, -5], 'ordinal_skp_Ordinal2': [5, -5], 'basen_ce_BaseN1': [6., 6.], 'basen_ce_BaseN2': [7, 7], 'target_ce_Target1': [8, -8], 'target_ce_Target2': [9, -9], 22: [10, -10] }, index=['index1', 'index2']) y = pd.DataFrame(data=[0, 1], columns=['y']) enc = ColumnTransformer(transformers=[ ('onehot_skp', skp.OneHotEncoder(), ['Onehot1', 'Onehot2']), ('onehot_ce', ce.OneHotEncoder(), ['Onehot1', 'Onehot2']), ('binary_ce', ce.BinaryEncoder(), ['Binary1', 'Binary2']), ('ordinal_ce', ce.OrdinalEncoder(), ['Ordinal1', 'Ordinal2']), ('ordinal_skp', skp.OrdinalEncoder(), ['Ordinal1', 'Ordinal2']), ('basen_ce', ce.BaseNEncoder(), ['BaseN1', 'BaseN2']), ('target_ce', ce.TargetEncoder(), ['Target1', 'Target2']) ], remainder='passthrough') enc.fit(train, y) input_dict1 = dict() input_dict1['col'] = 'Onehot2' input_dict1['mapping'] = pd.Series(data=['C', 'D', np.nan], index=['C', 'D', 'missing']) input_dict1['data_type'] = 'object' input_dict2 = dict() input_dict2['col'] = 'Binary2' input_dict2['mapping'] = pd.Series(data=['G', 'H', np.nan], index=['G', 'H', 'missing']) input_dict2['data_type'] = 'object' input_dict3 = dict() input_dict3['col'] = 'Ordinal2' input_dict3['mapping'] = pd.Series(data=['K', 'L', np.nan], index=['K', 'L', 'missing']) input_dict3['data_type'] = 'object' list_dict = [input_dict2, input_dict3] original = inverse_transform_contributions( contributions, [enc, input_dict1, list_dict]) pd.testing.assert_frame_equal(expected_contrib, original)
# - length: Longest shell measurement # - diameter: Diameter perpendicular to length # - height: Height with meat in shell # - whole_weight: Weight of whole abalone # - shucked_weight: Weight of meat # - viscera_weight: Gut weight (after bleeding) # - shell_weight: Weight after being dried # Categorical Features: # - sex: categories encoded as strings {'M', 'F', 'I'} where 'I' is Infant numeric_features = list(feature_columns_names) numeric_features.remove('sex') numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['sex'] categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ], remainder="drop") preprocessor.fit(concat_data) joblib.dump(preprocessor, os.path.join(args.model_dir, "model.joblib"))
) numerical_pipe = Pipeline( [ ('imputer', SimpleImputer(strategy='mean')), ('scaler', MinMaxScaler()) ] ) preprocessor = ColumnTransformer(transformers = [ ('cat', categorical_pipe, categorical), ('num', numerical_pipe, numerical)] ) # Fit and transform training data preprocessor.fit(X_train) cat = preprocessor.named_transformers_['cat']['encoder'].get_feature_names(categorical) columns = np.append(cat, numerical) X_train_transformed = pd.DataFrame(preprocessor.transform(X_train), columns=columns) X_train_transformed.head() ''' Step 4 : Do some serious ML ''' def create_baseline_classifiers(seed=seed): """Create a list of baseline classifiers. Parameters
class RepeatingBasisFunction(TransformerMixin, BaseEstimator): """ This is a transformer for features with some form of circularity. E.g. for days of the week you might face the problem that, conceptually, day 7 is as close to day 6 as it is to day 1. While numerically their distance is different. This transformer remedies that problem. The transformer selects a column and transforms it with a given number of repeating (radial) basis functions. Radial basis functions are bell-curve shaped functions which take the original data as input. The basis functions are equally spaced over the input range. The key feature of repeating basis funtions is that they are continuous when moving from the max to the min of the input range. As a result these repeating basis functions can capture how close each datapoint is to the center of each repeating basis function, even when the input data has a circular nature. :type column: int or list, default=0 :param column: Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. :type remainder: {'drop', 'passthrough'}, default="drop" :param remainder: By default, only the specified column is transformed, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns will be automatically passed through. This subset of columns is concatenated with the output of the transformer. :type n_periods: int, default=12 :param n_periods: number of basis functions to create, i.e., the number of columns that will exit the transformer. :type input_range: tuple or None, default=None :param input_range: the values at which the data repeats itself. For example, for days of the week this is (1,7). If input_range=None it is inferred from the training data. """ def __init__( self, column=0, remainder="drop", n_periods=12, input_range=None ): self.column = column self.remainder = remainder self.n_periods = n_periods self.input_range = input_range def fit(self, X, y=None): self.pipeline_ = ColumnTransformer( [ ( "repeatingbasis", _RepeatingBasisFunction( n_periods=self.n_periods, input_range=self.input_range ), [self.column], ) ], remainder=self.remainder, ) self.pipeline_.fit(X, y) return self def transform(self, X): check_is_fitted(self, ["pipeline_"]) return self.pipeline_.transform(X)
def build_model(dataframe=None, target_column=None, numerical_transformer=None, categorical_transformer=None, pca=False, algorithm=None, balance_data=False, grid_search=False, params=None, hashing=False, hash_size=500, project_path=None, **kwargs): # algorithm = algorithm.copy() data_path = os.path.join(project_path, 'data/') try: os.mkdir(data_path) except: pass identify_columns(dataframe, target_column, output_path=data_path, **kwargs) model_preprocessor_pipeline = os.path.join(project_path, 'model/') try: os.mkdir(model_preprocessor_pipeline) except: pass if os.path.exists(f"{project_path}/data/metadata/store_file.yaml"): config = yaml.safe_load( open(f"{project_path}/data/metadata/store_file.yaml")) numerical_attribute = config['num_feat'] categorical_attribute = config['cat_feat'] lower_categorical_attribute = config['lower_cat'] hash_features = config['hash_feat'] input_columns = config['input_columns'] else: raise ValueError( 'path: No file found in f"{project_path}/data/metadata/"') if hashing: hash_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')), ('hasher', FeatureHasher(n_features=hash_size, input_type='string')) ]) categorical_attribute = lower_categorical_attribute data_transformer = ColumnTransformer(transformers=[( 'numerical', numerical_transformer, numerical_attribute ), ('categorical', categorical_transformer, categorical_attribute), ('hasher', hash_transformer, hash_features)]) use_cols = numerical_attribute + categorical_attribute + hash_features else: data_transformer = ColumnTransformer(transformers=[( 'numerical', numerical_transformer, numerical_attribute ), ('categorical', categorical_transformer, categorical_attribute)]) use_cols = numerical_attribute + categorical_attribute train_df = manage_columns(dataframe, columns=input_columns, select_columns=True) y = dataframe[target_column] if balance_data: oversample = SMOTE() data_transformer.fit(train_df) encoder = data_transformer.transform(train_df) X, y = oversample.fit_resample(encoder, y) train_df = X X_train_copy = encoder X_train, X_test, y_train, y_test = train_test_split(train_df, y,\ stratify=y, test_size=0.20, random_state=0) else: X_train, X_test, y_train, y_test = train_test_split(train_df, y,\ stratify=y, test_size=0.20, random_state=0) X_train_copy = X_train.copy() data_transformer.fit(X_train_copy) X_train_copy = data_transformer.transform(X_train_copy) if pca: print_devider('Applying PCA to the data') if scipy.sparse.issparse(X_train_copy): X_train_array = X_train_copy.toarray() # elif isinstance(X_train_copy, np.ndarray): # X_train_array = X_train_copy else: X_train_array = X_train_copy # X_train_copy = data_transformer.fit_transform(X_train_copy) pca_ = PCA().fit(X_train_array) pca_evr = pca_.explained_variance_ratio_ cumsum_ = np.cumsum(pca_evr) dim_95 = np.argmax(cumsum_ >= 0.95) + 1 instances_, dims_ = X_train_copy.shape dim_reduction = PCA(dim_95) print( f"\nDimension reduced from {dims_} to {dim_95} while retaining 95% of variance." ) if hashing: preprocessor = Pipeline(steps=[('data_transformer', data_transformer),\ ('to_dense', DenseTransformer()),\ ('reduce_dim',dim_reduction)]) else: preprocessor = Pipeline(steps=[('data_transformer', data_transformer),\ ('reduce_dim',dim_reduction)]) else: if hashing: preprocessor = Pipeline(steps=[('data_transformer', data_transformer),\ ('to_dense', DenseTransformer())]) else: preprocessor = Pipeline(steps=[('data_transformer', data_transformer)]) classifier = Pipeline(steps=[('preprocessor', preprocessor), ('model', algorithm)]) exclusive_keyword = ['model', 'fit', 'hyperparameters'] if params: model_params = {} fit_params = {} hyperparamers_params = {} keys = [key for key in params.keys()] for first_key in keys: for key, value in params[first_key].items(): key = f'model__{key}' if first_key == 'model': model_params[key] = value elif first_key == 'fit': fit_params[key] = value elif first_key == 'hyperparameters': hyperparamers_params[key] = value else: raise ValueError( "params:'Only one of parameters {} should be set'.format(exclusive_keyword)" ) if grid_search: # We can utilize params grid to check for best hyperparameters or transformers # The syntax here is pipeline_step_name__parameters and we need to chain if we have nested pipelines parameters_grid = {} for key, value in param_grid.items(): parameters_grid[f'model__{key}'] = value # Doing a Grid Search grid_search = GridSearchCV(classifier, param_grid=parameters_grid) # fitting on our dataset grid_search.fit( X_train, y_train) # Semicolon to not print estimator in notebook # set config to diagram for visualizing the pipelines/composite estimator model_path = os.path.join(model_preprocessor_pipeline, f'{algorithm.__class__.__name__}.pkl') store_pipeline(grid_search.best_estimator_, model_path) set_config(display='diagram') # Lets visualize the best estimator from grid search. output = grid_search.best_estimator_ # saving pipeline as html format with open( f'{model_preprocessor_pipeline}/titanic_data_pipeline_estimator.html', 'w') as f: f.write(estimator_html_repr(grid_search.best_estimator_)) else: if params: kwargsList = inspect.getfullargspec(algorithm.fit)[0] if len(fit_params) > 0: # X_val = data_transformer.transform(X_test) # if 'eval_set' in kwargsList: # fit_params['model__eval_set'] = (X_test, y_test) try: classifier.set_params(**model_params) classifier.fit(X_train, y_train, **fit_params) except: if 'cat_features' in kwargsList: cate_features_index = [ X_train.columns.get_loc(col) for col in X_train.columns ][len(numerical_attribute):] fit_params['model__cat_features'] = cate_features_index classifier.set_params(**model_params) classifier.fit(X_train, y_train, **fit_params) else: try: classifier.set_params(**model_params) classifier.fit(X_train, y_train) except: if 'cat_features' in kwargsList: cate_features_index = [ X_train.columns.get_loc(col) for col in X_train.columns ][len(numerical_attribute):] fit_params['model__cat_features'] = cate_features_index classifier.set_params(**model_params) classifier.fit(X_train, y_train, **fit_params) else: classifier.set_params(**model_params) classifier.fit(X_train, y_train) model_path = os.path.join(model_preprocessor_pipeline, f'{algorithm.__class__.__name__}.pkl') store_pipeline(classifier, model_path) # set config to diagram for visualizing the pipelines/composite estimators set_config(display='diagram') output = classifier with open( f'{model_preprocessor_pipeline}/titanic_data_pipeline_estimator.html', 'w') as f: f.write(estimator_html_repr(classifier)) # X_test = data_transformer.transform(X_test) y_pred = output.predict(X_test) print_devider('Metric Performance') met_perf = get_scores(y_test, y_pred) print(f'\nMetric performance on test data\n{met_perf}\n\n') print('\nconfusion matrix') print(confusion_matrix(y_test, y_pred)) return output
train.drop('m_id', axis=1, inplace=True) train.drop(['user', 'item'], axis=1, inplace=True) # %% test = pd.merge(test, u_user, how='left', left_on='user', right_on='u_id') test.drop('u_id', axis=1, inplace=True) test = pd.merge(test, u_item, how='left', left_on='item', right_on='m_id') test.drop('m_id', axis=1, inplace=True) test.drop(['user', 'item'], axis=1, inplace=True) # %% ct = ColumnTransformer([ # ('u_i_onehot',OneHotEncoder(categories=[range(1, n_user + 1), range(1, n_item + 1)], sparse=False,dtype=np.int), ['user', 'item']), ('gender_onehot', OneHotEncoder(dtype=np.int, sparse=False),['gender', 'occupation', 'zip_code']) ], remainder='passthrough') ct.fit(train) X_train = ct.transform(train) X_test = ct.transform(test) # %% # 特征维度与V的维度 n_feature = X_train.shape[1] k = 10 # %% # 定义权重 w0 = tf.Variable(initial_value=tf.truncated_normal(shape=[1]), name='w0') w = tf.Variable(initial_value=tf.truncated_normal(shape=[n_feature]), name='w') V = tf.Variable(initial_value=tf.truncated_normal(shape=[k, n_feature]), name='V') # %% X = tf.placeholder(dtype='float', shape=[None, n_feature], name="X") y = tf.placeholder(dtype='float', shape=[None, 1], name='y')
numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['sex'] categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)], remainder="drop") preprocessor.fit(concat_data) joblib.dump(preprocessor, os.path.join(args.model_dir, "model.joblib")) print("saved model!") def input_fn(input_data, content_type): """Parse input data payload We currently only take csv input. Since we need to process both labelled and unlabelled data we first determine whether the label column is present by looking at how many columns were provided. """ if content_type == 'text/csv': # Read the raw input data as CSV.
cat_attrb_selected = [ "Suburb", "Type", "Method", "SellerG", "Date", "CouncilArea", "Regionname" ] cat_pipeline = Pipeline([("select_cat", DataFrameSelector(cat_attrb_selected)), ("imputer", MostFrequentImputer()), ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))]) full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attrb_selected), ("cat", cat_pipeline, cat_attrb_selected), ]) full_pipeline.fit(data[data.Price.notnull()]) #SVM = pickle.load(open('model/Pickle/SVM.pkl', 'rb')) #KNN = pickle.load(open('model/Pickle/KNN.pkl', 'rb')) #RF = pickle.load(open('model/Pickle/RF.pkl', 'rb')) #SVM_Grid = pickle.load(open('model/Pickle/svm_grid.pkl','rb')) #KNN_Grid = pickle.load(open('model/Pickle/knn_grid.pkl','rb')) #RF_Random = pickle.load(open('model/Pickle/rf_random.pkl','rb')) # print(SVM.predict(full_pipeline.transform(data.head(1)))) test = [ 2, 2.5, 2, 1, 1, 202, -37.7996, 144.9984, "Abbotsford", "h", "S", "Biggin", "3/12/2016", "Yarra City Council", "Northern Metropolitan" ] features = num_attrb_selected + cat_attrb_selected
# In[14]: test_data = pd.DataFrame([test_country], columns=countries.columns) # In[15]: data_features = countries.select_dtypes('number').columns data_pipeline = Pipeline(steps=[('imputer', SimpleImputer( strategy='median')), ('scaler', StandardScaler())]) preprocessor = ColumnTransformer(transformers=[('num', data_pipeline, data_features)], remainder='drop') preprocessor.fit(countries) # In[16]: def q4(): arable = preprocessor.transform(test_data)[0][data_features.get_loc( 'Arable')] return float(round(arable, 3)) # ## Questão 5 # # Descubra o número de _outliers_ da variável `Net_migration` segundo o método do _boxplot_, ou seja, usando a lógica: # # $$x \notin [Q1 - 1.5 \times \text{IQR}, Q3 + 1.5 \times \text{IQR}] \Rightarrow x \text{ é outlier}$$
def test_column_transformer_dataframe(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_both = X_array cases = [ # String keys: label based # scalar ('first', X_res_first), # list (['first'], X_res_first), (['first', 'second'], X_res_both), # slice (slice('first', 'second'), X_res_both), # int keys: positional # scalar (0, X_res_first), # list ([0], X_res_first), ([0, 1], X_res_both), (np.array([0, 1]), X_res_both), # slice (slice(0, 1), X_res_first), (slice(0, 2), X_res_both), # boolean mask (np.array([True, False]), X_res_first), (pd.Series([True, False], index=['first', 'second']), X_res_first), ] for selection, res in cases: ct = ColumnTransformer([('trans', Trans(), selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), res) assert_array_equal(ct.fit(X_df).transform(X_df), res) # callable that returns any of the allowed specifiers ct = ColumnTransformer([('trans', Trans(), lambda X: selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), res) assert_array_equal(ct.fit(X_df).transform(X_df), res) ct = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} both = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])], transformer_weights=transformer_weights) res = np.vstack([transformer_weights['trans1'] * X_df['first'], transformer_weights['trans2'] * X_df['second']]).T assert_array_equal(both.fit_transform(X_df), res) assert_array_equal(both.fit(X_df).transform(X_df), res) assert len(both.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' # test multiple columns both = ColumnTransformer([('trans', Trans(), ['first', 'second'])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder' both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder' # ensure pandas object is passes through class TransAssert(BaseEstimator): def fit(self, X, y=None): return self def transform(self, X, y=None): assert_true(isinstance(X, (pd.DataFrame, pd.Series))) if isinstance(X, pd.Series): X = X.to_frame() return X ct = ColumnTransformer([('trans', TransAssert(), 'first')], remainder='drop') ct.fit_transform(X_df) ct = ColumnTransformer([('trans', TransAssert(), ['first', 'second'])]) ct.fit_transform(X_df) # integer column spec + integer column names -> still use positional X_df2 = X_df.copy() X_df2.columns = [1, 0] ct = ColumnTransformer([('trans', Trans(), 0)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), X_res_first) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'drop' assert_array_equal(ct.transformers_[-1][2], [1])
def test_check_consistency_model_features_5(self): """ Unit test check_consistency_model_features 5 """ train = pd.DataFrame( { 'city': ['chicago', 'paris'], 'state': ['US', 'FR'], 'other': [5, 10] }, index=['index1', 'index2']) features_dict = None columns_dict = { i: features for i, features in enumerate(train.columns) } features_types = { features: str(train[features].dtypes) for features in train.columns } mask_params = None enc = ColumnTransformer(transformers=[ ('Ordinal_ce', ce.OrdinalEncoder(), ['city', 'state']), ('Ordinal_skp', skp.OrdinalEncoder(), ['city', 'state']) ], remainder='passthrough') enc_2 = ColumnTransformer(transformers=[ ('Ordinal_ce', ce.OrdinalEncoder(), ['city', 'state']), ('Ordinal_skp', skp.OrdinalEncoder(), ['city', 'state']) ], remainder='drop') enc.fit(train) train_1 = pd.DataFrame( enc.transform(train), columns=["city_ce", "state_ce", "city_skp", "state_skp", "other"]) train_1["y"] = np.array([1, 0]) clf_1 = cb.CatBoostClassifier(n_estimators=1) \ .fit(train_1[["city_ce", "state_ce", "city_skp", "state_skp", "other"]], train_1['y']) enc_2.fit(train) train_2 = pd.DataFrame( enc_2.transform(train), columns=["city_ce", "state_ce", "city_skp", "state_skp"]) train_2["y"] = np.array([1, 0]) clf_2 = cb.CatBoostClassifier(n_estimators=1) \ .fit(train_2[["city_ce", "state_ce", "city_skp", "state_skp"]], train_2['y']) enc_3 = ce.OneHotEncoder(cols=['city', 'state']) enc_3.fit(train) train_3 = enc_3.transform(train) train_3["y"] = np.array([1, 0]) clf_3 = cb.CatBoostClassifier(n_estimators=1) \ .fit(train_3[["city_1", "city_2", "state_1", "state_2", "other"]], train_3['y']) dict_4 = { 'col': 'state', 'mapping': pd.Series(data=[1, 2], index=['US', 'FR']), 'data_type': 'object' } dict_5 = { 'col': 'city', 'mapping': pd.Series(data=[1, 2], index=['chicago', 'paris']), 'data_type': 'object' } enc_4 = [enc_3, [dict_4]] enc_5 = [enc_3, [dict_4, dict_5]] check_consistency_model_features(features_dict, clf_1, columns_dict, features_types, mask_params, enc, list_preprocessing=[enc]) check_consistency_model_features(features_dict, clf_2, columns_dict, features_types, mask_params, enc_2, list_preprocessing=[enc_2]) check_consistency_model_features(features_dict, clf_3, columns_dict, features_types, mask_params, enc_3, list_preprocessing=[enc_3]) check_consistency_model_features(features_dict, clf_3, columns_dict, features_types, mask_params, enc_4, list_preprocessing=enc_4) check_consistency_model_features(features_dict, clf_3, columns_dict, features_types, mask_params, enc_5, list_preprocessing=enc_5)
('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) # ('vect', CountVectorizer(), 'Model') # ('scale', StandardScaler(), all_features) # ('iter', IterativeImputer(max_iter=10, random_state=0), ['New_Price']) ]) """ clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LinearRegression())]) """ X = df.drop(['Price', 'Location_Type', 'Name', 'Mileage', 'New_Price'], axis=1) y = df['Price'] X_test = tf.drop(['Location_Type', 'Name'], axis=1) all_data = X.append(X_test) preprocessor.fit(all_data) X = preprocessor.transform(X) X_test = preprocessor.transform(X_test) X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.1) xgb = xgboost.XGBRegressor(n_estimators=200, learning_rate=0.08, gamma=0, subsample=0.55, colsample_bytree=0.75, max_depth=20) xgb.fit(X_train, y_train)
def get_preprocessor_pipeline(df: pd.DataFrame) -> Pipeline: """ Building transformation. Returns a pipeline which involves in multiple steps. Args: df: Dataframe Returns: Data transformed by pipeline """ numeric_features = list(NUMERICAL_COLUMNS.keys()) numeric_features += ['HouseAge', 'HouseAgeRemodel', 'GarageAge'] ordinal_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'GarageQual'] categorical_features = list(set(CATEGORICAL_COLUMNS.keys()) - set(ordinal_features)) numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()), ]) categorical_transformers = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OrdinalEncoder(categories='auto', handle_unknown='use_encoded_value', unknown_value=-1)) ]) ordinal_transformer_1 = Pipeline(steps=[ ('encoder', OrdinalEncoder(categories=[['Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value', unknown_value=-1)) ]) ordinal_transformer_2 = Pipeline(steps=[ ('encoder', OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value', unknown_value=-1)) ]) ordinal_transformer_3 = Pipeline(steps=[ ('encoder', OrdinalEncoder(categories=[['Not Available', 'Po', 'Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value', unknown_value=-1)) ]) ordinal_transformer_4 = Pipeline(steps=[ ('encoder', OrdinalEncoder(categories=[['Not Available', 'Po', 'Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value', unknown_value=-1)) ]) ordinal_transformer_5 = Pipeline(steps=[ ('encoder', OrdinalEncoder(categories=[['Not Available', 'No', 'Mn', 'Av', 'Gd']], handle_unknown='use_encoded_value', unknown_value=-1)) ]) ordinal_transformer_6 = Pipeline(steps=[ ('encoder', OrdinalEncoder(categories=[['Not Available', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']], handle_unknown='use_encoded_value', unknown_value=-1)) ]) ordinal_transformer_7 = Pipeline(steps=[ ('encoder', OrdinalEncoder(categories=[['Not Available', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']], handle_unknown='use_encoded_value', unknown_value=-1)) ]) ordinal_transformer_8 = Pipeline(steps=[ ('encoder', OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value', unknown_value=-1)) ]) ordinal_transformer_9 = Pipeline(steps=[ ('encoder', OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value', unknown_value=-1)) ]) ordinal_transformer_10 = Pipeline(steps=[ ('encoder', OrdinalEncoder(categories=[['Not Available', 'Po', 'Fa', 'TA', 'Gd', 'Ex']], handle_unknown='use_encoded_value', unknown_value=-1)) ]) preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformers, categorical_features), ('ord1', ordinal_transformer_1, ['ExterQual']), ('ord2', ordinal_transformer_2, ['ExterCond']), ('ord3', ordinal_transformer_3, ['BsmtQual']), ('ord4', ordinal_transformer_4, ['BsmtCond']), ('ord5', ordinal_transformer_5, ['BsmtExposure']), ('ord6', ordinal_transformer_6, ['BsmtFinType1']), ('ord7', ordinal_transformer_7, ['BsmtFinType2']), ('ord8', ordinal_transformer_8, ['HeatingQC']), ('ord9', ordinal_transformer_9, ['KitchenQual']), ('ord10', ordinal_transformer_10, ['GarageQual'])] ) joblib.dump(preprocessor.fit(df), filename=os.path.join(MODEL_DIR, PREPROCESSING_PIPELINE_FILE_NAME)) upload_files(filename=PREPROCESSING_PIPELINE_FILE_NAME, source_file_path=os.path.join(MODEL_DIR, PREPROCESSING_PIPELINE_FILE_NAME)) return preprocessor.fit_transform(df)
num_attr = ['tenure', 'MonthlyCharges', 'TotalCharges'] testing_churn = telecom[['Churn']] testing_churn = testing_churn['Churn'].map({'Yes': 1, 'No': 0}) telecom.drop('Churn', axis=1, inplace=True) ######## Pipeline ######## num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('std_scale', MinMaxScaler(feature_range=(-1, 1)))]) full_pipeline = ColumnTransformer([('num', num_pipeline, num_attr), ('ordinal', OrdinalEncoder(), ordinal_attr), ('cat', OneHotEncoder(drop='first', sparse=False), dummy)]) # sample_telecom = telecom[0:3] fitting = full_pipeline.fit(telecom) cat_names = full_pipeline.named_transformers_.cat.get_feature_names(dummy) def feature_ext(sample): int_cols = {'tenure': int, 'MonthlyCharges': float, 'TotalCharges': float} val = [ 'customerID', 'tenure', 'PhoneService', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies' ] sample.columns = val sample = sample.astype(int_cols) testing = fitting.transform(sample)
-0.13929748680369286, 1.3163604645710438, -0.3699637766938669, -0.6149300604558857, -0.854369594993175, 0.263445277972641, 0.5712416961268142 ] # In[36]: ct = pd.DataFrame([test_country], columns=df.columns) # In[37]: cols2 = df.select_dtypes(['int64', 'float64']).columns pl = Pipeline(steps=[('imp', SimpleImputer( strategy='median')), ('scaler', StandardScaler())]) tf = ColumnTransformer(transformers=[('number', pl, cols2)], n_jobs=-1) tf.fit(df) # In[38]: def q4(): res = tf.transform(ct)[0][cols2.get_loc('Arable')] return round(float(res), 3) # ## Questão 5 # # Descubra o número de _outliers_ da variável `Net_migration` segundo o método do _boxplot_, ou seja, usando a lógica: # # $$x \notin [Q1 - 1.5 \times \text{IQR}, Q3 + 1.5 \times \text{IQR}] \Rightarrow x \text{ é outlier}$$ #
preproc_scale = ColumnTransformer(transformers=[('num', StandardScaler(), feat_num_idx)]) # for models that don't require scaling, we want to pass-through these features: preproc_num_pass = ColumnTransformer(transformers=[('num', 'passthrough', feat_num_idx)]) # Categorical: cat_cols = ['registered_via'] cat_cols_idx = [list(df_feat1.columns).index(x) for x in cat_cols] preproc_ohe = ColumnTransformer( transformers=[('cat', OneHotEncoder(categories='auto'), cat_cols_idx)]) # fit to get feature names preproc_ohe.fit(df_feat1) feat_ohe = preproc_ohe.named_transformers_['cat'].get_feature_names() feat_ohe = feat_ohe.tolist() feat_ohe = list(map(fix_ohe_names, feat_ohe)) # Boolean: pass through pass_cols = [ 'payment_method_most_common_mode', 'is_auto_renew_mode', 'is_cancel_mode' ] pass_cols_idx = [list(df_feat1.columns).index(x) for x in pass_cols] preproc_pass = ColumnTransformer(transformers=[ ('as_is', 'passthrough', pass_cols_idx), ]) # Scaling
def main(): if len(sys.argv) < 3: print( "Not enough arguments specified\n Usage: lasso.py <x features path> <y target path> <outdir>" ) sys.exit(1) else: # print command line arguments for arg in sys.argv[0:]: print(arg) #Load X features data X_path = sys.argv[1] print('Loading the X features at {}'.format(X_path)) X_train = pd.read_csv(X_path, index_col=0) X_train = X_train.sort_index(axis=0) Y_path = sys.argv[2] print('Loading Y target at {}'.format(Y_path)) y_train = pd.read_csv(Y_path, index_col=0) y_train = y_train.sort_index(axis=0) y_target = y_train[:-1].columns #Load the numeric and categorical feature names num_feat = pd.read_csv("data/numerical_features_ffq.csv", delimiter=',', header=0) cat_feat = pd.read_csv("data/categorical_features_ffq.csv", delimiter=',', header=0) zero_feat = pd.read_csv("data/ffq_var_with_zeroes.csv", delimiter=",", header=0) #Define the numeric and categorical features numerical_features = [ col for col in X_train.columns if col in num_feat.values ] numeric_nonzero = [ col for col in numerical_features if col not in zero_feat.values ] numeric_zeroes = [ col for col in X_train.columns if col in zero_feat.values ] categorical_features = [ col for col in X_train.columns if col in cat_feat.values ] print('Setting up ColumnTransformer...') numeric_transformer = Pipeline( steps=[('log', FunctionTransformer(np.log)), ('scaler', StandardScaler())]) #set up pipeline for numeric variables with zeroes zero_transformer = Pipeline( steps=[('yeo', PowerTransformer(method="yeo-johnson", standardize=True))]) #Set up the categorical pipeline #define the unique levels of each category X_cat = X_train[categorical_features] enc = OneHotEncoder(handle_unknown="error", sparse=False) enc.fit(X_cat) enc.transform(X_cat) cat_levels = enc.categories_ #define the categorical transformer categorical_transformer = Pipeline(steps=[( 'onehot', OneHotEncoder( handle_unknown='error', sparse=False, categories=cat_levels))]) #Set up ColumnTransformer prep = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_nonzero), ('yeo', zero_transformer, numeric_zeroes), ('cat', categorical_transformer, categorical_features)]) model = TransformedTargetRegressor(Lasso(random_state=0), func=np.log, inverse_func=np.exp) #Set up the pipeline print('Setting up pipeline...') pipeline = Pipeline(steps=[('preprocessor', prep), ('lasso', model)]) #Set up the param grid and CV param_grid = {'lasso__regressor__alpha': np.logspace(-4, -1, 50)} #define inner and outer cv inner_cv = KFold(n_splits=10, shuffle=True, random_state=0) outer_cv = KFold(n_splits=10, shuffle=True, random_state=0) refit = 'r2' pscore = make_scorer(pcc) scoring = { 'r2': make_scorer(r2_score), 'MAE': make_scorer(mean_absolute_error), 'pearson': pscore } #create output sinks outer_loop_r2 = [] outer_loop_pcc = [] outer_loop_mae = [] inner_loop_won_params = [] inner_loop_accuracy_scores = [] inner_loop_coefs = [] inner_loop_best_cv_results = [] # Looping through the outer loop, feeding each training set into a grid_search as the inner loop for train_index, test_index in outer_cv.split(X_train, y_train): grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=inner_cv, scoring=scoring, refit="r2", n_jobs=-1) # inner loop grid_search.fit(X_train.iloc[train_index], y_train.iloc[train_index]) inner_results = pd.DataFrame(grid_search.cv_results_) inner_best_scores = inner_results[inner_results['rank_test_r2'] == 1] # The best hyper parameters from grid_search is now being tested on the unseen outer loop test data. pred = grid_search.predict(X_train.iloc[test_index]) # Appending the "winning" hyper parameters and their associated accuracy score outer_loop_r2.append(r2_score(y_train.iloc[test_index], pred)) outer_loop_mae.append( mean_absolute_error(y_train.iloc[test_index], pred)) outer_loop_pcc.append( sp.stats.pearsonr(y_train.iloc[test_index], pred)[0]) inner_loop_won_params.append(grid_search.best_params_) inner_loop_best_cv_results.append(inner_best_scores) inner_loop_coefs.append( grid_search.best_estimator_.named_steps['lasso'].regressor_.coef_) inner_loop_accuracy_scores.append(grid_search.best_score_) for i in zip(inner_loop_won_params, outer_loop_r2, inner_loop_accuracy_scores): print(i) print('Mean of outer loop accuracy score:', np.mean(outer_loop_r2)) #save the results cv_savepath = sys.argv[3] #save outer loop scores outer_results = pd.DataFrame() outer_results['r2'] = outer_loop_r2 outer_results['mae'] = outer_loop_mae outer_results['pcc'] = outer_loop_pcc outer_results['pcc'] = outer_results['pcc'].str.get(0) outer_name = 'outer_loop_results_for_{}'.format(y_target[0]) + '.csv' outer_path = cv_savepath + outer_name outer_results.to_csv(outer_path, index=True) #save the inner loop results inner_results = pd.concat(inner_loop_best_cv_results) inner_name = 'inner_loop_results_for_{}'.format(y_target[0]) + '.csv' inner_path = cv_savepath + inner_name inner_results.to_csv(inner_path, index=True) #get the feature names prep.fit(X_train) feature_names = get_transformer_feature_names(prep) #save the inner loop coefs inner_feat_df = pd.DataFrame(inner_loop_coefs).T inner_feat_df['Feature'] = feature_names inner_feat_df = inner_feat_df.set_index(['Feature']) inner_coef_name = 'inner_loop_coefs_for_{}'.format(y_target[0]) + '.csv' inner_coef_path = cv_savepath + inner_coef_name inner_feat_df.to_csv(inner_coef_path, index=True) #save the model mod_name = 'lasso_{}'.format(y_target[0]) + '.pkl' filename = cv_savepath + mod_name dump(grid_search.best_estimator_, open(filename, 'wb')) print("\nResults saved to {}".format(cv_savepath)) print("\nModel saved to {}".format(filename))
train_df.head() train_data = train_df.drop(['bruises'], axis=1) test_data = test_df.drop(['bruises'], axis=1) y_train_label = train_df['bruises'] y_test_label = test_df['bruises'] # ### Encoding categorical variables categorical_transformer = Pipeline(steps=[('woe', ce.OrdinalEncoder())]) categorical_features = train_data.select_dtypes(include=['object']).columns preprocessor = ColumnTransformer(transformers=[('cat', categorical_transformer, categorical_features)]) # Feature transformation preprocessor.fit(train_data) X_train = preprocessor.transform(train_data) X_test = preprocessor.transform(test_data) le = LabelEncoder() label_encoder = le.fit(y_train_label) y_train = label_encoder.transform(y_train_label) y_test = label_encoder.transform(y_test_label) print("Training Size:", X_train.shape, "Testing Size:", X_test.shape) comparision_df = pd.DataFrame(columns=[ 'Algorithm', 'Implementation', 'Depth', 'Bag Size', 'Accuracy' ]) MAX_TREE_DEPTH = [3, 5] BAG_SIZE = [10, 20]
my_trans = ColumnTransformer([ ("bin_weather", pip_weather, ["weather"]), ("bin and encode atemp", KBinsDiscretizer(n_bins=3, encode="onehot-dense", strategy="uniform"), ["atemp"]), ("interaction term work and hour", PolynomialFeatures(interaction_only=True, include_bias=False, degree=2), ["workingday", "hour"]), ("poly 2nd degree and scale", pip_scale_poly, ["atemp", "humidity"]), ("scale", MinMaxScaler(), ["windspeed"]), ("passthrough", "passthrough", ["datetime", "peek_hours"]), ("one hot encode", OneHotEncoder(), ["month", "season"]), ]) my_trans.fit(X_train) X_trans = my_trans.transform(X_train) X_trans = pd.DataFrame(X_trans, columns=[ "good_weath_cond", "bad_weath_cond", "low_temp", "medium_temp", "high_temp", "workingday", "hour", "inter_work_hour", "atemp", "humidity", "atemp^2", "interaction_atemp_hum", "humidity^2", "windspeed", "datetime", "peek_hours", "Jan", "Feb", "March", "Apr", "May", "Jun", "july", "Aug", "Sept", "Oct", "Nov", "Dec", "Spring", "Summer", "Autumn", "Winter" ]) X_trans.set_index("datetime", inplace=True) X_trans.head()
numcols = ["total_bill"] catcols = xtrain.select_dtypes("category").columns ohe = OneHotEncoder() ss = StandardScaler() prep_pl_categorical = Pipeline([("OHE", ohe)]) prep_pl_numeric = Pipeline([("Scaling", ss)]) ct = ColumnTransformer([("1", prep_pl_categorical, catcols), ("2", prep_pl_numeric, numcols)]) # %% ct.fit(xtrain) xtrain = ct.transform(xtrain) xtest = ct.transform(xtest) # %% new_input = pd.DataFrame( { "total_bill": 16.99, "sex": "Female", "smoker": "No", "day": "Sun", "time": "Dinner", "size": 2 }, index=[0])
class CoreferenceClassifier: def __init__(self, training_instances_iterator, classifier='SVM'): if classifier not in { 'NaiveBayes', 'Perceptron', 'SVM', 'MaxEnt', 'RandomForest', '__existing' }: print("ERROR: {} is not a valid classifier.".format(classifier), file=sys.stderr) print("Valid classifiers: ", file=sys.stderr) print( "\t'NaiveBayes'\n\t'Perceptron'\n\t'SVM'\n\t'MaxEnt'\n\t'RandomForest'", file=sys.stderr) print("(Default = 'DecisionTree')", file=sys.stderr) sys.exit(1) # Classifier model self.classifier = classifier # Transformer that prepares data for training the model and making predictions self.column_transformer = None # leave constructor if loading an already trained model if classifier == '__existing': pass else: # Scaler and OneHotEncoder to adapt feature vectors to model self.column_transformer = ColumnTransformer([ ('NumericalData', StandardScaler(), [0]), ('CategoricalData', OneHotEncoder(), slice(1, 11)) ]) self.column_transformer.fit( [[0, '+', '+', '+', '+', '+', '+', '+', '+', '+', '+'], [0, '-', '-', '-', '-', '-', '-', '-', '-', '-', '-'], [ 0, 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown' ]]) # incremental learning of model if self.classifier == 'SVM': self.classifier = SGDClassifier(loss='hinge') elif self.classifier == 'Perceptron': self.classifier = Perceptron() elif self.classifier == 'NaiveBayes': self.classifier = BernoulliNB() elif self.classifier == 'MaxEnt': self.classifier = SGDClassifier(loss='log') elif self.classifier == 'RandomForest': self.classifier = RandomForestClassifier(warm_start=True) # over sampler to cope with uneven balanced class distributions # (there are a lot more non-coreferent mention-pairs than coreferent mention-pairs) over_sampler = RandomOverSampler() under_sampler = RandomUnderSampler() for instances in training_instances_iterator: feature_matrix = [x[2:13] for x in instances] labels = [x[13] for x in instances] if len(set(labels)) > 1: feature_matrix, labels = over_sampler.fit_resample( feature_matrix, labels) # update Scaler num_data = [[x[0]] for x in feature_matrix] self.column_transformer.named_transformers_[ 'NumericalData'].partial_fit(num_data) del num_data # transform feature vectors feature_matrix = self.column_transformer.transform( feature_matrix) # update the model if classifier == 'RandomForest': self.classifier.fit(feature_matrix, labels) else: self.classifier.partial_fit(feature_matrix, labels, classes=['+', '-']) # predict returns a vector containing the predicted classes for an input vector or matrix def predict(self, data): # transform data so it fits the model try: data = self.column_transformer.transform(data) except ValueError: data = [data] data = self.column_transformer.transform(data) # make predictions return self.classifier.predict(data) def predict_mention_pair(self, feature_vector): # transform feature vector so it fits the model feature_vector = self.column_transformer.transform([feature_vector]) # make prediction pred = self.classifier.predict(feature_vector) return pred[0] def transform(self, data): return self.column_transformer.transform(data) # saves trained model in a binary file def save_binary(self, filename): binary = open(filename, 'wb') pickle.dump((self.classifier, self.column_transformer), binary) binary.close() # reads a trained model from a binary file # Usage: classifier = CoreferenceClassifier.load_binary(filename) @classmethod def load_binary(cls, filename): classifier = CoreferenceClassifier([]) binary = open(filename, 'rb') models = pickle.load(binary) binary.close() classifier.classifier = models[0] classifier.column_transformer = models[1] return classifier
stratify=y) numerical_features = X.select_dtypes("int64").columns categorical_features = X.select_dtypes(include=['object']).columns numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())]) categorical_transformer = Pipeline( steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numerical_features), ('cat', categorical_transformer, categorical_features)]) transf = preprocessor.fit(X) transf xtrain_prepared = transf.transform(X_train) xtrain_prepared.shape xtest_prepared = transf.transform(X_test) xtest_prepared.shape """## MLP Model Investigação feita neste modelo MLP foi para definir or melhores parametros para tax de aprendizado e o numero ideal de iterações que a rede precisa. """ mlp = MLPClassifier(max_iter=1000) params_mlp = {
enc.fit([['male', 0, 3], ['male', 1, 0], ['female', 2, 1], ['female', 0, 2]]) enc.categories_ # %% enc.transform([['male', 0, 3], ['none', 1, 0], ['male', 0, 2]]).toarray() enc.get_feature_names() # %% # ColumnTransformer from sklearn.compose import ColumnTransformer categorical_features = [0] enc = OneHotEncoder(handle_unknown='ignore') clt = ColumnTransformer([('name', enc, categorical_features)], remainder='passthrough') clt.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) clt.transform([[0, 2, 3]]) # %% # fit_transform enc = OneHotEncoder(sparse=False) ans = enc.fit_transform([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) ans # %% enc = OneHotEncoder() ans = enc.fit_transform([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) ans.toarray() # %% # StandardScaler 去均值和方差归一化