def test_column_transformer_special_strings(): # one 'drop' -> ignore X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer( [('trans1', Trans(), [0]), ('trans2', 'drop', [1])]) exp = np.array([[0.], [1.], [2.]]) assert_array_equal(ct.fit_transform(X_array), exp) assert_array_equal(ct.fit(X_array).transform(X_array), exp) # all 'drop' -> return shape 0 array ct = ColumnTransformer( [('trans1', 'drop', [0]), ('trans2', 'drop', [1])]) assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0)) assert_array_equal(ct.fit_transform(X_array).shape, (3, 0)) # 'passthrough' X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer( [('trans1', Trans(), [0]), ('trans2', 'passthrough', [1])]) exp = X_array assert_array_equal(ct.fit_transform(X_array), exp) assert_array_equal(ct.fit(X_array).transform(X_array), exp) # None itself / other string is not valid for val in [None, 'other']: ct = ColumnTransformer( [('trans1', Trans(), [0]), ('trans2', None, [1])]) assert_raise_message(TypeError, "All estimators should implement", ct.fit_transform, X_array) assert_raise_message(TypeError, "All estimators should implement", ct.fit, X_array)
def test_make_column_transformer_pandas(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) norm = Normalizer() ct1 = ColumnTransformer([('norm', Normalizer(), X_df.columns)]) ct2 = make_column_transformer((norm, X_df.columns)) assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df))
def test_column_transformer_remainder_numpy(key): # test different ways that columns are specified with passthrough X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), key)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
def test_column_transformer_sparse_stacking(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)]) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert_true(sparse.issparse(X_trans)) assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1)) assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer_negative_column_indexes(): X = np.random.randn(2, 2) X_categories = np.array([[1], [2]]) X = np.concatenate([X, X_categories], axis=1) ohe = OneHotEncoder(categories='auto') tf_1 = ColumnTransformer([('ohe', ohe, [-1])], remainder='passthrough') tf_2 = ColumnTransformer([('ohe', ohe, [2])], remainder='passthrough') assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
def test_2D_transformer_output(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T # if one transformer is dropped, test that name is still correct ct = ColumnTransformer([('trans1', 'drop', 0), ('trans2', TransNo2D(), 1)]) assert_raise_message(ValueError, "the 'trans2' transformer should be 2D", ct.fit_transform, X_array) ct.fit(X_array) assert_raise_message(ValueError, "the 'trans2' transformer should be 2D", ct.transform, X_array)
def test_column_transformer_no_remaining_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T ct = ColumnTransformer([('trans1', Trans(), [0, 1, 2])], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_array) assert_array_equal(ct.fit(X_array).transform(X_array), X_array) assert len(ct.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder'
def test_column_transformer_remainder_pandas(key): # test different ways that columns are specified with passthrough pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), key)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
def test_2D_transformer_output_pandas(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['col1', 'col2']) # if one transformer is dropped, test that name is still correct ct = ColumnTransformer([('trans1', TransNo2D(), 'col1')]) assert_raise_message(ValueError, "the 'trans1' transformer should be 2D", ct.fit_transform, X_df) ct.fit(X_df) assert_raise_message(ValueError, "the 'trans1' transformer should be 2D", ct.transform, X_df)
def test_column_transformer_get_set_params(): ct = ColumnTransformer([('trans1', StandardScaler(), [0]), ('trans2', StandardScaler(), [1])]) exp = {'n_jobs': 1, 'remainder': 'drop', 'trans1': ct.transformers[0][1], 'trans1__copy': True, 'trans1__with_mean': True, 'trans1__with_std': True, 'trans2': ct.transformers[1][1], 'trans2__copy': True, 'trans2__with_mean': True, 'trans2__with_std': True, 'transformers': ct.transformers, 'transformer_weights': None} assert_dict_equal(ct.get_params(), exp) ct.set_params(trans1__with_mean=False) assert_false(ct.get_params()['trans1__with_mean']) ct.set_params(trans1='passthrough') exp = {'n_jobs': 1, 'remainder': 'drop', 'trans1': 'passthrough', 'trans2': ct.transformers[1][1], 'trans2__copy': True, 'trans2__with_mean': True, 'trans2__with_std': True, 'transformers': ct.transformers, 'transformer_weights': None} assert_dict_equal(ct.get_params(), exp)
def test_column_transformer_no_estimators(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).astype('float').T ct = ColumnTransformer([], remainder=StandardScaler()) params = ct.get_params() assert params['remainder__with_mean'] X_trans = ct.fit_transform(X_array) assert X_trans.shape == X_array.shape assert len(ct.transformers_) == 1 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][2] == [0, 1, 2]
def test_column_transformer_get_set_params_with_remainder(): ct = ColumnTransformer([('trans1', StandardScaler(), [0])], remainder=StandardScaler()) exp = {'n_jobs': 1, 'remainder': ct.remainder, 'remainder__copy': True, 'remainder__with_mean': True, 'remainder__with_std': True, 'trans1': ct.transformers[0][1], 'trans1__copy': True, 'trans1__with_mean': True, 'trans1__with_std': True, 'transformers': ct.transformers, 'transformer_weights': None} assert ct.get_params() == exp ct.set_params(remainder__with_std=False) assert not ct.get_params()['remainder__with_std'] ct.set_params(trans1='passthrough') exp = {'n_jobs': 1, 'remainder': ct.remainder, 'remainder__copy': True, 'remainder__with_mean': True, 'remainder__with_std': False, 'trans1': 'passthrough', 'transformers': ct.transformers, 'transformer_weights': None} assert ct.get_params() == exp
def test_column_transformer_sparse_array(): X_sparse = sparse.eye(3, 2).tocsr() # no distinction between 1D and 2D X_res_first = X_sparse[:, 0] X_res_both = X_sparse for col in [0, [0], slice(0, 1)]: for remainder, res in [('drop', X_res_first), ('passthrough', X_res_both)]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder, sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res) for col in [[0, 1], slice(0, 2)]: ct = ColumnTransformer([('trans', Trans(), col)], sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both)
def test_column_transformer_named_estimators(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans1', StandardScaler(), [0]), ('trans2', StandardScaler(with_std=False), [1])]) assert_false(hasattr(ct, 'transformers_')) ct.fit(X_array) assert_true(hasattr(ct, 'transformers_')) assert_true(isinstance(ct.named_transformers_['trans1'], StandardScaler)) assert_true(isinstance(ct.named_transformers_.trans1, StandardScaler)) assert_true(isinstance(ct.named_transformers_['trans2'], StandardScaler)) assert_true(isinstance(ct.named_transformers_.trans2, StandardScaler)) assert_false(ct.named_transformers_.trans2.with_std) # check it are fitted transformers assert_equal(ct.named_transformers_.trans1.mean_, 1.)
def test_column_transformer_drop_all_sparse_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T ct = ColumnTransformer([('trans1', 'drop', [0])], remainder=SparseMatrixTrans()) X_trans = ct.fit_transform(X_array) assert sparse.issparse(X_trans) # SparseMatrixTrans creates 3 features for each column, thus: assert X_trans.shape == (3, 3) assert_array_equal(X_trans.toarray(), np.eye(3)) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_list(): X_list = [ [1, float('nan'), 'a'], [0, 0, 'b'] ] expected_result = np.array([ [1, float('nan'), 1, 0], [-1, 0, 0, 1], ]) ct = ColumnTransformer([ ('numerical', StandardScaler(), [0, 1]), ('categorical', OneHotEncoder(), [2]), ]) assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_column_transformer_drops_all_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T # columns are doubled when remainder = DoubleTrans X_res_both = 2 * X_array.copy()[:, 1:3] ct = ColumnTransformer([('trans1', 'drop', [0])], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], DoubleTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_remainder_pandas(key): # test different ways that columns are specified with passthrough pd = pytest.importorskip('pandas') if isinstance(key, six.string_types) and key == 'pd-index': key = pd.Index(['first']) X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), key)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1])
def test_column_transformer_remainder_transformer(key): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T X_res_both = X_array.copy() # second and third columns are doubled when remainder = DoubleTrans X_res_both[:, 1:3] *= 2 ct = ColumnTransformer([('trans1', Trans(), key)], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], DoubleTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_2D_transformer_output(): class TransNo2D(BaseEstimator): def fit(self, X, y=None): return self def transform(self, X, y=None): return X X_array = np.array([[0, 1, 2], [2, 4, 6]]).T # if one transformer is dropped, test that name is still correct ct = ColumnTransformer([('trans1', 'drop', 0), ('trans2', TransNo2D(), 1)]) assert_raise_message(ValueError, "the 'trans2' transformer should be 2D", ct.fit_transform, X_array) ct.fit(X_array) assert_raise_message(ValueError, "the 'trans2' transformer should be 2D", ct.transform, X_array)
def test_column_transformer_list(): X_list = [ [1, float('nan'), 'a'], [0, 0, 'b'] ] expected_result = np.array([ [1, float('nan'), 1, 0], [-1, 0, 0, 1], ]) ct = ColumnTransformer([ ('numerical', StandardScaler(), [0, 1]), ('categorical', OneHotEncoder(), [2]), ]) with pytest.warns(DataConversionWarning): # TODO: this warning is not very useful in this case, would be good # to get rid of it assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_column_transformer_sparse_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=SparseMatrixTrans()) X_trans = ct.fit_transform(X_array) assert sparse.issparse(X_trans) # SparseMatrixTrans creates 3 features for each column. There is # one column in ``transformers``, thus: assert X_trans.shape == (3, 3 + 1) exp_array = np.hstack( (X_array[:, 0].reshape(-1, 1), np.eye(3))) assert_array_equal(X_trans.toarray(), exp_array) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_callable_specifier(): # assert that function gets the full array / dataframe X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first = np.array([[0, 1, 2]]).T def func(X): assert_array_equal(X, X_array) return [0] ct = ColumnTransformer([('trans', Trans(), func)], remainder='drop') assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) pd = pytest.importorskip('pandas') X_df = pd.DataFrame(X_array, columns=['first', 'second']) def func(X): assert_array_equal(X.columns, X_df.columns) assert_array_equal(X.values, X_df.values) return ['first'] ct = ColumnTransformer([('trans', Trans(), func)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), X_res_first) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
def test_column_transformer_cloning(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', StandardScaler(), [0])]) ct.fit(X_array) assert_false(hasattr(ct.transformers[0][1], 'mean_')) assert_true(hasattr(ct.transformers_[0][1], 'mean_')) ct = ColumnTransformer([('trans', StandardScaler(), [0])]) ct.fit_transform(X_array) assert_false(hasattr(ct.transformers[0][1], 'mean_')) assert_true(hasattr(ct.transformers_[0][1], 'mean_'))
def test_column_transformer_sparse_stacking(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)], sparse_threshold=0.8) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert sparse.issparse(X_trans) assert_equal(X_trans.shape, (X_trans.shape[0], X_trans.shape[0] + 1)) assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0])) assert len(col_trans.transformers_) == 2 assert col_trans.transformers_[-1][0] != 'remainder' col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)], sparse_threshold=0.1) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert not sparse.issparse(X_trans) assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1) assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer_sparse_threshold(): X_array = np.array([['a', 'b'], ['A', 'B']], dtype=object).T # above data has sparsity of 4 / 8 = 0.5 # apply threshold even if all sparse col_trans = ColumnTransformer([('trans1', OneHotEncoder(), [0]), ('trans2', OneHotEncoder(), [1])], sparse_threshold=0.2) res = col_trans.fit_transform(X_array) assert not sparse.issparse(res) assert not col_trans.sparse_output_ # mixed -> sparsity of (4 + 2) / 8 = 0.75 for thres in [0.75001, 1]: col_trans = ColumnTransformer( [('trans1', OneHotEncoder(sparse=True), [0]), ('trans2', OneHotEncoder(sparse=False), [1])], sparse_threshold=thres) res = col_trans.fit_transform(X_array) assert sparse.issparse(res) assert col_trans.sparse_output_ for thres in [0.75, 0]: col_trans = ColumnTransformer( [('trans1', OneHotEncoder(sparse=True), [0]), ('trans2', OneHotEncoder(sparse=False), [1])], sparse_threshold=thres) res = col_trans.fit_transform(X_array) assert not sparse.issparse(res) assert not col_trans.sparse_output_ # if nothing is sparse -> no sparse for thres in [0.33, 0, 1]: col_trans = ColumnTransformer( [('trans1', OneHotEncoder(sparse=False), [0]), ('trans2', OneHotEncoder(sparse=False), [1])], sparse_threshold=thres) res = col_trans.fit_transform(X_array) assert not sparse.issparse(res) assert not col_trans.sparse_output_
def test_column_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first1D = np.array([0, 1, 2]) X_res_second1D = np.array([2, 4, 6]) X_res_first = X_res_first1D.reshape(-1, 1) X_res_both = X_array cases = [ # single column 1D / 2D (0, X_res_first), ([0], X_res_first), # list-like ([0, 1], X_res_both), (np.array([0, 1]), X_res_both), # slice (slice(0, 1), X_res_first), (slice(0, 2), X_res_both), # boolean mask (np.array([True, False]), X_res_first), ] for selection, res in cases: ct = ColumnTransformer([('trans', Trans(), selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_array), res) assert_array_equal(ct.fit(X_array).transform(X_array), res) # callable that returns any of the allowed specifiers ct = ColumnTransformer([('trans', Trans(), lambda x: selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_array), res) assert_array_equal(ct.fit(X_array).transform(X_array), res) ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} both = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])], transformer_weights=transformer_weights) res = np.vstack([transformer_weights['trans1'] * X_res_first1D, transformer_weights['trans2'] * X_res_second1D]).T assert_array_equal(both.fit_transform(X_array), res) assert_array_equal(both.fit(X_array).transform(X_array), res) assert len(both.transformers_) == 2 both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both) assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both) assert len(both.transformers_) == 1
def prepare_data(train_df_raw, test_df_raw, data_prep_dict): ''' Function to process raw data into required modelling data Inputs: 1. train_df_raw - Dataframe 2. test_df_raw - Dataframe 3. data_prep_dict - Dictionary Outputs: 1. train_df_processed - Dataframe 2. test_df_processed - Dataframe ''' #quick check to apply data processing on both train and test combined #train_df_raw = pd.concat([train_df_raw,test_df_raw],axis = 0) #override simple imputer error by manually assigning missing values train_df_raw['Holding_Policy_Duration'].fillna('-1', inplace=True) test_df_raw['Holding_Policy_Duration'].fillna('-1', inplace=True) train_df_raw.fillna('missing', inplace=True) test_df_raw.fillna('missing', inplace=True) #modify data values to convert catergorical raw attributes to potential numeric features train_df_raw.replace({'14+': '14'}, inplace=True) train_df_raw['Holding_Policy_Duration'] = train_df_raw[ 'Holding_Policy_Duration'].astype(float) test_df_raw.replace({'14+': '14'}, inplace=True) test_df_raw['Holding_Policy_Duration'] = test_df_raw[ 'Holding_Policy_Duration'].astype(float) #freeze data types train_df_raw[data_prep_dict['one_hot_encode']] = train_df_raw[ data_prep_dict['one_hot_encode']].astype(str) test_df_raw[data_prep_dict['one_hot_encode']] = test_df_raw[ data_prep_dict['one_hot_encode']].astype(str) #target encode required attributes for target_encode_col in data_prep_dict['target_encode']: encoding_dict = train_df_raw.groupby( target_encode_col)[TARGET].mean().to_dict() train_df_raw[target_encode_col] = train_df_raw[target_encode_col].map( encoding_dict) test_df_raw[target_encode_col] = test_df_raw[target_encode_col].map( encoding_dict) #fill missing Region Codes #city_code_means = train_df_raw.groupby(['City_Code'])[TARGET].mean().reset_index() #test_df_raw['Region_Code'] = test_df_raw.apply( #lambda row: city_code_means[TARGET][city_code_means.City_Code == # row['City_Code']].values[0] # if row['Region_Code'] not in train_df_raw['Region_Code'].unique() else row['Region_Code'], # axis=1 # ) #define set of transformation steps per raw attribute present in the data column_transformer_1 = ColumnTransformer( [('one_hot_encode', OneHotEncoder(sparse=False, drop='if_binary'), data_prep_dict['one_hot_encode'])], remainder='passthrough', verbose='True') #build and fit the column transformer on train data train_df_processed = column_transformer_1.fit_transform(train_df_raw) #apply the column transformer on test data test_df_processed = column_transformer_1.transform(test_df_raw) #convert numpy arrays into pandas dataframe for further analysis train_df_processed_1 = pd.DataFrame( train_df_processed, columns=column_transformer_1.get_feature_names()) test_df_processed_1 = pd.DataFrame( test_df_processed, columns=column_transformer_1.get_feature_names()) column_transformer_2 = ColumnTransformer([('passthrough', 'passthrough', [ col for col in train_df_processed_1.columns if col not in data_prep_dict['standard_scale'] ]), ('standard_scale', StandardScaler(), data_prep_dict['standard_scale']) ], remainder='passthrough', verbose='True') #build and fit the column transformer on train data train_df_processed_2 = column_transformer_2.fit_transform( train_df_processed_1) #apply the column transformer on test data test_df_processed_2 = column_transformer_2.transform(test_df_processed_1) #recreate column names in the correct order, to understand feature importances train_df_processed_out = pd.DataFrame( train_df_processed_2, columns=[ col for col in train_df_processed_1.columns if col not in data_prep_dict['standard_scale'] ] + data_prep_dict['standard_scale']) test_df_processed_out = pd.DataFrame( test_df_processed_2, columns=[ col for col in train_df_processed_1.columns if col not in data_prep_dict['standard_scale'] ] + data_prep_dict['standard_scale']) #progress logger print('Target encoding completed, return processed data') return train_df_processed_out, test_df_processed_out
"land_use_type_3_fraction", "land_use_type_4_fraction", "land_use_type_5_fraction", "land_use_type_6_fraction", "land_use_type_9_fraction"] y_list = "observe_O3" start = time() file = "F:/graduation_thesis/new_all_data/model/all_model_data.csv" data = pd.read_csv(file) data.dropna(inplace=True) data = data.sample(1000) features = data[x_list] labels = data[y_list] numeric_features = x_list numeric_transformer = Pipeline(steps=[('imp2', SimpleImputer(missing_values=-999, strategy='mean'))]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_features)]) X = preprocessor.fit_transform(features) Y = labels x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) model = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=60, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=90, min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=-1, oob_score=False, random_state=None, verbose=0, warm_start=False) print("model training") model.fit(x_train, y_train) print("model training finished")
def test_column_transformer_get_feature_names(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', Trans(), [0, 1])]) # raise correct error when not fitted assert_raises(NotFittedError, ct.get_feature_names) # raise correct error when no feature names are available ct.fit(X_array) assert_raise_message(AttributeError, "Transformer trans (type Trans) does not provide " "get_feature_names", ct.get_feature_names) # working example X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}], [{'c': 5}, {'c': 6}]], dtype=object).T ct = ColumnTransformer( [('col' + str(i), DictVectorizer(), i) for i in range(2)]) ct.fit(X) assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b', 'col1__c']) # passthrough transformers not supported ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) ct.fit(X) assert_raise_message( NotImplementedError, 'get_feature_names is not yet supported', ct.get_feature_names) ct = ColumnTransformer([('trans', DictVectorizer(), 0)], remainder='passthrough') ct.fit(X) assert_raise_message( NotImplementedError, 'get_feature_names is not yet supported', ct.get_feature_names) # drop transformer ct = ColumnTransformer( [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)]) ct.fit(X) assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b'])
1.0074863283776458, 0.20239896852403538, -0.043678728558593366, -0.13929748680369286, 1.3163604645710438, -0.3699637766938669, -0.6149300604558857, -0.854369594993175, 0.263445277972641, 0.5712416961268142 ] # In[36]: ct = pd.DataFrame([test_country], columns=df.columns) # In[37]: cols2 = df.select_dtypes(['int64', 'float64']).columns pl = Pipeline(steps=[('imp', SimpleImputer( strategy='median')), ('scaler', StandardScaler())]) tf = ColumnTransformer(transformers=[('number', pl, cols2)], n_jobs=-1) tf.fit(df) # In[38]: def q4(): res = tf.transform(ct)[0][cols2.get_loc('Arable')] return round(float(res), 3) # ## Questão 5 # # Descubra o número de _outliers_ da variável `Net_migration` segundo o método do _boxplot_, ou seja, usando a lógica: # # $$x \notin [Q1 - 1.5 \times \text{IQR}, Q3 + 1.5 \times \text{IQR}] \Rightarrow x \text{ é outlier}$$
admissionData = pd.read_csv("admissions_data.csv") admissionData = admissionData.drop(["Serial No."], axis=1) labels = admissionData.iloc[:, -1] # remove uni rating and TOEFL score - unethical? # remove serial no. and research - irrelevant info features = admissionData.iloc[:, [0, 3, 4, 5, 6]] # split dataset into train and test features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.3, random_state=42) # scale/normalise dataset features ct = ColumnTransformer([("normalize", Normalizer(), [0, 1, 2, 3])], remainder='passthrough') features_train = ct.fit_transform(features_train) features_test = ct.transform(features_test) learning_rate = 0.001 num_epochs = 20 # create neural network # admissionsModel = build_model(features_train, learning_rate) # rewrite this function # admissionsModel.fit(features_train, labels_train, epochs=20, batch_size=1, verbose=1) history1 = fit_model(build_model(features_train, learning_rate), features_train, labels_train, learning_rate, num_epochs) # need to return the fitted model into a graph somehow here
from core.utils.common_transformers import TypeSelector, FeatureSquarer from core.utils.common_estimators import RandomBinaryClassifier # sys.path.append('D:/GitRepos/github/PythonTestCode/prod_test') # transformer tests df = pd.DataFrame(data=[[1, 2, 'chad'], [4, 5, 'John']], columns=['col1', 'col2', 'col3']) float_pipeline = Pipeline(steps=[('float_squarer', FeatureSquarer())]) float_pipeline.fit(df) float_pipeline.transform(df) transformer_list = [('float', float_pipeline, ['col1', 'col2'])] preprocessor = ColumnTransformer(transformer_list) int_data = Pipeline(steps=[('column_extractor', TypeSelector('int64'))]) int_data.fit_transform(df) # estimator test X = pd.DataFrame(np.random.randn(100, 4), columns=list('ABCD')) y = pd.Series(np.random.choice(['setosa', 'virginica'], 100, p=[0.3, 0.7])) y_test = pd.Series(np.random.choice(['setosa', 'virginica'], 100, p=[0.3, 0.7])) model = RandomBinaryClassifier() model.fit(X, y)
class DogeDataLoader: def __init__(self, filename, categorical_cols, target_col, seq_length, batch_size, preprocessor=True, prediction_window=1): ''' :param filename: path to the csv dataset :param categorical_cols: name of the categorical columns, if None pass empty list :param target_col: name of the targeted column :param seq_length: window length to use :param prediction_window: window length to predict :param preprocessor: if normalize data or not :param batch_size: batch size ''' self.data = self.read_and_preprocess(filename) self.categorical_cols = categorical_cols self.numerical_cols = list( set(self.data.columns) - set(categorical_cols) - set(target_col)) self.target_col = target_col self.seq_length = seq_length self.prediction_window = prediction_window self.batch_size = batch_size self.preprocessor = preprocessor self.preprocess = ColumnTransformer( [ ("scaler", StandardScaler(), self.numerical_cols), #("encoder", OneHotEncoder(), self.categorical_cols) ], remainder="passthrough") def read_and_preprocess(self, filename): # Reading df = pd.read_csv(filename) # Reorder and resetting index df = df[::-1].reset_index(drop=True) # Preprocessing 'Change' column df['Change %'] = df['Change %'].str.replace("%", "") df['Change %'] = pd.to_numeric(df['Change %'].str.replace(",", "")) # Preprocessing 'Vol.' column vols = [el for el in df['Vol.']] for num, el in enumerate(vols): # Check if is billion isB = el[-1] == 'B' try: el = float(el[:-1]) except ValueError: print("Value Error at row ", num) el = vols[num - 1] if isB: el = el * 1000 vols[num] = el df['Vol.'] = vols # Dropping Date column df.pop('Date') # Done, returning dataframe return df def preprocess_data(self): ''' Preprocessing function ''' X = self.data.drop(self.target_col, axis=1) y = self.data[self.target_col] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False) if self.preprocessor is not None: X_train = self.preprocess.fit_transform(X_train) X_test = self.preprocess.fit_transform(X_test) if self.target_col: return X_train, X_test, y_train.values, y_test.values return X_train, X_test def frame_series(self, X, y=None): ''' Function used to prepare the data for time series prediction :param X: set of features :param y: targeted value to predict :return: TensorDataset ''' nb_obs, nb_features = X.shape features, target, y_hist = [], [], [] for i in range(1, nb_obs - self.seq_length - self.prediction_window): features.append( torch.FloatTensor(X[i:i + self.seq_length, :]).unsqueeze(0)) features_var = torch.cat(features) if y is not None: for i in range(1, nb_obs - self.seq_length - self.prediction_window): target.append( torch.tensor(y[i + self.seq_length:i + self.seq_length + self.prediction_window])) target_var = torch.cat(target) return TensorDataset(features_var, target_var) return TensorDataset(features_var) def get_loaders(self, ): ''' Preprocess and frame the dataset :return: DataLoaders associated to training and testing data ''' X_train, X_test, y_train, y_test = self.preprocess_data() train_dataset = self.frame_series(X_train, y_train) test_dataset = self.frame_series(X_test, y_test) train_iter = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=False, drop_last=True) test_iter = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False, drop_last=True) return train_iter, test_iter
("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("onehot", OneHotEncoder(handle_unknown="ignore")), ]) # For text, we: # 1. Impute missing values with the string "missing" # 2. Tfidf encode the text, using 1-grams and 2-grams. text_pipeline = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("tfidf", MultiColumnTfidfVectorizer(ngram_range=(1, 2))), ]) # Sparse preprocessing pipeline, for models such as Ridge that handle sparse input well sparse_preprocessing_pipeline = ColumnTransformer(transformers=[ ("num", numeric_pipeline, numeric_selector), ("cat", categorical_pipeline, categorical_selector), ("txt", text_pipeline, text_selector), ]) # Modified TruncatedSVD that doesn't fail if n_components > ncols class MyTruncatedSVD(TruncatedSVD): def fit_transform(self, X, y=None): if X.shape[1] <= self.n_components: self.n_components = X.shape[1] - 1 return TruncatedSVD.fit_transform(self, X=X, y=y) # Dense preprocessing pipeline, for models such as XGboost that do not do well with # extremely wide, sparse data # This preprocessing will work with linear models such as Ridge too
]) categorical_features = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'] categorical_transforms = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('scaler', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transforms, numeric_features), ('cat', categorical_transforms, categorical_features) ]) # append classifier to preprocessor classifier = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)) ]) # remove unuseful columns drop_labels = ['Id'] X = train_dataset.drop(labels=drop_labels, axis=1) y = train_dataset['SalePrice'] # # fit local
# Label Encoder labelEncoder_previsores = LabelEncoder() previsores[:, 1] = labelEncoder_previsores.fit_transform(previsores[:, 1]) previsores[:, 3] = labelEncoder_previsores.fit_transform(previsores[:, 3]) previsores[:, 5] = labelEncoder_previsores.fit_transform(previsores[:, 5]) previsores[:, 6] = labelEncoder_previsores.fit_transform(previsores[:, 6]) previsores[:, 7] = labelEncoder_previsores.fit_transform(previsores[:, 7]) previsores[:, 8] = labelEncoder_previsores.fit_transform(previsores[:, 8]) previsores[:, 9] = labelEncoder_previsores.fit_transform(previsores[:, 9]) previsores[:, 13] = labelEncoder_previsores.fit_transform(previsores[:, 13]) # One Hot Encoder oneHotEncoder = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(categories='auto'), [1, 3, 5, 6, 7, 8, 9, 13])], remainder='passthrough') previsores = oneHotEncoder.fit_transform(previsores).toarray() # Y labelEncoder_classe = LabelEncoder() classe = labelEncoder_classe.fit_transform(classe) # Escalonamento dos dados ##### Escalonamento Parcial ##### # scalerCols = previsores[:, 102:] # scaler = StandardScaler() # previsores[:, 102:] = scaler.fit_transform(scalerCols) ##### Escalonamento Total ##### scaler = StandardScaler() previsores = scaler.fit_transform(previsores)
# %% from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import StandardScaler categorical_processor = OneHotEncoder(handle_unknown="ignore") numerical_processor = StandardScaler() # %% [markdown] # Subsequently, create a `ColumnTransformer` to redirect the specific columns # a preprocessing pipeline. # %% from sklearn.compose import ColumnTransformer preprocessor = ColumnTransformer([ ('cat-preprocessor', categorical_processor, categorical_columns), ('num-preprocessor', numerical_processor, numerical_columns) ]) # %% [markdown] # Finally, concatenate the preprocessing pipeline with a logistic regression. # %% from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression model = make_pipeline(preprocessor, LogisticRegression()) # %% [markdown] # Use a `RandomizedSearchCV` to find the best set of hyperparameters by tuning # the following parameters for the `LogisticRegression` model: # - `C` with values ranging from 0.001 to 10. You can use a log-uniform
#Implementando a transformação em números e atribuindo previsores[:, 1] = labelEncoder_previsores.fit_transform(previsores[:, 1]) previsores[:, 3] = labelEncoder_previsores.fit_transform(previsores[:, 3]) previsores[:, 5] = labelEncoder_previsores.fit_transform(previsores[:, 5]) previsores[:, 6] = labelEncoder_previsores.fit_transform(previsores[:, 6]) previsores[:, 7] = labelEncoder_previsores.fit_transform(previsores[:, 7]) previsores[:, 8] = labelEncoder_previsores.fit_transform(previsores[:, 8]) previsores[:, 9] = labelEncoder_previsores.fit_transform(previsores[:, 9]) previsores[:, 13] = labelEncoder_previsores.fit_transform(previsores[:, 13]) #Criar variáveis dummy para melhor uso dos dados from sklearn.compose import ColumnTransformer column_transform = ColumnTransformer( [("encoder", OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13])], remainder='passthrough') #Transforma variável categórica em números labelEncoder_classe = LabelEncoder() classe = labelEncoder_classe.fit_transform(classe) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) #Gerar uma amostra de treinamento - previsores e classe #Gerar uma amostra de teste - previsores e classe from sklearn.model_selection import train_test_split
# Separamos en entrada (X) y salida (y) X = df[X_cols] y = df[y_col] # Declaramos los Imputer que insertarán valores en los NaN imputer_media = SimpleImputer(strategy='mean') imputer_moda = SimpleImputer(strategy='most_frequent') # Declaramos los Scaler que estandarizarán los datos scaler_media = StandardScaler() scaler_moda = StandardScaler() # Creamos un ColumnTransformer para el SimpleImputer imputer = ColumnTransformer([ ('imputer_media', imputer_media, slice(0, 8)), ('imputer_moda', imputer_moda, slice(8, len(X.columns))), ]) # Creamos un ColumnTransformer para el StandardScaler scaler = ColumnTransformer([('scaler_media', scaler_media, slice(0, 8)), ('scaler_moda', scaler_moda, slice(8, len(X.columns)))]) # Creamos el Pipeline incorporando ColumnTransformer y Clasificador pipeline = Pipeline([('imputer', imputer), ('scaler', scaler), ('svm', SVC(random_state=RANDOM_STATE, class_weight=CLASS_WEIGHT, probability=True))]) # InnerCV (GridSearchCV de 2-folds 5-times (stratified) para obtener mejores parámetros)
def main(): # Lazy import libraries from rlearnlib.utils import ( predefined_estimators, load_training_data, save_training_data, option_to_list, scoring_metrics, check_class_weights, ) from rlearnlib.raster import RasterStack try: import sklearn if sklearn.__version__ < "0.20": gs.fatal( "Package python3-scikit-learn 0.20 or newer is not installed") except ImportError: gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed") try: import pandas as pd except ImportError: gs.fatal("Package python3-pandas 0.25 or newer is not installed") # parser options ---------------------------------------------------------- group = options["group"] training_map = options["training_map"] training_points = options["training_points"] field = options["field"] model_save = options["save_model"] model_name = options["model_name"] hyperparams = { "penalty": options["penalty"], "alpha": options["alpha"], "l1_ratio": options["l1_ratio"], "C": options["c"], "epsilon": options["epsilon"], "min_samples_leaf": options["min_samples_leaf"], "n_estimators": options["n_estimators"], "learning_rate": options["learning_rate"], "subsample": options["subsample"], "max_depth": options["max_depth"], "max_features": options["max_features"], "n_neighbors": options["n_neighbors"], "weights": options["weights"], "hidden_layer_sizes": options["hidden_units"], } cv = int(options["cv"]) group_raster = options["group_raster"] importances = flags["f"] preds_file = options["preds_file"] classif_file = options["classif_file"] fimp_file = options["fimp_file"] param_file = options["param_file"] norm_data = flags["s"] random_state = int(options["random_state"]) load_training = options["load_training"] save_training = options["save_training"] n_jobs = int(options["n_jobs"]) balance = flags["b"] category_maps = option_to_list(options["category_maps"]) # define estimator -------------------------------------------------------- hyperparams, param_grid = process_param_grid(hyperparams) estimator, mode = predefined_estimators(model_name, random_state, n_jobs, hyperparams) # remove dict keys that are incompatible for the selected estimator estimator_params = estimator.get_params() param_grid = { key: value for key, value in param_grid.items() if key in estimator_params } scoring, search_scorer = scoring_metrics(mode) # checks of input options ------------------------------------------------- if (mode == "classification" and balance is True and model_name not in check_class_weights()): gs.warning(model_name + " does not support class weights") balance = False if mode == "regression" and balance is True: gs.warning( "Balancing of class weights is only possible for classification") balance = False if classif_file: if cv <= 1: gs.fatal("Output of cross-validation global accuracy requires " "cross-validation cv > 1") if not os.path.exists(os.path.dirname(classif_file)): gs.fatal("Directory for output file {} does not exist".format( classif_file)) # feature importance file selected but no cross-validation scheme used if importances: if sklearn.__version__ < "0.22": gs.fatal("Feature importances calculation requires scikit-learn " "version >= 0.22") if fimp_file: if importances is False: gs.fatal( 'Output of feature importance requires the "f" flag to be set') if not os.path.exists(os.path.dirname(fimp_file)): gs.fatal("Directory for output file {} does not exist".format( fimp_file)) # predictions file selected but no cross-validation scheme used if preds_file: if cv <= 1: gs.fatal("Output of cross-validation predictions requires " "cross-validation cv > 1") if not os.path.exists(os.path.dirname(preds_file)): gs.fatal("Directory for output file {} does not exist".format( preds_file)) # define RasterStack ------------------------------------------------------ stack = RasterStack(group=group) if category_maps is not None: stack.categorical = category_maps # extract training data --------------------------------------------------- if load_training != "": X, y, cat, class_labels, group_id = load_training_data(load_training) if class_labels is not None: a = pd.DataFrame({"response": y, "labels": class_labels}) a = a.drop_duplicates().values class_labels = {k: v for (k, v) in a} else: gs.message("Extracting training data") if group_raster != "": stack.append(group_raster) if training_map != "": X, y, cat = stack.extract_pixels(training_map) y = y.flatten() with RasterRow(training_map) as src: if mode == "classification": src_cats = {v: k for (k, v, m) in src.cats} class_labels = {k: k for k in np.unique(y)} class_labels.update(src_cats) else: class_labels = None elif training_points != "": X, y, cat = stack.extract_points(training_points, field) y = y.flatten() if y.dtype in (np.object_, np.object): from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = le.fit_transform(y) class_labels = {k: v for (k, v) in enumerate(le.classes_)} else: class_labels = None # take group id from last column and remove from predictors if group_raster != "": group_id = X[:, -1] X = np.delete(X, -1, axis=1) stack.drop(group_raster) else: group_id = None # check for labelled pixels and training data if y.shape[0] == 0 or X.shape[0] == 0: gs.fatal("No training pixels or pixels in imagery group ...check " "computational region") from sklearn.utils import shuffle if group_id is None: X, y, cat = shuffle(X, y, cat, random_state=random_state) else: X, y, cat, group_id = shuffle(X, y, cat, group_id, random_state=random_state) if save_training != "": save_training_data(save_training, X, y, cat, class_labels, group_id, stack.names) # cross validation settings ----------------------------------------------- # inner resampling method (cv=2) from sklearn.model_selection import GridSearchCV, StratifiedKFold, GroupKFold, KFold if any(param_grid) is True: if group_id is None and mode == "classification": inner = StratifiedKFold(n_splits=3) elif group_id is None and mode == "regression": inner = KFold(n_splits=3) else: inner = GroupKFold(n_splits=3) else: inner = None # outer resampling method (cv=cv) if cv > 1: if group_id is None and mode == "classification": outer = StratifiedKFold(n_splits=cv) elif group_id is None and mode == "regression": outer = KFold(n_splits=cv) else: outer = GroupKFold(n_splits=cv) # modify estimators that take sample_weights ------------------------------ if balance is True: from sklearn.utils import compute_class_weight class_weights = compute_class_weight(class_weight="balanced", classes=(y), y=y) fit_params = {"sample_weight": class_weights} else: class_weights = None fit_params = {} # preprocessing ----------------------------------------------------------- from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder # standardization if norm_data is True and category_maps is None: scaler = StandardScaler() trans = ColumnTransformer( remainder="passthrough", transformers=[("scaling", scaler, np.arange(0, stack.count))], ) # one-hot encoding elif norm_data is False and category_maps is not None: enc = OneHotEncoder(handle_unknown="ignore", sparse=False) trans = ColumnTransformer(remainder="passthrough", transformers=[("onehot", enc, stack.categorical)]) # standardization and one-hot encoding elif norm_data is True and category_maps is not None: scaler = StandardScaler() enc = OneHotEncoder(handle_unknown="ignore", sparse=False) trans = ColumnTransformer( remainder="passthrough", transformers=[ ("onehot", enc, stack.categorical), ( "scaling", scaler, np.setxor1d(range(stack.count), stack.categorical).astype("int"), ), ], ) # combine transformers if norm_data is True or category_maps is not None: estimator = Pipeline([("preprocessing", trans), ("estimator", estimator)]) param_grid = wrap_named_step(param_grid) fit_params = wrap_named_step(fit_params) if any(param_grid) is True: estimator = GridSearchCV( estimator=estimator, param_grid=param_grid, scoring=search_scorer, n_jobs=n_jobs, cv=inner, ) # estimator training ------------------------------------------------------ gs.message(os.linesep) gs.message(("Fitting model using " + model_name)) if balance is True and group_id is not None: estimator.fit(X, y, groups=group_id, **fit_params) elif balance is True and group_id is None: estimator.fit(X, y, **fit_params) else: estimator.fit(X, y) # message best hyperparameter setup and optionally save using pandas if any(param_grid) is True: gs.message(os.linesep) gs.message("Best parameters:") optimal_pars = [ (k.replace("estimator__", "").replace("selection__", "") + " = " + str(v)) for (k, v) in estimator.best_params_.items() ] for i in optimal_pars: gs.message(i) if param_file != "": param_df = pd.DataFrame(estimator.cv_results_) param_df.to_csv(param_file) # cross-validation -------------------------------------------------------- if cv > 1: from sklearn.metrics import classification_report from sklearn import metrics if (mode == "classification" and cv > np.histogram(y, bins=np.unique(y))[0].min()): gs.message(os.linesep) gs.fatal("Number of cv folds is greater than number of samples in " "some classes ") gs.message(os.linesep) gs.message("Cross validation global performance measures......:") if (mode == "classification" and len(np.unique(y)) == 2 and all([0, 1] == np.unique(y))): scoring["roc_auc"] = metrics.roc_auc_score from sklearn.model_selection import cross_val_predict preds = cross_val_predict(estimator, X, y, group_id, cv=outer, n_jobs=n_jobs, fit_params=fit_params) test_idx = [test for train, test in outer.split(X, y)] n_fold = np.zeros((0, )) for fold in range(outer.get_n_splits()): n_fold = np.hstack((n_fold, np.repeat(fold, test_idx[fold].shape[0]))) preds = {"y_pred": preds, "y_true": y, "cat": cat, "fold": n_fold} preds = pd.DataFrame(data=preds, columns=["y_pred", "y_true", "cat", "fold"]) gs.message(os.linesep) gs.message("Global cross validation scores...") gs.message(os.linesep) gs.message("Metric \t Mean \t Error") for name, func in scoring.items(): score_mean = (preds.groupby("fold").apply( lambda x: func(x["y_true"], x["y_pred"])).mean()) score_std = (preds.groupby("fold").apply( lambda x: func(x["y_true"], x["y_pred"])).std()) gs.message(name + "\t" + str(score_mean.round(3)) + "\t" + str(score_std.round(3))) if mode == "classification": gs.message(os.linesep) gs.message("Cross validation class performance measures......:") report_str = classification_report( y_true=preds["y_true"], y_pred=preds["y_pred"], sample_weight=class_weights, output_dict=False, ) report = classification_report( y_true=preds["y_true"], y_pred=preds["y_pred"], sample_weight=class_weights, output_dict=True, ) report = pd.DataFrame(report) gs.message(report_str) if classif_file != "": report.to_csv(classif_file, mode="w", index=True) # write cross-validation predictions to csv file if preds_file != "": preds.to_csv(preds_file, mode="w", index=False) text_file = open(preds_file + "t", "w") text_file.write('"Real", "Real", "integer", "integer"') text_file.close() # feature importances ----------------------------------------------------- if importances is True: from sklearn.inspection import permutation_importance fimp = permutation_importance( estimator, X, y, scoring=search_scorer, n_repeats=5, n_jobs=n_jobs, random_state=random_state, ) feature_names = deepcopy(stack.names) feature_names = [i.split("@")[0] for i in feature_names] fimp = pd.DataFrame({ "feature": feature_names, "importance": fimp["importances_mean"], "std": fimp["importances_std"], }) gs.message(os.linesep) gs.message("Feature importances") gs.message("Feature" + "\t" + "Score") for index, row in fimp.iterrows(): gs.message(row["feature"] + "\t" + str(row["importance"]) + "\t" + str(row["std"])) if fimp_file != "": fimp.to_csv(fimp_file, index=False) # save the fitted model import joblib joblib.dump((estimator, y, class_labels), model_save)
0.5712416961268142 ] # In[14]: test_data = pd.DataFrame([test_country], columns=countries.columns) # In[15]: data_features = countries.select_dtypes('number').columns data_pipeline = Pipeline(steps=[('imputer', SimpleImputer( strategy='median')), ('scaler', StandardScaler())]) preprocessor = ColumnTransformer(transformers=[('num', data_pipeline, data_features)], remainder='drop') preprocessor.fit(countries) # In[16]: def q4(): arable = preprocessor.transform(test_data)[0][data_features.get_loc( 'Arable')] return float(round(arable, 3)) # ## Questão 5 #
band_gap = train_set.drop('Eg(G0W0;eV)', axis=1, inplace=False) band_gap_label = train_set['Eg(G0W0;eV)'].copy() # %% band_gap_num = band_gap.drop('Compound', axis=1) pipe = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('std', StandardScaler()), ]) band_gap_tr = pipe.fit_transform(band_gap_num) # %% num_attribs = list(band_gap_num) cat_attribs = ['Compound'] full_pipe = ColumnTransformer([('num', pipe, num_attribs), ('cat', OrdinalEncoder(), cat_attribs)]) band_gap_prepared = full_pipe.fit_transform(band_gap) # %% # OrdinalEncoder().categories # band_gap_prepared_df = pd.DataFrame(band_gap_prepared) # band_gap_prepared_df.head(10) # %% lin_reg = LinearRegression() lin_reg.fit(band_gap_prepared, band_gap_label) # %% band_gap_prediction = lin_reg.predict(band_gap_prepared) zip_sample = zip(band_gap_prediction, band_gap_label) for i, j in zip_sample: print(i, j) bg_mse = mean_squared_error(band_gap_prediction, band_gap_label)
df["ClaimNb"] = df["ClaimNb"].clip(upper=4) df["Exposure"] = df["Exposure"].clip(upper=1) df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000) log_scale_transformer = make_pipeline( FunctionTransformer(func=np.log), StandardScaler() ) column_trans = ColumnTransformer( [ ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), ("onehot_categorical", OneHotEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), ("passthrough_numeric", "passthrough", ["BonusMalus"]), ("log_scaled_numeric", log_scale_transformer, ["Density"]), ], remainder="drop", ) X = column_trans.fit_transform(df) # Insurances companies are interested in modeling the Pure Premium, that is # the expected total claim amount per unit of exposure for each policyholder # in their portfolio: df["PurePremium"] = df["ClaimAmount"] / df["Exposure"] # This can be indirectly approximated by a 2-step modeling: the product of the # Frequency times the average claim amount per claim:
num_attr = features_1.dtypes == 'float' cat_attr = ~num_attr for i in range(len(feature_nam)): if cat_attr[i] == True: print(feature_nam[i]) val = feature_nam[i] features_1[val].fillna(features_1[val].value_counts().index[0], inplace=True) num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) preprocess = ColumnTransformer([ ("num", num_pipeline, num_attr), ("cat", OneHotEncoder(), cat_attr), ]) features_prepared = preprocess.fit_transform(features_1) features_prepared_2 = preprocess.fit_transform(features_2) # Set up train and test arrays from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(features_prepared, target_1, random_state=0) #=====================================================================================================# # Prediction models print() ; print('=============== Predicition Models ===============') nam_model = [] type_model = []
dataset = pd.read_csv('Bank_Predictions.csv') X = dataset.iloc[:, 3:13].values y = dataset.iloc[:, 13].values # Encoding categorical data #convert gender and country to number data from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer label_encoder_x_1 = LabelEncoder() X[:, 2] = label_encoder_x_1.fit_transform(X[:,2]) transformer = ColumnTransformer( transformers=[ ("OneHot", # Just a name OneHotEncoder(), # The transformer class [1] # The column(s) to be applied on. ) ], remainder='passthrough' # donot apply anything to the remaining columns ) X = transformer.fit_transform(X.tolist()) X = X[:, 1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler Sscale = StandardScaler()
filename = cwd + '/default of credit card clients.xls' nanDict = {} df = pd.read_excel(filename, header=1, skiprows=0, index_col=0, na_values=nanDict) df.rename(index=str, columns={"default payment next month": "targets"}, inplace=True) #Assume that pay 0 is actaul meant to be pay print(df.columns) #There is no pay_1 --> # Features and targets X = df.loc[:, df.columns != 'targets'].values y = df.loc[:, df.columns == 'targets'].values # Categorical variables to one-hot's onehotencoder = OneHotEncoder(categories="auto") X = ColumnTransformer([ ("", onehotencoder, [3]), ], remainder="passthrough").fit_transform(X) print(X)
#def dense_identity(X): # return X.todense() text_features = ['text_feat'] text_transformer = Pipeline(steps=[('vec', CountVectorizer())]) #, #('to_dense', FunctionTransformer(func=dense_identity, validate=True, accept_sparse=True))]) numeric_features = ['numeric_feat'] #['mkt_ret'] numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0.) ), ('scaler', StandardScaler())]) # combine features preprocessing preprocessor = ColumnTransformer( transformers=[('text', text_transformer, 'text_feat'), ('num', numeric_transformer, numeric_features)]) pipeline = Pipeline(steps=[('preprocessor', preprocessor)]) X1 = pipeline.fit_transform(X) print('Expected (2, 13), got', X1.shape) X2 = text_transformer.fit_transform(X['text_feat']) print('Single pipeline works as expected:', X2.shape) #%% data = pd.DataFrame( data={ 'text_feat': ['This is my first sentence.', 'This is my second.'], 'numeric_feat': [1, 2],
def test_column_transformer_remainder(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_second = np.array([2, 4, 6]).reshape(-1, 1) X_res_both = X_array # default drop ct = ColumnTransformer([('trans1', Trans(), [0])]) assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'drop' assert_array_equal(ct.transformers_[-1][2], [1]) # specify passthrough ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1]) # column order is not preserved (passed through added to end) ct = ColumnTransformer([('trans1', Trans(), [1])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1]) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1]) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [0]) # passthrough when all actual transformers are skipped ct = ColumnTransformer([('trans1', 'drop', [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_second) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1]) # error on invalid arg ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\', \'passthrough\', " "or estimator.", ct.fit, X_array) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\', \'passthrough\', " "or estimator.", ct.fit_transform, X_array) # check default for make_column_transformer ct = make_column_transformer(([0], Trans())) assert ct.remainder == 'drop'
#create dummy model and evaluate model = DummyClassifier(strategy='constant', constant=1) scores = evaluate_model(X, y, model) print(X.shape, y.shape, Counter(y)) print('Dummy Classifier : ') print('Mean F2 :%.3f (%.3f)' % (mean(scores), std(scores))) models, names = get_models() results = list() #evaluate each model and print results for i in range(len(models)): # one hot encode categorical, normalize numerical ct = ColumnTransformer([('c', OneHotEncoder(), cat_ix), ('n', MinMaxScaler(), num_ix)]) # wrap the model in a pipeline pipeline = Pipeline(steps=[('t', ct), ('m', models[i])]) # evaluate the model and store results scores = evaluate_model(X, y, pipeline) results.append(scores) # summarize and store print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores))) #boxplot of results pyplot.boxplot(results, labels=names, showmeans=True) pyplot.show() results = list() models, names = get_under_sample_models() for i in range(len(models)):
# convert texts to numbers housing_cat = housing[['ocean_proximity']] ordinal_encoder = OrdinalEncoder() housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) ordinal_encoder.categories_ cat_encoder = OneHotEncoder() housing_cat_1hot = cat_encoder.fit_transform(housing_cat) num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy='median')), # ('attribs_adder', Combined) ('std_scaler', StandardScaler()) ]) num_attribs = list(housing_num) cat_attribs = ['ocean_proximitnum_pipeliney'] full_pipeline = ColumnTransformer([ ('num', num_pipeline, num_attribs), ('cat', OneHotEncoder(), cat_attribs) ]) housing_prepared = full_pipeline.fit_transform(housing) lin_reg = LinearRegression() lin_reg.fit(housing_prepared,housing_labels)
train_set, test_set = train_test_split(df, test_size=0.2) test_set.to_csv("test_set.csv") #%% # Vectorization for Categorical data # Categorial Fields to be encoded using onehot method loans_cat_1hot = ["flag_fthb", "ppmt_pnlty", "st"] # Normalization for numerical data loans_num_norm = [ "orig_loan_term", "loan_age", "fico", "mi_pct", "cltv", "dti", "ltv", "int_rt", "current_int_rt", "median_income", "unemployment_rate", "house_index" ] # Resample the data pipeline = ColumnTransformer([("num", Normalizer(), loans_num_norm), ("cat", OneHotEncoder(), loans_cat_1hot)]) X_train = pipeline.fit_transform(train_set) smte = SMOTE(random_state=42, k_neighbors=3) res_train, res_target = smte.fit_sample(X_train, train_set["time_to_d"]) res_target = res_target.reshape(-1, 1) res_target = OneHotEncoder( categories="auto").fit_transform(res_target).toarray() # %% # save resampled data np.save("res_train.npy", res_train.toarray()) np.save("res_target.npy", res_target)
train = pd.merge(train, u_user, how='left', left_on='user', right_on='u_id') train.drop('u_id', axis=1, inplace=True) train = pd.merge(train, u_item, how='left', left_on='item', right_on='m_id') train.drop('m_id', axis=1, inplace=True) train.drop(['user', 'item'], axis=1, inplace=True) # %% test = pd.merge(test, u_user, how='left', left_on='user', right_on='u_id') test.drop('u_id', axis=1, inplace=True) test = pd.merge(test, u_item, how='left', left_on='item', right_on='m_id') test.drop('m_id', axis=1, inplace=True) test.drop(['user', 'item'], axis=1, inplace=True) # %% ct = ColumnTransformer([ # ('u_i_onehot',OneHotEncoder(categories=[range(1, n_user + 1), range(1, n_item + 1)], sparse=False,dtype=np.int), ['user', 'item']), ('gender_onehot', OneHotEncoder(dtype=np.int, sparse=False),['gender', 'occupation', 'zip_code']) ], remainder='passthrough') ct.fit(train) X_train = ct.transform(train) X_test = ct.transform(test) # %% # 特征维度与V的维度 n_feature = X_train.shape[1] k = 10 # %% # 定义权重 w0 = tf.Variable(initial_value=tf.truncated_normal(shape=[1]), name='w0') w = tf.Variable(initial_value=tf.truncated_normal(shape=[n_feature]), name='w') V = tf.Variable(initial_value=tf.truncated_normal(shape=[k, n_feature]), name='V')
return df.apply(pd.cut, axis=0, **kwargs) winddir_discretizer = Pipeline([ ('binning', FunctionTransformer(binning, kw_args={ 'bins': [0, 45, 90, 135, 180, 225, 270, 315, 360], 'retbins': False })), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer(transformers=[ ('minmax', MinMaxScaler(), ['sum_wind_last_3_hours']), ('winddir_discreizer', winddir_discretizer, ['wind_direction']) ], remainder=StandardScaler()) pipe = Pipeline([('preprocess', preprocessor), ('forest', RandomForestRegressor(max_depth=70, n_estimators=200, n_jobs=-1))]) param_grid = { 'forest__max_depth': [20, 100], 'forest__n_estimators': [200, 1000] } model = GridSearchCV(pipe, param_grid, n_jobs=-1)
] # get numerical columns numerical_cols = [ cname for cname in x_train.columns if x_train[cname].dtype in ['int64', 'float64'] ] # preprocessing for numerical data numerical_transformer = SimpleImputer() #preprocessing for categorical data categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='most_frequent') ), ('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[( 'num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols)]) model = RandomForestRegressor() clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)]) clf.fit(x_train, y_train) preds = clf.predict(test) #### output model results output = pd.DataFrame({'Id': test.index, 'SalePrice': preds}) output.to_csv('submission.csv', index=False)
def test_column_transformer_dataframe(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_both = X_array cases = [ # String keys: label based # scalar ('first', X_res_first), # list (['first'], X_res_first), (['first', 'second'], X_res_both), # slice (slice('first', 'second'), X_res_both), # int keys: positional # scalar (0, X_res_first), # list ([0], X_res_first), ([0, 1], X_res_both), (np.array([0, 1]), X_res_both), # slice (slice(0, 1), X_res_first), (slice(0, 2), X_res_both), # boolean mask (np.array([True, False]), X_res_first), (pd.Series([True, False], index=['first', 'second']), X_res_first), ] for selection, res in cases: ct = ColumnTransformer([('trans', Trans(), selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), res) assert_array_equal(ct.fit(X_df).transform(X_df), res) # callable that returns any of the allowed specifiers ct = ColumnTransformer([('trans', Trans(), lambda X: selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), res) assert_array_equal(ct.fit(X_df).transform(X_df), res) ct = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} both = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])], transformer_weights=transformer_weights) res = np.vstack([transformer_weights['trans1'] * X_df['first'], transformer_weights['trans2'] * X_df['second']]).T assert_array_equal(both.fit_transform(X_df), res) assert_array_equal(both.fit(X_df).transform(X_df), res) assert len(both.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' # test multiple columns both = ColumnTransformer([('trans', Trans(), ['first', 'second'])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder' both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder' # ensure pandas object is passes through class TransAssert(BaseEstimator): def fit(self, X, y=None): return self def transform(self, X, y=None): assert_true(isinstance(X, (pd.DataFrame, pd.Series))) if isinstance(X, pd.Series): X = X.to_frame() return X ct = ColumnTransformer([('trans', TransAssert(), 'first')], remainder='drop') ct.fit_transform(X_df) ct = ColumnTransformer([('trans', TransAssert(), ['first', 'second'])]) ct.fit_transform(X_df) # integer column spec + integer column names -> still use positional X_df2 = X_df.copy() X_df2.columns = [1, 0] ct = ColumnTransformer([('trans', Trans(), 0)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), X_res_first) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'drop' assert_array_equal(ct.transformers_[-1][2], [1])
LabelEncoder_X = LabelEncoder() X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0]) X[:, 2] = LabelEncoder_X.fit_transform(X[:, 2]) X[:, 4] = LabelEncoder_X.fit_transform(X[:, 4]) X[:, 5] = LabelEncoder_X.fit_transform(X[:, 5]) X[:, 7] = LabelEncoder_X.fit_transform(X[:, 7]) X[:, 8] = LabelEncoder_X.fit_transform(X[:, 8]) X[:, 10] = LabelEncoder_X.fit_transform(X[:, 10]) X[:, 12] = LabelEncoder_X.fit_transform(X[:, 12]) # Processing from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer ct = ColumnTransformer([('encoder', OneHotEncoder(), [5])], remainder='passthrough') X = np.array(ct.fit_transform(X), dtype=np.float) X = X[:, 1:] # Processing from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer ct = ColumnTransformer([('encoder', OneHotEncoder(), [8])], remainder='passthrough') X = np.array(ct.fit_transform(X), dtype=np.float) X = X[:, 1:] # Spliting the data value in train and test from sklearn.model_selection import train_test_split
# In[8]: data.drop('bool_of_active', axis=1, inplace=True) data # In[9]: data.drop('step_count', axis=1, inplace=True) data # In[10]: from sklearn.compose import ColumnTransformer ct = ColumnTransformer( [("mood", OneHotEncoder(), [0])], remainder="passthrough" ) # The last arg ([0]) is the list of columns you want to transform in this step x = ct.fit_transform(data) x # In[11]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0) # In[12]: from sklearn.naive_bayes import GaussianNB
# Multiple Linear Regression # Importing the libraries import numpy as np import pandas as pd # Importing the dataset dataset = pd.read_csv('50_Startups.csv') X = dataset.iloc[ : , :-1].values y = dataset.iloc[ : , -1].values # Encoding Categorical Data # Encoding the Independent Variable from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct =ColumnTransformer( transformers = [('encoder', OneHotEncoder(), [3])], remainder ='passthrough') X= np.array (ct.fit_transform(X)) #avoiding dummy variable trap X = X[ : ,1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test =train_test_split( X, y, test_size = 0.2, random_state = 0) # Training the Multiple Linear Regression model on the Training set from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train) # Predicting the Test set results
#missing data management #from sklearn.preprocessing import Imputer from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values = np.nan, strategy ='mean') #imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis =0) imputer = imputer.fit(X[:,1:3]) X[:, 1:3] = imputer.transform (X[:,1:3]) #Encoding categorical values from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer LabelEncoder_X = LabelEncoder() X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0]) columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough') X=np.array(columnTransformer.fit_transform(X),dtype=np.str) LabelEncoder_y = LabelEncoder() y = LabelEncoder_y.fit_transform(y) # columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [3])], # remainder='passthrough') # y=np.array(columnTransformer.fit_transform(y),dtype=np.str) #Splitting datasets into test and training sets nb train sz + tst sz = 1 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=0) #Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler()