def fit(self, X, y=None): self.encoder = OrdinalEncoder([self.order + list(self.undefined)]) self.encoder.fit([[x] for x in self.order + list(self.undefined)]) # Argument irrelevant return self
from sklearn import metrics from sklearn.model_selection import GridSearchCV scale = StandardScaler() # Los datos a usar estan disponibles en un repositorio de github file = "https://raw.githubusercontent.com/fhernanb/datos/master/iris.txt" datos = pd.read_csv(file, sep='\t') datos.head() # Vamos a seleccionar las variables de interes datos = datos[["Species", "Sepal.Width", "Sepal.Length"]] # Vamos a convertir la variable Species a numérica enc = OrdinalEncoder() enc.fit(datos[["Species"]]) datos["y"] = enc.transform(datos[["Species"]]) # Explorando la variable respuesta import seaborn as sns sns.countplot(x='Species', data=datos) # Creando X e y y = datos["y"] X = datos[["Sepal.Length", "Sepal.Width"]] # Para escalar los valores de X scaledX = scale.fit_transform(X) # Creando train y test
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error as MSE, mean_absolute_error as MAE import pickle outback = pd.read_csv('../data/outback.csv') ordinal = ['condition', 'title_status'] # Ordinal Encoder categorical = ['cylinders', 'fuel', 'transmission', 'paint_color', 'model'] # OHE numerical = ['year', 'miles'] y = outback.USD X = outback.drop('USD', axis=1) encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1) ohe = OneHotEncoder(sparse=False, handle_unknown='ignore') ct = ColumnTransformer(transformers = [ ('ordinalEncoder', encoder, ordinal), ('oneHotEncoder', ohe, categorical ) ], remainder='passthrough') regressor = RandomForestRegressor(n_estimators=20) pipe = Pipeline(steps=[('preprocess', ct), ('model', regressor)]) parameters = { 'model__n_estimators':list(range(5,110,5)),
#Alternativa 3 - Completarlos con el promedio de los datos median = housing["total_bedrooms"].median() housing["total_bedrooms"].fillna(median, inplace=True) #El siguiente paso es volver los datos que son textuales a numericos, porque #asi funciona machine learning, para esto tenemos dos formas de hacer las cosas #Primer forma - la hacemos dependiendo segun cuantas clasificaciones hayan # el problema es que los modelos normalmente entre mas distanciados estan los #numeros quiere decir que son menos semejantes entre si. from sklearn.preprocessing import OrdinalEncoder housing_cat = housing[["ocean_proximity"]] ordinal_enconder = OrdinalEncoder() housing_cat_encoded = ordinal_enconder.fit_transform(housing_cat) """ print(housing_cat_encoded[:10]) """ #Ahora usamos otro metodo que se llama onehotencoder en donde lo que hace es que #crea un vector con la cantidad total de clasificaciones que hay de ese atributo #y procede a poner en uno el valor que le corresponda y cero los otros from sklearn.preprocessing import OneHotEncoder cat_encoder = OneHotEncoder() housing_cat_1hot = cat_encoder.fit_transform(housing_cat) housing_cat_1hot.toarray()
def test_ordinal_encoder_raise_categories_shape(): cats = ['Low', 'Medium', 'High'] msg = ("Categories are expected to be either list or array-like, but ") with pytest.raises(TypeError, match=msg): _ = OrdinalEncoder(categories=cats)
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV # data/default_of_credit_card_clients/default_of_credit_card_clients.xls dataset = pd.read_excel( r'..\..\data\default_of_credit_card_clients\default_of_credit_card_clients.xls', skiprows=1) dataset.pop('ID') y = LabelEncoder().fit_transform( dataset.pop('default payment next month').values) cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value=-99)) # This is for training ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore') ) # This is for testing oe_step = ('le', OrdinalEncoder()) num_si_step = ('si', SimpleImputer(strategy='median')) sc_step = ('sc', StandardScaler()) cat_pipe = Pipeline([cat_si_step, ohe_step]) num_pipe = Pipeline([num_si_step, sc_step]) bin_pipe = Pipeline([oe_step]) transformers = [ ('cat', cat_pipe, [ 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6' ]), ('num', num_pipe, [ 'LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
return (jobs.reshape((len(jobs), 1))) def sector(df, sector_dict): sectors = df.EmploymentSector.apply(lambda x: sector_dict[x]).to_numpy() return (sectors.reshape((len(sectors), 1))) transformer_JobTitle = FunctionTransformer(lambda df: JobTitle(df, job_dict)) transformer_sector = FunctionTransformer(lambda df: sector(df, sector_dict)) # categorical columns with ordinal encoding cat_cols = ["ManageStaff", "EmploymentStatus"] cat_pipeline = make_pipeline( SimpleImputer(strategy='constant'), OrdinalEncoder(), ) transformer_nk = make_column_transformer( (cat_pipeline, cat_cols), (transformer_JobTitle, ["JobTitle"]), (transformer_sector, ["EmploymentSector"])) def get_elements(key): L = key.split(",") Result = [] i = 0 while i < len(L): word = L[i] if word.find("(") != -1: while word.find(")") == -1:
from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.model_selection import cross_validate categorical_columns = [ "weather", "season", "holiday", "workingday", ] categories = [ ["clear", "misty", "rain"], ["spring", "summer", "fall", "winter"], ["False", "True"], ["False", "True"], ] ordinal_encoder = OrdinalEncoder(categories=categories) gbrt_pipeline = make_pipeline( ColumnTransformer( transformers=[ ("categorical", ordinal_encoder, categorical_columns), ], remainder="passthrough", ), HistGradientBoostingRegressor(categorical_features=range(4), ), ) # %% # # Lets evaluate our gradient boosting model with the mean absolute error of the # relative demand averaged accross our 5 time-based cross-validation splits:
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', 0.245, 0.057, '坏瓜'], ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', 0.343, 0.099, '坏瓜'], ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', 0.639, 0.161, '坏瓜'], ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', 0.657, 0.198, '坏瓜'], ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.360, 0.370, '坏瓜'], ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', 0.593, 0.042, '坏瓜'], ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', 0.719, 0.103, '坏瓜']] #特征值列表 labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感', '密度', '含糖率'] #整理出数据集和标签 X = np.array(dataSet)[:, :8] Y = np.array(dataSet)[:, 8] #对X进行编码 from sklearn.preprocessing import OrdinalEncoder oriencode = OrdinalEncoder(categories='auto') oriencode.fit(X[:, :6]) Xdata = oriencode.transform(X[:, :6]) #编码后的数据 print(oriencode.categories_) #查看分类标签 Xdata = np.hstack((Xdata, X[:, 6:].astype(float))) #对Y进行编码 from sklearn.preprocessing import LabelEncoder labelencode = LabelEncoder() labelencode.fit(Y) Ylabel = labelencode.transform(Y) #得到切分后的数据 labelencode.classes_ #查看分类标签 labelencode.inverse_transform(Ylabel) #还原编码前数据 X = pd.DataFrame(Xdata, columns=labels) Y = pd.Series(Ylabel)
# print ('Number of non-attacks: ', y.value_counts () [0]) # print ('Number of attacks: ', y.value_counts () [1]) # else: # # Undersampling # ros = RandomUnderSampler (random_state=42) # X, y = ros.fit_resample (X, y) # print ('Number of non-attacks: ', y.value_counts () [0]) # print ('Number of attacks: ', y.value_counts () [1]) ############################################################################### ### Encode categorical features print ('Encoding categorical features (ordinal encoding).') my_encoder = OrdinalEncoder () df ['flg'] = my_encoder.fit_transform (df ['flg'].values.reshape (-1, 1)) df ['pr'] = my_encoder.fit_transform (df ['pr'].values.reshape (-1, 1)) print ('Objects:', list (df.select_dtypes ( ['object']).columns)) ############################################################################### ## Quick sanity check ############################################################################### display_general_information (df) ############################################################################### ## Split dataset into train and test sets ############################################################################### ### Dataset too big? Drop, uncomment the next lines.
print('数据的行列',data_all.shape) delete = ['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no','id_name', 'latest_query_time', 'source', 'loans_latest_time', 'first_transaction_time'] data_all = data_all.drop(delete,axis=1) print('删除无用数据的行列',data_all.shape) data_all = data_all.drop(['student_feature'],axis=1) from sklearn.impute import SimpleImputer for i in range(data_all.shape[1]): feature = data_all.iloc[:,i].values.reshape(-1,1) #sklearn中特征矩阵必须是二维 imp_mode = SimpleImputer(strategy='most_frequent') data_all.iloc[:,i] = imp_mode.fit_transform(feature) from sklearn.preprocessing import OrdinalEncoder data_all['reg_preference_for_trad'] = OrdinalEncoder().fit_transform(data_all['reg_preference_for_trad'].values.reshape(-1,1)) #查找标签值对应的索引 for i in range(data_all.shape[1]): if data_all.columns[i] == 'status': print(i) y = data_all.iloc[:,38] X = data_all.drop(['status'],axis=1) #划分数据集 X = data_all.drop(['status'],axis=1) y = data_all['status'] X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2018)
def test_date_pipeline(): df = create_dataset(num=0, cat=0, date=5, target=False, size=5000) train_df = df.iloc[:-1000] test_df = df.iloc[-1000:] date_pipeline = Pipeline(steps=[ Step("date", DateFeatures()), Step( "derived_processing", ColumnsProcessor(branches=[ Step("num_derived", Wrap(StandardScaler()), types=[VarType.NUM]), Step( "cat_derived", OrdCat(min_support=0, use_other=False), types=[VarType.CAT], ), ]), ), ]) train = date_pipeline.fit_transform(to_task_data(train_df)) test = date_pipeline.transform(to_task_data(test_df)) for data in [train, test]: assert data.column_types[:5] == [ColumnType(VarType.NUM)] * 5 assert set([c.var_type for c in data.column_types[5:]]) == set([VarType.CAT]) assert all([c.level > 0 for c in data.column_types[5:]]) date_features = DateFeatures() dates_train = date_features.fit_transform(to_task_data(train_df)) dates_test = date_features.transform(to_task_data(test_df)) num_train = take_columns(dates_train, types=[VarType.NUM]) cat_train = take_columns(dates_train, types=[VarType.CAT]) scaler = StandardScaler() enc = OrdinalEncoder() num_train = scaler.fit_transform(num_train.X) cat_train = enc.fit_transform(cat_train.X) cat_train = cat_train + 1 assert np.all( np.isclose(num_train, take_columns(train, types=[VarType.NUM]).X)) assert np.all( np.isclose(cat_train, take_columns(train, types=[VarType.CAT]).X)) num_test = take_columns(dates_test, types=[VarType.NUM]) cat_test = take_columns(dates_test, types=[VarType.CAT]) num_test = scaler.transform(num_test.X) cat_test = enc.transform(cat_test.X) cat_test = cat_test + 1 assert np.all( np.isclose(num_test, take_columns(test, types=[VarType.NUM]).X)) assert np.all( np.isclose(cat_test, take_columns(test, types=[VarType.CAT]).X))
keras.layers.Dense(50, activation='relu'), keras.layers.Dense(25, activation='relu'), keras.layers.Dense(1, activation='sigmoid') ]) model4.compile('adam', 'binary_crossentropy') model4.fit(xtrain.values, ytrain, validation_data=(xtest, ytest), epochs=100, batch_size=32) eval_model(ytest, model4.predict_classes(xtest.values)) #%% from sklearn.preprocessing import OrdinalEncoder df['international_plan'] = df['international_plan'].map({'No':0, 'Yes':1}) df['voice_mail_plan'] = df['voice_mail_plan'].map({'No':0, 'Yes':1}) xtrain, xtest, ytrain, ytest = train_test_split(df.drop('churn', axis=1), df.churn) xtrain_state = xtrain['state'] xtest_state = xtest['state'] oe = OrdinalEncoder() xtrain_state = oe.fit_transform(xtrain_state.values.reshape(-1, 1)) xtest_state = oe.transform(xtest_state.values.reshape(-1, 1)) xtrain = xtrain.drop('state', axis=1) xtest = xtest.drop('state', axis=1) #%% import keras state_in = keras.Input(shape=(1,)) rest_in = keras.Input(shape=(18,)) emb = keras.layers.Embedding(51,5)(state_in) emb_reshaped = keras.layers.Reshape((5,))(emb) concat = keras.layers.Concatenate()([emb_reshaped, rest_in]) d1 = keras.layers.Dense(50, activation='relu')(concat) d2 = keras.layers.Dense(25, activation='relu')(d1) out = keras.layers.Dense(1, activation='sigmoid')(d2)
def fit(self, X, y=None): self.encoder = OrdinalEncoder([self.values + list(self.undefined)], dtype=int) self.encoder.fit([[x] for x in self.values + list(self.undefined)]) # Argument irrelevant return self
def encode_objects(X_train): oe = OrdinalEncoder() oe.fit(X_train) X_train_enc = oe.transform(X_train) return X_train_enc
'Native Hawaiian/Oth Pac Island': '5', 'Not Applicable': '-1', 'NHISP': '3', 'not available': '-1', '2+RACE': '6', ' ': '-1'} enc_ethnicity = ethnicity.copy() for key, val in D_matching.items(): enc_ethnicity[ethnicity == key] = val enc_ethnicity = enc_ethnicity.to_numpy(dtype=int)[p] employer = Xg[:,301:305] enc_employer = (employer * np.arange(1,5)).sum(axis=1) clean_jt_enc = OrdinalEncoder().fit_transform(clean_jt[:,None])[:,0] Xg_matching = np.zeros((len(Xg), 4)) Xg_matching[:,0] = clean_jt_enc Xg_matching[:,1] = Xg[:,300] Xg_matching[:,2] = enc_employer Xg_matching[:,3] = enc_ethnicity Xg_fuzzy = Xg[:,:303].copy() Xg_fuzzy[:,301] = enc_employer Xg_fuzzy[:,302] = enc_ethnicity # ------------- ATE estimation with machine-learning models ------------- # def AIPW_estimator(ms, mg, Xs, Xg, ys, yg): """
from sklearn.preprocessing import OrdinalEncoder oe = OrdinalEncoder(dtype=int) transformed = oe.fit_transform(X_train_census[categorical_features]) transformed = pd.DataFrame(data=transformed, columns=categorical_features, index=X_train_census.index)
# In[19]: # Augment training data print("Augment data...") df_augm_x, augm_y = DuplicateData(BattleResults, y) # In[22]: # Define preprocessing pipeline and fit sklearn scalers to data numerical_attributes = [ 'HP_1', 'Attack_1', 'Defense_1', 'Sp_Atk_1', 'Sp_Def_1', 'Speed_1', 'HP_2', 'Attack_2', 'Defense_2', 'Sp_Atk_2', 'Sp_Def_2', 'Speed_2' ] type_attributes = ['Type1_1', 'Type2_1', 'Type1_2', 'Type2_2'] TypeEncoder = OrdinalEncoder() full_pipeline = Pipeline([ ("Add Types", AddPokemonTypes()), ( "Individual Feature Preprocessing", ColumnTransformer([ ("Drop", "drop", ['Name_1', 'Name_2', 'Price_1', 'Price_2', 'BattleResult']), ("Numerical Attributes", StandardScaler(), numerical_attributes), # Faulty when not using augmented data ("Boolean", "passthrough", ['Legendary_1', 'Legendary_2']), ("Level", LevelScaler(), ['Level_1', 'Level_2']), ("Weather", OrdinalEncoder(), ['WeatherAndTime']), ("Types", TypeEncoder, type_attributes) ])),
def main(): config = Configuration() # Get config for data directory checker = ConfigChecker(config, None, 'preprocessing', training=None) checker.pre_init_checks() config.import_timestamps() number_data_sets = len(config.datasets) # list of all examples examples: [np.ndarray] = [] labels_of_examples: [str] = [] failure_times_of_examples: [str] = [] window_times_of_examples: [str] = [] attributes = None for i in range(number_data_sets): print('\n\nImporting dataframe ' + str(i) + '/' + str(number_data_sets - 1) + ' from file') # read the imported dataframe from the saved file path_to_file = config.datasets[i][0] + config.filename_pkl_cleaned with open(path_to_file, 'rb') as f: df: pd.DataFrame = pickle.load(f) # cleaning moved to separate script because of computational demands # df = clean_up_dataframe(df, config) # split the dataframe into the configured cases cases_df, labels_df, failures_df = split_by_cases(df, i, config) print("cases_df: ", len(cases_df)) print("labels_df: ", len(labels_df)) print("failures_df: ", len(failures_df), ": ", failures_df) if i == 0: attributes = np.stack(df.columns, axis=0) del df gc.collect() # split the case into examples, which are added to the list of of all examples number_cases = len(cases_df) for y in range(number_cases): df = cases_df[y] if len(df) <= 0: print(i, y, 'empty') print( "df: ", df, ) continue start = df.index[0] end = df.index[-1] secs = (end - start).total_seconds() print('\nSplitting case', y, '/', number_cases - 1, 'into examples. Length:', secs, " start: ", start, " end: ", end) split_into_examples(df, labels_df[y], examples, labels_of_examples, config.time_series_length, config.interval_in_seconds, config, failure_times_of_examples, failures_df[y], window_times_of_examples, y, i) del cases_df, labels_df, failures_df gc.collect() # convert lists of arrays to numpy array examples_array = np.stack(examples, axis=0) labels_array = np.stack(labels_of_examples, axis=0) failure_times_array = np.stack(failure_times_of_examples, axis=0) window_times_array = np.stack(window_times_of_examples, axis=0) del examples, labels_of_examples, failure_times_of_examples, window_times_of_examples gc.collect() # print("config.use_over_lapping_windows: ", config.use_over_lapping_windows) if config.use_over_lapping_windows: print('\nExecute train/test split with failure case consideration') # define groups for GroupShuffleSplit enc = OrdinalEncoder() enc.fit(failure_times_array.reshape(-1, 1)) failure_times_array_groups = enc.transform( failure_times_array.reshape(-1, 1)) # print("groups: ",failure_times_array_groups) # group_kfold = GroupKFold(n_splits=2) gss = GroupShuffleSplit(n_splits=1, test_size=config.test_split_size, random_state=config.random_seed) for train_idx, test_idx in gss.split(examples_array, labels_array, failure_times_array_groups): print("TRAIN:", train_idx, "TEST:", test_idx) # split_idx in gss.split(examples_array, labels_array, failure_times_array_groups) # train_idx = split_idx[0] # test_idx = split_idx[1] # print("train_idx:",train_idx) x_train, x_test = examples_array[train_idx], examples_array[test_idx] y_train, y_test = labels_array[train_idx], labels_array[test_idx] failure_times_train, failure_times_test = failure_times_array[ train_idx], failure_times_array[test_idx] window_times_train, window_times_test = window_times_array[ train_idx], window_times_array[test_idx] print("X_train: ", x_train.shape, " X_test: ", x_test.shape) print("Y_train: ", y_train.shape, " Y_train: ", y_test.shape) print("Failure_times_train: ", failure_times_train.shape, " Failure_times_test: ", failure_times_test.shape) print("Window_times_train: ", window_times_train.shape, " Window_times_test: ", window_times_test.shape) print("Classes in the train set: ", np.unique(y_train)) print("Classes in the test set: ", np.unique(y_test)) # print("Classes in train and test set: ", np.unique(np.concatenate(y_train, y_test))) else: # split into train and test data set print('\nExecute train/test split') x_train, x_test, y_train, y_test = train_test_split( examples_array, labels_array, test_size=config.test_split_size, random_state=config.random_seed) # Sort both datasets by the cases for easier handling ''' x_train = x_train[y_train.argsort()] y_train = np.sort(y_train) x_test = x_test[y_test.argsort()] y_test = np.sort(y_test) ''' print('Training data set shape: ', x_train.shape) print('Training label set shape: ', y_train.shape) print('Test data set shape: ', x_test.shape) print('Test label set shape: ', y_test.shape, '\n') # normalize each sensor stream to contain values in [0,1] x_train, x_test = normalise(x_train, x_test, config) x_train, x_test, = x_train.astype('float32'), x_test.astype('float32') # save the np arrays print('\nSave to np arrays in ' + config.training_data_folder) print('Step 1/5') np.save(config.training_data_folder + 'train_features_4_.npy', x_train) print('Step 2/5') np.save(config.training_data_folder + 'test_features_4_.npy', x_test) print('Step 3/5') np.save(config.training_data_folder + 'train_labels_4_.npy', y_train) print('Step 4/5') np.save(config.training_data_folder + 'test_labels_4_.npy', y_test) print('Step 5/5') np.save(config.training_data_folder + 'feature_names_4_.npy', attributes) print() if config.use_over_lapping_windows: print('Saving additional data if overlapping windows are used') # Contains the associated time of a failure (if not no failure) for each example print('Step 1/4') np.save(config.training_data_folder + 'train_failure_times_4_.npy', failure_times_train) print('Step 2/4') np.save(config.training_data_folder + 'test_failure_times_4_.npy', failure_times_test) print('Step 3/4') # Contains the start and end time stamp for each training example np.save(config.training_data_folder + 'train_window_times_4_.npy', window_times_train) print('Step 4/4') np.save(config.training_data_folder + 'test_window_times_4_.npy', window_times_test)
# %% # We define a predictive model based on a random forest. Therefore, we will make # the following preprocessing steps: # # - use :class:`~sklearn.preprocessing.OrdinaleEcnoder` to encode the # categorical features; # - use :class:`~sklearn.impute.SimpleImputer` to fill missing values for # numerical features using a mean strategy. from sklearn.ensemble import RandomForestClassifier from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OrdinalEncoder categorical_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1, encoded_missing_value=-1) numerical_pipe = SimpleImputer(strategy="mean") preprocessing = ColumnTransformer( [ ("cat", categorical_encoder, categorical_columns), ("num", numerical_pipe, numerical_columns), ], verbose_feature_names_out=False, ) rf = Pipeline([ ("preprocess", preprocessing), ("classifier", RandomForestClassifier(random_state=42)), ])
def encode_feature(df, feature): df[feature] = df[feature].fillna("Unkown") ord_enc = OrdinalEncoder() return ord_enc.fit_transform(df[[feature]]).reshape( (len(df[[feature]]), 1))
adult_census = pd.read_csv("../datasets/adult-census.csv") target_name = "class" target = adult_census[target_name] data = adult_census.drop(columns=[target_name, "education-num"]) data_train, data_test, target_train, target_test = train_test_split( data, target, train_size=0.2, random_state=42) # %% from sklearn.compose import ColumnTransformer from sklearn.compose import make_column_selector as selector from sklearn.preprocessing import OrdinalEncoder categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) preprocessor = ColumnTransformer( [('cat-preprocessor', categorical_preprocessor, selector(dtype_include=object))], remainder='passthrough', sparse_threshold=0) # This line is currently required to import HistGradientBoostingClassifier from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.pipeline import Pipeline model = Pipeline([ ("preprocessor", preprocessor), ("classifier", HistGradientBoostingClassifier(random_state=42)) ])
def test_ordinal_encoder(X): enc = OrdinalEncoder() exp = np.array([[0, 1, 0], [1, 0, 0]], dtype='int64') assert_array_equal(enc.fit_transform(X), exp.astype('float64')) enc = OrdinalEncoder(dtype='int64') assert_array_equal(enc.fit_transform(X), exp)
hist_one_hot = make_pipeline(one_hot_encoder, HistGradientBoostingRegressor(random_state=42)) # %% # Gradient boosting estimator with ordinal encoding # ------------------------------------------------- # Next, we create a pipeline that will treat categorical features as if they # were ordered quantities, i.e. the categories will be encoded as 0, 1, 2, # etc., and treated as continuous features. from sklearn.preprocessing import OrdinalEncoder import numpy as np ordinal_encoder = make_column_transformer( (OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan), make_column_selector(dtype_include='category')), remainder='passthrough') hist_ordinal = make_pipeline(ordinal_encoder, HistGradientBoostingRegressor(random_state=42)) # %% # Gradient boosting estimator with native categorical support # ----------------------------------------------------------- # We now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator # that will natively handle categorical features. This estimator will not treat # categorical features as ordered quantities. # # Since the :class:`~ensemble.HistGradientBoostingRegressor` requires category # values to be encoded in `[0, n_unique_categories - 1]`, we still rely on an
# Subset training and testing maps X_train = X[1000:3000, 1000:3000, 0:X.shape[2]] y_train = y[1000:3000, 1000:3000] X_test = X[3000:4000, 1000:3000, 0:X.shape[2]] y_test = y[3000:4000, 1000:3000] print("X_train shape {}".format(X_train.shape)) print("y_train shape {}".format(y_train.shape)) print("X_test shape {}".format(X_test.shape)) print("y_test shape {}".format(y_test.shape)) # In[ ]: # Encode response arrays y_train = OrdinalEncoder().fit_transform(y_train) y_test = OrdinalEncoder().fit_transform(y_test) # In[ ]: # Plot training map plt.rcParams['figure.figsize'] = [6.4, 4.8] plt.rcParams['figure.dpi'] = 144 plt.subplot(121) plt.imshow(X_train[:, :, 1], cmap=plt.cm.Greys_r) plt.title('X (training)') plt.subplot(122) plt.imshow(y_train, cmap=plt.cm.get_cmap('magma')) plt.title('y (training)')
data_categorical = data[categorical_columns] data_categorical.head() # %% print(f"The datasets is composed of {data_categorical.shape[1]} features") # %% [markdown] # ### Encoding ordinal categories # # The most intuitive strategy is to encode each category with a different # number. The `OrdinalEncoder` will transform the data in such manner. # %% from sklearn.preprocessing import OrdinalEncoder encoder = OrdinalEncoder() data_encoded = encoder.fit_transform(data_categorical) print(f"The dataset encoded contains {data_encoded.shape[1]} features") data_encoded[:5] # %% [markdown] # We can see that the categories have been encoded for each feature (column) # independently. We can also note that the number of features before and after # the encoding is the same. # # However, one has to be careful when using this encoding strategy. Using this # integer representation can lead the downstream models to make the assumption # that the categories are ordered: 0 is smaller than 1 which is smaller than 2, # etc. #
#plt.show() # Use z-score to handle outlier idx_bmi = find_outliers(data['bmi']) data = data.loc[idx_bmi == False] #print(data.info()) #print(data.head()) print("\n############ Before encoding ############") print(data.head()) print(data.info()) # Encoding : OrdinalEncoder ordinalencoder = OrdinalEncoder() # gender # ['Female', 'Male', 'Other'] = [0,1,2] gender = data[["gender"]] gender_encod = ordinalencoder.fit_transform(gender) data['gender']=gender_encod # ever_married # ['No', 'Yes'] = [0,1] married = data[["ever_married"]] married_encod = ordinalencoder.fit_transform(married) data['ever_married']=married_encod # work_type
2] titanic["Fare_cat"] = pd.cut(titanic.Fare_p_person, bins=[0, 13, 30, 513], include_lowest=True) titanic["Age_cat"] = pd.cut(titanic["Age"].astype(int), bins=[-1, 20, 40, 60, 90]) sex_pip = Pipeline([ ("one_hot", OneHotEncoder(categories=[pd.Series.unique(titanic.Sex)])) ]) fare_cat_pip = Pipeline([ ("fare_std", OrdinalEncoder(categories=[pd.Series.unique(titanic.Fare_cat)])) ]) pclass_pip = Pipeline([("pclass_std", StandardScaler())]) age_cat_pip = Pipeline([ ("age_cat_std", OrdinalEncoder(categories=[pd.Series.unique(titanic.Age_cat)])) ]) fsize_pip = Pipeline([("fsize_std", StandardScaler())]) cabin_pip = Pipeline([ ("cabin_1hot", OneHotEncoder(categories=[pd.Series.unique(titanic.Cabin_letter)])) ])
test_original = mlib.csv_to_df(path) test_df = test_original.copy() # # Create list of features desired for training feature_list = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked'] target_list = ['Survived'] # Define Numeric Pipeline num_pipe = Pipeline([('imputer_mean', SimpleImputer(strategy='mean')), ('std_scalar', StandardScaler())]) # Define Categorical Pipeline cat_pipe = Pipeline([ ('imputer', SimpleImputer(strategy='most_frequent')), #('ohe' , OneHotEncoder()), ('oe', OrdinalEncoder()) ]) #Combining Pipes into full pipeline - Train Data full_pipeline, train_features, target_features, post_trans_train_feature = mlib.Full_PipeLine( train_df, feature_list, target_list, num_pipe, cat_pipe) # Combining Pipes into full pipeline - Test Data full_pipeline_test, test_features, empty, empty = mlib.Full_PipeLine( test_df, feature_list, [], num_pipe, cat_pipe) # Transform data using final combined pipeline - Train train_features_prep = full_pipeline.fit_transform(train_features) # Transform data using final combined pipeline - Test test_features_prep = full_pipeline.fit_transform(test_features)
def fit_transform(self, X): self.encoder = OrdinalEncoder() self.n_categorical = (X.dtypes == "category").sum() self.encoder.fit(X[X.columns[:self.n_categorical]]) return self.transform(X)