def test_raises_error_if_df_contains_na(df_enc_big, df_enc_big_na): # test case 4: when dataset contains na, fit method with pytest.raises(ValueError): encoder = OneHotEncoder() encoder.fit(df_enc_big_na) # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): encoder = OneHotEncoder() encoder.fit(df_enc_big) encoder.transform(df_enc_big_na)
def test_get_feature_names_out(df_enc_binary): original_features = ["var_num"] input_features = ["var_A", "var_B", "var_C", "var_D"] tr = OneHotEncoder() tr.fit(df_enc_binary) out = [ "var_A_A", "var_A_B", "var_A_C", "var_B_A", "var_B_B", "var_B_C", "var_C_AHA", "var_C_UHU", "var_D_OHO", "var_D_EHE", ] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=input_features) == out assert tr.get_feature_names_out( input_features=input_features[0:2]) == out[0:6] assert tr.get_feature_names_out( input_features=[input_features[0]]) == out[0:3] tr = OneHotEncoder(drop_last=True) tr.fit(df_enc_binary) out = [ "var_A_A", "var_A_B", "var_B_A", "var_B_B", "var_C_AHA", "var_D_OHO", ] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=input_features) == out assert tr.get_feature_names_out( input_features=input_features[0:2]) == out[0:4] assert tr.get_feature_names_out( input_features=[input_features[0]]) == out[0:2] tr = OneHotEncoder(drop_last_binary=True) tr.fit(df_enc_binary) out = [ "var_A_A", "var_A_B", "var_A_C", "var_B_A", "var_B_B", "var_B_C", "var_C_AHA", "var_D_OHO", ] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=input_features) == out assert tr.get_feature_names_out( input_features=[input_features[0]]) == out[0:3] assert tr.get_feature_names_out(input_features=[input_features[3]]) == [ out[-1] ] tr = OneHotEncoder(top_categories=1) tr.fit(df_enc_binary) out = ["var_A_B", "var_B_A", "var_C_AHA", "var_D_EHE"] assert tr.get_feature_names_out( input_features=None) == original_features + out assert tr.get_feature_names_out(input_features=input_features) == out assert tr.get_feature_names_out( input_features=input_features[0:2]) == out[0:2] assert tr.get_feature_names_out(input_features=[input_features[3]]) == [ out[3] ] with pytest.raises(ValueError): tr.get_feature_names_out("var_A") with pytest.raises(ValueError): tr.get_feature_names_out(["var_A", "hola"])
st.selectbox('Do you want the house to be furnished ?', ('Yes', 'No'))) security_doors = word_convert( st.selectbox('Do you want security doors ?', ('Yes', 'No'))) cctv = word_convert( st.selectbox('Do you want CCTV surveillance ?', ('Yes', 'No'))) bq = word_convert(st.selectbox('Do you want Boys Quarters ?', ('Yes', 'No'))) gym = word_convert(st.selectbox('Do you need gym facilities ?', ('Yes', 'No'))) pool = word_convert(st.selectbox('Do you need swimming pool ?', ('Yes', 'No'))) # Modeling step # Encoding Step encode = OneHotEncoder() target = data['Price'] features = data.drop('Price', 1) encode.fit(features) features = encode.transform(features) # Getting the target and features variables # print(data.head()) X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0) # Creating the algorithm class model = RandomForestRegressor() # Creating algorithm object model.fit(X_train, y_train) # Predicted values
# Criando um modelo com todas as features e usando pipeline num_features = df.select_dtypes(include=['int64', 'float64']).drop( 'Survived', axis=1).columns num_features # %% cat_features = df.select_dtypes(include=['category', 'object']).columns cat_features #%% features = df.drop('Survived', axis=1).columns.to_list() features # %% onehot = OneHotEncoder(variables=['Pclass', 'Sex', 'Embarked'], drop_last=False) # %% onehot.fit(df[features]) onehot.transform(df[features]).head() # %% X = onehot.transform(df[features]) y = df['Survived'] print(X.shape) # %% # Separate into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_train, y_train) print(logistic_model)
def encorder(self, y): """Y dataframe""" encode = OneHotEncoder() encode.fit(y) return encode.transform(y)