cat_columns2 = ["PassengerId","Name","Embarked"] cat_columns3 = ["Fare","Cabin"] cat_columns4 = ["Cabin"] num_pipeline = Pipeline( [ # ("columns",DropColumns(cat_columns)), ('imputer', SimpleImputer(strategy="median")), ('std_scaler',RobustScaler(1,99)), ] ) cat_pipeline = Pipeline( [ ("columns",DropColumns(cat_columns2)) # ('imputer_cat', SimpleImputer( strategy="most_frequent")), # ("cat3",OneHotEncoder(handle_unknown="ignore", sparse= False)) ] ) cat_pipeline2 = Pipeline( [ ('cabins', AddFeaturesFromCat()), ] ) cat_pipeline3 = Pipeline( [ ('decks', GetDeck()), ("decks2",OneHotEncoder(handle_unknown="ignore", sparse= False)) ] )
("cat3", OneHotEncoder(handle_unknown="ignore", sparse=False)) ]) cat_transform_pipeline = Pipeline([ ('cabin', GetDeck()), # ('decks', Combine()), # ("decks2",OneHotEncoder(handle_unknown="ignore", sparse= False)) # ("decks2",OrdinalEncoder()) ]) transformer = ColumnTransformer([ ("num", num_pipeline, num_columns), ("num_transform", num_to_transform_pipeline, num_columns_to_transform), ("cat_stay", cat_stay_pipeline, cat_columns_stay), ("cat_transform", cat_transform_pipeline, cat_columns_to_transform) ]) full_pipeline = Pipeline([("1", DropColumns(columns_to_drop)), ("2", transformer)]) # train_data.drop(["PassengerId","Name"], axis= 1, inplace= True) #%% # split = StratifiedShuffleSplit(n_splits=1,test_size=0.3,random_state=42) # for train_index, test_index in split.split(train_data,train_data["Sex"]): # train_data_train = train_data.iloc[train_index] # val_data = train_data.iloc[test_index] X_train_data = train_data.drop("Survived", axis=1) y_train_data = train_data["Survived"] X_train, X_val, y_train, y_val = train_test_split(X_train_data, y_train_data, random_state=42, test_size=0.1) #%%
from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.impute import KNNImputer from sklearn.impute import SimpleImputer from sklearn.model_selection import GridSearchCV from lightgbm import LGBMClassifier from sklearn.model_selection import StratifiedShuffleSplit from sklearn.neural_network import MLPClassifier from titanic_functions import plot_learning_curves, get_num_of_cabins, get_real_fare, get_deck, DropColumns, AddFeaturesFromCat, GetDeck, print_results here = os.path.dirname(os.path.abspath(__file__)) d = { 'col1': [1, 10, 5, 2, 17, 3, 27], 'col2': ['a', 'b', 'np.nan', 'c', 'd', 'e', 'f'], 'col3': ['sd', 'asdb', 'asdas', 'cgh', 'fgh', 'fgh', 'werf'], 'col4': [0.2, 0.10, 0.35, 0.52, 0.6917, 30, 0.627] } data = pd.DataFrame(data=d) data #%% pipeline1 = Pipeline([("1", OneHotEncoder(handle_unknown='ignore', sparse=False))]) pipeline2 = Pipeline([("1", DropColumns(["col1"]))]) transformer = ColumnTransformer([ # ("p1",pipeline1,data.columns), ("p2", pipeline2, data.columns) ]) full = Pipeline([("f1", transformer)]) data_prepared = full.fit_transform(data) type(data_prepared)