def fit_transform(self, X, y): self.fit(X, y) kf = CustomFolds(validation_scheme=self.fold_scheme, num_folds=self.folds, random_state=self.random_state, shuffle=self.shuffle) X = self.convert_input(X) copy_X = deepcopy(X) for i, (train_index, test_index) in enumerate(kf.split(copy_X)): encoding_dict, iv = self.create_encoding_dict(X.iloc[train_index], y[train_index]) copy_X.iloc[test_index] = self.apply_encoding(X.iloc[test_index], encoding_dict) if self.return_df: return copy_X else: return copy_X.values
def fit(self, x, y, use_oof=False, n_jobs=-1): if not hasattr(self.model, 'fit') : raise Exception ("Model/algorithm needs to implement fit()") fitted_models = [] scaler_models= [] if use_oof: folds = CustomFolds(num_folds=self.n_splits, random_state=random.randint(0,1000) if self.random_state=='random' else self.random_state, shuffle=self.shuffle, validation_scheme=self.validation_scheme) self.indices = folds.split(x, y, group=self.cv_group_col) for i, (train_index, test_index) in enumerate(self.indices): model = clone(self.model) model.n_jobs = n_jobs if (isinstance(model, LGBMClassifier) and self.early_stopping_rounds is not None): model.fit(X=x[train_index], y=y[train_index], eval_set=[(x[test_index],y[test_index]),(x[train_index],y[train_index])], verbose=100, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds) elif (isinstance(model, XGBClassifier) and self.early_stopping_rounds is not None): model.fit(X=x[train_index], y=y[train_index], eval_set=[(x[test_index],y[test_index])], verbose=100, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds) elif (isinstance(model, CatBoostClassifier) and self.early_stopping_rounds is not None): model.od_wait=int(self.early_stopping_rounds) model.fit(x[train_index], y[train_index], cat_features=self.categorical_features_indices, eval_set=(x[test_index],y[test_index]), use_best_model=True, verbose=100) elif isinstance(model,LogisticRegression): model.fit(x[train_index],y[train_index]) else: model.fit(x[train_index], y[train_index]) fitted_models.append(model) else: model = clone(self.model) model.n_jobs = n_jobs x_train, x_val, y_train, y_val = train_test_split(x, y, test_size =0.2, shuffle=True, random_state=random.randint(0,1000) if self.random_state=='random' else self.random_state ) if isinstance(model, LGBMClassifier): if self.early_stopping_rounds is not None: model.fit(X=x_train, y=y_train, eval_set=[(x_val,y_val)], verbose=100, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds) elif isinstance(model, XGBClassifier): if self.early_stopping_rounds is not None: model.fit(X=x_train, y=y_train, eval_set=[(x_val,y_val)], verbose=100, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds) model.fit(x, y) fitted_models.append(model) self.fitted_models = fitted_models return self
def fit(self, x, y, use_oof=False, n_jobs=-1): if not hasattr(self.model, 'fit'): raise Exception("Model/algorithm needs to implement fit()") fitted_models = [] scaler_models = [] # if (isinstance(self.model, CatBoostClassifier)): # if (self.categorical_features_indices is None): # temp_df=pd.DataFrame(x) # temp_df=temp_df.apply(pd.to_numeric, errors='ignore') # self.categorical_features_indices=get_categorical_column_indexes(temp_df,threshold=self.threshold).tolist() # self.encoder = LabelEncoding(categorical_columns=self.categorical_features_indices) # x = self.encoder.fit_transform(x) if use_oof: folds = CustomFolds(num_folds=self.n_splits, random_state=self.random_state, shuffle=self.shuffle, validation_scheme=self.validation_scheme) self.indices = folds.split(x, y, group=self.cv_group_col) for i, (train_index, test_index) in enumerate(self.indices): model = clone(self.model) model.n_jobs = n_jobs if (isinstance(model, LGBMClassifier) and self.early_stopping_rounds is not None): model.fit(X=x[train_index], y=y[train_index], eval_set=[(x[test_index], y[test_index]), (x[train_index], y[train_index])], verbose=100, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds) elif (isinstance(model, XGBClassifier) and self.early_stopping_rounds is not None): model.fit(X=x[train_index], y=y[train_index], eval_set=[(x[test_index], y[test_index])], verbose=50, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds) elif (isinstance(model, CatBoostClassifier) and self.early_stopping_rounds is not None): model.od_wait = int(self.early_stopping_rounds) model.fit(x[train_index], y[train_index], cat_features=self.categorical_features_indices, eval_set=(x[test_index], y[test_index]), use_best_model=True, verbose=10) elif isinstance(model, LogisticRegression): scaler = RobustScaler() xtrain = scaler.fit_transform(x[train_index]) scaler_models.append(scaler) model.fit(xtrain, y[train_index]) else: model.fit(x[train_index], y[train_index]) fitted_models.append(model) else: model = clone(self.model) model.n_jobs = n_jobs x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=100) if isinstance(model, LGBMClassifier): if self.early_stopping_rounds is not None: model.fit(X=x_train, y=y_train, eval_set=[(x_val, y_val)], verbose=False, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds) elif isinstance(model, XGBClassifier): if self.early_stopping_rounds is not None: model.fit(X=x_train, y=y_train, eval_set=[(x_val, y_val)], verbose=False, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds) model.fit(x, y) fitted_models.append(model) self.fitted_models = fitted_models self.scaler_models = scaler_models return self
def fit(self, x, y, use_oof=False, n_jobs=-1): if not hasattr(self.model, 'fit'): raise Exception("Model/algorithm needs to implement fit()") fitted_models = [] if use_oof: # kf = StratifiedKFold(n_splits=self.kfolds, random_state=self.random_state, shuffle=self.shuffle) # self.indices = [(train_index, test_index) for (train_index, test_index) in kf.split(x, y)] folds = CustomFolds(num_folds=self.n_splits, random_state=self.random_state, shuffle=self.shuffle, validation_scheme=self.validation_scheme) self.indices = folds.split(x, y, group=self.cv_group_col) for i, (train_index, test_index) in enumerate(self.indices): model = clone(self.model) model.n_jobs = n_jobs if (isinstance(model, LGBMRegressor) and self.early_stopping_rounds is not None): model.fit(X=x[train_index], y=y[train_index], eval_set=[(x[test_index], y[test_index]), (x[train_index], y[train_index])], verbose=100, eval_metric='rmse', early_stopping_rounds=self.early_stopping_rounds) elif (isinstance(model, XGBRegressor) and self.early_stopping_rounds is not None): model.fit(X=x[train_index], y=y[train_index], eval_set=[(x[test_index], y[test_index])], verbose=100, eval_metric='rmse', early_stopping_rounds=self.early_stopping_rounds) else: # model.n_jobs=-1 model.fit(x[train_index], y[train_index]) print scoring_metric(y[test_index], model.predict(x[test_index]))**0.5 fitted_models.append(model) else: model = clone(self.model) model.n_jobs = n_jobs x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=100) if isinstance(model, LGBMRegressor): if self.early_stopping_rounds is not None: model.fit(X=x_train, y=y_train, eval_set=[(x_val, y_val)], verbose=False, eval_metric='rmse', early_stopping_rounds=self.early_stopping_rounds) elif isinstance(model, XGBRegressor): if self.early_stopping_rounds is not None: model.fit(X=x_train, y=y_train, eval_set=[(x_val, y_val)], verbose=False, eval_metric='rmse', early_stopping_rounds=self.early_stopping_rounds) model.fit(x, y) fitted_models.append(model) self.fitted_models = fitted_models return self