def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # unite the input into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.use_default_cols: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') categories = self.fit_leave_one_out( X, y, cols=self.cols ) self.mapping = categories X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : encoder Returns self. """ # unite the input into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # if columns aren't passed, just use every string column if self.use_default_cols: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') categories = self.fit_leave_one_out(X, y, cols=self.cols) self.mapping = categories X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target info (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # unite the input into pandas types X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) # if we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not list(self.cols): return X X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') X = self.target_encode(X, y) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def fit_transform(self, X: Union[pd.DataFrame, np.ndarray], y: pd.Series = None, **fit_params) \ -> Union[pd.DataFrame, np.ndarray]: """ Fit models for each fold, then transform X Args: X: Data y: Target fit_params: Additional parameters passed to models Returns: Transformed version of X. It will be pd.DataFrame If X is `pd.DataFrame` and return_same_type is True. """ assert len(X) == len(y) self._pre_train(y) is_pandas = isinstance(X, pd.DataFrame) X = convert_input(X) y = convert_input_vector(y, X.index) if y.isnull().sum() > 0: # y == null is regarded as test data X_ = X.copy() X_.loc[~y.isnull(), :] = self._fit_train(X[~y.isnull()], y[~y.isnull()], **fit_params) X_.loc[y.isnull(), :] = self._fit_train(X[y.isnull()], None, **fit_params) else: X_ = self._fit_train(X, y, **fit_params) X_ = self._post_transform(self._post_fit(X_, y)) return X_ if self.return_same_type and is_pandas else X_.values
def fit_transform(self, X: Union[pd.DataFrame, np.ndarray], y: pd.Series = None, **fit_params) -> Union[pd.DataFrame, np.ndarray]: assert len(X) == len(y) self._pre_train(y) is_pandas = isinstance(X, pd.DataFrame) X = convert_input(X) y = convert_input_vector(y, X.index) if y.isnull().sum() > 0: # 欠損値が存在 # y == null is regarded as test data X_ = X.copy() X_.loc[~y.isnull(), :] = self._fit_train(X[~y.isnull()], y[~y.isnull()], **fit_params) X_.loc[y.isnull(), :] = self._fit_train(X[y.isnull()], None, **fit_params) else: X_ = self._fit_train(X, y, **fit_params) X_ = self._post_transform(self._post_fit(X_, y)) return X_ if self.return_same_type and is_pandas else X_.values
def run(self, train: TYPE_DATASET, test: TYPE_DATASET, target: TYPE_DATASET, groups: Optional[pd.Series] = None, verbose: bool = True): train = convert_input(train) target = convert_input_vector(target, train.index) if test is not None: test = convert_input(test) self.predictions = np.zeros(len(test)) self.oof = np.zeros(len(train)) scores = [] for idx, (trn_idx, val_idx) in enumerate(self.cv.split(train, target, groups)): if verbose: print('Fold: {}/{}'.format(idx + 1, self.cv.n_splits)) print('Length train: {} / valid: {}'.format( len(trn_idx), len(val_idx))) train_x, train_y = train.iloc[trn_idx], target.iloc[trn_idx] valid_x, valid_y = train.iloc[val_idx], target.iloc[val_idx] self.trainer.train(train_x, train_y) self.oof[val_idx] = self.trainer.predict(valid_x) if test is not None: self.predictions += self.trainer.predict(test) if self.scoring is not None: score = self.scoring(valid_y, self.oof[val_idx]) scores.append(score)
def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # unite the input into pandas types X = util.convert_input(X) # then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) # if we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X X = self.transform_leave_one_out( X, y, mapping=self.mapping ) if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. When the data are used for model training, it is important to also pass the target in order to apply leave one out. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError( 'Must train encoder before it can be used to transform data.') # Unite the input into pandas types X = util.convert_input(X) # Then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % ( X.shape[1], self._dim, )) # If we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not list(self.cols): return X # Do not modify the input argument X = X.copy(deep=True) X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') # Loop over the columns and replace the nominal values with the numbers X = self._score(X, y) # Postprocessing # Note: We should not even convert these columns. if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().any(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder(verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value') self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training if self.model == 'independent': self.mapping = self._train_independent(X_ordinal, y) elif self.model == 'pooled': self.mapping = self._train_pooled(X_ordinal, y) elif self.model == 'beta': self.mapping = self._train_beta(X_ordinal, y) elif self.model == 'binary': # The label must be binary with values {0,1} unique = y.unique() if len(unique) != 2: raise ValueError( "The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).") if y.isnull().any(): raise ValueError( "The target column y must not contain missing values.") if np.max(unique) < 1: raise ValueError( "The target column y must be binary with values {0, 1}. Value 1 was not found in the target." ) if np.min(unique) > 0: raise ValueError( "The target column y must be binary with values {0, 1}. Value 0 was not found in the target." ) # Perform the training self.mapping = self._train_log_odds_ratio(X_ordinal, y) else: raise ValueError("model='" + str(self.model) + "' is not a recognized option") X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. When the data are used for model training, it is important to also pass the target in order to apply leave one out. Parameters ---------- X : array-like, shape = [n_samples, n_features] y : array-like, shape = [n_samples] when transform by leave one out None, when transform without target information (such as transform test set) Returns ------- p : array, shape = [n_samples, n_numeric + N] Transformed values with encoding applied. """ if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') # Unite the input into pandas types X = util.convert_input(X) # Then make sure that it is the right size if X.shape[1] != self._dim: raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) # If we are encoding the training data, we have to check the target if y is not None: y = util.convert_input_vector(y, X.index).astype(float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") if not self.cols: return X # Do not modify the input argument X = X.copy(deep=True) X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') # Loop over columns and replace nominal values with WOE X = self._score(X, y) # Postprocessing # Note: We should not even convert these columns. if self.drop_invariant: for col in self.drop_cols: X.drop(col, 1, inplace=True) if self.return_df or override_return_df: return X else: return X.values
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == 'error': if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown='value', handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]], X_train: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], X_test: Union[pd.DataFrame, np.ndarray] = None, cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, groups: Optional[pd.Series] = None, predict_proba: bool = False, eval_func: Optional[Callable] = None, logger: Optional[Logger] = None, on_each_fold: Optional[Callable[[int, BaseEstimator, pd.DataFrame, pd.Series], None]] = None, fit_params: Optional[Union[Dict[str, Any], Callable]] = None, importance_type: str = 'gain', early_stopping: bool = True, type_of_target: str = 'auto') -> CVResult: """ Evaluate metrics by cross-validation. It also records out-of-fold prediction and test prediction. Args: estimator: The object to be used in cross-validation. For list inputs, ``estimator[i]`` is trained on i-th fold. X_train: Training data y: Target X_test: Test data (Optional). If specified, prediction on the test data is performed using ensemble of models. cv: int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, - integer, to specify the number of folds in a ``(Stratified)KFold``, - CV splitter (the instance of ``BaseCrossValidator``), - An iterable yielding (train, test) splits as arrays of indices. groups: Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). predict_proba: If true, call ``predict_proba`` instead of ``predict`` for calculating prediction for test data. eval_func: Function used for logging and returning scores logger: logger on_each_fold: called for each fold with (idx_fold, model, X_fold, y_fold) fit_params: Parameters passed to the fit method of the estimator importance_type: The type of feature importance to be used to calculate result. Used only in ``LGBMClassifier`` and ``LGBMRegressor``. early_stopping: If ``True``, ``eval_set`` will be added to ``fit_params`` for each fold. ``early_stopping_rounds = 100`` will also be appended to fit_params if it does not already have one. Returns: Namedtuple with following members * oof_prediction (numpy array, shape (len(X_train),)): The predicted value on put-of-Fold validation data. * test_prediction (numpy array, hape (len(X_test),)): The predicted value on test data. ``None`` if X_test is ``None``. * scores (list of float, shape (nfolds+1,)): ``scores[i]`` denotes validation score in i-th fold. ``scores[-1]`` is the overall score. `None` if eval is not specified. * importance (list of pandas DataFrame, shape (nfolds,)): ``importance[i]`` denotes feature importance in i-th fold model. If the estimator is not GBDT, empty array is returned. Example: >>> from sklearn.datasets import make_regression >>> from sklearn.linear_model import Ridge >>> from sklearn.metrics import mean_squared_error >>> from nyaggle.validation import cross_validate >>> X, y = make_regression(n_samples=8) >>> model = Ridge(alpha=1.0) >>> pred_oof, pred_test, scores, _ = \ >>> cross_validate(model, >>> X_train=X[:3, :], >>> y=y[:3], >>> X_test=X[3:, :], >>> cv=3, >>> eval_func=mean_squared_error) >>> print(pred_oof) [-101.1123267 , 26.79300693, 17.72635528] >>> print(pred_test) [-10.65095894 -12.18909059 -23.09906427 -17.68360714 -20.08218267] >>> print(scores) [71912.80290003832, 15236.680239881942, 15472.822033121925, 34207.43505768073] """ cv = check_cv(cv, y) n_output_cols = 1 if type_of_target == 'auto': type_of_target = multiclass.type_of_target(y) if type_of_target == 'multiclass': n_output_cols = y.nunique(dropna=True) if isinstance(estimator, list): assert len(estimator) == cv.get_n_splits(), "Number of estimators should be same to nfolds." X_train = convert_input(X_train) y = convert_input_vector(y, X_train.index) if X_test is not None: X_test = convert_input(X_test) if not isinstance(estimator, list): estimator = [estimator] * cv.get_n_splits() assert len(estimator) == cv.get_n_splits() if logger is None: logger = getLogger(__name__) def _predict(model: BaseEstimator, x: pd.DataFrame, _predict_proba: bool): if _predict_proba: proba = model.predict_proba(x) return proba[:, 1] if proba.shape[1] == 2 else proba else: return model.predict(x) oof = np.zeros((len(X_train), n_output_cols)) if n_output_cols > 1 else np.zeros(len(X_train)) evaluated = np.full(len(X_train), False) test = None if X_test is not None: test = np.zeros((len(X_test), n_output_cols)) if n_output_cols > 1 else np.zeros(len(X_test)) scores = [] eta_all = [] importance = [] for n, (train_idx, valid_idx) in enumerate(cv.split(X_train, y, groups)): start_time = time.time() train_x, train_y = X_train.iloc[train_idx], y.iloc[train_idx] valid_x, valid_y = X_train.iloc[valid_idx], y.iloc[valid_idx] if fit_params is None: fit_params_fold = {} elif callable(fit_params): fit_params_fold = fit_params(n, train_idx, valid_idx) else: fit_params_fold = copy.copy(fit_params) if isinstance(estimator[n], (LGBMModel, CatBoost)): if early_stopping: if 'eval_set' not in fit_params_fold: fit_params_fold['eval_set'] = [(valid_x, valid_y)] if 'early_stopping_rounds' not in fit_params_fold: fit_params_fold['early_stopping_rounds'] = 100 estimator[n].fit(train_x, train_y, **fit_params_fold) else: estimator[n].fit(train_x, train_y, **fit_params_fold) oof[valid_idx] = _predict(estimator[n], valid_x, predict_proba) evaluated[valid_idx] = True if X_test is not None: test += _predict(estimator[n], X_test, predict_proba) if on_each_fold is not None: on_each_fold(n, estimator[n], train_x, train_y) if isinstance(estimator[n], (LGBMModel, CatBoost)): importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type)) if eval_func is not None: score = eval_func(valid_y, oof[valid_idx]) scores.append(score) logger.info('Fold {} score: {}'.format(n, score)) elapsed = time.time() - start_time eta_all.append(elapsed) logger.debug('{:.3f} sec / fold'.format(elapsed)) if eval_func is not None: score = eval_func(y.loc[evaluated], oof[evaluated]) scores.append(score) logger.info('Overall score: {}'.format(score)) if X_test is not None: predicted = test / cv.get_n_splits(X_train, y, groups) else: predicted = None return CVResult(oof, predicted, scores, importance)
def test_convert_input_vector(self): index = [2, 3, 4] result = convert_input_vector([0, 1, 0], index) # list self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector([[0, 1, 0]], index) # list of lists (row) self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector([[0], [1], [0]], index) # list of lists (column) self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector(np.array([1, 0, 1]), index) # np vector self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector(np.array([[1, 0, 1]]), index) # np matrix row self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector(np.array([[1], [0], [1]]), index) # np matrix column self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector(pd.Series([0, 1, 0], index=[4, 5, 6]), index) # series self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [4, 5, 6], 'We want to preserve the original index') result = convert_input_vector(pd.DataFrame({'y': [0, 1, 0]}, index=[4, 5, 6]), index) # dataFrame self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [4, 5, 6], 'We want to preserve the original index') result = convert_input_vector((0, 1, 0), index) # tuple self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector(0, [2]) # scalar self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(1, len(result)) self.assertTrue(result.index == [2]) result = convert_input_vector('a', [2]) # scalar self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(1, len(result)) self.assertTrue(result.index == [2]) # multiple columns and rows should cause an error because it is unclear which column/row to use as the target self.assertRaises(ValueError, convert_input_vector, (pd.DataFrame({'col1': [0, 1, 0], 'col2': [1, 0, 1]})), index) self.assertRaises(ValueError, convert_input_vector, (np.array([[0, 1], [1, 0], [0, 1]])), index) self.assertRaises(ValueError, convert_input_vector, ([[0, 1], [1, 0], [0, 1]]), index) # edge scenarios (it is ok to raise an exception but please, provide then a helpful exception text) _ = convert_input_vector(pd.Series(), []) _ = convert_input_vector([], []) _ = convert_input_vector([[]], []) _ = convert_input_vector(pd.DataFrame(), [])
def run(trainer: Trainer, train: TYPE_DATASET, test: Optional[TYPE_DATASET] = None, target: Optional[TYPE_DATASET] = None, scoring: Optional[Callable] = None, cv: TYPE_CV = None, groups: Optional[pd.Series] = None, logger: Optional[logging.RootLogger] = None, type_of_target: str = 'auto'): if logger is None: logger = getLogger(__name__) train, target, groups = indexable(train, target, groups) train = convert_input(train) target = convert_input_vector(target, train.index) predictions = None n_output_cols = 1 if type_of_target == 'auto': type_of_target = multiclass.type_of_target(target) if type_of_target == 'multiclass': n_output_cols = target.nunique(dropna=True) oof = np.zeros((len(train), n_output_cols)) \ if n_output_cols > 1 else np.zeros(len(train)) if test is not None: test = convert_input(test) predictions = np.zeros((len(test), n_output_cols)) \ if n_output_cols > 1 else np.zeros(len(test)) feature_importance = [] scores = [] for idx, (trn_idx, val_idx) in enumerate(cv.split(train, target, groups)): logger.info('Fold: {}/{}'.format(idx + 1, cv.n_splits)) logger.info('Length train: {} / valid: {}'.format( len(trn_idx), len(val_idx))) train_x, train_y = train.iloc[trn_idx], target.iloc[trn_idx] valid_x, valid_y = train.iloc[val_idx], target.iloc[val_idx] trainer.train(train_x, train_y) if trainer.is_classifier: pred_valid = trainer.predict_proba(valid_x) else: pred_valid = trainer.predict(valid_x) oof[val_idx] = pred_valid if test is not None: if trainer.is_classifier: pred_test = trainer.predict_proba(test) else: pred_test = trainer.predict(test) predictions += pred_test if scoring is not None: score = scoring(valid_y, oof[val_idx]) logger.info("Fold {} Score: {}".format(idx, score)) scores.append(score) feature_importance.append(trainer.get_feature_importance()) if scoring is not None: score = scoring(target, oof) logger.info("Overall Score: {}".format(score)) prediction = None if test is not None: prediction = predictions / cv.get_n_splits(train, target, groups) return oof, prediction, feature_importance
def stacking(test_predictions: List[np.ndarray], oof_predictions: List[np.ndarray], y: pd.Series, estimator: Optional[BaseEstimator] = None, cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, groups: Optional[pd.Series] = None, type_of_target: str = 'auto', eval_func: Optional[Callable] = None) -> EnsembleResult: """ Perform stacking on predictions. Args: test_predictions: List of predicted values on test data. oof_predictions: List of predicted values on out-of-fold training data. y: Target value estimator: Estimator used for the 2nd-level model. If ``None``, the default estimator (auto-tuned linear model) will be used. cv: int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, - integer, to specify the number of folds in a ``(Stratified)KFold``, - CV splitter (the instance of ``BaseCrossValidator``), - An iterable yielding (train, test) splits as arrays of indices. groups: Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). type_of_target: The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``. Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported. eval_func: Evaluation metric used for calculating result score. Used only if ``oof_predictions`` and ``y`` are given. Returns: Namedtuple with following members * test_prediction: numpy array, Average prediction on test data. * oof_prediction: numpy array, Average prediction on Out-of-Fold validation data. ``None`` if ``oof_predictions`` = ``None``. * score: float, Calculated score on Out-of-Fold data. ``None`` if ``eval_func`` is ``None``. """ assert len(oof_predictions) == len( test_predictions), "Number of oof and test predictions should be same" def _stack(predictions): if predictions[0].ndim == 1: predictions = [p.reshape(len(p), -1) for p in predictions] return np.hstack(predictions) X_train = convert_input(_stack(oof_predictions)) y = convert_input_vector(y, X_train.index) X_test = convert_input(_stack(test_predictions)) assert len(X_train) == len(y) if type_of_target == 'auto': type_of_target = multiclass.type_of_target(y) if estimator is None: # if estimator is None, tuned linear estimator is used if type_of_target == 'continuous': estimator = Ridge(normalize=True, random_state=0) param_grid = { 'alpha': [0.001, 0.01, 0.1, 1, 10], } else: estimator = LogisticRegression(random_state=0) param_grid = { 'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10], } grid_search = GridSearchCV(estimator, param_grid, cv=cv) grid_search.fit(X_train, y, groups=groups) estimator = grid_search.best_estimator_ result = cross_validate(estimator, X_train, y, X_test, cv=cv, groups=groups, eval_func=eval_func) score = result.scores[-1] if result.scores else None return EnsembleResult(result.test_prediction, result.oof_prediction, score)
def fit(self, X, y, **kwargs): """Fit encoder according to X and binary y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Binary target values. Returns ------- self : encoder Returns self. """ # Create parent encoder and fit it self.parent_cols = list(self.feature_mapping.values()) self.parent_encoder = MEstimateEncoder( verbose=self.verbose, cols=self.parent_cols, drop_invariant=self.drop_invariant, return_df=self.return_df, handle_unknown=self.handle_unknown, handle_missing=self.handle_missing, random_state=self.random_state, randomized=self.randomized, sigma=self.sigma, m=self.m_prior, ) self.parent_encoder.fit(X, y) # Unite parameters into pandas types X = util.convert_input(X) y = util.convert_input_vector(y, X.index).astype(float) # The lengths must be equal if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") self._dim = X.shape[1] # If columns aren't passed, just use every string column if self.cols is None: self.cols = util.get_obj_cols(X) else: self.cols = util.convert_cols_to_list(self.cols) if self.handle_missing == "error": if X[self.cols].isnull().any().any(): raise ValueError("Columns to be encoded can not contain null") # Check that children and parents are disjoint children = set(self.feature_mapping.keys()) parents = set(self.feature_mapping.values()) if len(children.intersection(parents)) > 0: raise ValueError("No column should be a child and a parent") self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, handle_unknown="value", handle_missing="value", ) self.ordinal_encoder = self.ordinal_encoder.fit(X) X_ordinal = self.ordinal_encoder.transform(X) # Training self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() # Store column names with approximately constant variance on the training data if self.drop_invariant: self.drop_cols = [] generated_cols = util.get_generated_cols(X, X_temp, self.cols) self.drop_cols = [ x for x in generated_cols if X_temp[x].var() <= 10e-5 ] try: [self.feature_names.remove(x) for x in self.drop_cols] except KeyError as e: if self.verbose > 0: print("Could not remove column from feature names." "Not found in generated cols.\n{}".format(e)) return self
def test_convert_input_vector(self): index = [2, 3, 4] result = convert_input_vector([0, 1, 0], index) # list self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector([[0, 1, 0]], index) # list of lists (row) self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector([[0], [1], [0]], index) # list of lists (column) self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector(np.array([1, 0, 1]), index) # np vector self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector(np.array([[1, 0, 1]]), index) # np matrix row self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector(np.array([[1], [0], [1]]), index) # np matrix column self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector(pd.Series([0, 1, 0], index=[4, 5, 6]), index) # series self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [4, 5, 6], 'We want to preserve the original index') result = convert_input_vector(pd.DataFrame({'y' :[0, 1, 0]}, index=[4, 5, 6]), index) # dataFrame self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [4, 5, 6], 'We want to preserve the original index') result = convert_input_vector((0, 1, 0), index) # tuple self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(3, len(result)) np.testing.assert_array_equal(result.index, [2, 3, 4]) result = convert_input_vector(0, [2]) # scalar self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(1, len(result)) self.assertTrue(result.index == [2]) result = convert_input_vector('a', [2]) # scalar self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(1, len(result)) self.assertTrue(result.index == [2]) # multiple columns and rows should cause an error because it is unclear which column/row to use as the target self.assertRaises(ValueError, convert_input_vector, (pd.DataFrame({'col1' :[0 ,1 ,0], 'col2' :[1 ,0 ,1]})), index) self.assertRaises(ValueError, convert_input_vector, (np.array([[0 ,1], [1 ,0], [0 ,1]])), index) self.assertRaises(ValueError, convert_input_vector, ([[0, 1], [1, 0], [0, 1]]), index) # edge scenarios (it is ok to raise an exception but please, provide then a helpful exception text) _ = convert_input_vector(pd.Series(), []) _ = convert_input_vector([], []) _ = convert_input_vector([[]], []) _ = convert_input_vector(pd.DataFrame(), [])