def set_params(self, **params): """Set parameters for the wrapper and the wrapped estimator. This method is required for compatibility with GridSearchCV. :param params: A dictionary of parameters for the wrapper and wrapped estimator. If a key doesn't match the name of a wrapper parameter, it is assumed to be for the wrapped estimator. TODO: it would be better to do what sklearn's pipeline does and provide some namespacing in case the wrapper and wrapped class share a parameter name :return: self """ if not params: return self valid_params = self.get_params(deep=True) model_params = self.model_params wrapper_params = {} for key, value in params.iteritems(): if key in valid_params: wrapper_params[key] = value else: model_params[key] = value wrapper_params['model_params'] = model_params BaseEstimator.set_params(self, **wrapper_params) return self
def set_params(self, **params): """ Custom setting of parameters for generative models. All parameters that start with 'x_prep', 'y_prep', 'y_post' are delegated to respective preprocessors. """ elements = {'augm', 'X_prep', 'Y_prep', 'Y_post', 'model'} self_params = { k: v for k, v in params.items() if not any(k.startswith(p.lower()) for p in elements) } BaseEstimator.set_params(self, **self_params) # set attributes of elements for e in elements: element = getattr(self, e) if isinstance(element, BaseEstimator): subprm = { k[len(e) + 2:]: v for k, v in params.items() if k.startswith(e.lower()) } element.set_params(**subprm) return self
def set_params(self, **params): """ Set the parameters of this estimator. Valid parameter keys can be listed with ``get_params()``. Returns ------- self """ items = self.steps names, _ = zip(*items) keys = list(params.keys()) for name in keys: if '__' not in name and name in names: # replace an estimator self._replace_estimator('steps', name, params.pop(name)) if callable(params[name]): # use a callable or function to set parameters params[name] = params[name](params) elif params[name] in keys: # set one arg from another params[name] = params[params[name]] BaseEstimator.set_params(self, **params) return self
def set_params(self, **kwargs): """Set the parameters of this class. Valid parameter keys can be listed with ``get_params()``. Returns ------- self """ normal_params, special_params = {}, {} for key, val in kwargs.items(): if any(key.startswith(prefix) for prefix in self.prefixes_): special_params[key] = val else: normal_params[key] = val BaseEstimator.set_params(self, **normal_params) for key, val in special_params.items(): if key.endswith('_'): raise ValueError("Not sure: Should this ever happen?") else: setattr(self, key, val) if any(key.startswith('criterion') for key in special_params): self.initialize_criterion() if any(key.startswith('callbacks') for key in special_params): self.initialize_callbacks() if any(key.startswith('module') for key in special_params): self.initialize_module() self.initialize_optimizer() if any(key.startswith('optimizer') for key in special_params): self.initialize_optimizer() return self
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== #raise NotImplementedError() # ======================== kf = sklearn.model_selection.KFold(k_folds) smallest_loss = np.inf best_params = {"bostonfeaturestransformer__degree": 1, "linearregressor__reg_lambda": 0.2} count = 0 for lam in lambda_range: for deg in degree_range: model.set_params(linearregressor__reg_lambda=lam, bostonfeaturestransformer__degree=deg) avg_mse = 0.0 count += 1 for train_i, test_i in kf.split(X): x_train = X[train_i] y_train = y[train_i] model.fit(x_train, y_train) y_pred = model.predict(X[test_i]) avg_mse += np.square(y[test_i] - y_pred).sum() / (2 * X.shape[0]) avg_mse /= k_folds #check if the current params are the best if avg_mse <= smallest_loss: smallest_loss = avg_mse best_params = {"linearregressor__reg_lambda": lam, "bostonfeaturestransformer__degree": deg} # ======================== print(count) return best_params
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== params = { 'linearregressor__reg_lambda': lambda_range, 'bostonfeaturestransformer__degree': degree_range } kf = KFold(n_splits=k_folds) best_params = ParameterGrid(params)[0] best_mse = np.inf best_r_2 = 0.0 for p_dict in ParameterGrid(params): cur_acc = 0.0 curr_r_2 = 0.0 model.set_params(**p_dict) for train_index, test_index in kf.split(X): model.fit(X[train_index], y=y[train_index]) mse, rsq = evaluate_accuracy(y[test_index], model.predict(X[test_index])) cur_acc += mse curr_r_2 += rsq cur_acc /= k_folds curr_r_2 /= k_folds if curr_r_2 > best_r_2: best_r_2 = curr_r_2 best_params = p_dict # ======================== return best_params
def set_cpu_params(estimator: BaseEstimator, num_cpus: int) -> None: """Sets all CPU-related params to num_cpus (incl. nested).""" cpu_params = { param: num_cpus for param in estimator.get_params(deep=True) if any( param.endswith(cpu_param_name) for cpu_param_name in SKLEARN_CPU_PARAM_NAMES) } estimator.set_params(**cpu_params)
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== best_params = { 'bostonfeaturestransformer__degree': 0, 'linearregressor__reg_lambda': 0 } best_score = 0 for 𝜆 in lambda_range: for deg in degree_range: hypers = { 'bostonfeaturestransformer__degree': deg, 'linearregressor__reg_lambda': 𝜆 } model.set_params(**hypers) scores = sklearn.model_selection.cross_validate(model, X, y, scoring='r2', cv=k_folds) mean_score = np.mean(scores['test_score']) if mean_score > best_score: best_score = mean_score best_params['linearregressor__reg_lambda'] = 𝜆 best_params['bostonfeaturestransformer__degree'] = deg # ======================== return best_params
def set_params(self, **kwargs): """Update the parameters of the feature extractor.""" # We don't want non-functional arguments polluting kwargs params = kwargs.copy() for k in ['function', 'target']: params.pop(k, None) self.kwargs.update(params) BaseEstimator.set_params(self, **kwargs)
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== kf = sklearn.model_selection.KFold(n_splits=k_folds) best_params = 0 min_mse = np.inf for curr_degree in degree_range: for curr_lambda in lambda_range: params = dict(linearregressor__reg_lambda=curr_lambda, bostonfeaturestransformer__degree=curr_degree) model.set_params(**params) mse = 0 counter = 0 for train_index, test_index in kf.split(X): counter = counter + 1 model.fit(X[train_index], y[train_index]) y_pred = model.predict(X[test_index]) mse = mse + np.mean((y[test_index] - y_pred)**2) avg_mse = mse / counter print("avg_mse:", avg_mse, " labmda:", curr_lambda, " degree:", curr_degree) if avg_mse < min_mse: best_params = params min_mse = avg_mse # ======================== return best_params
def set_params(self, **params): if 'use_feature_selection' in params: if not params['use_feature_selection']: print("FS is disabled") BaseEstimator.set_params(self, k='all') # params['k'] = 'all' # print(params['use_feature_selection']) # print(params['k']) print(params) # super(BaseEstimator, self).set_params(**params) BaseEstimator.set_params(self, **params)
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== kf = sklearn.model_selection.KFold(k_folds) params_grid = sklearn.model_selection.ParameterGrid({ 'bostonfeaturestransformer__degree': degree_range, 'linearregressor__reg_lambda': lambda_range }) best_loss = 0 for param in params_grid: model.set_params(**param) avg_score = 0.0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) y_pred = model.predict(X_test) avg_score += r2_score(y_test, y_pred) avg_score /= k_folds if avg_score > best_loss: best_loss = avg_score best_params = param # ======================== return best_params
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== # params = model.get_params() kf = sklearn.model_selection.KFold(n_splits=k_folds) params_grid = { 'bostonfeaturestransformer__degree': degree_range, 'linearregressor__reg_lambda': lambda_range } min_acc = np.inf for params in list(sklearn.model_selection.ParameterGrid(params_grid)): model.set_params(**params) curr_acc = 0 for train_idx, test_idx in kf.split(X): train_x, train_y = X[train_idx], y[train_idx] test_x, test_y = X[test_idx], y[test_idx] model.fit(train_x, train_y) y_pred = model.predict(test_x) curr_acc += mse_score(test_y, y_pred) mean = curr_acc / k_folds if mean < min_acc: min_acc = mean best_params = params # ======================== return best_params
def set_params(self, **kwargs): """Set the parameters of this class. Valid parameter keys can be listed with ``get_params()``. Returns ------- self """ self._check_deprecated_params(**kwargs) normal_params, cb_params, special_params = {}, {}, {} for key, val in kwargs.items(): if key.startswith('callbacks'): cb_params[key] = val elif any(key.startswith(prefix) for prefix in self.prefixes_): special_params[key] = val else: normal_params[key] = val BaseEstimator.set_params(self, **normal_params) for key, val in special_params.items(): if key.endswith('_'): raise ValueError("Not sure: Should this ever happen?") else: setattr(self, key, val) if cb_params: # callbacks need special treatmeant since they are list of tuples self.initialize_callbacks() self._set_params_callback(**cb_params) if any(key.startswith('criterion') for key in special_params): self.initialize_criterion() if any(key.startswith('module') for key in special_params): self.initialize_module() self.initialize_optimizer() if any(key.startswith('optimizer') for key in special_params): # Model selectors such as GridSearchCV will set the # parameters before .initialize() is called, therefore we # need to make sure that we have an initialized model here # as the optimizer depends on it. if not hasattr(self, 'module_'): self.initialize_module() self.initialize_optimizer() vars(self).update(kwargs) return self
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== DEGREE_PARAM = "bostonfeaturestransformer__degree" LAMBDA_PARAM = "linearregressor__reg_lambda" results = {} for degree in degree_range: for reg_lambda in lambda_range: params = model.get_params() params[DEGREE_PARAM] = degree params[LAMBDA_PARAM] = reg_lambda model.set_params(**params) scores = sklearn.model_selection.cross_val_score( model, X, y, scoring="neg_mean_squared_error", cv=k_folds) score = np.mean(scores) results[score] = params best_params = max(results.items(), key=lambda x: x[0])[1] # ======================== return best_params
def __init__(self, blending_regressor: BaseEstimator, model_name: str, params: dict): super().__init__(model_name, params) self.blend_model = BlendingRegressor( blending_regressor.set_params(**params)) self.MODELS_SERIALIZING_BASEPATH = self.path.join( self.MODELS_SERIALIZING_BASEPATH, MACHINE_LEARNING_TECHNIQUE_NAME) self.SERIALIZE_FILENAME_PREFIX = SERIALIZE_FILENAME_PREFIX
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. from sklearn.model_selection import cross_validate best_score = None best_lambda = None best_degree = None for current_degree in degree_range: for current_lambda in lambda_range: model.set_params(linearregressor__reg_lambda=current_lambda,bostonfeaturestransformer__degree=current_degree) scores = cross_validate(model, X, y, cv=k_folds, scoring=('r2')) if best_score is None: best_score = np.average(scores['test_score']) elif np.average(scores['test_score']) > best_score: best_degree = current_degree best_lambda = current_lambda best_score = np.average(scores['test_score']) best_params = {} best_params['bostonfeaturestransformer__degree'] = best_degree best_params['linearregressor__reg_lambda'] = best_lambda return best_params
def set_params(self, **params): BaseEstimator.set_params(self, **params) print(params) if 'use_feature_selection' in params: if not params['use_feature_selection']: print("FS is disabled") BaseEstimator.set_params(self, k='all') # # params['k'] = 'all' # # print(params['use_feature_selection']) # BaseEstimator.set_params(self, **params) # if 'k' in params: # print(params['k']) # if self.use_feature_selection: # # super(BaseEstimator, self).set_params(**params) # BaseEstimator.set_params(self, **params) # else: # print("don't set k, use_feature_select is false") print(BaseEstimator.get_params(self))
def set_params(self, **params): """Set the parameters of this estimator. The optimizer works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ G_agg = params.pop('G_agg', None) if G_agg == 'full' and self.G_agg != 'full': if hasattr(self, 'components_'): self.G_ = self.components_.dot(self.components_.T) self.G_agg = 'full' BaseEstimator.set_params(self, **params)
def set_params(self, **params): """Set the parameters of this estimator. Valid parameter keys can be listed with ``get_params()``. Returns ------- self """ return BaseEstimator.set_params(self, **params)
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== from copy import deepcopy as dc score_optimal = float('-inf') parameter = model.get_params() best_params = parameter.copy() for d in degree_range: for l in lambda_range: parameter['bostonfeaturestransformer'].degree, parameter[ 'bostonfeaturestransformer__degree'], parameter[ 'linearregressor'].reg_lambda, parameter[ 'linearregressor__reg_lambda'] = d, d, l, l score = np.mean( sklearn.model_selection.cross_validate( model.set_params(**parameter), X, y, cv=k_folds, scoring='neg_mean_squared_error')['test_score']) score_optimal = score if (score > score_optimal) else score_optimal best_params = dc(parameter) if ( score > score_optimal) else best_params # raise NotImplementedError() # ======================== return best_params
def train_test_model(model: BaseEstimator, param_dict, data_dict, pp_dict=PP_DICT, savefig=None, save_model=True): X_train, X_test, y_train, y_test = DataManager.load_tts_data(**data_dict) log_tf = pp_dict.get('log_tf', True) models = [] if pp_dict: if pp_dict.get('std_scale', False): with_mean = pp_dict.get('with_mean', True) models.append(StandardScaler(with_mean=with_mean)) n_comp = pp_dict.get('n_components', -1) if n_comp > 0: models.append(PCA(n_components=n_comp)) model.set_params(**param_dict) models.append(model) pipeline = make_pipeline(*models) y_tf = (y_train if not log_tf else DataManager.log_tf(y_train)).iloc[:, 0] pipeline.fit(X_train, y_tf) m_err, r2 = compute_regression_result(pipeline, X_train, X_test, y_train, y_test, log_tf=log_tf) if save_model: store_pipeline(pipeline, param_dict, data_dict, pp_dict, m_err, r2) return pipeline, m_err, r2
def set_params(self, **params): """ Set the parameters of this estimator. The method works on simple estimators as well as on nested objects (such as pipelines). The latter have parameters of the form ``<component>__<parameter>`` so that it's possible to update each component of a nested object. Returns ------- self """ if self.compat: BaseEstimator.set_params(self, **params) else: for p in self._get_param_names(): if p in params: setattr(self, p, params.pop(p)) self.estimator.set_params(**params) return self
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== kf = sklearn.model_selection.KFold(n_splits=k_folds) best_params = {} best_mse = None best_degree = None best_lambda = None for degree in degree_range: for lambda_r in lambda_range: mse = 0 cnt = 0 model.set_params(bostonfeaturestransformer__degree=degree, linearregressor__reg_lambda=lambda_r) # model = sklearn.pipeline.make_pipeline( # BiasTrickTransformer(), # BostonFeaturesTransformer(degree), # LinearRegressor(lambda_r) # ) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) y_test_pred = model.predict(X_test) mse += np.sum((y_test - y_test_pred) ** 2) cnt += y_test.shape[0] mse /= cnt if best_mse is None or best_mse > mse: best_mse = mse best_degree = degree best_lambda = lambda_r best_params['bostonfeaturestransformer__degree'] = best_degree best_params['linearregressor__reg_lambda'] = best_lambda # ======================== return best_params
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. best_params = None # ====== YOUR CODE: ====== best_accr = np.inf # Splitting the data k-fold k_folder = sklearn.model_selection.KFold(k_folds) # Iterating over all parameters for degree_param in degree_range: for lambda_param in lambda_range: # Defying current params and setting the model params = { 'bostonfeaturestransformer__degree': degree_param, 'linearregressor__reg_lambda': lambda_param } model.set_params(**params) avg_accur = 0 # Checking params on all k folds for train_indices, val_indices in k_folder.split(X): train_X, train_y = X[train_indices], y[train_indices] val_X, val_y = X[val_indices], y[val_indices] # Training model on training set model.fit(train_X, train_y) # Evaluate accuracy on validation set y_pred = model.predict(val_X) mse = np.mean((val_y - y_pred)**2) avg_accur += mse # Calculating avg of all k_folds avg_accur = avg_accur / k_folds # Updating Best params if avg_accur < best_accr: best_accr = avg_accur best_params = params # ======================== return best_params
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== best_params = None best_loss = -1 for degree in degree_range: for lambda_val in lambda_range: # create the hyper-parameters for the model params = dict(linearregressor__reg_lambda=lambda_val, bostonfeaturestransformer__degree=degree) model.set_params(**params) # k-fold split kf = KFold(n_splits=k_folds) loss = 0 for fold_ind_train, fold_ind_test in kf.split(X): # getting the fold values train_X_fold = X[fold_ind_train] train_y_fold = y[fold_ind_train] test_X_fold = X[fold_ind_test] test_y_fold = y[fold_ind_test] model.fit(train_X_fold, train_y_fold) y_pred = model.predict(test_X_fold) mse = np.mean((test_y_fold - y_pred)**2) loss += mse # first loop - initialization if best_loss < 0: best_params = params best_loss = loss else: best_loss = min(loss, best_loss) if loss > best_loss: best_params = best_params else: best_params = params # ======================== return best_params
def _train_and_evaluate(estimator: BaseEstimator, params: Dict[str, Any], model_id: int, grid_search_context: Dict[str, Any]) -> None: # Unpack the grid search context. output_dir = grid_search_context['output_dir'] cross_validation = grid_search_context['cross_validation'] validation_file = grid_search_context['validation_file'] target_col = grid_search_context['target_col'] training_file = grid_search_context['training_file'] param_str = ", ".join( ["{}={}".format(param_name, param_value) for param_name, param_value in params.items()]) logger.info("Training and evaluating model {}: {}"\ .format(model_id, param_str)) model_file = "{}/model_{}.pkl".format(output_dir, model_id) results_file = "{}/results_{}.json".format(output_dir, model_id) # If the results file already exists, skip this pass. if os.path.exists(results_file): logger.info("Model {} already exists, skipping.".format(model_id)) return # Initialize the estimator with the params. estimator.set_params(**params) cv_results = {} # Perform cross validation if selected. if cross_validation is not None: logger.info("Cross validating model {} for {} folds.".format( model_id, cross_validation)) cv_results = \ _cross_validate(estimator, model_id, grid_search_context) logger.info( "Training model {} and evaluating the model on the training set."\ .format(model_id)) estimator, training_results = \ _train_model(estimator, grid_search_context) logger.info( "Model {} trained in {:.3f} seconds.".format( model_id, training_results["training_time_total"])) logger.info( "Model {} training set prediction time: {:.3f} for {} records.".format( model_id, training_results["training_total_prediction_time"], training_results["training_total_prediction_records"])) # If the validation set is defined, use _evaluate_model to evaluate the # model. Otherwise this is an empty dict. if validation_file is not None: logger.info( "Evaluating model {} on the validation set.".format(model_id)) validation_results = \ _evaluate_model(estimator, grid_search_context['X_validation'], grid_search_context['y_validation'], grid_search_context, "validation") \ if validation_file is not None else {} if len(validation_results) > 0: logger.info( "Model {} validation set evaluation time: {:.3f} for {} records."\ .format(model_id, validation_results["validation_total_prediction_time"], validation_results["validation_total_prediction_records"])) # Construct and write the results for this run. results = { "training_file": training_file, "target": target_col, "model_file": model_file, "model_id": model_id, **cv_results, **training_results, **validation_results, **params } # Add the validation set file if present. if validation_file: results["validation_file"] = validation_file # Write the results _after_ the model. logger.info("Writing estimator for model {} to {}."\ .format(model_id, model_file)) joblib.dump(estimator, model_file) logger.info("Writing results for model {} to {}."\ .format(model_id, results_file)) with open(results_file, 'w') as results_out: results_out.write( json.dumps(results) + "\n")
def set_params(self, **params): BaseEstimator.set_params(self, **params)
def __init__(self, base_regressor: BaseEstimator, quantiles: list = None, quantile_range: Tuple = None, step: float = None, **base_params): """Initializes the QuantileRegressor instance. Initializes the quantile regressor by supplying the underlying sklearn estimator as well as fixed quantiles or a quantile range and step size. Args: base_regressor: The underlying sklearn estimator. Must implement a fit and predict method as well as accept loss and alpha parameters. fit_quantiles, optional: List of quantiles on which the model should be trained on. If no list is provided, the model falls back on the quantile_range and step parameters. quantile_range, optional: Tuple with a lower and higher quantile bound which provide a range for quantiles on which the model should be trained on. step, optional: Step size which is used to create the model quantile range. **base_params: Optional keyword arguments which will be passed on to the ``base_model``. Examples: The below example illustrates how an instance of the QuantileRegressor class can be initialized with a trained sklearn GradientBoostingRegressor instance. >>> gbr = GradientBoostingRegressor() >>> quantile_reg = QuantileRegressor(gbr, fit_quantiles=[0.4, 0.5, 0.55]) """ assert {'loss', 'alpha'}.issubset(base_regressor.get_params().keys()), \ 'Provided base_regressor instance doesn\'t accept quantile loss function.' assert quantiles is not None or (quantile_range is not None and step is not None), \ 'The variable fit_quantiles or the variables quantile_range and step must be specified.' params = {'loss': 'quantile', 'alpha': 0.5} base_regressor = clone(base_regressor) base_regressor.set_params(**base_params) base_regressor.set_params(**params) self.base_regressor = base_regressor self.fit_quantiles = quantiles self.quantile_range = quantile_range self.step = step model_dict = {} self._quantiles = [0.5] model_dict['0.5'] = base_regressor self.model_dict = model_dict quantiles = self.__quantile_creator() if self.fit_quantiles is None and quantile_range is not None and step is not None \ else self.fit_quantiles all_models = [self._create_model_from_quantile(q) for q in quantiles] for i in range(0, len(quantiles)): if quantiles[i] not in self.model_dict.keys(): self.model_dict['{}'.format(quantiles[i])] = all_models[i] quantiles = self._quantiles + quantiles quantiles = list(set(quantiles)) self._quantiles = sorted(quantiles)