def _benchmark_from_data( experiment: Experiment, *, estimator: BaseEstimator, X_train: DataType, y_train: TargetType, X_test: DataType, y_test: TargetType, save_train: bool = False, ) -> None: with _add_timing(experiment, "fit_time"): estimator.fit(X_train, y_train) _append_info(experiment, "fitted_estimator", estimator) with _add_timing(experiment, "score_time"): test_score = estimator.score(X_test, y_test) _append_info(experiment, "test_score", test_score) if save_train: train_score = estimator.score(X_train, y_train) _append_info(experiment, "train_score", train_score) for output in ("transform", "predict"): method = getattr(estimator, output, None) if method is not None: with _add_timing(experiment, f"{output}_time"): _append_info(experiment, f"{output}", method(X_test))
def cross_validate(estimator: BaseEstimator, X: pd.DataFrame, y: pd.DataFrame, num_splits: int, save_name: str) -> None: """ function to perform cross validation and call error_profile at the end to generate an error report for a sklearn model :param estimator: SkLearn classification model :param X: dataframe containing data :param y: dataframe containing class labels corresponding to X :param num_splits: number of folds for k-fold cross validation :param save_name: save name for error profile plots (file extension will be appended) :return: None """ splitter = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=0) predictions = {"test": [], "train": []} y_true = {"test": [], "train": []} for train_index, test_index in splitter.split(X, y): estimator.fit(X.iloc[train_index, :], y.iloc[train_index, 0]) test_pred = estimator.predict(X.iloc[test_index, :]) train_pred = estimator.predict(X.iloc[train_index, :]) predictions["train"].append(train_pred) predictions["test"].append(test_pred) y_true["train"].append(np.array(y.iloc[train_index])[:, 0]) y_true["test"].append(np.array(y.iloc[test_index])[:, 0]) error_profile(y_true, predictions, model_type=save_name)
def loop_snippet(clf: BaseEstimator, repeat: int, x, y, xt): time_table = [] for i in range(repeat): start = time.perf_counter() clf.fit(x, y) clf.predict(xt) time_table.append(time.perf_counter() - start) return time_table
def test_determine_offset(model: BaseEstimator, expected_offset: int): """ Determine the correct output difference from the model """ X, y = np.random.random((100, 10)), np.random.random((100, 10)) model.fit(X, y) offset = ModelBuilder._determine_offset(model, X) assert offset == expected_offset
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== #raise NotImplementedError() # ======================== kf = sklearn.model_selection.KFold(k_folds) smallest_loss = np.inf best_params = {"bostonfeaturestransformer__degree": 1, "linearregressor__reg_lambda": 0.2} count = 0 for lam in lambda_range: for deg in degree_range: model.set_params(linearregressor__reg_lambda=lam, bostonfeaturestransformer__degree=deg) avg_mse = 0.0 count += 1 for train_i, test_i in kf.split(X): x_train = X[train_i] y_train = y[train_i] model.fit(x_train, y_train) y_pred = model.predict(X[test_i]) avg_mse += np.square(y[test_i] - y_pred).sum() / (2 * X.shape[0]) avg_mse /= k_folds #check if the current params are the best if avg_mse <= smallest_loss: smallest_loss = avg_mse best_params = {"linearregressor__reg_lambda": lam, "bostonfeaturestransformer__degree": deg} # ======================== print(count) return best_params
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== params = { 'linearregressor__reg_lambda': lambda_range, 'bostonfeaturestransformer__degree': degree_range } kf = KFold(n_splits=k_folds) best_params = ParameterGrid(params)[0] best_mse = np.inf best_r_2 = 0.0 for p_dict in ParameterGrid(params): cur_acc = 0.0 curr_r_2 = 0.0 model.set_params(**p_dict) for train_index, test_index in kf.split(X): model.fit(X[train_index], y=y[train_index]) mse, rsq = evaluate_accuracy(y[test_index], model.predict(X[test_index])) cur_acc += mse curr_r_2 += rsq cur_acc /= k_folds curr_r_2 /= k_folds if curr_r_2 > best_r_2: best_r_2 = curr_r_2 best_params = p_dict # ======================== return best_params
def test_data_types(est: BaseEstimator, feature, target): if hasattr(est, 'fit'): # Meaning a Handler or Robust Model est.fit(feature, target).predict(feature) elif hasattr(est, 'detect'): est.detect(feature, target) elif hasattr(est, 'simulate_noise'): est.simulate_noise(feature, target) else: raise Exception("WTF")
def auto_mlflow( run_name: str, model_name: BaseEstimator, data_params: dict = None, X: np.ndarray = "X_train", y: np.ndarray = "y_train", ) -> str: """ Wrapper function that automates the application of mlflow to a model training event. Args: run_name (str): Desired name of the run, this will appear in the database model_name (BaseEstimator): Variable name of the sklearn estimator object (must refer to an already instantiated model) data_params (dict, optional): Dictionary containing params on the data e.g. {'standard_scaled': False}. Defaults to None. X (np.ndarray, optional): Feature array. Defaults to "X_train". y (np.ndarray, optional): Target array. Defaults to "y_train". Returns: str: Logs data to mlflow, also prints representation of evaluation scores to console """ with mlflow.start_run(run_name=run_name): model_name.fit(X, y) no_val_rmse, no_val_r2, val_rmse_scores, cv_mean, cv_std, cv_cov = score_model( model_name, X, y ) data_params = data_params model_params = model_name.get_params() mlflow.log_params(data_params) mlflow.log_params(model_params) mlflow.log_metrics( { "no_val_rmse": no_val_rmse, "no_val_r2": no_val_r2, "cv_score_1": val_rmse_scores[0], "cv_score_2": val_rmse_scores[1], "cv_score_3": val_rmse_scores[2], "cv_score_4": val_rmse_scores[3], "cv_score_5": val_rmse_scores[4], "cv_mean": cv_mean, "cv_std": cv_std, "cv_cov": cv_cov, } ) mlflow.sklearn.log_model(model_name, "model") return None
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== kf = sklearn.model_selection.KFold(n_splits=k_folds) best_params = 0 min_mse = np.inf for curr_degree in degree_range: for curr_lambda in lambda_range: params = dict(linearregressor__reg_lambda=curr_lambda, bostonfeaturestransformer__degree=curr_degree) model.set_params(**params) mse = 0 counter = 0 for train_index, test_index in kf.split(X): counter = counter + 1 model.fit(X[train_index], y[train_index]) y_pred = model.predict(X[test_index]) mse = mse + np.mean((y[test_index] - y_pred)**2) avg_mse = mse / counter print("avg_mse:", avg_mse, " labmda:", curr_lambda, " degree:", curr_degree) if avg_mse < min_mse: best_params = params min_mse = avg_mse # ======================== return best_params
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== kf = sklearn.model_selection.KFold(k_folds) params_grid = sklearn.model_selection.ParameterGrid({ 'bostonfeaturestransformer__degree': degree_range, 'linearregressor__reg_lambda': lambda_range }) best_loss = 0 for param in params_grid: model.set_params(**param) avg_score = 0.0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) y_pred = model.predict(X_test) avg_score += r2_score(y_test, y_pred) avg_score /= k_folds if avg_score > best_loss: best_loss = avg_score best_params = param # ======================== return best_params
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== # params = model.get_params() kf = sklearn.model_selection.KFold(n_splits=k_folds) params_grid = { 'bostonfeaturestransformer__degree': degree_range, 'linearregressor__reg_lambda': lambda_range } min_acc = np.inf for params in list(sklearn.model_selection.ParameterGrid(params_grid)): model.set_params(**params) curr_acc = 0 for train_idx, test_idx in kf.split(X): train_x, train_y = X[train_idx], y[train_idx] test_x, test_y = X[test_idx], y[test_idx] model.fit(train_x, train_y) y_pred = model.predict(test_x) curr_acc += mse_score(test_y, y_pred) mean = curr_acc / k_folds if mean < min_acc: min_acc = mean best_params = params # ======================== return best_params
def fit_transform(self, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series, model: BaseEstimator) -> List: """ Parameters ---------- X_train: Данные для обучения X_test: Тестовый набор y_train: Целевая для обучающего набора y_test: Целевая для тестового набора model: Модель, совместимая с sklearn estimator Return value ------------ Выбранный набор признаков """ if self.permutation_importance_df is None: self.__fe = FeatureImportance(X_train, X_test, y_train, y_test, model, self.metric_name) self.permutation_importance_df = self.__fe.get_n_permutation_importance( self.n) self.permutation_importance_df = self.permutation_importance_df.sort_values( 'permutation_' + self.metric_name, ascending=False) selected_features = [] for i, col in enumerate(self.permutation_importance_df['features']): selected_features.append(col) if self.verbose: print('Fitting model on {0} features'.format(i + 1)) model.fit(X_train[selected_features], y_train) current_metric = self.metric(model, X_test[selected_features], y_test) if self.verbose: print(self.metric_name + ' = {0}'.format(current_metric)) self.subsets_.append({ 'score_' + self.metric_name: current_metric, 'feature_names': list(selected_features) }) if i > self.early_stopping_rounds and current_metric - self.subsets_[i - self.early_stopping_rounds] \ ['score_' + self.metric_name] < self.epsilon: break return selected_features[:-self.early_stopping_rounds]
def _fit_step(self, transformer: BaseEstimator, ids: Tuple, is_final: bool, X: pd.DataFrame, y: Iterable = None, **fit_params): # make transformer unique for each CV split transformer.train_ = tuple(X.index) transformer.features_ = tuple(X.columns) # load transformer from database transformer_loaded, ids_loaded = self._load(transformer, ids) is_loaded = False if transformer_loaded is None else True if is_loaded: transformer = transformer_loaded ids = ids_loaded # fit final step if is_final: if not is_loaded: transformer.fit(X, y, **fit_params) # fit intermediate steps else: if not is_loaded: transformer.fit(X, y, **fit_params) transformed_data = transformer.transform(X) if isinstance(transformed_data, Tuple): X, y = transformed_data else: Xnp = transformed_data # reshape input data if Xnp.shape != X.shape: if isinstance(X, pd.DataFrame): X = X.iloc[:, transformer.get_support()] else: X = pd.DataFrame(Xnp) # save transformer if not is_loaded: ids = self._save(transformer, ids) return transformer, ids, X
def _train_model(estimator: BaseEstimator, grid_search_context: Dict[str, Any]) \ -> Tuple[Dict[str, Any], BaseEstimator]: X = grid_search_context['X_train'] y = grid_search_context['y_train'] fit_params = grid_search_context['fit_params'] fit_start = time() estimator.fit(X, y, **fit_params) fit_end = time() results = _evaluate_model(estimator, X, y, grid_search_context, "training") results["training_time_total"] = fit_end - fit_start return estimator, results
def out_of_fold( self, estimator: BaseEstimator, train_x, train_y, valid_x, valid_y): # lightGBMとcatboostの場合は、fit時に下記パラメータを与える fit_params = {} if type(estimator).__name__ in ('LGBMClassifier', 'CatBoostClassifier',): if 'eval_set' not in fit_params: fit_params['eval_set'] = [(valid_x, valid_y)] if 'early_stopping_rounds' not in fit_params: fit_params['early_stopping_rounds'] = 100 estimator.fit(train_x, train_y, **fit_params) oof = self.make_pred(estimator, valid_x) return oof
def train_fchl(rep_computer: FCHLRepresentation, model: BaseEstimator, mols: List[str], y: List[float], n_jobs: int = 1, y_lower: List[float] = None) -> BaseEstimator: """Retrain an FCHL-based model Args: rep_computer: Tool used to compute the FCHL-compatible representations for each molecule model: Model to be retrained mols: List of molecules (XYZ format) in training set y: List of other properties to predict n_jobs: Number of threads to use for generating representations y_lower: Lower-fidelity estimate of the property. Used for delta learning models Returns: Retrained model """ # Convert the input molecules into FCHL-ready inputs rep_computer.n_jobs = n_jobs reps = rep_computer.transform(mols) # Retrain the model if y_lower is not None: y = np.subtract(y, y_lower) return model.fit(reps, y)
def finalize_model( model: BaseEstimator, X_train: CSVData, Y_train: CSVData, X_test: CSVData, test_ids: CSVData, output: str, smote_fn: SamplerFnType = None, outlier_detection: Any = None, header: Tuple[str, str] = ("id", "y"), label_indexing: int = 0, export_int: bool = False, ) -> None: """Train the model on the complete data and generate the submission file. Parameters ---------- model: The model X_train: The training data Y_train: The training labels X_test: The test data test_ids: The IDs for the test data output: The path where to dump the output smote_fn: The function that takes labels and returns SMOTE label_indexing: What to start indexing the label from export_int: Whether to export the CSV as integers """ print("Training model...") if outlier_detection is not None: outliers = outlier_detection.fit_predict(X_train) X_train = X_train[outliers == 1] Y_train = Y_train[outliers == 1] if smote_fn: smote = smote_fn(Y_train) X_train, Y_train = smote.fit_resample(X_train, Y_train) model.fit(X_train, Y_train) print("Model trained") Y_pred = model.predict(X_test) + label_indexing submission: Any = np.stack([test_ids, Y_pred], 1) # Add IDs create_submission_file(output, submission, header=header, export_int=export_int)
def instantiate_and_fit( index: pd.DataFrame, fold: pd.DataFrame, X: np.ndarray, y: pd.DataFrame, estimator: BaseEstimator, n_splits: int = 5, param_grid: Optional[Dict[str, Any]] = None, ) -> BaseEstimator: assert fold.shape[0] == index.shape[0] assert fold.shape[0] == X.shape[0] assert fold.shape[0] == y.shape[0] fold_vals = fold.ravel() train_inds = fold_vals == "train" val_inds = fold_vals == "val" if val_inds.sum(): raise NotImplementedError( "Explicit validation indices not yet supported.") y = y.values.ravel() nan_row, nan_col = np.nonzero(np.isnan(X) | np.isinf(X)) if len(nan_row): logger.warning( f"Setting {len(nan_row)} NaN elements to zero before fitting {estimator}." ) X[nan_row, nan_col] = 0 logger.info(f"Fitting {estimator} on data (shape: {X.shape})") if param_grid is not None: group_k_fold = GroupKFold(n_splits=n_splits).split( X[train_inds], y[train_inds], index.trial.values[train_inds]) grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, verbose=10, cv=list(group_k_fold)) grid_search.fit(X[train_inds], y[train_inds]) return grid_search.best_estimator_ estimator.fit(X[train_inds], y[train_inds]) return estimator
def link_prediction_pipeline(graph: nx.Graph, embeddings: np.array, id2node: list, node2id: list, classifier: BaseEstimator, **kwargs) -> dict: non_edges_train, non_edges_test, edges_train, edges_test = kwargs["non_edges_train"], kwargs["non_edges_test"],\ kwargs["edges_train"], kwargs["edges_test"] X_train, X_test, Y_train, Y_test = link_pred_train_test_split( embeddings, node2id, **kwargs) # Classify classifier.fit(X_train, Y_train) y_pred = classifier.predict(X_test) y_true = Y_test return { "micro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="micro"), "macro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="macro"), "accuracy": accuracy_score(y_true=y_true, y_pred=y_pred) }
def fit_and_suppress_warnings(logger: PicklableClientLogger, pipeline: BaseEstimator, X: Dict[str, Any], y: Any) -> BaseEstimator: @no_type_check def send_warnings_to_log(message, category, filename, lineno, file=None, line=None) -> None: logger.debug('%s:%s: %s:%s', filename, lineno, category.__name__, message) return with warnings.catch_warnings(): warnings.showwarning = send_warnings_to_log pipeline.fit(X, y) return pipeline
def node_classification_pipeline(graph: nx.Graph, embeddings: np.ndarray, id2node: list, node2id: list, classifier: BaseEstimator, **kwargs) -> dict: test_size = kwargs["test_size"] node_vectors = embeddings labels = np.array([graph.nodes[word]["community"] for word in id2node]) node_vectors_train, node_vectors_test, labels_train, labels_test = train_test_split( node_vectors, labels, test_size=test_size) classifier.fit(node_vectors_train, labels_train) y_pred = classifier.predict(node_vectors_test) y_true = labels_test return { "micro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="micro"), "macro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="macro") }
def train_ss_ensemble( clf: BaseEstimator, params: SSEnsembleParams, X: np.ndarray, y: np.ndarray, lb_mask: np.ndarray, ): rng = np.random.RandomState(params.random_state) # We want to return out of bag predictions ulb_mask = ~lb_mask # TODO: not really necessary but we could set them all to zero from the outside as well y = y.copy() y[ulb_mask] = 0 ulb_indices = ulb_mask.nonzero()[0] y_oob_sum = np.zeros(len(y)) y_oob_hit = np.zeros(len(y)) for i in range(params.n_estimators): bag_ulb_indices = rng.choice(ulb_indices, size=params.n_samples) bag_lb_indices = lb_mask.nonzero()[0] bag_indices = np.concatenate([bag_ulb_indices, bag_lb_indices]) X_bag = X[bag_indices] y_bag = y[bag_indices] oob_mask = np.ones(len(y), dtype="bool") oob_mask[bag_indices] = False X_oob = X[oob_mask] clf = clone(clf) clf.fit(X_bag, y_bag) y_oob = clf.predict_proba(X_oob) y_oob_sum[oob_mask] += y_oob[:, 1] y_oob_hit[oob_mask] += 1 return y_oob_sum / y_oob_hit
def test_get_metadata_helper(model: BaseEstimator, expect_empty_dict: bool): """ Ensure the builder works with various model configs and that each has expected/valid metadata results. """ X, y = np.random.random((1000, 4)), np.random.random((1000, )) model.fit(X, y) metadata = ModelBuilder._extract_metadata_from_model(model) # All the metadata we've implemented so far is 'history', so we'll check that if not expect_empty_dict: assert "history" in metadata assert all(name in metadata["history"] for name in ("params", "loss", "accuracy")) else: assert dict() == metadata
def pca_fit( ds: Dataset, est: BaseEstimator, *, variable: str = "call_alternate_allele_count", check_missing: bool = True, ) -> BaseEstimator: """ Fit PCA estimator """ AC = _allele_counts(ds, variable, check_missing=check_missing) return est.fit(da.asarray(AC).T)
def test_vector_alignment(self): # Mock out a generic scikit-learn classifier mocked_model = BaseEstimator() mocked_model.fit = MagicMock() mocked_model.predict = MagicMock(return_value=[True]) # Create a simple data frame extending to January 15 date_sequence = pd.date_range('1/1/2011', periods=15, freq='D') time_series = pd.DataFrame({ # This column will be accessed by name to generate the targets vector. 'Violent Crime Committed?': [True, True] + [False]*13, # Actual time series used for nonsequential prediction will contain more than one column. # However, we just need to verify that it grabs the correct slices of each column, # so one stand-in column will suffice. 'Other Data': [0]*10 + [1]*5 }, index=date_sequence) # Construct a NonsequentialPredictor with the mock predictor = NonsequentialPredictor(time_series, model=mocked_model) # The date to predict comes before the end of the time series, # so all rows from the 13th on should be discarded date_to_predict = datetime.date(2011, 1, 13) # The mock always predicts True, so predict() should return True self.assertTrue(predictor.predict(date_to_predict)) # And both fit and predict should have been called self.assertTrue(mocked_model.fit.called) self.assertTrue(mocked_model.predict.called) # When feeding training data to the sklearn model, # predict() needs to align each day of the time series with whether a violent crime was committed the NEXT day. # Thus, the first element of the Violent Crime Committed? column should have been removed # before being used as the model's targets vector because it has no previous day to partner with. expected_targets = [True] + [False]*11 # Similarly, the last element of any other column (in this case, 'Other Data') # should only go up to the day before the day we're trying to predict expected_features = [[0]]*10 + [[1]]*2 # Get the two arguments passed to mocked_model fit_args = mocked_model.fit.call_args observed_features = fit_args[0][0] observed_targets = fit_args[0][1] # Equality tests with numpy arrays are wonky, so I convert numpy arrays to Python lists self.assertEqual(observed_targets.tolist(), expected_targets) self.assertEqual(observed_features.tolist(), expected_features) # Confirm the correct argument was passed to predict print(mocked_model.predict.call_args) observed_day_to_predict = mocked_model.predict.call_args[0][0] self.assertEqual(observed_day_to_predict.tolist(), [[1]])
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== model = sklearn.model_selection.GridSearchCV( estimator=model, param_grid={ 'linearregressor__reg_lambda': lambda_range, 'bostonfeaturestransformer__degree': degree_range }, scoring=sklearn.metrics.make_scorer(mse_score, greater_is_better=False), cv=k_folds) model.fit(X, y) best_params = model.best_params_ # ======================== return best_params
def _fit_and_suppress_warnings(logger: Union[logging.Logger, PicklableClientLogger], model: BaseEstimator, X: np.ndarray, y: np.ndarray) -> BaseEstimator: def send_warnings_to_log( message: Union[Warning, str], category: Type[Warning], filename: str, lineno: int, file: Optional[TextIO] = None, line: Optional[str] = None, ) -> None: logger.debug('%s:%s: %s:%s' % (filename, lineno, str(category), message)) return with warnings.catch_warnings(): warnings.showwarning = send_warnings_to_log model.fit(X, y) return model
def _lookahead(points: np.ndarray, model: BaseEstimator, train_ixs: List[int], obs_labels: List[float], x: np.ndarray, label: float): """ Does a lookahead at what the model would be if (x, label) were added to the known set. If the model implements the partial_fit API from sklearn, then that will be used. Otherwise, the model is retrained from scratch Args: model (BaseEstimator): sklearn model to be retrained train_ixs (ndarray): Indices of currently-labeled set obs_labels (ndarray): Labels for each labeled entry x (ndarray): Data point to simulate being labeled label (float): Simulated label """ # If partial-fit available, use it if hasattr(model, "partial_fit"): return model.partial_fit([x], [label], [0, 1]) # Update the training set X_train = np.concatenate([points[train_ixs], [x]]) obs_labels = np.concatenate([obs_labels, [label]]) # Refit the model model.fit(X_train, obs_labels)
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== kf = sklearn.model_selection.KFold(n_splits=k_folds) best_params = {} best_mse = None best_degree = None best_lambda = None for degree in degree_range: for lambda_r in lambda_range: mse = 0 cnt = 0 model.set_params(bostonfeaturestransformer__degree=degree, linearregressor__reg_lambda=lambda_r) # model = sklearn.pipeline.make_pipeline( # BiasTrickTransformer(), # BostonFeaturesTransformer(degree), # LinearRegressor(lambda_r) # ) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) y_test_pred = model.predict(X_test) mse += np.sum((y_test - y_test_pred) ** 2) cnt += y_test.shape[0] mse /= cnt if best_mse is None or best_mse > mse: best_mse = mse best_degree = degree best_lambda = lambda_r best_params['bostonfeaturestransformer__degree'] = best_degree best_params['linearregressor__reg_lambda'] = best_lambda # ======================== return best_params
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. best_params = None # ====== YOUR CODE: ====== best_accr = np.inf # Splitting the data k-fold k_folder = sklearn.model_selection.KFold(k_folds) # Iterating over all parameters for degree_param in degree_range: for lambda_param in lambda_range: # Defying current params and setting the model params = { 'bostonfeaturestransformer__degree': degree_param, 'linearregressor__reg_lambda': lambda_param } model.set_params(**params) avg_accur = 0 # Checking params on all k folds for train_indices, val_indices in k_folder.split(X): train_X, train_y = X[train_indices], y[train_indices] val_X, val_y = X[val_indices], y[val_indices] # Training model on training set model.fit(train_X, train_y) # Evaluate accuracy on validation set y_pred = model.predict(val_X) mse = np.mean((val_y - y_pred)**2) avg_accur += mse # Calculating avg of all k_folds avg_accur = avg_accur / k_folds # Updating Best params if avg_accur < best_accr: best_accr = avg_accur best_params = params # ======================== return best_params
def plot_feature_importance( estimator: BaseEstimator, X_train: pd.DataFrame, y_train: Optional[pd.DataFrame] = None, top_n: int = 10, figsize: Tuple[int, int] = (8, 8), plot_error_bars: bool = True, print_table: bool = True, ) -> Tuple[plt.Figure, pd.DataFrame]: """plot feature importances of a tree-based sklearn estimator Args: estimator (BaseEstimator): sklearn-based estiamtor X_train (pd.DataFrame): training set features y_train (Optional[pd.DataFrame], optional): training set target values. Defaults to None. top_n (int, optional): top n feature importances to plot. Defaults to 10. figsize (Tuple[int, int], optional): Defaults to (8, 8). plot_error_bars (bool, optional): whether to plot error bars (std). Default to True. print_table (bool, optional): whether to print the table after the plot. Defaults to False. Raises: AttributeError: When feature_importances_ does not exists for the estimator Returns: plt.Figure: feature importances plot pd.DataFrame: df with feature name, importance, std based on trees """ if not hasattr(estimator, "feature_importances_"): estimator.fit(X_train.values, y_train.values.ravel()) if not hasattr(estimator, "feature_importances_"): raise AttributeError( f"{estimator.__class__.__name__} does not have feature_importances_ attribute" ) feat_imp = pd.DataFrame({ "feature": X_train.columns, "importance": estimator.feature_importances_ }) try: feat_imp["std"] = np.std( [tree.feature_importances_ for tree in estimator.estimators_], axis=0) except AttributeError: if plot_error_bars: logger.warning( f"cannot plot error bars for this estimator: {estimator.__class__.__name__}" ) plot_error_bars = False feat_imp = feat_imp.sort_values(by="importance", ascending=False).iloc[:top_n] feat_imp = feat_imp.set_index("feature", drop=True) feat_imp = feat_imp.sort_values(by="importance", ascending=True) plot_kwargs = dict( title=f"Features Importances for {estimator.__class__.__name__}", figsize=figsize) if plot_error_bars is True: plot_kwargs["xerr"] = "std" fig = feat_imp.plot.barh(**plot_kwargs) else: fig = feat_imp.plot.barh(**plot_kwargs) plt.xlabel("Feature Importance") if print_table is True: from IPython.display import display msg = f" Top {top_n} features in descending order of importance " print(f"\n{msg:-^100}\n") display(feat_imp.sort_values(by="importance", ascending=False)) return fig, feat_imp