def cross_validate(estimator: BaseEstimator, X: pd.DataFrame, y: pd.DataFrame, num_splits: int, save_name: str) -> None: """ function to perform cross validation and call error_profile at the end to generate an error report for a sklearn model :param estimator: SkLearn classification model :param X: dataframe containing data :param y: dataframe containing class labels corresponding to X :param num_splits: number of folds for k-fold cross validation :param save_name: save name for error profile plots (file extension will be appended) :return: None """ splitter = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=0) predictions = {"test": [], "train": []} y_true = {"test": [], "train": []} for train_index, test_index in splitter.split(X, y): estimator.fit(X.iloc[train_index, :], y.iloc[train_index, 0]) test_pred = estimator.predict(X.iloc[test_index, :]) train_pred = estimator.predict(X.iloc[train_index, :]) predictions["train"].append(train_pred) predictions["test"].append(test_pred) y_true["train"].append(np.array(y.iloc[train_index])[:, 0]) y_true["test"].append(np.array(y.iloc[test_index])[:, 0]) error_profile(y_true, predictions, model_type=save_name)
def loop_snippet(clf: BaseEstimator, repeat: int, x, y, xt): time_table = [] for i in range(repeat): start = time.perf_counter() clf.fit(x, y) clf.predict(xt) time_table.append(time.perf_counter() - start) return time_table
def _model_predict(self, model: BaseEstimator, data: pd.DataFrame) -> np.array: if self._task._task_type == BINARY_CLASSIFICATION: predictions = model.predict_proba(data) elif self._task._task_type == MULTI_CLASS_CLASSIFICATION: predictions = model.predict(data) elif self._task._task_type == REGRESSION: predictions = model.predict(data) return predictions
def evaluate(self, model: BaseEstimator, X, y, X_test, y_test): metrics_logger = MetricsLogger( classes=audioset.ontology.MUSIC_GENRE_CLASSES, classsmetrics_filepath=self.classmetrics_filepath, show_top_classes=25, class_sort_key='ap' ) logging.info('---- Train stats ----') predictions = model.predict(X) metrics_logger.log(predictions, y) logging.info('---- Test stats ----') predictions = model.predict(X_test) metrics_logger.log(predictions, y_test, show_classes=True)
def build_submission(model_sj: BaseEstimator, model_iq: BaseEstimator, test_features_sj: pd.DataFrame, test_features_iq: pd.DataFrame, raw_path: str, pred_path: str, name: str) -> pd.DataFrame: submission = pd.read_csv(os.path.join(raw_path, 'submission_format.csv')) y_pred_sj = model_sj.predict(test_features_sj) y_pred_iq = model_iq.predict(test_features_iq) y_pred = np.concatenate((y_pred_sj, y_pred_iq)) submission['total_cases'] = np.round(y_pred).astype(int) submission.to_csv(os.path.join(pred_path, name + '.csv'), index=None) return submission
def summarize_feature_comparisons( base_clf: BaseEstimator, comparison_clfs: Dict[str, BaseEstimator], X_test, y_test ): from mlxtend.evaluate import mcnemar, cochrans_q, mcnemar_table summary_dict = collections.OrderedDict() mcnemar_tbs = dict() # create list of predicted values base_y_predict = base_clf.predict(X_test) y_predictions = [base_y_predict] for idx, (name, clf) in enumerate(comparison_clfs.items()): # get the probability y_predict_proba = clf.predict_proba(X_test) y_predict = clf.predict(X_test) # form mcnemar tables against base classifier tb = mcnemar_table(y_test, base_y_predict, y_predict) mcnemar_tbs[f"base vs {name}"] = tb.values() # store predictions per classifier y_predictions.append(y_predict) # first run cochrans Q test qstat, pval = cochrans_q(y_test, *y_predictions) summary_dict["cochrans_q"] = qstat summary_dict["cochrans_q_pval"] = pval # run mcnemars test against all the predictions for name, table in mcnemar_tbs.items(): chi2stat, pval = mcnemar(table, exact=True) summary_dict[f"mcnemar_{name}_chi2stat"] = chi2stat summary_dict[f"mcnemar_{name}_pval"] = pval return summary_dict
def standard_report( estimator: BaseEstimator, X_test: Union[pd.DataFrame, np.ndarray], y_test: Union[pd.Series, np.ndarray], zero_division: str = "warn", ) -> None: """Display standard report of diagnostic metrics and plots for classification. Parameters ---------- estimator : BaseEstimator Fitted classification estimator for evaluation. X_test : DataFrame or ndarray of shape (n_samples, n_features) Predictor test set. y_test : Series or ndarray of shape (n_samples,) Target test set. zero_division : str, optional Value to return for division by zero: 0, 1, or 'warn'. """ table = classification_report(y_test, estimator.predict(X_test), zero_division=zero_division, heatmap=True) classification_plots(estimator, X_test, y_test) display(table)
def evaluate_fchl(rep_computer: FCHLRepresentation, model: BaseEstimator, mols: List[str], n_jobs: int = 1, y_lower: List[float] = None) -> np.ndarray: """Run an FCHL-based model Args: rep_computer: Tool used to compute the FCHL-compatible representations for each molecule model: Model to be evaluated mols: List of molecules (XYZ format) to evaluate n_jobs: Number of threads to use for generating representations y_lower: Lower-fidelity estimate of the property. Used for delta learning models Returns: Results from the inference """ # Convert the input molecules into FCHL-ready inputs rep_computer.n_jobs = n_jobs reps = rep_computer.transform(mols) # Run the model y_pred = model.predict(reps).tolist() if y_lower is not None: y_pred = np.add(y_pred, y_lower) return y_pred
def generate(model: base.BaseEstimator, sentences: List[List[str]]) -> None: """Tag the sentences with the given model. Parameters ---------- sentences : list List of lists of strings representing the sentences to tag. """ print(f"Tagging {len(sentences)} sentences.") # Since the models were trained on the lemmatized version of the words, # we also lemmatize them when tagging unlabeled sentences. lemmatizer = stem.WordNetLemmatizer() for sentence in sentences: # Convert to the lemmatized versions lemmatized = [lemmatizer.lemmatize(w.lower()) for w in sentence] # Convert to conllu.TokenList because models expect that. # Since they are essentially dicts, we build them that way. tags = model.predict([[{"lemma": w} for w in lemmatized]]) print("Word\tTag") for w, t in zip(sentence, tags[0]): print(f"{w}\t{t}") print()
def max_std_sampling(regressor: BaseEstimator, X: modALinput, n_instances: int = 1, random_tie_break=False, **predict_kwargs) -> np.ndarray: """ Regressor standard deviation sampling strategy. Args: regressor: The regressor for which the labels are to be queried. X: The pool of samples to query from. n_instances: Number of samples to be queried. random_tie_break: If True, shuffles utility scores to randomize the order. This can be used to break the tie when the highest utility score is not unique. **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor. Returns: The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled. """ _, std = regressor.predict(X, return_std=True, **predict_kwargs) std = std.reshape(X.shape[0], ) if not random_tie_break: return multi_argmax(std, n_instances=n_instances) return shuffled_argmax(std, n_instances=n_instances)
def evaluate(df: pd.DataFrame, target_column: Text, clf: BaseEstimator) -> Dict: """Evaluate classifier on a dataset Args: df {pandas.DataFrame}: dataset target_column {Text}: target column name clf {sklearn.base.BaseEstimator}: classifier (trained model) Returns: Dict: Dict of reported metrics 'f1' - F1 score 'cm' - Comnfusion Matrix 'actual' - true values for test data 'predicted' - predicted values for test data """ # Get X and Y y_test = df.loc[:, target_column].values.astype('int32') X_test = df.drop(target_column, axis=1).values.astype('float32') prediction = clf.predict(X_test) f1 = f1_score(y_true=y_test, y_pred=prediction, average='macro') cm = confusion_matrix(prediction, y_test) return {'f1': f1, 'cm': cm, 'actual': y_test, 'predicted': prediction}
def run_inference( self, batch: Sequence[numpy.ndarray], model: BaseEstimator, **kwargs) -> Iterable[PredictionResult]: # vectorize data for better performance vectorized_batch = numpy.stack(batch, axis=0) predictions = model.predict(vectorized_batch) return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
def _predict_regression( self, X: np.ndarray, model: BaseEstimator, task_type: int, Y_train: Optional[np.ndarray] = None) -> np.ndarray: def send_warnings_to_log( message: Union[Warning, str], category: Type[Warning], filename: str, lineno: int, file: Optional[TextIO] = None, line: Optional[str] = None, ) -> None: self.logger.debug('%s:%s: %s:%s' % (filename, lineno, str(category), message)) return with warnings.catch_warnings(): warnings.showwarning = send_warnings_to_log Y_pred = model.predict(X) if len(Y_pred.shape) == 1: Y_pred = Y_pred.reshape((-1, 1)) return Y_pred
def get_preds_probas( est: BaseEstimator, X_test: DataFrame, y_test: Series, mapper_dict: Dict ) -> DataFrame: """ Get prediction probabilities (if available) or return true and predicted labels """ df_preds = DataFrame(est.predict(X_test), index=X_test.index) if hasattr(est.named_steps["clf"], "predict_proba"): # Get prediction probabilities (if available) df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index) # Append prediction and prediction probabilities df_summ = concat([df_preds, df_probas], axis=1) df_summ.columns = ["predicted_label"] + [ f"probability_of_{i}" for i in range(0, len(np.unique(y_test))) ] # Get label (class) with maximum prediction probability for each row df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1) df_summ["probability_of_max_class"] = df_probas.max(axis=1) # Compare .predict_proba() and manually extracted prediction # probability lhs = df_summ["max_class_number_manually"] rhs = df_summ["predicted_label"].replace(mapper_dict) assert (lhs == rhs).eq(True).all() else: df_summ = df_preds.copy() # Get true label df_summ.insert(0, "true_label", y_test) return df_summ
def decision_boundary(self, x: np.ndarray, y: np.ndarray, model: BaseEstimator): x0 = x[:, 0] x1 = x[:, 1] x_min, x_max = x0.min() - 1, x0.max() + 1 y_min, y_max = x1.min() - 1, x1.max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) z = model.predict(np.c_[xx.ravel(), yy.ravel()]) z = z.reshape(xx.shape) z = z.astype(np.str) y = [str(label) for label in y] fig = px.scatter(x=x0, y=x1, color=y) # fig = go.Figure() contour = go.Contour(z=z, x=np.arange(x_min, x_max, 0.1), y=np.arange(y_min, y_max, 0.1), line_width=0, colorscale=[[0, '#ff9900'], [1, '#6666ff']], opacity=0.4, showscale=False) fig.add_trace(contour) fig.update_layout(title='Decision boundary', legend_title='Label') pyo.iplot(fig)
def evaluate_model(self, model: BaseEstimator, xtest: np.ndarray, ytest: np.ndarray) -> ModelStats: """Get the accuracy, recall, precision of this model""" ypreds = model.predict(xtest) return ModelStats(accuracy=accuracy_score(ypreds, ytest), precision=precision_score(ypreds, ytest), recall=recall_score(ypreds, ytest))
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== #raise NotImplementedError() # ======================== kf = sklearn.model_selection.KFold(k_folds) smallest_loss = np.inf best_params = {"bostonfeaturestransformer__degree": 1, "linearregressor__reg_lambda": 0.2} count = 0 for lam in lambda_range: for deg in degree_range: model.set_params(linearregressor__reg_lambda=lam, bostonfeaturestransformer__degree=deg) avg_mse = 0.0 count += 1 for train_i, test_i in kf.split(X): x_train = X[train_i] y_train = y[train_i] model.fit(x_train, y_train) y_pred = model.predict(X[test_i]) avg_mse += np.square(y[test_i] - y_pred).sum() / (2 * X.shape[0]) avg_mse /= k_folds #check if the current params are the best if avg_mse <= smallest_loss: smallest_loss = avg_mse best_params = {"linearregressor__reg_lambda": lam, "bostonfeaturestransformer__degree": deg} # ======================== print(count) return best_params
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== params = { 'linearregressor__reg_lambda': lambda_range, 'bostonfeaturestransformer__degree': degree_range } kf = KFold(n_splits=k_folds) best_params = ParameterGrid(params)[0] best_mse = np.inf best_r_2 = 0.0 for p_dict in ParameterGrid(params): cur_acc = 0.0 curr_r_2 = 0.0 model.set_params(**p_dict) for train_index, test_index in kf.split(X): model.fit(X[train_index], y=y[train_index]) mse, rsq = evaluate_accuracy(y[test_index], model.predict(X[test_index])) cur_acc += mse curr_r_2 += rsq cur_acc /= k_folds curr_r_2 /= k_folds if curr_r_2 > best_r_2: best_r_2 = curr_r_2 best_params = p_dict # ======================== return best_params
def score_model(estimator: BaseEstimator, X: np.ndarray, y: np.ndarray) -> str: """ Runs a cross_val_score with cv = 5 on arrays X, y with a neg mean squared error score. Performs the RMSE conversion and prints out scores. Args: estimator (BaseEstimator): Trained sklearn estimator object (Regressor) X (np.ndarray): Feature array y (np.ndarray): Target array Returns: no_val_rmse: [np.float64] RMSE score based on the training data no_val_r2: [np.float64] R^2 score based on the training data val_rmse_scores: [np.ndarray] Series of RMSE scores from cross validation cv_mean: [np.float64] Mean of all cross-validated RMSE scores cv_std: [np.float64] StDev of all cross-validated RMSE scores cv_cov: [np.float64] CoV of all cross-validated RMSE scores (CoV = StDev / Mean) """ val_scores = cross_val_score(estimator, X, y, scoring="neg_mean_squared_error") val_scores = val_scores * -1 val_rmse_scores = np.sqrt(val_scores) no_val_mse = mean_squared_error(y, estimator.predict(X)) no_val_rmse = np.sqrt(no_val_mse) no_val_r2 = r2_score(y, estimator.predict(X)) cv_mean = np.mean(val_rmse_scores) cv_std = np.std(val_rmse_scores) cv_cov = cv_std / cv_mean print("Non-validation Scores") print("-----------") print(f"RMSE (No Val): {np.round(no_val_rmse, 3)}") print(f"R^2 (No Val): {np.round(no_val_r2, 3)}") print() print("Validation Scores") print("-----------") print(f"RMSE's: {np.round(val_rmse_scores, 3)}") print(f"Mean: {np.round(cv_mean, 3)}") print(f"StDev: {np.round(cv_std, 3)}") print(f"CoV: {np.round(cv_cov, 3)}") return no_val_rmse, no_val_r2, val_rmse_scores, cv_mean, cv_std, cv_cov
def predict(model: BaseEstimator, sample: list) -> int: """Make a prediction. Returns: A -1 indicating an outlier or 1 indicating a normal value. """ # Need to reshape a single sample because input needs to be 2d result = model.predict(sample) return int(result[0])
def test_vector_alignment(self): # Mock out a generic scikit-learn classifier mocked_model = BaseEstimator() mocked_model.fit = MagicMock() mocked_model.predict = MagicMock(return_value=[True]) # Create a simple data frame extending to January 15 date_sequence = pd.date_range('1/1/2011', periods=15, freq='D') time_series = pd.DataFrame({ # This column will be accessed by name to generate the targets vector. 'Violent Crime Committed?': [True, True] + [False]*13, # Actual time series used for nonsequential prediction will contain more than one column. # However, we just need to verify that it grabs the correct slices of each column, # so one stand-in column will suffice. 'Other Data': [0]*10 + [1]*5 }, index=date_sequence) # Construct a NonsequentialPredictor with the mock predictor = NonsequentialPredictor(time_series, model=mocked_model) # The date to predict comes before the end of the time series, # so all rows from the 13th on should be discarded date_to_predict = datetime.date(2011, 1, 13) # The mock always predicts True, so predict() should return True self.assertTrue(predictor.predict(date_to_predict)) # And both fit and predict should have been called self.assertTrue(mocked_model.fit.called) self.assertTrue(mocked_model.predict.called) # When feeding training data to the sklearn model, # predict() needs to align each day of the time series with whether a violent crime was committed the NEXT day. # Thus, the first element of the Violent Crime Committed? column should have been removed # before being used as the model's targets vector because it has no previous day to partner with. expected_targets = [True] + [False]*11 # Similarly, the last element of any other column (in this case, 'Other Data') # should only go up to the day before the day we're trying to predict expected_features = [[0]]*10 + [[1]]*2 # Get the two arguments passed to mocked_model fit_args = mocked_model.fit.call_args observed_features = fit_args[0][0] observed_targets = fit_args[0][1] # Equality tests with numpy arrays are wonky, so I convert numpy arrays to Python lists self.assertEqual(observed_targets.tolist(), expected_targets) self.assertEqual(observed_features.tolist(), expected_features) # Confirm the correct argument was passed to predict print(mocked_model.predict.call_args) observed_day_to_predict = mocked_model.predict.call_args[0][0] self.assertEqual(observed_day_to_predict.tolist(), [[1]])
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== kf = sklearn.model_selection.KFold(n_splits=k_folds) best_params = 0 min_mse = np.inf for curr_degree in degree_range: for curr_lambda in lambda_range: params = dict(linearregressor__reg_lambda=curr_lambda, bostonfeaturestransformer__degree=curr_degree) model.set_params(**params) mse = 0 counter = 0 for train_index, test_index in kf.split(X): counter = counter + 1 model.fit(X[train_index], y[train_index]) y_pred = model.predict(X[test_index]) mse = mse + np.mean((y[test_index] - y_pred)**2) avg_mse = mse / counter print("avg_mse:", avg_mse, " labmda:", curr_lambda, " degree:", curr_degree) if avg_mse < min_mse: best_params = params min_mse = avg_mse # ======================== return best_params
def score_data(data: pd.DataFrame, model: BaseEstimator) -> pd.DataFrame: """Score data using model.""" feature_columns = [ 'sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)' ] label_to_classes_map = {0: 'setosa', 1: 'versicolor', 2: 'virginica'} X = data[feature_columns].values data['predicted_labels'] = model.predict(X) data['predicted_class'] = ( data['predicted_labels'].apply(lambda e: label_to_classes_map[e])) return data
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== kf = sklearn.model_selection.KFold(k_folds) params_grid = sklearn.model_selection.ParameterGrid({ 'bostonfeaturestransformer__degree': degree_range, 'linearregressor__reg_lambda': lambda_range }) best_loss = 0 for param in params_grid: model.set_params(**param) avg_score = 0.0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) y_pred = model.predict(X_test) avg_score += r2_score(y_test, y_pred) avg_score /= k_folds if avg_score > best_loss: best_loss = avg_score best_params = param # ======================== return best_params
def compute_score(model: BaseEstimator, designs: List[str]) -> List[float]: """Assign a score to a series of designs given a machine learning model Args: model (BaseEstimator): Scikit-learn model designs ([str]): List of strings describing the model """ # Run inference y_pred = model.predict(designs) # Return results return y_pred.tolist()
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range, lambda_range): """ Cross-validate to find best hyperparameters with k-fold CV. :param X: Training data. :param y: Training targets. :param model: sklearn model. :param lambda_range: Range of values for the regularization hyperparam. :param degree_range: Range of values for the degree hyperparam. :param k_folds: Number of folds for splitting the training data into. :return: A dict containing the best model parameters, with some of the keys as returned by model.get_params() """ # TODO: Do K-fold cross validation to find the best hyperparameters # Notes: # - You can implement it yourself or use the built in sklearn utilities # (recommended). See the docs for the sklearn.model_selection package # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection # - If your model has more hyperparameters (not just lambda and degree) # you should add them to the search. # - Use get_params() on your model to see what hyperparameters is has # and their names. The parameters dict you return should use the same # names as keys. # - You can use MSE or R^2 as a score. # ====== YOUR CODE: ====== # params = model.get_params() kf = sklearn.model_selection.KFold(n_splits=k_folds) params_grid = { 'bostonfeaturestransformer__degree': degree_range, 'linearregressor__reg_lambda': lambda_range } min_acc = np.inf for params in list(sklearn.model_selection.ParameterGrid(params_grid)): model.set_params(**params) curr_acc = 0 for train_idx, test_idx in kf.split(X): train_x, train_y = X[train_idx], y[train_idx] test_x, test_y = X[test_idx], y[test_idx] model.fit(train_x, train_y) y_pred = model.predict(test_x) curr_acc += mse_score(test_y, y_pred) mean = curr_acc / k_folds if mean < min_acc: min_acc = mean best_params = params # ======================== return best_params
def fit(self, X, original_y): base_est = BaseEstimator() base_est.predict = lambda X: np.zeros(X.shape[0], dtype=float) self.estimators_ = [base_est] for i in range(self.n_estimators): grad = self.loss_grad(original_y, self._predict(X)) estimator = deepcopy(self.base_regressor) estimator.fit(X, grad) self.estimators_.append(estimator) self.out_ = self._outliers(grad) self.feature_importances_ = self._calc_feature_imps() return self
def finalize_model( model: BaseEstimator, X_train: CSVData, Y_train: CSVData, X_test: CSVData, test_ids: CSVData, output: str, smote_fn: SamplerFnType = None, outlier_detection: Any = None, header: Tuple[str, str] = ("id", "y"), label_indexing: int = 0, export_int: bool = False, ) -> None: """Train the model on the complete data and generate the submission file. Parameters ---------- model: The model X_train: The training data Y_train: The training labels X_test: The test data test_ids: The IDs for the test data output: The path where to dump the output smote_fn: The function that takes labels and returns SMOTE label_indexing: What to start indexing the label from export_int: Whether to export the CSV as integers """ print("Training model...") if outlier_detection is not None: outliers = outlier_detection.fit_predict(X_train) X_train = X_train[outliers == 1] Y_train = Y_train[outliers == 1] if smote_fn: smote = smote_fn(Y_train) X_train, Y_train = smote.fit_resample(X_train, Y_train) model.fit(X_train, Y_train) print("Model trained") Y_pred = model.predict(X_test) + label_indexing submission: Any = np.stack([test_ids, Y_pred], 1) # Add IDs create_submission_file(output, submission, header=header, export_int=export_int)
def log_performance( X_test: np.ndarray, y_test_binarized: np.ndarray, model: BaseEstimator, binarizer: MultiLabelBinarizer, logger: Logger, ) -> None: """Logs performance of the model to the log file""" y_test_pred_binarized = model.predict(X_test) logger.info("-" * 80) logger.info("**EVALUATION\nClassification Report \n**") logger.info( classification_report(y_test_binarized, y_test_pred_binarized, target_names=binarizer.classes_, zero_division=1)) logger.info("\nAccuracy Score: {}".format( accuracy_score(y_test_binarized, y_test_pred_binarized)))
def link_prediction_pipeline(graph: nx.Graph, embeddings: np.array, id2node: list, node2id: list, classifier: BaseEstimator, **kwargs) -> dict: non_edges_train, non_edges_test, edges_train, edges_test = kwargs["non_edges_train"], kwargs["non_edges_test"],\ kwargs["edges_train"], kwargs["edges_test"] X_train, X_test, Y_train, Y_test = link_pred_train_test_split( embeddings, node2id, **kwargs) # Classify classifier.fit(X_train, Y_train) y_pred = classifier.predict(X_test) y_true = Y_test return { "micro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="micro"), "macro_f1": f1_score(y_true=y_true, y_pred=y_pred, average="macro"), "accuracy": accuracy_score(y_true=y_true, y_pred=y_pred) }
def eval(model: base.BaseEstimator, test_data: List[conllu.TokenList]) -> None: """Evaluate a model using the provided dataset. Parameters ---------- test_data : list List of sentences represented as `conllu.TokenList`. """ print(f"Evaluating with {len(test_data)} sentences.") y_test = feature_extraction.extract_tags(test_data) y_pred = model.predict(test_data) accuracy = metrics.accuracy(y_test, y_pred) amb_accuracy = metrics.ambiguous_accuracy(test_data, y_test, y_pred) print("Model accuracy:", accuracy) print("Model ambiguous words accuracy:", amb_accuracy)