def _benchmark_from_data( experiment: Experiment, *, estimator: BaseEstimator, X_train: DataType, y_train: TargetType, X_test: DataType, y_test: TargetType, save_train: bool = False, ) -> None: with _add_timing(experiment, "fit_time"): estimator.fit(X_train, y_train) _append_info(experiment, "fitted_estimator", estimator) with _add_timing(experiment, "score_time"): test_score = estimator.score(X_test, y_test) _append_info(experiment, "test_score", test_score) if save_train: train_score = estimator.score(X_train, y_train) _append_info(experiment, "train_score", train_score) for output in ("transform", "predict"): method = getattr(estimator, output, None) if method is not None: with _add_timing(experiment, f"{output}_time"): _append_info(experiment, f"{output}", method(X_test))
def _validate(self, model: BaseEstimator, X: np.ndarray, y: np.ndarray): X_train, X_test, y_train, y_test = train_test_split(X, y, **self._options) train_score = np.array(model.score(X_train, y_train)) test_score = np.array(model.score(X_test, y_test)) return {"train": train_score, "test": test_score}
def eval_metrics( model: BaseEstimator, *data: Sequence[np.ndarray] ) -> Tuple[np.ndarray, np.ndarray]: X_train, X_test, y_train, y_test = data acc_train = model.score(X_train, y_train) decision_func_train = model.decision_function(X_train) auc_train = roc_auc_score(y_train, decision_func_train) fpr_train, tpr_train, _ = roc_curve(y_train, decision_func_train) print(f"Training set: AUC={auc_train}, acc={acc_train}") acc_test = model.score(X_test, y_test) decision_func_test = model.decision_function(X_test) auc_test = roc_auc_score(y_test, decision_func_test) fpr_test, tpr_test, _ = roc_curve(y_test, decision_func_test) print(f"Testing set: AUC={auc_test}, acc={acc_test}") return decision_func_train, decision_func_test
def evaluate(clf: BaseEstimator, X: np.ndarray, y: np.ndarray) -> float: """Compute the accuracy of te classifier. Args: clf (BaseEstimator): a classifier X (np.ndarray): a numpy ndarray y (np.ndarray): a numpy ndarray Returns: float: te accuracy of te classifier """ return clf.score(X, y)
def _train_classifier( self, attribute_embeddings_dict: List[Dict[str, np.ndarray]], estimator: BaseEstimator = LogisticRegression, estimator_params: Dict[str, Any] = { 'solver': 'liblinear', 'max_iter': 10000, }, random_state: Union[int, None] = None, print_model_evaluation: bool = False) -> Tuple[BaseEstimator, float]: """Train the sentiment classifier from the provided attribute embeddings. Parameters ---------- attribute_embeddings_dict : dict[str, np.ndarray] A dict with the attributes keys and embeddings estimator : BaseEstimator, optional A scikit-learn classifier class that implements predict_proba function, by default None, estimator_params : dict, optional Parameters that will use the classifier, by default { 'solver': 'liblinear', 'max_iter': 10000, } random_state : Union[int, None], optional Seed that allows to make the execution of the query reproducible. by default None print_model_evaluation : bool, optional Indicates whether the classifier evaluation is printed after the training process is completed., by default False Returns ------- Tuple[BaseEstimator, float] The trained classifier and the accuracy obtained by the model. """ attribute_0_embeddings = np.array(list(attribute_embeddings_dict[0].values())) attribute_1_embeddings = np.array(list(attribute_embeddings_dict[1].values())) # generate the labels (1, -1) for each embedding positive_attribute_labels = np.ones(attribute_0_embeddings.shape[0]) negative_attribute_labels = -np.ones(attribute_1_embeddings.shape[0]) attributes_embeddings = np.concatenate( (attribute_0_embeddings, attribute_1_embeddings)) attributes_labels = np.concatenate( (negative_attribute_labels, positive_attribute_labels)) split = train_test_split(attributes_embeddings, attributes_labels, shuffle=True, random_state=random_state, test_size=0.33, stratify=attributes_labels) X_embeddings_train, X_embeddings_test, y_train, y_test = split num_train_negative_examples = np.count_nonzero((y_train == -1)) num_train_positive_examples = np.count_nonzero((y_train == 1)) # Check the number of train and test examples. if num_train_positive_examples == 1: raise Exception( 'After dividing the datset using stratified train_test_split, ' 'the attribute 0 has 0 training examples.') if num_train_negative_examples < 1: raise Exception( 'After dividing the datset using stratified train_test_split, ' 'the attribute 1 has 0 training examples.') # when random_state is not none, set it on classifier params. if random_state is not None: estimator_params['random_state'] = random_state estimator = estimator(**estimator_params) estimator.fit(X_embeddings_train, y_train) # evaluate y_pred = estimator.predict(X_embeddings_test) score = estimator.score(X_embeddings_test, y_test) if print_model_evaluation: print("Classification Report:\n{}".format( classification_report(y_test, y_pred, labels=estimator.classes_))) return estimator, score