def test_train_model_errors_correctly_when_not_scored( self, pipeline_logistic: Pipeline, tmp_path: pathlib.Path, train_iris_dataset): model = Model(pipeline_logistic) with pytest.raises(MLToolingError, match="You haven't scored the estimator"): with model.log(str(tmp_path)): model.train_estimator(train_iris_dataset) model.save_estimator(FileStorage(tmp_path))
def test_randomsearch_can_log_with_context_manager( self, feature_union_classifier, train_iris_dataset, tmp_path): classifier = Model(feature_union_classifier) classifier.config.RUN_DIR = tmp_path with classifier.log("randomsearch_union_test"): _, _ = classifier.randomsearch( train_iris_dataset, param_distributions={"estimator__penalty": ["l1", "l2"]}, n_iter=2, )
def test_gridsearch_can_log_with_context_manager(self, feature_union_classifier, train_iris_dataset, tmp_path): classifier = Model(feature_union_classifier) classifier.config.RUN_DIR = tmp_path with classifier.log("gridsearch_union_test"): _, _ = classifier.gridsearch( train_iris_dataset, param_grid={"estimator__penalty": ["l1", "l2"]})
def test_log_context_manager_works_as_expected(self, regression: Model): assert regression.config.LOG is False assert "runs" == regression.config.RUN_DIR.name with regression.log("test"): assert regression.config.LOG is True assert "test" == regression.config.RUN_DIR.name assert "runs" == regression.config.RUN_DIR.parent.name assert regression.config.LOG is False assert "runs" == regression.config.RUN_DIR.name assert "test" not in regression.config.RUN_DIR.parts
def test_save_estimator_saves_logging_dir_correctly( self, mock_hash: MagicMock, classifier: Model, tmp_path: pathlib.Path): mock_hash.return_value = "1234" with classifier.log(str(tmp_path)): expected_file = classifier.save_estimator(FileStorage(tmp_path)) assert expected_file.exists() assert ("LogisticRegression" in [str(file) for file in tmp_path.rglob("*.yaml")][0]) mock_hash.assert_called_once()
def test_log_context_manager_logs_when_scoring_model( self, tmp_path: pathlib.Path, train_iris_dataset): model = Model(LinearRegression()) runs = tmp_path / "runs" with model.log(str(runs)): result = model.score_estimator(train_iris_dataset) for file in runs.rglob("LinearRegression_*"): with file.open() as f: log_result = yaml.safe_load(f) assert result.metrics.score == log_result["metrics"]["r2"] assert result.model.estimator_name == log_result["estimator_name"]
def train_model(year, month, day, graphs=True, clf=RandomForestRegressor()): dataset = AirBnBDataset(year=year, month=month, day=day) dataset.create_train_test() model = Model(clf, feature_pipeline=features) result = model.score_estimator(dataset) model.config.N_JOBS = 6 with model.log("randomforest"): model.save_estimator() if graphs: result.plot.feature_importance() plt.savefig(VISUALIZATIONS / "confusion_matrix.png") result.plot.residuals() plt.savefig(VISUALIZATIONS / "residuals.png") result.plot.prediction_error() plt.savefig(VISUALIZATIONS / "prediction_error.png") return result