def test_classification_models_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) roc_threshold = 0.95 logger = default_log('default_test_logger') with OperationTypesRepository() as repo: model_names, _ = repo.suitable_operation( task_type=TaskTypesEnum.classification, data_type=data.data_type, tags=['ml']) for model_name in model_names: logger.info(f"Test classification model: {model_name}.") model = Model(operation_type=model_name) _, train_predicted = model.fit(data=train_data) test_pred = model.predict(fitted_operation=_, data=test_data, is_fit_pipeline_stage=False) roc_on_test = get_roc_auc(valid_data=test_data, predicted_data=test_pred) if model_name not in ['bernb', 'multinb']: assert roc_on_test >= roc_threshold else: assert roc_on_test >= 0.5
def test_log_clustering_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Scaling pipeline. Fit predict it scaling_pipeline = Pipeline(PrimaryNode('normalization')) scaling_pipeline.fit(train_data) scaled_data = scaling_pipeline.predict(train_data) kmeans = Model(operation_type='kmeans') _, train_predicted = kmeans.fit(data=scaled_data) assert all(np.unique(train_predicted.predict) == [0, 1])
def test_svc_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Scaling pipeline. Fit predict it scaling_pipeline = Pipeline(PrimaryNode('normalization')) scaling_pipeline.fit(train_data) scaled_data = scaling_pipeline.predict(train_data) svc = Model(operation_type='svc') _, train_predicted = svc.fit(data=scaled_data) roc_on_train = get_roc_auc(valid_data=train_data, predicted_data=train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_random_forest_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Scaling chain. Fit predict it scaling_chain = Chain(PrimaryNode('normalization')) scaling_chain.fit(train_data) scaled_data = scaling_chain.predict(train_data) random_forest = Model(operation_type='rf') _, train_predicted = random_forest.fit(data=scaled_data) roc_on_train = get_roc_auc(valid_data=train_data, predicted_data=train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_log_regression_fit_correct(classification_dataset): data = classification_dataset train_data, test_data = train_test_data_setup(data=data) # Scaling chain. Fit predict it scaling_chain = Chain(PrimaryNode('normalization')) scaling_chain.fit(train_data) scaled_data = scaling_chain.predict(train_data) log_reg = Model(operation_type='logit') _, train_predicted = log_reg.fit(data=scaled_data) roc_on_train = get_roc_auc(valid_data=train_data, predicted_data=train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_node_factory_log_reg_correct(data_setup): model_type = 'logit' node = PrimaryNode(operation_type=model_type) expected_model = Model(operation_type=model_type).__class__ actual_model = node.operation.__class__ assert node.__class__ == PrimaryNode assert expected_model == actual_model
def test_ts_models_fit_correct(): train_data, test_data = get_ts_data(forecast_length=5) logger = default_log('default_test_logger') with OperationTypesRepository() as repo: model_names, _ = repo.suitable_operation( task_type=TaskTypesEnum.ts_forecasting, tags=['time_series']) for model_name in model_names: logger.info(f"Test time series model: {model_name}.") model = Model(operation_type=model_name) _, train_predicted = model.fit(data=deepcopy(train_data)) test_pred = model.predict(fitted_operation=_, data=test_data, is_fit_pipeline_stage=False) mae_value_test = mean_absolute_error(y_true=test_data.target, y_pred=test_pred.predict[0]) mae_threshold = np.var(test_data.target) * 2 assert mae_value_test < mae_threshold
def test_regression_models_fit_correct(): data = get_synthetic_regression_data(n_samples=1000, random_state=42) train_data, test_data = train_test_data_setup(data) logger = default_log('default_test_logger') with OperationTypesRepository() as repo: model_names, _ = repo.suitable_operation( task_type=TaskTypesEnum.regression, tags=['ml']) for model_name in model_names: logger.info(f"Test regression model: {model_name}.") model = Model(operation_type=model_name) _, train_predicted = model.fit(data=train_data) test_pred = model.predict(fitted_operation=_, data=test_data, is_fit_pipeline_stage=False) rmse_value_test = mean_squared_error(y_true=test_data.target, y_pred=test_pred.predict) rmse_threshold = np.std(test_data.target)**2 assert rmse_value_test < rmse_threshold
def test_logger_write_logs_correctly(): test_file_path = str(os.path.dirname(__file__)) test_log_file = os.path.join(test_file_path, 'test_log.log') test_log = default_log('test_log', log_file=test_log_file) # Model data preparation file = os.path.join('../data', 'advanced_classification.csv') data = InputData.from_csv(os.path.join(test_file_path, file)) train_data, test_data = train_test_data_setup(data=data) try: knn = Model(operation_type='knnreg', log=test_log) model, _ = knn.fit(train_data, is_fit_chain_stage=True) except Exception: print('Captured error') if os.path.exists(test_log_file): with open(test_log_file, 'r') as file: content = file.readlines() release_log(logger=test_log, log_file=test_log_file) assert 'Can not find evaluation strategy' in content[0]
def get_operation(self) -> Operation: """ Factory method returns the desired object of the 'Data_operation' or 'Model' class which depends on model_type variable """ if self.operation_type == 'model': operation = Model(operation_type=self.operation_name) elif self.operation_type == 'data_operation': operation = DataOperation(operation_type=self.operation_name) else: raise ValueError(f'Operation type {self.operation_type} is not supported') return operation