def test_fine_tune_all_nodes(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Chain composition chain = get_class_chain() # Before tuning prediction chain.fit(train_data, use_cache=False) before_tuning_predicted = chain.predict(test_data) # root node tuning chain.fine_tune_all_nodes(train_data, max_lead_time=timedelta(minutes=1), iterations=30) chain.fit_from_scratch(train_data) after_tun_root_node_predicted = chain.predict(test_data) bfr_tun_roc_auc = round( roc(y_true=test_data.target, y_score=before_tuning_predicted.predict), 2) aft_tun_roc_auc = round( roc(y_true=test_data.target, y_score=after_tun_root_node_predicted.predict), 2) print(f'Before tune test {bfr_tun_roc_auc}') print(f'After tune test {aft_tun_roc_auc}', '\n') assert aft_tun_roc_auc >= bfr_tun_roc_auc
def test_chain_hierarchy_fit_correct(data_setup): data = data_setup train, _ = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[first]) final = SecondaryNode(operation_type='logit', nodes_from=[second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) chain.unfit() train_predicted = chain.fit(input_data=train) assert chain.root_node.descriptive_id == ('((/n_logit_default_params;)/' 'n_logit_default_params;;(/' 'n_logit_default_params;)/' 'n_logit_default_params;)/' 'n_logit_default_params') assert chain.length == 4 assert chain.depth == 3 assert train_predicted.predict.shape[0] == train.target.shape[0] assert final.fitted_operation is not None
def test_tune_certain_node_with_tune_class_correctly(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) chain = create_four_depth_chain() chain.fit(train_data, use_cache=False) before_tuning_predicted = chain.predict(test_data) model_id_to_tune = 4 tuned_chain = Tune(chain).fine_tune_certain_node( model_id=model_id_to_tune, input_data=train_data, max_lead_time=timedelta(minutes=1), iterations=30) tuned_chain.fit_from_scratch(train_data) after_tun_root_node_predicted = tuned_chain.predict(test_data) bfr_tun_roc_auc = round( roc(y_true=test_data.target, y_score=before_tuning_predicted.predict), 1) aft_tun_roc_auc = round( roc(y_true=test_data.target, y_score=after_tun_root_node_predicted.predict), 1) print(f'Before tune test {bfr_tun_roc_auc}') print(f'After tune test {aft_tun_roc_auc}', '\n') assert aft_tun_roc_auc >= bfr_tun_roc_auc
def test_fine_tune_primary_nodes(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Chain composition chain = get_regr_chain() # Before tuning prediction chain.fit(train_data, use_cache=False) before_tuning_predicted = chain.predict(test_data) # Chain tuning chain.fine_tune_primary_nodes(train_data, max_lead_time=timedelta(minutes=1), iterations=10) # After tuning prediction chain.fit_from_scratch(train_data) after_tuning_predicted = chain.predict(test_data) # Metrics bfr_tun_mse = mse(y_true=test_data.target, y_pred=before_tuning_predicted.predict) aft_tun_mse = mse(y_true=test_data.target, y_pred=after_tuning_predicted.predict) print(f'Before tune test {bfr_tun_mse}') print(f'After tune test {aft_tun_mse}', '\n') assert aft_tun_mse <= bfr_tun_mse
def get_synthetic_ts_data_period(n_steps=1000, forecast_length=1, max_window_size=50): simulated_data = ArmaProcess().generate_sample(nsample=n_steps) x1 = np.arange(0, n_steps) x2 = np.arange(0, n_steps) + 1 simulated_data = simulated_data + x1 * 0.0005 - x2 * 0.0001 periodicity = np.sin(x1 / 50) simulated_data = simulated_data + periodicity task = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False)) data = InputData(idx=np.arange(0, n_steps), features=np.asarray([x1, x2]).T, target=simulated_data, task=task, data_type=DataTypesEnum.ts) return train_test_data_setup(data)
def data_setup(): task = Task(TaskTypesEnum.classification) predictors, response = load_breast_cancer(return_X_y=True) np.random.seed(1) np.random.shuffle(predictors) np.random.shuffle(response) response = response[:100] predictors = predictors[:100] input_data = InputData(idx=np.arange(0, len(predictors)), features=predictors, target=response, task=task, data_type=DataTypesEnum.table) train_data, test_data = train_test_data_setup(data=input_data) train_data_x = train_data.features test_data_x = test_data.features train_data_y = train_data.target test_data_y = test_data.target train_data = InputData(features=train_data_x, target=train_data_y, idx=np.arange(0, len(train_data_y)), task=task, data_type=DataTypesEnum.table) test_data = InputData(features=test_data_x, target=test_data_y, idx=np.arange(0, len(test_data_y)), task=task, data_type=DataTypesEnum.table) return train_data, test_data
def test_knn_classification_tune_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) knn = Model(model_type='knn') model, _ = knn.fit(data=train_data) test_predicted = knn.predict(fitted_model=model, data=test_data) roc_on_test = roc_auc(y_true=test_data.target, y_score=test_predicted) roc_on_test_tuned_list = [] for _ in range(3): knn_for_tune = Model(model_type='knn') model, _ = knn_for_tune.fine_tune(data=train_data, iterations=10, max_lead_time=timedelta(minutes=1)) test_predicted_tuned = knn_for_tune.predict(fitted_model=model, data=test_data) roc_on_test_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_on_test_tuned_list.append(roc_on_test_tuned) roc_threshold = 0.6 assert np.array( roc_on_test_tuned_list).any() >= roc_on_test > roc_threshold
def get_cholesterol_data(): file_path = 'cases/data/cholesterol/cholesterol.csv' full_path = join(str(project_root()), file_path) task = Task(TaskTypesEnum.regression) data = InputData.from_csv(full_path, task=task) train, test = train_test_data_setup(data) return train, test
def get_kc2_data(): file_path = 'cases/data/kc2/kc2.csv' full_path = join(str(project_root()), file_path) task = Task(TaskTypesEnum.classification) data = InputData.from_csv(full_path, task=task) train, test = train_test_data_setup(data) return train, test
def run_text_problem_from_saved_meta_file(path): data = InputData.from_text_meta_file(meta_file_path=path) train_data, test_data = train_test_data_setup(data, split_ratio=0.7) metric = execute_chain_for_text_problem(train_data, test_data) print(f'meta_file metric: {metric}')
def test_pca_model_removes_redunant_features_correct(): n_informative = 5 data = classification_dataset_with_redunant_features( n_samples=1000, n_features=100, n_informative=n_informative) train_data, test_data = train_test_data_setup(data=data) pca = Model(model_type='pca_data_model') _, train_predicted = pca.fit(data=train_data) assert train_predicted.shape[1] < data.features.shape[1]
def get_dataset(task_type: str): if task_type == 'regression': data = get_synthetic_regression_data() train_data, test_data = train_test_data_setup(data) threshold = np.std(test_data.target) * 0.05 elif task_type == 'classification': data = get_iris_data() train_data, test_data = train_test_data_setup(data, shuffle_flag=True) threshold = 0.95 elif task_type == 'clustering': data = get_synthetic_input_data(n_samples=10000) train_data, test_data = train_test_data_setup(data) threshold = 0.5 elif task_type == 'ts_forecasting': train_data, test_data = get_synthetic_ts_data_period(forecast_length=12) threshold = np.str(test_data.target) else: raise ValueError('Incorrect type of machine learning task') return train_data, test_data, threshold
def test_log_clustering_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) kmeans = Model(model_type='kmeans') _, train_predicted = kmeans.fit(data=train_data) assert all(np.unique(train_predicted) == [0, 1])
def test_regression_chain_fit_correct(): data = get_synthetic_regression_data() chain = generate_chain() train_data, test_data = train_test_data_setup(data) chain.fit(input_data=train_data) _, rmse_on_test = get_rmse_value(chain, train_data, test_data) rmse_threshold = np.std(data.target) * 0.05 assert rmse_on_test < rmse_threshold
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup): data = data_setup train, test = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) first = deepcopy(first) second = deepcopy(second) third = deepcopy(third) final_shuffled = SecondaryNode(operation_type='xgboost', nodes_from=[third, first, second]) chain_shuffled = Chain() # change order of nodes in list for node in [final_shuffled, third, first, second]: chain_shuffled.add_node(node) train_predicted = chain.fit(input_data=train) train_predicted_shuffled = chain_shuffled.fit(input_data=train) # train results should be invariant assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id assert np.equal(train_predicted.predict, train_predicted_shuffled.predict).all() test_predicted = chain.predict(input_data=test) test_predicted_shuffled = chain_shuffled.predict(input_data=test) # predict results should be invariant assert np.equal(test_predicted.predict, test_predicted_shuffled.predict).all() # change parents order for the nodes fitted chain nodes_for_change = chain.nodes[3].nodes_from chain.nodes[3].nodes_from = [ nodes_for_change[2], nodes_for_change[0], nodes_for_change[1] ] chain.nodes[3].unfit() chain.fit(train) test_predicted_re_shuffled = chain.predict(input_data=test) # predict results should be invariant assert np.equal(test_predicted.predict, test_predicted_re_shuffled.predict).all()
def run_text_problem_from_meta_file(): data_file_abspath = os.path.abspath( os.path.join('data', 'spam', 'spamham.csv')) data = InputData.from_text_meta_file(meta_file_path=data_file_abspath) train_data, test_data = train_test_data_setup(data, split_ratio=0.7) metric = execute_chain_for_text_problem(train_data, test_data) print(f'meta_file metric: {metric}')
def get_synthetic_ts_data_period(n_steps=1000, forecast_length=5): simulated_data = ArmaProcess().generate_sample(nsample=n_steps) x1 = np.arange(0, n_steps) x2 = np.arange(0, n_steps) + 1 simulated_data = simulated_data + x1 * 0.0005 - x2 * 0.0001 periodicity = np.sin(x1 / 50) simulated_data = simulated_data + periodicity task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length)) data = InputData(idx=np.arange(0, n_steps), features=simulated_data, target=simulated_data, task=task, data_type=DataTypesEnum.ts) a, b = train_test_data_setup(data) return train_test_data_setup(data)
def test_log_clustering_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Scaling chain. Fit predict it scaling_chain = Chain(PrimaryNode('normalization')) scaling_chain.fit(train_data) scaled_data = scaling_chain.predict(train_data) kmeans = Model(operation_type='kmeans') _, train_predicted = kmeans.fit(data=scaled_data) assert all(np.unique(train_predicted.predict) == [0, 1])
def test_log_regression_fit_correct(classification_dataset): data = classification_dataset data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) log_reg = Model(model_type='logit') _, train_predicted = log_reg.fit(data=train_data) roc_on_train = get_roc_auc(train_data, train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_output_mode_labels(): data = get_iris_data() chain = chain_simple() train_data, test_data = train_test_data_setup(data, shuffle_flag=True) chain.fit(input_data=train_data) results = chain.predict(input_data=test_data, output_mode='labels') results_probs = chain.predict(input_data=test_data) assert len(results.predict) == len(test_data.target) assert set(results.predict) == {0, 1, 2} assert not np.array_equal(results_probs.predict, results.predict)
def test_random_forest_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) random_forest = Model(model_type='rf') _, train_predicted = random_forest.fit(data=train_data) roc_on_train = get_roc_auc(train_data, train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_multiclassification_chain_fit_correct(): data = get_iris_data() chain = chain_simple() train_data, test_data = train_test_data_setup(data, shuffle_flag=True) chain.fit(input_data=train_data) results = chain.predict(input_data=test_data) roc_auc_on_test = roc_auc(y_true=test_data.target, y_score=results.predict, multi_class='ovo', average='macro') assert roc_auc_on_test > 0.95
def tune_node(self, input_data, loss_function, node_index, loss_params=None): """ Method for hyperparameters tuning for particular node""" # Train test split train_input, predict_input = train_test_data_setup(input_data) test_target = np.array(predict_input.target) is_need_to_maximize = _greater_is_better(target=test_target, loss_function=loss_function, loss_params=loss_params) self.is_need_to_maximize = is_need_to_maximize # Check source metrics for data self.init_check(train_input, predict_input, test_target, loss_function, loss_params) node = self.chain.nodes[node_index] operation_name = str(node.operation.operation_type) # Get node's parameters to optimize node_params = get_node_params(node_id=node_index, operation_name=operation_name) if node_params is None: print( f'"{operation_name}" operation has no parameters to optimize') else: # Apply tuning for current node self._optimize_node(node_id=node_index, train_input=train_input, predict_input=predict_input, test_target=test_target, node_params=node_params, iterations_per_node=self.iterations, seconds_per_node=self.max_seconds, loss_function=loss_function, loss_params=loss_params) # Validation is the optimization do well final_chain = self.final_check(train_input=train_input, predict_input=predict_input, test_target=test_target, tuned_chain=self.chain, loss_function=loss_function, loss_params=loss_params) return final_chain
def test_eval_strategy_logreg(data_setup): data_set = data_setup train, test = train_test_data_setup(data=data_set) test_skl_model = LogisticRegression(C=10., random_state=1, solver='liblinear', max_iter=10000, verbose=0) test_skl_model.fit(train.features, train.target) expected_result = test_skl_model.predict(test.features) test_model_node = PrimaryNode(model_type='logit') test_model_node.fit(input_data=train) actual_result = test_model_node.predict(input_data=test) assert len(actual_result.predict) == len(expected_result)
def test_output_mode_full_probs(): data = get_binary_classification_data() chain = chain_simple() train_data, test_data = train_test_data_setup(data, shuffle_flag=True) chain.fit(input_data=train_data) results = chain.predict(input_data=test_data, output_mode='full_probs') results_default = chain.predict(input_data=test_data) results_probs = chain.predict(input_data=test_data, output_mode='probs') assert not np.array_equal(results_probs.predict, results.predict) assert np.array_equal(results_probs.predict, results_default.predict) assert results.predict.shape == (len(test_data.target), 2) assert results_probs.predict.shape == (len(test_data.target), )
def test_model_fit_and_predict_correctly(): """Checks whether the model fits and predict correctly on the synthetic dataset""" data = get_synthetic_input_data(N_SAMPLES, N_FEATURES, random_state=1) chain = generate_chain() train_data, test_data = train_test_data_setup(data) chain.fit(input_data=train_data) roc_auc_value_train, roc_auc_value_test = get_roc_auc_value( chain, train_data, test_data) train_auc_thr = get_auc_threshold(roc_auc_value_train) test_auc_thr = get_auc_threshold(roc_auc_value_test) assert train_auc_thr >= CORRECT_MODEL_AUC_THR assert test_auc_thr >= CORRECT_MODEL_AUC_THR
def synthetic_benchmark_composing_example(): fitted_chain = separately_fit_chain(samples=5000, features_amount=10, classes=2) data = synthetic_benchmark_dataset(samples_amount=5000, features_amount=10, fitted_chain=fitted_chain) print(f'Synthetic features: {data.features[:10]}') print(f'Synthetic target: {data.target[:10]}') train, test = train_test_data_setup(data) simple_chain = two_level_chain() simple_chain.fit(input_data=train, use_cache=False) print(f'ROC score on train: {roc_value(simple_chain, train)}') print(f'ROC score on test {roc_value(simple_chain, test)}')
def test_regression_chain_with_datamodel_fit_correct(): data = get_synthetic_regression_data() train_data, test_data = train_test_data_setup(data) node_data = PrimaryNode('direct_data_model') node_first = PrimaryNode('ridge') node_second = SecondaryNode('lasso') node_second.nodes_from = [node_first, node_data] chain = Chain(node_second) chain.fit(train_data) results = chain.predict(test_data) assert results.predict.shape == test_data.target.shape
def test_logger_manager_keeps_loggers_correctly(): LogManager().clear_cache() chain = create_four_depth_chain() expected_number_of_loggers = 4 file = os.path.join('../data', 'advanced_classification.csv') test_file_path = str(os.path.dirname(__file__)) data = InputData.from_csv(os.path.join(test_file_path, file)) train_data, _ = train_test_data_setup(data=data) chain.fit(train_data) actual_number_of_loggers = LogManager().debug['loggers_number'] assert actual_number_of_loggers == expected_number_of_loggers
def test_chain_with_clusters_fit_correct(): mean_roc_on_test = 0 # mean ROC AUC is analysed because of stochastic clustering for _ in range(5): data = get_synthetic_input_data(n_samples=10000) chain = generate_chain() train_data, test_data = train_test_data_setup(data) chain.fit(input_data=train_data) _, roc_on_test = get_roc_auc_value(chain, train_data, test_data) mean_roc_on_test = np.mean([mean_roc_on_test, roc_on_test]) roc_threshold = 0.5 assert mean_roc_on_test > roc_threshold