def get_rmse_value(chain: Chain, train_data: InputData, test_data: InputData) -> (float, float): train_pred = chain.predict(input_data=train_data) test_pred = chain.predict(input_data=test_data) rmse_value_test = ts_mse(obs=test_data.target, pred=test_pred.predict) rmse_value_train = ts_mse(obs=train_data.target, pred=train_pred.predict) return rmse_value_train, rmse_value_test, train_pred, test_pred
def get_roc_auc_value(chain: Chain, train_data: InputData, test_data: InputData) -> (float, float): train_pred = chain.predict(input_data=train_data) test_pred = chain.predict(input_data=test_data) roc_auc_value_test = roc_auc(y_true=test_data.target, y_score=test_pred.predict) roc_auc_value_train = roc_auc(y_true=train_data.target, y_score=train_pred.predict) return roc_auc_value_train, roc_auc_value_test
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup): data = data_setup train, test = train_test_data_setup(data) first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) first = deepcopy(first) second = deepcopy(second) third = deepcopy(third) final_shuffled = SecondaryNode(operation_type='xgboost', nodes_from=[third, first, second]) chain_shuffled = Chain() # change order of nodes in list for node in [final_shuffled, third, first, second]: chain_shuffled.add_node(node) train_predicted = chain.fit(input_data=train) train_predicted_shuffled = chain_shuffled.fit(input_data=train) # train results should be invariant assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id assert np.equal(train_predicted.predict, train_predicted_shuffled.predict).all() test_predicted = chain.predict(input_data=test) test_predicted_shuffled = chain_shuffled.predict(input_data=test) # predict results should be invariant assert np.equal(test_predicted.predict, test_predicted_shuffled.predict).all() # change parents order for the nodes fitted chain nodes_for_change = chain.nodes[3].nodes_from chain.nodes[3].nodes_from = [ nodes_for_change[2], nodes_for_change[0], nodes_for_change[1] ] chain.nodes[3].unfit() chain.fit(train) test_predicted_re_shuffled = chain.predict(input_data=test) # predict results should be invariant assert np.equal(test_predicted.predict, test_predicted_re_shuffled.predict).all()
def get_rmse_value(chain: Chain, train_data: InputData, test_data: InputData) -> (float, float): train_pred = chain.predict(input_data=train_data) test_pred = chain.predict(input_data=test_data) rmse_value_test = mse(y_true=test_data.target, y_pred=test_pred.predict, squared=False) rmse_value_train = mse(y_true=train_data.target, y_pred=train_pred.predict, squared=False) return rmse_value_train, rmse_value_test
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier()) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(f'ROC AUC for TPOT: {roc_auc_value}') node_scaling = PrimaryNode('scaling') node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling]) chain = Chain(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(f'ROC AUC for FEDOT: {roc_auc_value}') return roc_auc_value
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): """ Function run chain with Auto ML models in nodes :param train_file_path: path to the csv file with data for train :param test_file_path: path to the csv file with data for validation :param max_run_time: maximum running time for customization of the "tpot" model :return roc_auc_value: ROC AUC metric for chain """ train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_scaling = PrimaryNode('scaling') node_tpot = PrimaryNode('tpot') node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds} node_lda = SecondaryNode('lda', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda]) chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def test_save_load_fitted_atomized_chain_correctly(): chain = create_chain_with_several_nested_atomized_model() train_data, test_data = create_data_for_train() chain.fit(train_data) json_actual = chain.save_chain( 'test_save_load_fitted_atomized_chain_correctly') json_path_load = create_correct_path( 'test_save_load_fitted_atomized_chain_correctly') chain_loaded = Chain() chain_loaded.load_chain(json_path_load) json_expected = chain_loaded.save_chain( 'test_save_load_fitted_atomized_chain_correctly_loaded') assert chain.length == chain_loaded.length assert json_actual == json_expected before_save_predicted = chain.predict(test_data) chain_loaded.fit(train_data) after_save_predicted = chain_loaded.predict(test_data) bfr_tun_mse = mean_squared_error(y_true=test_data.target, y_pred=before_save_predicted.predict) aft_tun_mse = mean_squared_error(y_true=test_data.target, y_pred=after_save_predicted.predict) assert aft_tun_mse <= bfr_tun_mse
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_tpot = PrimaryNode('tpot') node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds} node_lda = PrimaryNode('lda') node_rf = SecondaryNode('rf') node_rf.nodes_from = [node_tpot, node_lda] chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def chain_tuning(nodes_to_tune: str, chain: Chain, train_data: InputData, test_data: InputData, local_iter: int, tuner_iter_num: int = 50) -> (float, list): several_iter_scores_test = [] if nodes_to_tune == 'primary': print('primary_node_tuning') chain_tune_strategy = chain.fine_tune_primary_nodes elif nodes_to_tune == 'root': print('root_node_tuning') chain_tune_strategy = chain.fine_tune_all_nodes else: raise ValueError(f'Invalid type of nodes. Nodes must be primary or root') for iteration in range(local_iter): print(f'current local iteration {iteration}') # Chain tuning chain_tune_strategy(train_data, iterations=tuner_iter_num) # After tuning prediction chain.fit(train_data) after_tuning_predicted = chain.predict(test_data) # Metrics aft_tun_roc_auc = roc_auc(y_true=test_data.target, y_score=after_tuning_predicted.predict) several_iter_scores_test.append(aft_tun_roc_auc) return float(np.mean(several_iter_scores_test)), several_iter_scores_test
def test_forecast_with_exog(): train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts( ) # Source data for lagged node node_lagged = PrimaryNode('lagged', node_data={ 'fit': train_source_ts, 'predict': predict_source_ts }) # Set window size for lagged transformation node_lagged.custom_params = {'window_size': window_size} # Exogenous variable for exog node node_exog = PrimaryNode('exog', node_data={ 'fit': train_exog_ts, 'predict': predict_exog_ts }) node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_exog]) chain = Chain(node_final) chain.fit() forecast = chain.predict() prediction = np.ravel(np.array(forecast.predict)) assert tuple(prediction) == tuple(ts_test)
def apply_model_to_data(model: Chain, data_path: str): df, file_path = create_multi_clf_examples_from_excel(data_path, return_df=True) dataset_to_apply = InputData.from_csv(file_path, target_column=None) evo_predicted = model.predict(dataset_to_apply) df['forecast'] = probs_to_labels(evo_predicted.predict) return df
def calculate_validation_metric(chain: Chain, dataset_to_validate: InputData) -> float: # the execution of the obtained composite models predicted = chain.predict(dataset_to_validate) # the quality assessment for the simulation results roc_auc_value = roc_auc(y_true=dataset_to_validate.target, y_score=predicted.predict) return roc_auc_value
def validate_model_quality(model: Chain, data_path: str): dataset_to_validate = InputData.from_csv(data_path) predicted_labels = model.predict(dataset_to_validate).predict roc_auc_valid = round( roc_auc(y_true=test_data.target, y_score=predicted_labels, multi_class='ovo', average='macro'), 3) return roc_auc_valid
def test_ts_forecasting_lagged_data_operation(): train_input, predict_input, y_test = get_time_series() node_lagged = PrimaryNode('lagged') node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged]) chain = Chain(node_ridge) chain.fit_from_scratch(train_input) predicted_output = chain.predict(predict_input) predicted = np.ravel(predicted_output.predict) assert len(predicted) == len(np.ravel(y_test))
def execute_chain_for_text_problem(train_data, test_data): node_text_clean = PrimaryNode('text_clean') node_tfidf = SecondaryNode('tfidf', nodes_from=[node_text_clean]) model_node = SecondaryNode('multinb', nodes_from=[node_tfidf]) chain = Chain(model_node) chain.fit(train_data) predicted = chain.predict(test_data) roc_auc_metric = roc_auc(y_true=test_data.target, y_score=predicted.predict) return roc_auc_metric
def test_log_clustering_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Scaling chain. Fit predict it scaling_chain = Chain(PrimaryNode('normalization')) scaling_chain.fit(train_data) scaled_data = scaling_chain.predict(train_data) kmeans = Model(operation_type='kmeans') _, train_predicted = kmeans.fit(data=scaled_data) assert all(np.unique(train_predicted.predict) == [0, 1])
def test_ts_forecasting_smoothing_data_operation(): train_input, predict_input, y_test = get_time_series() for smoothing_operation in ['smoothing', 'gaussian_filter']: node_smoothing = PrimaryNode(smoothing_operation) node_lagged = SecondaryNode('lagged', nodes_from=[node_smoothing]) node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged]) chain = Chain(node_ridge) chain.fit_from_scratch(train_input) predicted_output = chain.predict(predict_input) predicted = np.ravel(predicted_output.predict) assert len(predicted) == len(np.ravel(y_test))
def execute_chain_for_text_problem(train_data, test_data): preproc_node = PrimaryNode( 'tfidf', manual_preprocessing_func=TextPreprocessingStrategy) model_node = SecondaryNode('multinb', nodes_from=[preproc_node], manual_preprocessing_func=EmptyStrategy) chain = Chain(nodes=[model_node, preproc_node]) chain.fit(train_data) predicted = chain.predict(test_data) roc_auc_metric = roc_auc(y_true=test_data.target, y_score=predicted.predict) return roc_auc_metric
def test_regression_chain_with_datamodel_fit_correct(): data = get_synthetic_regression_data() train_data, test_data = train_test_data_setup(data) node_data = PrimaryNode('direct_data_model') node_first = PrimaryNode('ridge') node_second = SecondaryNode('lasso') node_second.nodes_from = [node_first, node_data] chain = Chain(node_second) chain.fit(train_data) results = chain.predict(test_data) assert results.predict.shape == test_data.target.shape
def get_value(cls, chain: Chain, reference_data: InputData) -> float: metric = cls.default_value try: results = chain.predict(reference_data, output_mode=cls.output_mode) if reference_data.task.task_type == TaskTypesEnum.ts_forecasting: # Convert prediction into one-dimensional array forecast_values = np.ravel(np.array(results.predict)) results.predict = forecast_values metric = cls.metric(reference_data, results) else: metric = cls.metric(reference_data, results) except Exception as ex: print(f'Metric evaluation error: {ex}') return metric
def test_log_regression_fit_correct(classification_dataset): data = classification_dataset train_data, test_data = train_test_data_setup(data=data) # Scaling chain. Fit predict it scaling_chain = Chain(PrimaryNode('normalization')) scaling_chain.fit(train_data) scaled_data = scaling_chain.predict(train_data) log_reg = Model(operation_type='logit') _, train_predicted = log_reg.fit(data=scaled_data) roc_on_train = get_roc_auc(valid_data=train_data, predicted_data=train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_random_forest_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Scaling chain. Fit predict it scaling_chain = Chain(PrimaryNode('normalization')) scaling_chain.fit(train_data) scaled_data = scaling_chain.predict(train_data) random_forest = Model(operation_type='rf') _, train_predicted = random_forest.fit(data=scaled_data) roc_on_train = get_roc_auc(valid_data=train_data, predicted_data=train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_pca_model_removes_redunant_features_correct(): n_informative = 5 data = classification_dataset_with_redunant_features(n_samples=1000, n_features=100, n_informative=n_informative) train_data, test_data = train_test_data_setup(data=data) # Scaling chain. Fit predict it scaling_chain = Chain(PrimaryNode('normalization')) scaling_chain.fit(train_data) scaled_data = scaling_chain.predict(train_data) pca = DataOperation(operation_type='pca') _, train_predicted = pca.fit(data=scaled_data) transformed_features = train_predicted.predict assert transformed_features.shape[1] < data.features.shape[1]
def test_classification_data_operations(): train_input, predict_input, y_test = get_small_classification_dataset() for data_operation in [ 'kernel_pca', 'pca', 'scaling', 'normalization', 'poly_features', 'rfe_lin_class', 'rfe_non_lin_class' ]: node_data_operation = PrimaryNode(data_operation) node_final = SecondaryNode('logit', nodes_from=[node_data_operation]) chain = Chain(node_final) # Fit and predict for chain chain.fit_from_scratch(train_input) predicted_output = chain.predict(predict_input) predicted = predicted_output.predict assert len(predicted) == len(y_test)
def get_value(cls, chain: Chain, reference_data: InputData) -> float: metric = cls.default_value if not metric: raise ValueError('Default value for metric not found') try: results = chain.predict(reference_data) if reference_data.task.task_type == TaskTypesEnum.ts_forecasting: new_reference_data = copy(reference_data) new_reference_data.target = new_reference_data.target[ ~np.isnan(results.predict)] results.predict = results.predict[~np.isnan(results.predict)] metric = cls.metric(new_reference_data, results) else: metric = cls.metric(reference_data, results) except Exception as ex: print(f'Metric evaluation error: {ex}') return metric
def run_import_export_example(chain_path): features_options = {'informative': 1, 'bias': 0.0} samples_amount = 100 features_amount = 2 x_train, y_train, x_test, y_test = get_regression_dataset(features_options, samples_amount, features_amount) # Define regression task task = Task(TaskTypesEnum.regression) # Prepare data to train the model train_input = InputData(idx=np.arange(0, len(x_train)), features=x_train, target=y_train, task=task, data_type=DataTypesEnum.table) predict_input = InputData(idx=np.arange(0, len(x_test)), features=x_test, target=None, task=task, data_type=DataTypesEnum.table) # Get chain and fit it chain = get_chain() chain.fit_from_scratch(train_input) predicted_output = chain.predict(predict_input) prediction_before_export = np.array(predicted_output.predict) print(f'Before export {prediction_before_export[:4]}') # Export it chain.save(path=chain_path) # Import chain json_path_load = create_correct_path(chain_path) new_chain = Chain() new_chain.load(json_path_load) predicted_output_after_export = new_chain.predict(predict_input) prediction_after_export = np.array(predicted_output_after_export.predict) print(f'After import {prediction_after_export[:4]}')
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier() ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(roc_auc_value) chain = Chain() node_first = PrimaryNode('direct_data_model') node_second = PrimaryNode('bernb') node_third = SecondaryNode('rf') node_third.nodes_from.append(node_first) node_third.nodes_from.append(node_second) chain.add_node(node_third) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def test_chain_with_datamodel_fit_correct(data_setup): data = data_setup train_data, test_data = train_test_data_setup(data) chain = Chain() node_data = PrimaryNode('logit') node_first = PrimaryNode('bernb') node_second = SecondaryNode('rf') node_second.nodes_from = [node_first, node_data] chain.add_node(node_data) chain.add_node(node_first) chain.add_node(node_second) chain.fit(train_data) results = np.asarray(probs_to_labels(chain.predict(test_data).predict)) assert results.shape == test_data.target.shape
def test_chain_with_custom_params_for_model(data_setup): data = data_setup custom_params = dict(n_neighbors=1, weights='uniform', p=1) first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') final = SecondaryNode(operation_type='knn', nodes_from=[first, second]) chain = Chain() chain.add_node(final) chain_default_params = deepcopy(chain) chain.root_node.custom_params = custom_params chain_default_params.fit(data) chain.fit(data) custom_params_prediction = chain.predict(data).predict default_params_prediction = chain_default_params.predict(data).predict assert not np.array_equal(custom_params_prediction, default_params_prediction)
def test_regression_chain_with_data_operation_fit_correct(): data = get_synthetic_regression_data() train_data, test_data = train_test_data_setup(data) # linear # / \ # ridge | # | | # ransac_lin_reg lasso # \ / # scaling node_scaling = PrimaryNode('scaling') node_ransac = SecondaryNode('ransac_lin_reg', nodes_from=[node_scaling]) node_lasso = SecondaryNode('lasso', nodes_from=[node_scaling]) node_ridge = SecondaryNode('ridge', nodes_from=[node_ransac]) node_root = SecondaryNode('linear', nodes_from=[node_lasso, node_ridge]) chain = Chain(node_root) chain.fit(train_data) results = chain.predict(test_data) assert results.predict.shape == test_data.target.shape