Ejemplo n.º 1
0
def get_rmse_value(chain: Chain, train_data: InputData,
                   test_data: InputData) -> (float, float):
    train_pred = chain.predict(input_data=train_data)
    test_pred = chain.predict(input_data=test_data)

    rmse_value_test = ts_mse(obs=test_data.target, pred=test_pred.predict)
    rmse_value_train = ts_mse(obs=train_data.target, pred=train_pred.predict)

    return rmse_value_train, rmse_value_test, train_pred, test_pred
Ejemplo n.º 2
0
def get_roc_auc_value(chain: Chain, train_data: InputData,
                      test_data: InputData) -> (float, float):
    train_pred = chain.predict(input_data=train_data)
    test_pred = chain.predict(input_data=test_data)
    roc_auc_value_test = roc_auc(y_true=test_data.target,
                                 y_score=test_pred.predict)
    roc_auc_value_train = roc_auc(y_true=train_data.target,
                                  y_score=train_pred.predict)

    return roc_auc_value_train, roc_auc_value_test
Ejemplo n.º 3
0
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup):
    data = data_setup
    train, test = train_test_data_setup(data)

    first = PrimaryNode(operation_type='logit')
    second = PrimaryNode(operation_type='lda')
    third = PrimaryNode(operation_type='knn')
    final = SecondaryNode(operation_type='xgboost',
                          nodes_from=[first, second, third])

    chain = Chain()
    for node in [first, second, third, final]:
        chain.add_node(node)

    first = deepcopy(first)
    second = deepcopy(second)
    third = deepcopy(third)

    final_shuffled = SecondaryNode(operation_type='xgboost',
                                   nodes_from=[third, first, second])

    chain_shuffled = Chain()
    # change order of nodes in list
    for node in [final_shuffled, third, first, second]:
        chain_shuffled.add_node(node)

    train_predicted = chain.fit(input_data=train)

    train_predicted_shuffled = chain_shuffled.fit(input_data=train)

    # train results should be invariant
    assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id
    assert np.equal(train_predicted.predict,
                    train_predicted_shuffled.predict).all()

    test_predicted = chain.predict(input_data=test)
    test_predicted_shuffled = chain_shuffled.predict(input_data=test)

    # predict results should be invariant
    assert np.equal(test_predicted.predict,
                    test_predicted_shuffled.predict).all()

    # change parents order for the nodes fitted chain
    nodes_for_change = chain.nodes[3].nodes_from
    chain.nodes[3].nodes_from = [
        nodes_for_change[2], nodes_for_change[0], nodes_for_change[1]
    ]
    chain.nodes[3].unfit()
    chain.fit(train)
    test_predicted_re_shuffled = chain.predict(input_data=test)

    # predict results should be invariant
    assert np.equal(test_predicted.predict,
                    test_predicted_re_shuffled.predict).all()
Ejemplo n.º 4
0
def get_rmse_value(chain: Chain, train_data: InputData,
                   test_data: InputData) -> (float, float):
    train_pred = chain.predict(input_data=train_data)
    test_pred = chain.predict(input_data=test_data)
    rmse_value_test = mse(y_true=test_data.target,
                          y_pred=test_pred.predict,
                          squared=False)
    rmse_value_train = mse(y_true=train_data.target,
                           y_pred=train_pred.predict,
                           squared=False)

    return rmse_value_train, rmse_value_test
Ejemplo n.º 5
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier())
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results)

    print(f'ROC AUC for TPOT: {roc_auc_value}')

    node_scaling = PrimaryNode('scaling')
    node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling])
    chain = Chain(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(f'ROC AUC for FEDOT: {roc_auc_value}')

    return roc_auc_value
Ejemplo n.º 6
0
def run_chain_from_automl(train_file_path: str,
                          test_file_path: str,
                          max_run_time: timedelta = timedelta(minutes=10)):
    """ Function run chain with Auto ML models in nodes

    :param train_file_path: path to the csv file with data for train
    :param test_file_path: path to the csv file with data for validation
    :param max_run_time: maximum running time for customization of the "tpot" model

    :return roc_auc_value: ROC AUC metric for chain
    """
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    chain = Chain()
    node_scaling = PrimaryNode('scaling')
    node_tpot = PrimaryNode('tpot')

    node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = SecondaryNode('lda', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda])
    chain.add_node(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Ejemplo n.º 7
0
def test_save_load_fitted_atomized_chain_correctly():
    chain = create_chain_with_several_nested_atomized_model()

    train_data, test_data = create_data_for_train()
    chain.fit(train_data)

    json_actual = chain.save_chain(
        'test_save_load_fitted_atomized_chain_correctly')

    json_path_load = create_correct_path(
        'test_save_load_fitted_atomized_chain_correctly')

    chain_loaded = Chain()
    chain_loaded.load_chain(json_path_load)
    json_expected = chain_loaded.save_chain(
        'test_save_load_fitted_atomized_chain_correctly_loaded')

    assert chain.length == chain_loaded.length
    assert json_actual == json_expected

    before_save_predicted = chain.predict(test_data)

    chain_loaded.fit(train_data)
    after_save_predicted = chain_loaded.predict(test_data)

    bfr_tun_mse = mean_squared_error(y_true=test_data.target,
                                     y_pred=before_save_predicted.predict)
    aft_tun_mse = mean_squared_error(y_true=test_data.target,
                                     y_pred=after_save_predicted.predict)

    assert aft_tun_mse <= bfr_tun_mse
Ejemplo n.º 8
0
def run_chain_from_automl(train_file_path: str, test_file_path: str,
                          max_run_time: timedelta = timedelta(minutes=10)):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    chain = Chain()
    node_tpot = PrimaryNode('tpot')

    node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = PrimaryNode('lda')
    node_rf = SecondaryNode('rf')

    node_rf.nodes_from = [node_tpot, node_lda]

    chain.add_node(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target,
                            y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Ejemplo n.º 9
0
def chain_tuning(nodes_to_tune: str, chain: Chain, train_data: InputData,
                 test_data: InputData, local_iter: int,
                 tuner_iter_num: int = 50) -> (float, list):
    several_iter_scores_test = []

    if nodes_to_tune == 'primary':
        print('primary_node_tuning')
        chain_tune_strategy = chain.fine_tune_primary_nodes
    elif nodes_to_tune == 'root':
        print('root_node_tuning')
        chain_tune_strategy = chain.fine_tune_all_nodes
    else:
        raise ValueError(f'Invalid type of nodes. Nodes must be primary or root')

    for iteration in range(local_iter):
        print(f'current local iteration {iteration}')

        # Chain tuning
        chain_tune_strategy(train_data, iterations=tuner_iter_num)

        # After tuning prediction
        chain.fit(train_data)
        after_tuning_predicted = chain.predict(test_data)

        # Metrics
        aft_tun_roc_auc = roc_auc(y_true=test_data.target,
                                  y_score=after_tuning_predicted.predict)
        several_iter_scores_test.append(aft_tun_roc_auc)

    return float(np.mean(several_iter_scores_test)), several_iter_scores_test
Ejemplo n.º 10
0
def test_forecast_with_exog():
    train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts(
    )

    # Source data for lagged node
    node_lagged = PrimaryNode('lagged',
                              node_data={
                                  'fit': train_source_ts,
                                  'predict': predict_source_ts
                              })
    # Set window size for lagged transformation
    node_lagged.custom_params = {'window_size': window_size}
    # Exogenous variable for exog node
    node_exog = PrimaryNode('exog',
                            node_data={
                                'fit': train_exog_ts,
                                'predict': predict_exog_ts
                            })

    node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_exog])
    chain = Chain(node_final)

    chain.fit()

    forecast = chain.predict()
    prediction = np.ravel(np.array(forecast.predict))

    assert tuple(prediction) == tuple(ts_test)
Ejemplo n.º 11
0
def apply_model_to_data(model: Chain, data_path: str):
    df, file_path = create_multi_clf_examples_from_excel(data_path,
                                                         return_df=True)
    dataset_to_apply = InputData.from_csv(file_path, target_column=None)
    evo_predicted = model.predict(dataset_to_apply)
    df['forecast'] = probs_to_labels(evo_predicted.predict)
    return df
Ejemplo n.º 12
0
def calculate_validation_metric(chain: Chain,
                                dataset_to_validate: InputData) -> float:
    # the execution of the obtained composite models
    predicted = chain.predict(dataset_to_validate)
    # the quality assessment for the simulation results
    roc_auc_value = roc_auc(y_true=dataset_to_validate.target,
                            y_score=predicted.predict)
    return roc_auc_value
Ejemplo n.º 13
0
def validate_model_quality(model: Chain, data_path: str):
    dataset_to_validate = InputData.from_csv(data_path)
    predicted_labels = model.predict(dataset_to_validate).predict

    roc_auc_valid = round(
        roc_auc(y_true=test_data.target,
                y_score=predicted_labels,
                multi_class='ovo',
                average='macro'), 3)
    return roc_auc_valid
Ejemplo n.º 14
0
def test_ts_forecasting_lagged_data_operation():
    train_input, predict_input, y_test = get_time_series()

    node_lagged = PrimaryNode('lagged')
    node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged])
    chain = Chain(node_ridge)

    chain.fit_from_scratch(train_input)
    predicted_output = chain.predict(predict_input)
    predicted = np.ravel(predicted_output.predict)

    assert len(predicted) == len(np.ravel(y_test))
Ejemplo n.º 15
0
def execute_chain_for_text_problem(train_data, test_data):
    node_text_clean = PrimaryNode('text_clean')
    node_tfidf = SecondaryNode('tfidf', nodes_from=[node_text_clean])
    model_node = SecondaryNode('multinb', nodes_from=[node_tfidf])
    chain = Chain(model_node)
    chain.fit(train_data)

    predicted = chain.predict(test_data)

    roc_auc_metric = roc_auc(y_true=test_data.target,
                             y_score=predicted.predict)

    return roc_auc_metric
Ejemplo n.º 16
0
def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling chain. Fit predict it
    scaling_chain = Chain(PrimaryNode('normalization'))
    scaling_chain.fit(train_data)
    scaled_data = scaling_chain.predict(train_data)

    kmeans = Model(operation_type='kmeans')
    _, train_predicted = kmeans.fit(data=scaled_data)

    assert all(np.unique(train_predicted.predict) == [0, 1])
Ejemplo n.º 17
0
def test_ts_forecasting_smoothing_data_operation():
    train_input, predict_input, y_test = get_time_series()

    for smoothing_operation in ['smoothing', 'gaussian_filter']:
        node_smoothing = PrimaryNode(smoothing_operation)
        node_lagged = SecondaryNode('lagged', nodes_from=[node_smoothing])
        node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged])
        chain = Chain(node_ridge)

        chain.fit_from_scratch(train_input)
        predicted_output = chain.predict(predict_input)
        predicted = np.ravel(predicted_output.predict)

        assert len(predicted) == len(np.ravel(y_test))
Ejemplo n.º 18
0
def execute_chain_for_text_problem(train_data, test_data):
    preproc_node = PrimaryNode(
        'tfidf', manual_preprocessing_func=TextPreprocessingStrategy)
    model_node = SecondaryNode('multinb',
                               nodes_from=[preproc_node],
                               manual_preprocessing_func=EmptyStrategy)
    chain = Chain(nodes=[model_node, preproc_node])
    chain.fit(train_data)

    predicted = chain.predict(test_data)

    roc_auc_metric = roc_auc(y_true=test_data.target,
                             y_score=predicted.predict)

    return roc_auc_metric
Ejemplo n.º 19
0
def test_regression_chain_with_datamodel_fit_correct():
    data = get_synthetic_regression_data()
    train_data, test_data = train_test_data_setup(data)

    node_data = PrimaryNode('direct_data_model')
    node_first = PrimaryNode('ridge')
    node_second = SecondaryNode('lasso')
    node_second.nodes_from = [node_first, node_data]

    chain = Chain(node_second)

    chain.fit(train_data)
    results = chain.predict(test_data)

    assert results.predict.shape == test_data.target.shape
Ejemplo n.º 20
0
    def get_value(cls, chain: Chain, reference_data: InputData) -> float:
        metric = cls.default_value
        try:
            results = chain.predict(reference_data,
                                    output_mode=cls.output_mode)

            if reference_data.task.task_type == TaskTypesEnum.ts_forecasting:
                # Convert prediction into one-dimensional array
                forecast_values = np.ravel(np.array(results.predict))
                results.predict = forecast_values
                metric = cls.metric(reference_data, results)
            else:
                metric = cls.metric(reference_data, results)
        except Exception as ex:
            print(f'Metric evaluation error: {ex}')
        return metric
Ejemplo n.º 21
0
def test_log_regression_fit_correct(classification_dataset):
    data = classification_dataset
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling chain. Fit predict it
    scaling_chain = Chain(PrimaryNode('normalization'))
    scaling_chain.fit(train_data)
    scaled_data = scaling_chain.predict(train_data)

    log_reg = Model(operation_type='logit')
    _, train_predicted = log_reg.fit(data=scaled_data)

    roc_on_train = get_roc_auc(valid_data=train_data,
                               predicted_data=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Ejemplo n.º 22
0
def test_random_forest_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling chain. Fit predict it
    scaling_chain = Chain(PrimaryNode('normalization'))
    scaling_chain.fit(train_data)
    scaled_data = scaling_chain.predict(train_data)

    random_forest = Model(operation_type='rf')
    _, train_predicted = random_forest.fit(data=scaled_data)

    roc_on_train = get_roc_auc(valid_data=train_data,
                               predicted_data=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Ejemplo n.º 23
0
def test_pca_model_removes_redunant_features_correct():
    n_informative = 5
    data = classification_dataset_with_redunant_features(n_samples=1000, n_features=100,
                                                         n_informative=n_informative)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling chain. Fit predict it
    scaling_chain = Chain(PrimaryNode('normalization'))
    scaling_chain.fit(train_data)
    scaled_data = scaling_chain.predict(train_data)

    pca = DataOperation(operation_type='pca')
    _, train_predicted = pca.fit(data=scaled_data)
    transformed_features = train_predicted.predict

    assert transformed_features.shape[1] < data.features.shape[1]
Ejemplo n.º 24
0
def test_classification_data_operations():
    train_input, predict_input, y_test = get_small_classification_dataset()

    for data_operation in [
            'kernel_pca', 'pca', 'scaling', 'normalization', 'poly_features',
            'rfe_lin_class', 'rfe_non_lin_class'
    ]:
        node_data_operation = PrimaryNode(data_operation)
        node_final = SecondaryNode('logit', nodes_from=[node_data_operation])
        chain = Chain(node_final)

        # Fit and predict for chain
        chain.fit_from_scratch(train_input)
        predicted_output = chain.predict(predict_input)
        predicted = predicted_output.predict

        assert len(predicted) == len(y_test)
Ejemplo n.º 25
0
    def get_value(cls, chain: Chain, reference_data: InputData) -> float:
        metric = cls.default_value
        if not metric:
            raise ValueError('Default value for metric not found')
        try:
            results = chain.predict(reference_data)

            if reference_data.task.task_type == TaskTypesEnum.ts_forecasting:
                new_reference_data = copy(reference_data)
                new_reference_data.target = new_reference_data.target[
                    ~np.isnan(results.predict)]
                results.predict = results.predict[~np.isnan(results.predict)]
                metric = cls.metric(new_reference_data, results)
            else:
                metric = cls.metric(reference_data, results)
        except Exception as ex:
            print(f'Metric evaluation error: {ex}')
        return metric
Ejemplo n.º 26
0
def run_import_export_example(chain_path):
    features_options = {'informative': 1, 'bias': 0.0}
    samples_amount = 100
    features_amount = 2
    x_train, y_train, x_test, y_test = get_regression_dataset(features_options,
                                                              samples_amount,
                                                              features_amount)

    # Define regression task
    task = Task(TaskTypesEnum.regression)

    # Prepare data to train the model
    train_input = InputData(idx=np.arange(0, len(x_train)),
                            features=x_train,
                            target=y_train,
                            task=task,
                            data_type=DataTypesEnum.table)

    predict_input = InputData(idx=np.arange(0, len(x_test)),
                              features=x_test,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.table)

    # Get chain and fit it
    chain = get_chain()
    chain.fit_from_scratch(train_input)

    predicted_output = chain.predict(predict_input)
    prediction_before_export = np.array(predicted_output.predict)
    print(f'Before export {prediction_before_export[:4]}')

    # Export it
    chain.save(path=chain_path)

    # Import chain
    json_path_load = create_correct_path(chain_path)
    new_chain = Chain()
    new_chain.load(json_path_load)

    predicted_output_after_export = new_chain.predict(predict_input)
    prediction_after_export = np.array(predicted_output_after_export.predict)

    print(f'After import {prediction_after_export[:4]}')
Ejemplo n.º 27
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()),
        RandomForestClassifier()
    )
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target,
                            y_score=results)

    print(roc_auc_value)

    chain = Chain()
    node_first = PrimaryNode('direct_data_model')
    node_second = PrimaryNode('bernb')
    node_third = SecondaryNode('rf')

    node_third.nodes_from.append(node_first)
    node_third.nodes_from.append(node_second)

    chain.add_node(node_third)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target,
                            y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Ejemplo n.º 28
0
def test_chain_with_datamodel_fit_correct(data_setup):
    data = data_setup
    train_data, test_data = train_test_data_setup(data)

    chain = Chain()

    node_data = PrimaryNode('logit')
    node_first = PrimaryNode('bernb')
    node_second = SecondaryNode('rf')

    node_second.nodes_from = [node_first, node_data]

    chain.add_node(node_data)
    chain.add_node(node_first)
    chain.add_node(node_second)

    chain.fit(train_data)
    results = np.asarray(probs_to_labels(chain.predict(test_data).predict))

    assert results.shape == test_data.target.shape
Ejemplo n.º 29
0
def test_chain_with_custom_params_for_model(data_setup):
    data = data_setup
    custom_params = dict(n_neighbors=1, weights='uniform', p=1)

    first = PrimaryNode(operation_type='logit')
    second = PrimaryNode(operation_type='lda')
    final = SecondaryNode(operation_type='knn', nodes_from=[first, second])

    chain = Chain()
    chain.add_node(final)
    chain_default_params = deepcopy(chain)

    chain.root_node.custom_params = custom_params

    chain_default_params.fit(data)
    chain.fit(data)

    custom_params_prediction = chain.predict(data).predict
    default_params_prediction = chain_default_params.predict(data).predict

    assert not np.array_equal(custom_params_prediction,
                              default_params_prediction)
Ejemplo n.º 30
0
def test_regression_chain_with_data_operation_fit_correct():
    data = get_synthetic_regression_data()
    train_data, test_data = train_test_data_setup(data)

    #           linear
    #       /           \
    #     ridge          |
    #       |            |
    # ransac_lin_reg   lasso
    #        \         /
    #          scaling
    node_scaling = PrimaryNode('scaling')
    node_ransac = SecondaryNode('ransac_lin_reg', nodes_from=[node_scaling])
    node_lasso = SecondaryNode('lasso', nodes_from=[node_scaling])
    node_ridge = SecondaryNode('ridge', nodes_from=[node_ransac])
    node_root = SecondaryNode('linear', nodes_from=[node_lasso, node_ridge])
    chain = Chain(node_root)

    chain.fit(train_data)
    results = chain.predict(test_data)

    assert results.predict.shape == test_data.target.shape