Example #1
0
def chain_tuning(nodes_to_tune: str, chain: Chain, train_data: InputData,
                 test_data: InputData, local_iter: int,
                 tuner_iter_num: int = 50) -> (float, list):
    several_iter_scores_test = []

    if nodes_to_tune == 'primary':
        print('primary_node_tuning')
        chain_tune_strategy = chain.fine_tune_primary_nodes
    elif nodes_to_tune == 'root':
        print('root_node_tuning')
        chain_tune_strategy = chain.fine_tune_all_nodes
    else:
        raise ValueError(f'Invalid type of nodes. Nodes must be primary or root')

    for iteration in range(local_iter):
        print(f'current local iteration {iteration}')

        # Chain tuning
        chain_tune_strategy(train_data, iterations=tuner_iter_num)

        # After tuning prediction
        chain.fit(train_data)
        after_tuning_predicted = chain.predict(test_data)

        # Metrics
        aft_tun_roc_auc = roc_auc(y_true=test_data.target,
                                  y_score=after_tuning_predicted.predict)
        several_iter_scores_test.append(aft_tun_roc_auc)

    return float(np.mean(several_iter_scores_test)), several_iter_scores_test
Example #2
0
def test_save_load_fitted_atomized_chain_correctly():
    chain = create_chain_with_several_nested_atomized_model()

    train_data, test_data = create_data_for_train()
    chain.fit(train_data)

    json_actual = chain.save_chain(
        'test_save_load_fitted_atomized_chain_correctly')

    json_path_load = create_correct_path(
        'test_save_load_fitted_atomized_chain_correctly')

    chain_loaded = Chain()
    chain_loaded.load_chain(json_path_load)
    json_expected = chain_loaded.save_chain(
        'test_save_load_fitted_atomized_chain_correctly_loaded')

    assert chain.length == chain_loaded.length
    assert json_actual == json_expected

    before_save_predicted = chain.predict(test_data)

    chain_loaded.fit(train_data)
    after_save_predicted = chain_loaded.predict(test_data)

    bfr_tun_mse = mean_squared_error(y_true=test_data.target,
                                     y_pred=before_save_predicted.predict)
    aft_tun_mse = mean_squared_error(y_true=test_data.target,
                                     y_pred=after_save_predicted.predict)

    assert aft_tun_mse <= bfr_tun_mse
Example #3
0
def test_forecast_with_exog():
    train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts(
    )

    # Source data for lagged node
    node_lagged = PrimaryNode('lagged',
                              node_data={
                                  'fit': train_source_ts,
                                  'predict': predict_source_ts
                              })
    # Set window size for lagged transformation
    node_lagged.custom_params = {'window_size': window_size}
    # Exogenous variable for exog node
    node_exog = PrimaryNode('exog',
                            node_data={
                                'fit': train_exog_ts,
                                'predict': predict_exog_ts
                            })

    node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_exog])
    chain = Chain(node_final)

    chain.fit()

    forecast = chain.predict()
    prediction = np.ravel(np.array(forecast.predict))

    assert tuple(prediction) == tuple(ts_test)
Example #4
0
def run_chain_from_automl(train_file_path: str, test_file_path: str,
                          max_run_time: timedelta = timedelta(minutes=10)):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    chain = Chain()
    node_tpot = PrimaryNode('tpot')

    node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = PrimaryNode('lda')
    node_rf = SecondaryNode('rf')

    node_rf.nodes_from = [node_tpot, node_lda]

    chain.add_node(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target,
                            y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Example #5
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier())
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results)

    print(f'ROC AUC for TPOT: {roc_auc_value}')

    node_scaling = PrimaryNode('scaling')
    node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling])
    chain = Chain(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(f'ROC AUC for FEDOT: {roc_auc_value}')

    return roc_auc_value
Example #6
0
def run_chain_from_automl(train_file_path: str,
                          test_file_path: str,
                          max_run_time: timedelta = timedelta(minutes=10)):
    """ Function run chain with Auto ML models in nodes

    :param train_file_path: path to the csv file with data for train
    :param test_file_path: path to the csv file with data for validation
    :param max_run_time: maximum running time for customization of the "tpot" model

    :return roc_auc_value: ROC AUC metric for chain
    """
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    chain = Chain()
    node_scaling = PrimaryNode('scaling')
    node_tpot = PrimaryNode('tpot')

    node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = SecondaryNode('lda', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda])
    chain.add_node(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Example #7
0
    def fine_tune_certain_node(self, model_id, input_data: InputData, iterations: int = 30,
                               max_lead_time: timedelta = timedelta(minutes=5)):
        """
        Optimize hyperparameters of models in the certain node,
        defined by model id

        :param int model_id: number of the certain model in the chain.
        Look for it in exported json file of your model.
        :param input_data: data used for tuning
        :param iterations: max number of iterations
        :param max_lead_time: max time available for tuning process
        :param verbose: flag used for status printing to console, default False
        :return: updated chain object
        """

        subchain = Chain()
        new_root = extract_subtree_root(root_model_id=model_id,
                                        chain_template=self.chain_template)
        subchain.add_node(new_root)
        subchain.fit(input_data=input_data, use_cache=False)

        updated_subchain = Tune(subchain).fine_tune_root_node(input_data=input_data,
                                                              iterations=iterations,
                                                              max_lead_time=max_lead_time)

        self._update_template(model_id=model_id,
                              updated_node=updated_subchain.root_node)

        updated_chain = Chain()
        self.chain_template.convert_to_chain(chain=updated_chain)

        return updated_chain
Example #8
0
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup):
    data = data_setup
    train, test = train_test_data_setup(data)

    first = PrimaryNode(operation_type='logit')
    second = PrimaryNode(operation_type='lda')
    third = PrimaryNode(operation_type='knn')
    final = SecondaryNode(operation_type='xgboost',
                          nodes_from=[first, second, third])

    chain = Chain()
    for node in [first, second, third, final]:
        chain.add_node(node)

    first = deepcopy(first)
    second = deepcopy(second)
    third = deepcopy(third)

    final_shuffled = SecondaryNode(operation_type='xgboost',
                                   nodes_from=[third, first, second])

    chain_shuffled = Chain()
    # change order of nodes in list
    for node in [final_shuffled, third, first, second]:
        chain_shuffled.add_node(node)

    train_predicted = chain.fit(input_data=train)

    train_predicted_shuffled = chain_shuffled.fit(input_data=train)

    # train results should be invariant
    assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id
    assert np.equal(train_predicted.predict,
                    train_predicted_shuffled.predict).all()

    test_predicted = chain.predict(input_data=test)
    test_predicted_shuffled = chain_shuffled.predict(input_data=test)

    # predict results should be invariant
    assert np.equal(test_predicted.predict,
                    test_predicted_shuffled.predict).all()

    # change parents order for the nodes fitted chain
    nodes_for_change = chain.nodes[3].nodes_from
    chain.nodes[3].nodes_from = [
        nodes_for_change[2], nodes_for_change[0], nodes_for_change[1]
    ]
    chain.nodes[3].unfit()
    chain.fit(train)
    test_predicted_re_shuffled = chain.predict(input_data=test)

    # predict results should be invariant
    assert np.equal(test_predicted.predict,
                    test_predicted_re_shuffled.predict).all()
Example #9
0
 def metric_for_nodes(self, metric_function, train_data: InputData,
                      test_data: InputData, is_chain_shared: bool,
                      chain: Chain) -> float:
     try:
         validate(chain)
         if is_chain_shared:
             chain = SharedChain(base_chain=chain, shared_cache=self.shared_cache)
         chain.fit(input_data=train_data)
         return metric_function(chain, test_data)
     except Exception as ex:
         self.log.info(f'Error in chain assessment during composition: {ex}. Continue.')
         return max_int_value
Example #10
0
def test_arima_chain_fit_correct():
    train_data, test_data = get_synthetic_ts_data_linear(forecast_length=12)

    chain = Chain(PrimaryNode('arima'))

    chain.fit(input_data=train_data)
    rmse_on_train, rmse_on_test, _, _ = get_rmse_value(chain, train_data,
                                                       test_data)

    rmse_threshold = _max_rmse_threshold_by_std(test_data.target)

    assert rmse_on_train < rmse_threshold
Example #11
0
def test_regression_chain_period_exog_forecast_multistep_correct():
    train_data, test_data = get_synthetic_ts_data_period(forecast_length=2,
                                                         max_window_size=3)

    chain = Chain(PrimaryNode('linear'))

    chain.fit(input_data=train_data)
    rmse_on_train, rmse_on_test, _, _ = get_rmse_value(chain, train_data,
                                                       test_data)

    rmse_threshold = 1.5
    assert rmse_on_train < rmse_threshold
    assert rmse_on_test < rmse_threshold
Example #12
0
def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling chain. Fit predict it
    scaling_chain = Chain(PrimaryNode('normalization'))
    scaling_chain.fit(train_data)
    scaled_data = scaling_chain.predict(train_data)

    kmeans = Model(operation_type='kmeans')
    _, train_predicted = kmeans.fit(data=scaled_data)

    assert all(np.unique(train_predicted.predict) == [0, 1])
Example #13
0
def test_regression_chain_forecast_multistep_correct():
    train_data, test_data = get_synthetic_ts_data_period(forecast_length=20,
                                                         max_window_size=30)

    chain = Chain(PrimaryNode('ridge'))

    chain.fit(input_data=train_data)
    _, rmse_on_test, _, _ = get_rmse_value(chain, train_data, test_data)

    rmse_threshold = _max_rmse_threshold_by_std(test_data.target,
                                                is_strict=False)

    assert rmse_on_test < rmse_threshold
Example #14
0
def test_import_custom_json_object_to_chain_and_fit_correctly_no_exception():
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/test_custom_json_template.json'
    json_path_load = os.path.join(test_file_path, file)

    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)

    chain = Chain()
    chain.load_chain(json_path_load)
    chain.fit(train_data)

    chain.save_chain('test_import_custom_json_object_to_chain_and_fit_correctly_no_exception')
Example #15
0
def execute_chain_for_text_problem(train_data, test_data):
    node_text_clean = PrimaryNode('text_clean')
    node_tfidf = SecondaryNode('tfidf', nodes_from=[node_text_clean])
    model_node = SecondaryNode('multinb', nodes_from=[node_tfidf])
    chain = Chain(model_node)
    chain.fit(train_data)

    predicted = chain.predict(test_data)

    roc_auc_metric = roc_auc(y_true=test_data.target,
                             y_score=predicted.predict)

    return roc_auc_metric
Example #16
0
def test_chain_with_wrong_data():
    chain = Chain(PrimaryNode('linear'))
    data_seq = np.arange(0, 10)
    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=10))

    data = InputData(idx=data_seq,
                     features=data_seq,
                     target=data_seq,
                     data_type=DataTypesEnum.ts,
                     task=task)

    with pytest.raises(ValueError):
        chain.fit(data)
Example #17
0
def execute_chain_for_text_problem(train_data, test_data):
    preproc_node = PrimaryNode(
        'tfidf', manual_preprocessing_func=TextPreprocessingStrategy)
    model_node = SecondaryNode('multinb',
                               nodes_from=[preproc_node],
                               manual_preprocessing_func=EmptyStrategy)
    chain = Chain(nodes=[model_node, preproc_node])
    chain.fit(train_data)

    predicted = chain.predict(test_data)

    roc_auc_metric = roc_auc(y_true=test_data.target,
                             y_score=predicted.predict)

    return roc_auc_metric
Example #18
0
def test_regression_chain_forecast_onestep_correct():
    train_data, test_data = get_synthetic_ts_data_linear(forecast_length=1,
                                                         max_window_size=10)

    chain = Chain(PrimaryNode('ridge'))

    chain.fit(input_data=train_data)
    rmse_on_train, rmse_on_test, _, _ = get_rmse_value(chain, train_data,
                                                       test_data)

    rmse_threshold = _max_rmse_threshold_by_std(test_data.target,
                                                is_strict=True)

    assert rmse_on_train < rmse_threshold
    assert rmse_on_test < rmse_threshold
Example #19
0
def test_regression_chain_with_datamodel_fit_correct():
    data = get_synthetic_regression_data()
    train_data, test_data = train_test_data_setup(data)

    node_data = PrimaryNode('direct_data_model')
    node_first = PrimaryNode('ridge')
    node_second = SecondaryNode('lasso')
    node_second.nodes_from = [node_first, node_data]

    chain = Chain(node_second)

    chain.fit(train_data)
    results = chain.predict(test_data)

    assert results.predict.shape == test_data.target.shape
Example #20
0
def test_pca_model_removes_redunant_features_correct():
    n_informative = 5
    data = classification_dataset_with_redunant_features(n_samples=1000, n_features=100,
                                                         n_informative=n_informative)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling chain. Fit predict it
    scaling_chain = Chain(PrimaryNode('normalization'))
    scaling_chain.fit(train_data)
    scaled_data = scaling_chain.predict(train_data)

    pca = DataOperation(operation_type='pca')
    _, train_predicted = pca.fit(data=scaled_data)
    transformed_features = train_predicted.predict

    assert transformed_features.shape[1] < data.features.shape[1]
Example #21
0
def test_fitted_chain_cache_correctness_after_export_and_import():
    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)

    chain = Chain(PrimaryNode('logit'))
    chain.fit(train_data)

    chain.save_chain('test_fitted_chain_cache_correctness_after_export_and_import')

    json_path_load = create_correct_path('test_fitted_chain_cache_correctness_after_export_and_import')
    new_chain = Chain()
    new_chain.load_chain(json_path_load)

    results = new_chain.fit(train_data)

    assert results is not None
Example #22
0
def test_log_regression_fit_correct(classification_dataset):
    data = classification_dataset
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling chain. Fit predict it
    scaling_chain = Chain(PrimaryNode('normalization'))
    scaling_chain.fit(train_data)
    scaled_data = scaling_chain.predict(train_data)

    log_reg = Model(operation_type='logit')
    _, train_predicted = log_reg.fit(data=scaled_data)

    roc_on_train = get_roc_auc(valid_data=train_data,
                               predicted_data=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Example #23
0
def test_random_forest_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling chain. Fit predict it
    scaling_chain = Chain(PrimaryNode('normalization'))
    scaling_chain.fit(train_data)
    scaled_data = scaling_chain.predict(train_data)

    random_forest = Model(operation_type='rf')
    _, train_predicted = random_forest.fit(data=scaled_data)

    roc_on_train = get_roc_auc(valid_data=train_data,
                               predicted_data=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Example #24
0
def test_chain_hierarchy_fit_correct(data_setup):
    data = data_setup
    train, _ = train_test_data_setup(data)

    first = PrimaryNode(operation_type='logit')
    second = SecondaryNode(operation_type='logit', nodes_from=[first])
    third = SecondaryNode(operation_type='logit', nodes_from=[first])
    final = SecondaryNode(operation_type='logit', nodes_from=[second, third])

    chain = Chain()
    for node in [first, second, third, final]:
        chain.add_node(node)

    chain.unfit()
    train_predicted = chain.fit(input_data=train)

    assert chain.root_node.descriptive_id == ('((/n_logit_default_params;)/'
                                              'n_logit_default_params;;(/'
                                              'n_logit_default_params;)/'
                                              'n_logit_default_params;)/'
                                              'n_logit_default_params')

    assert chain.length == 4
    assert chain.depth == 3
    assert train_predicted.predict.shape[0] == train.target.shape[0]
    assert final.fitted_operation is not None
Example #25
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()),
        RandomForestClassifier()
    )
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target,
                            y_score=results)

    print(roc_auc_value)

    chain = Chain()
    node_first = PrimaryNode('direct_data_model')
    node_second = PrimaryNode('bernb')
    node_third = SecondaryNode('rf')

    node_third.nodes_from.append(node_first)
    node_third.nodes_from.append(node_second)

    chain.add_node(node_third)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target,
                            y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Example #26
0
def test_chain_with_datamodel_fit_correct(data_setup):
    data = data_setup
    train_data, test_data = train_test_data_setup(data)

    chain = Chain()

    node_data = PrimaryNode('logit')
    node_first = PrimaryNode('bernb')
    node_second = SecondaryNode('rf')

    node_second.nodes_from = [node_first, node_data]

    chain.add_node(node_data)
    chain.add_node(node_first)
    chain.add_node(node_second)

    chain.fit(train_data)
    results = np.asarray(probs_to_labels(chain.predict(test_data).predict))

    assert results.shape == test_data.target.shape
Example #27
0
    def composer_metric(self, metrics, train_data: InputData,
                        test_data: InputData,
                        chain: Chain) -> Optional[Tuple[Any]]:
        try:
            validate(chain)
            chain.log = self.log

            if type(metrics) is not list:
                metrics = [metrics]

            if self.cache is not None:
                # TODO improve cache
                chain.fit_from_cache(self.cache)

            if not chain.is_fitted:
                self.log.debug(
                    f'Chain {chain.root_node.descriptive_id} fit started')
                chain.fit(input_data=train_data,
                          time_constraint=self.composer_requirements.
                          max_chain_fit_time)
                self.cache.save_chain(chain)

            evaluated_metrics = ()
            for metric in metrics:
                if callable(metric):
                    metric_func = metric
                else:
                    metric_func = MetricsRepository().metric_by_id(metric)
                evaluated_metrics = evaluated_metrics + (metric_func(
                    chain, reference_data=test_data), )

            self.log.debug(
                f'Chain {chain.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}'
            )

        except Exception as ex:
            self.log.info(f'Chain assessment warning: {ex}. Continue.')
            evaluated_metrics = None

        return evaluated_metrics
Example #28
0
def test_regression_chain_with_data_operation_fit_correct():
    data = get_synthetic_regression_data()
    train_data, test_data = train_test_data_setup(data)

    #           linear
    #       /           \
    #     ridge          |
    #       |            |
    # ransac_lin_reg   lasso
    #        \         /
    #          scaling
    node_scaling = PrimaryNode('scaling')
    node_ransac = SecondaryNode('ransac_lin_reg', nodes_from=[node_scaling])
    node_lasso = SecondaryNode('lasso', nodes_from=[node_scaling])
    node_ridge = SecondaryNode('ridge', nodes_from=[node_ransac])
    node_root = SecondaryNode('linear', nodes_from=[node_lasso, node_ridge])
    chain = Chain(node_root)

    chain.fit(train_data)
    results = chain.predict(test_data)

    assert results.predict.shape == test_data.target.shape
Example #29
0
def test_chain_with_custom_params_for_model(data_setup):
    data = data_setup
    custom_params = dict(n_neighbors=1, weights='uniform', p=1)

    first = PrimaryNode(operation_type='logit')
    second = PrimaryNode(operation_type='lda')
    final = SecondaryNode(operation_type='knn', nodes_from=[first, second])

    chain = Chain()
    chain.add_node(final)
    chain_default_params = deepcopy(chain)

    chain.root_node.custom_params = custom_params

    chain_default_params.fit(data)
    chain.fit(data)

    custom_params_prediction = chain.predict(data).predict
    default_params_prediction = chain_default_params.predict(data).predict

    assert not np.array_equal(custom_params_prediction,
                              default_params_prediction)
Example #30
0
def test_chain_sequential_fit_correct(data_setup):
    data = data_setup
    train, _ = train_test_data_setup(data)

    first = PrimaryNode(model_type='logit')
    second = SecondaryNode(model_type='logit', nodes_from=[first])
    third = SecondaryNode(model_type='logit', nodes_from=[second])
    final = SecondaryNode(model_type='logit', nodes_from=[third])

    chain = Chain()
    for node in [first, second, third, final]:
        chain.add_node(node)

    train_predicted = chain.fit(input_data=train, use_cache=False)

    assert chain.root_node.descriptive_id == ('(((/n_logit_default_params;)/'
                                              'n_logit_default_params;)/'
                                              'n_logit_default_params;)/'
                                              'n_logit_default_params')

    assert chain.length == 4
    assert chain.depth == 4
    assert train_predicted.predict.shape[0] == train.target.shape[0]
    assert final.cache.actual_cached_state is not None