def get_rmse_value(chain: Chain, train_data: InputData, test_data: InputData) -> (float, float):
    train_pred = chain.predict(input_data=train_data)
    test_pred = chain.predict(input_data=test_data)
    rmse_value_test = mse(y_true=test_data.target, y_pred=test_pred.predict, squared=False)
    rmse_value_train = mse(y_true=train_data.target, y_pred=train_pred.predict, squared=False)

    return rmse_value_train, rmse_value_test
def get_roc_auc_value(chain: Chain, train_data: InputData,
                      test_data: InputData) -> (float, float):
    train_pred = chain.predict(input_data=train_data)
    test_pred = chain.predict(input_data=test_data)
    roc_auc_value_test = roc_auc(y_true=test_data.target,
                                 y_score=test_pred.predict)
    roc_auc_value_train = roc_auc(y_true=train_data.target,
                                  y_score=train_pred.predict)

    return roc_auc_value_train, roc_auc_value_test
Beispiel #3
0
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup):
    data = data_setup
    train, test = train_test_data_setup(data)
    first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit)
    second = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.lda)
    third = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.knn)
    final = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.xgboost,
                                         nodes_from=[first, second, third])

    chain = Chain()
    for node in [first, second, third, final]:
        chain.add_node(node)

    first = deepcopy(first)
    second = deepcopy(second)
    third = deepcopy(third)
    final_shuffled = NodeGenerator.secondary_node(
        model_type=ModelTypesIdsEnum.xgboost,
        nodes_from=[third, first, second])

    chain_shuffled = Chain()
    # change order of nodes in list
    for node in [final_shuffled, third, first, second]:
        chain_shuffled.add_node(node)

    train_predicted = chain.fit(input_data=train)

    train_predicted_shuffled = chain_shuffled.fit(input_data=train)

    # train results should be invariant
    assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id
    assert all(
        np.equal(train_predicted.predict, train_predicted_shuffled.predict))

    test_predicted = chain.predict(input_data=test)
    test_predicted_shuffled = chain_shuffled.predict(input_data=test)

    # predict results should be invariant
    assert all(
        np.equal(test_predicted.predict, test_predicted_shuffled.predict))

    # change parents order for the nodes fitted chain
    nodes_for_change = chain.nodes[3].nodes_from
    chain.nodes[3].nodes_from = [
        nodes_for_change[2], nodes_for_change[0], nodes_for_change[1]
    ]
    chain.nodes[3].cache.clear()
    chain.fit(train)
    test_predicted_re_shuffled = chain.predict(input_data=test)

    # predict results should be invariant
    assert all(
        np.equal(test_predicted.predict, test_predicted_re_shuffled.predict))
Beispiel #4
0
def run_metocean_forecasting_problem(train_file_path,
                                     test_file_path,
                                     forecast_length=1,
                                     max_window_size=64,
                                     is_visualise=False):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts)

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), test_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts)

    chain = get_composite_lstm_chain()

    chain_simple = Chain()
    node_single = PrimaryNode('ridge')
    chain_simple.add_node(node_single)

    chain_lstm = Chain()
    node_lstm = PrimaryNode('lstm')
    chain_lstm.add_node(node_lstm)

    chain.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid = calculate_validation_metric(
        chain.predict(dataset_to_validate), dataset_to_validate,
        f'full-composite_{forecast_length}', is_visualise)

    chain_lstm.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid_lstm_only = calculate_validation_metric(
        chain_lstm.predict(dataset_to_validate), dataset_to_validate,
        f'full-lstm-only_{forecast_length}', is_visualise)

    chain_simple.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid_simple = calculate_validation_metric(
        chain_simple.predict(dataset_to_validate), dataset_to_validate,
        f'full-simple_{forecast_length}', is_visualise)

    print(f'RMSE composite: {rmse_on_valid}')
    print(f'RMSE simple: {rmse_on_valid_simple}')
    print(f'RMSE LSTM only: {rmse_on_valid_lstm_only}')

    return rmse_on_valid_simple
Beispiel #5
0
def run_chain_from_automl(train_file_path: str,
                          test_file_path: str,
                          max_run_time: timedelta = timedelta(minutes=10)):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    chain = Chain()
    node_tpot = PrimaryNode('tpot')
    node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = PrimaryNode('lda')
    node_rf = SecondaryNode('rf')

    node_rf.nodes_from = [node_tpot, node_lda]

    chain.add_node(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Beispiel #6
0
def chain_tuning(nodes_to_tune: str,
                 chain: Chain,
                 train_data: InputData,
                 test_data: InputData,
                 local_iter: int,
                 tuner_iter_num: int = 50) -> (float, list):
    several_iter_scores_test = []

    if nodes_to_tune == 'primary':
        print('primary_node_tuning')
        chain_tune_strategy = chain.fine_tune_primary_nodes
    elif nodes_to_tune == 'root':
        print('root_node_tuning')
        chain_tune_strategy = chain.fine_tune_all_nodes
    else:
        raise ValueError(
            f'Invalid type of nodes. Nodes must be primary or root')

    for iteration in range(local_iter):
        print(f'current local iteration {iteration}')

        # Chain tuning
        chain_tune_strategy(train_data, iterations=tuner_iter_num)

        # After tuning prediction
        chain.fit(train_data)
        after_tuning_predicted = chain.predict(test_data)

        # Metrics
        aft_tun_roc_auc = roc_auc(y_true=test_data.target,
                                  y_score=after_tuning_predicted.predict)
        several_iter_scores_test.append(aft_tun_roc_auc)

    return float(np.mean(several_iter_scores_test)), several_iter_scores_test
def apply_model_to_data(model: Chain, data_path: str):
    df, file_path = create_multi_clf_examples_from_excel(data_path,
                                                         return_df=True)
    dataset_to_apply = InputData.from_csv(file_path, with_target=False)
    evo_predicted = model.predict(dataset_to_apply)
    df['forecast'] = probs_to_labels(evo_predicted.predict)
    return df
Beispiel #8
0
def calculate_validation_metric(chain: Chain, dataset_to_validate: InputData) -> float:
    # the execution of the obtained composite models
    predicted = chain.predict(dataset_to_validate)
    # the quality assessment for the simulation results
    roc_auc_value = roc_auc(y_true=dataset_to_validate.target,
                            y_score=predicted.predict)
    return roc_auc_value
Beispiel #9
0
 def get_value(cls, chain: Chain, reference_data: InputData) -> float:
     metric = cls.default_value
     try:
         results = chain.predict(reference_data)
         metric = cls.metric(reference_data, results)
     except Exception as ex:
         print(f'Metric evaluation error: {ex}')
     return metric
def validate_model_quality(model: Chain, data_path: str):
    dataset_to_validate = InputData.from_csv(data_path)
    predicted_labels = model.predict(dataset_to_validate).predict

    roc_auc_valid = round(
        roc_auc(y_true=test_data.target,
                y_score=predicted_labels,
                multi_class='ovo',
                average='macro'), 3)
    return roc_auc_valid
Beispiel #11
0
    def get_value(chain: Chain, reference_data: InputData) -> float:
        try:
            # validate(chain)
            results = chain.predict(reference_data)
            y_pred = [round(predict[0]) for predict in results.predict]
            score = round(
                accuracy_score(y_true=reference_data.target, y_pred=y_pred), 3)
        except Exception as ex:
            print(ex)
            score = 0.5

        return score
Beispiel #12
0
    def get_value(chain: Chain, reference_data: InputData) -> float:
        try:
            # validate(chain)
            results = chain.predict(reference_data)
            score = round(
                roc_auc_score(y_score=results.predict,
                              y_true=reference_data.target), 3)
        except Exception as ex:
            print(ex)
            score = 0.5

        return score
def calculate_validation_metric(chain: Chain, dataset_to_validate: InputData) -> float:
    # the execution of the obtained composite models
    predicted = chain.predict(dataset_to_validate)

    # plot results
    compare_plot(predicted, dataset_to_validate)

    # the quality assessment for the simulation results
    roc_auc_value = mse(y_true=dataset_to_validate.target,
                        y_pred=predicted.predict,
                        squared=False)
    return roc_auc_value
Beispiel #14
0
def test_regression_chain_with_datamodel_fit_correct():
    data = get_synthetic_regression_data()
    train_data, test_data = train_test_data_setup(data)

    node_data = PrimaryNode('direct_data_model')
    node_first = PrimaryNode('ridge')
    node_second = SecondaryNode('lasso')
    node_second.nodes_from = [node_first, node_data]

    chain = Chain(node_second)

    chain.fit(train_data)
    results = chain.predict(test_data)

    assert results.predict.shape == test_data.target.shape
Beispiel #15
0
def test_chain_with_datamodel_fit_correct(data_setup):
    data = data_setup
    train_data, test_data = train_test_data_setup(data)

    chain = Chain()
    node_data = PrimaryNode('direct_data_model')
    node_first = PrimaryNode('bernb')
    node_second = SecondaryNode('rf')
    node_second.nodes_from = [node_first, node_data]

    chain.add_node(node_data)
    chain.add_node(node_first)
    chain.add_node(node_second)

    chain.fit(train_data)
    results = np.asarray(probs_to_labels(chain.predict(test_data).predict))

    assert results.shape == test_data.target.shape
Beispiel #16
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier())
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results)

    print(roc_auc_value)

    chain = Chain()
    node_first = PrimaryNode('direct_data_model')
    node_second = PrimaryNode('bernb')
    node_third = SecondaryNode('rf')

    node_third.nodes_from.append(node_first)
    node_third.nodes_from.append(node_second)

    chain.add_node(node_third)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Beispiel #17
0
def test_chain_with_custom_params_for_model(data_setup):
    data = data_setup
    custom_params = dict(n_neighbors=1, weights='uniform', p=1)

    first = PrimaryNode(model_type='logit')
    second = PrimaryNode(model_type='lda')
    final = SecondaryNode(model_type='knn', nodes_from=[first, second])

    chain = Chain()
    chain.add_node(final)
    chain_default_params = deepcopy(chain)

    chain.root_node.custom_params = custom_params

    chain_default_params.fit(data)
    chain.fit(data)

    custom_params_prediction = chain.predict(data).predict
    default_params_prediction = chain_default_params.predict(data).predict

    assert not np.array_equal(custom_params_prediction,
                              default_params_prediction)
def run_oil_forecasting_problem(train_file_path,
                                train_file_path_crm,
                                forecast_length,
                                max_window_size,
                                is_visualise=False,
                                well_id='Unknown'):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size,
                            return_all_steps=False,
                            make_future_prediction=False))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts,
                                          delimiter=',')

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), train_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts,
                                             delimiter=',')

    full_path_train_crm = os.path.join(str(project_root()),
                                       train_file_path_crm)
    dataset_to_train_crm = InputData.from_csv(full_path_train_crm,
                                              task=task_to_solve,
                                              data_type=DataTypesEnum.ts,
                                              delimiter=',')

    dataset_to_validate_crm = copy(dataset_to_train_crm)

    prediction_full = None
    prediction_full_crm = None
    prediction_full_crm_opt = None

    forecast_window_shift_num = 4

    depth = 100

    for forecasting_step in range(forecast_window_shift_num):
        start = 0 + depth * forecasting_step
        end = depth * 2 + depth * (forecasting_step + 1)

        dataset_to_train_local = dataset_to_train.subset(start, end)
        dataset_to_train_local_crm = dataset_to_train_crm.subset(start, end)

        start = 0 + depth * forecasting_step
        end = depth * 2 + depth * (forecasting_step + 1)

        dataset_to_validate_local = dataset_to_validate.subset(
            start + depth, end + depth)
        dataset_to_validate_local_crm = dataset_to_validate_crm.subset(
            start + depth, end + depth)

        chain_simple = Chain(PrimaryNode('lstm'))
        chain_simple_crm = Chain(PrimaryNode('lstm'))
        chain_crm_opt = get_comp_chain()

        chain_simple.fit_from_scratch(input_data=dataset_to_train_local,
                                      verbose=False)
        chain_simple_crm.fit_from_scratch(
            input_data=dataset_to_train_local_crm, verbose=False)
        chain_crm_opt.fit_from_scratch(input_data=dataset_to_train_local_crm,
                                       verbose=False)

        prediction = chain_simple.predict(dataset_to_validate_local)
        prediction_crm = chain_simple_crm.predict(
            dataset_to_validate_local_crm)
        prediction_crm_opt = chain_crm_opt.predict(
            dataset_to_validate_local_crm)

        prediction_full = merge_datasets(prediction_full, prediction,
                                         forecasting_step)
        prediction_full_crm = merge_datasets(prediction_full_crm,
                                             prediction_crm, forecasting_step)
        prediction_full_crm_opt = merge_datasets(prediction_full_crm_opt,
                                                 prediction_crm_opt,
                                                 forecasting_step)

    rmse_on_valid_simple = calculate_validation_metric(
        prediction_full, prediction_full_crm, prediction_full_crm_opt,
        dataset_to_validate, well_id, is_visualise)

    print(well_id)
    print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}')
    print(f'RMSE ML: {round(rmse_on_valid_simple[1])}')
    print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}')
    print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}')

    print(f'DTW CRM: {round(rmse_on_valid_simple[4])}')
    print(f'DTW ML: {round(rmse_on_valid_simple[5])}')
    print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}')
    print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}')

    return rmse_on_valid_simple
from core.models.model import *
from benchmark.benchmark_utils import get_scoring_case_data_paths

train_file_path, test_file_path = get_scoring_case_data_paths()

train_data = InputData.from_csv(train_file_path)
test_data = InputData.from_csv(test_file_path)

training_features = train_data.features
testing_features = test_data.features
training_target = train_data.target
testing_target = test_data.target

chain = Chain()
node0 = NodeGenerator.primary_node(ModelTypesIdsEnum.tpot)
node1 = NodeGenerator.primary_node(ModelTypesIdsEnum.lda)
node2 = NodeGenerator.secondary_node(ModelTypesIdsEnum.rf)

node2.nodes_from.append(node0)
node2.nodes_from.append(node1)

chain.add_node(node0)
chain.add_node(node1)
chain.add_node(node2)

chain.fit(train_data)
results = chain.predict(test_data)

roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
print(roc_auc_value)
Beispiel #20
0
def roc_value(chain: Chain, dataset_to_validate) -> float:
    predicted = chain.predict(dataset_to_validate)
    roc_auc_value = roc_auc(y_true=dataset_to_validate.target,
                            y_score=predicted.predict)
    return roc_auc_value
Beispiel #21
0
 def get_value(chain: Chain, reference_data: InputData) -> float:
     results = chain.predict(reference_data)
     return mean_squared_error(y_true=reference_data.target,
                               y_pred=results.predict)
Beispiel #22
0
def synthetic_benchmark_dataset(samples_amount: int,
                                features_amount: int,
                                classes_amount: int = 2,
                                features_options: Dict = DEFAULT_OPTIONS,
                                fitted_chain: Chain = None) -> InputData:
    """
    Generates a binary classification benchmark dataset that was obtained using
    the (TODO: add. reference) proposed fitting schema.
    :param samples_amount: Total amount of samples in the resulted dataset.
    :param features_amount: Total amount of features per sample.
    :param classes_amount: The amount of classes in the dataset.
    :param features_options: features options in key-value suitable for classification_dataset.
    :param fitted_chain: Chain with separately fitted models.
    If None then 3-level balanced tree were fitted and taken as a default.
    :return: Benchmark dataset that is ready to be used by Chain.
    """
    if fitted_chain is None:
        fitted_chain = _default_chain(samples_amount=samples_amount,
                                      features_amount=features_amount,
                                      classes_amount=classes_amount)

    if classes_amount != 2:
        raise NotImplementedError(
            'Only binary classification tasks are supported')

    features, target = classification_dataset(
        samples_amount=samples_amount,
        features_amount=features_amount,
        classes_amount=classes_amount,
        features_options=features_options)
    target = np.expand_dims(target, axis=1)

    task = Task(TaskTypesEnum.classification)
    samples_idxs = np.arange(0, samples_amount)

    train = InputData(idx=samples_idxs,
                      features=features,
                      target=target,
                      task=task,
                      data_type=DataTypesEnum.table)

    synth_target = fitted_chain.predict(input_data=train).predict
    synth_labels = _to_labels(synth_target)
    data_synth_train = InputData(idx=np.arange(0, samples_amount),
                                 features=features,
                                 target=synth_labels,
                                 task=task,
                                 data_type=DataTypesEnum.table)

    # TODO: fix preproc issues

    fitted_chain.fit_from_scratch(input_data=data_synth_train)

    features, target = classification_dataset(
        samples_amount=samples_amount,
        features_amount=features_amount,
        classes_amount=classes_amount,
        features_options=features_options)
    target = np.expand_dims(target, axis=1)
    test = InputData(idx=samples_idxs,
                     features=features,
                     target=target,
                     data_type=DataTypesEnum.table,
                     task=task)
    synth_target = fitted_chain.predict(input_data=test).predict
    synth_labels = _to_labels(synth_target)
    data_synth_final = InputData(idx=samples_idxs,
                                 features=features,
                                 data_type=DataTypesEnum.table,
                                 target=synth_labels,
                                 task=task)

    return data_synth_final