Exemple #1
0
def test_fine_tune_all_nodes(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Chain composition
    chain = get_class_chain()

    # Before tuning prediction
    chain.fit(train_data, use_cache=False)
    before_tuning_predicted = chain.predict(test_data)

    # root node tuning
    chain.fine_tune_all_nodes(train_data,
                              max_lead_time=timedelta(minutes=1),
                              iterations=30)
    chain.fit_from_scratch(train_data)
    after_tun_root_node_predicted = chain.predict(test_data)

    bfr_tun_roc_auc = round(
        roc(y_true=test_data.target, y_score=before_tuning_predicted.predict),
        2)
    aft_tun_roc_auc = round(
        roc(y_true=test_data.target,
            y_score=after_tun_root_node_predicted.predict), 2)

    print(f'Before tune test {bfr_tun_roc_auc}')
    print(f'After tune test {aft_tun_roc_auc}', '\n')

    assert aft_tun_roc_auc >= bfr_tun_roc_auc
Exemple #2
0
def test_chain_hierarchy_fit_correct(data_setup):
    data = data_setup
    train, _ = train_test_data_setup(data)

    first = PrimaryNode(operation_type='logit')
    second = SecondaryNode(operation_type='logit', nodes_from=[first])
    third = SecondaryNode(operation_type='logit', nodes_from=[first])
    final = SecondaryNode(operation_type='logit', nodes_from=[second, third])

    chain = Chain()
    for node in [first, second, third, final]:
        chain.add_node(node)

    chain.unfit()
    train_predicted = chain.fit(input_data=train)

    assert chain.root_node.descriptive_id == ('((/n_logit_default_params;)/'
                                              'n_logit_default_params;;(/'
                                              'n_logit_default_params;)/'
                                              'n_logit_default_params;)/'
                                              'n_logit_default_params')

    assert chain.length == 4
    assert chain.depth == 3
    assert train_predicted.predict.shape[0] == train.target.shape[0]
    assert final.fitted_operation is not None
Exemple #3
0
def test_tune_certain_node_with_tune_class_correctly(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    chain = create_four_depth_chain()
    chain.fit(train_data, use_cache=False)
    before_tuning_predicted = chain.predict(test_data)

    model_id_to_tune = 4

    tuned_chain = Tune(chain).fine_tune_certain_node(
        model_id=model_id_to_tune,
        input_data=train_data,
        max_lead_time=timedelta(minutes=1),
        iterations=30)

    tuned_chain.fit_from_scratch(train_data)
    after_tun_root_node_predicted = tuned_chain.predict(test_data)

    bfr_tun_roc_auc = round(
        roc(y_true=test_data.target, y_score=before_tuning_predicted.predict),
        1)
    aft_tun_roc_auc = round(
        roc(y_true=test_data.target,
            y_score=after_tun_root_node_predicted.predict), 1)

    print(f'Before tune test {bfr_tun_roc_auc}')
    print(f'After tune test {aft_tun_roc_auc}', '\n')

    assert aft_tun_roc_auc >= bfr_tun_roc_auc
Exemple #4
0
def test_fine_tune_primary_nodes(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Chain composition
    chain = get_regr_chain()

    # Before tuning prediction
    chain.fit(train_data, use_cache=False)
    before_tuning_predicted = chain.predict(test_data)

    # Chain tuning
    chain.fine_tune_primary_nodes(train_data,
                                  max_lead_time=timedelta(minutes=1),
                                  iterations=10)

    # After tuning prediction
    chain.fit_from_scratch(train_data)
    after_tuning_predicted = chain.predict(test_data)

    # Metrics
    bfr_tun_mse = mse(y_true=test_data.target,
                      y_pred=before_tuning_predicted.predict)
    aft_tun_mse = mse(y_true=test_data.target,
                      y_pred=after_tuning_predicted.predict)

    print(f'Before tune test {bfr_tun_mse}')
    print(f'After tune test {aft_tun_mse}', '\n')

    assert aft_tun_mse <= bfr_tun_mse
Exemple #5
0
def get_synthetic_ts_data_period(n_steps=1000,
                                 forecast_length=1,
                                 max_window_size=50):
    simulated_data = ArmaProcess().generate_sample(nsample=n_steps)
    x1 = np.arange(0, n_steps)
    x2 = np.arange(0, n_steps) + 1

    simulated_data = simulated_data + x1 * 0.0005 - x2 * 0.0001

    periodicity = np.sin(x1 / 50)

    simulated_data = simulated_data + periodicity

    task = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size,
                            return_all_steps=False))

    data = InputData(idx=np.arange(0, n_steps),
                     features=np.asarray([x1, x2]).T,
                     target=simulated_data,
                     task=task,
                     data_type=DataTypesEnum.ts)

    return train_test_data_setup(data)
Exemple #6
0
def data_setup():
    task = Task(TaskTypesEnum.classification)
    predictors, response = load_breast_cancer(return_X_y=True)
    np.random.seed(1)
    np.random.shuffle(predictors)
    np.random.shuffle(response)
    response = response[:100]
    predictors = predictors[:100]

    input_data = InputData(idx=np.arange(0, len(predictors)),
                           features=predictors,
                           target=response,
                           task=task,
                           data_type=DataTypesEnum.table)
    train_data, test_data = train_test_data_setup(data=input_data)
    train_data_x = train_data.features
    test_data_x = test_data.features
    train_data_y = train_data.target
    test_data_y = test_data.target

    train_data = InputData(features=train_data_x,
                           target=train_data_y,
                           idx=np.arange(0, len(train_data_y)),
                           task=task,
                           data_type=DataTypesEnum.table)
    test_data = InputData(features=test_data_x,
                          target=test_data_y,
                          idx=np.arange(0, len(test_data_y)),
                          task=task,
                          data_type=DataTypesEnum.table)
    return train_data, test_data
Exemple #7
0
def test_knn_classification_tune_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    knn = Model(model_type='knn')
    model, _ = knn.fit(data=train_data)
    test_predicted = knn.predict(fitted_model=model, data=test_data)

    roc_on_test = roc_auc(y_true=test_data.target, y_score=test_predicted)

    roc_on_test_tuned_list = []
    for _ in range(3):
        knn_for_tune = Model(model_type='knn')
        model, _ = knn_for_tune.fine_tune(data=train_data,
                                          iterations=10,
                                          max_lead_time=timedelta(minutes=1))

        test_predicted_tuned = knn_for_tune.predict(fitted_model=model,
                                                    data=test_data)

        roc_on_test_tuned = roc_auc(y_true=test_data.target,
                                    y_score=test_predicted_tuned)

        roc_on_test_tuned_list.append(roc_on_test_tuned)

    roc_threshold = 0.6
    assert np.array(
        roc_on_test_tuned_list).any() >= roc_on_test > roc_threshold
Exemple #8
0
def get_cholesterol_data():
    file_path = 'cases/data/cholesterol/cholesterol.csv'
    full_path = join(str(project_root()), file_path)
    task = Task(TaskTypesEnum.regression)
    data = InputData.from_csv(full_path, task=task)
    train, test = train_test_data_setup(data)

    return train, test
Exemple #9
0
def get_kc2_data():
    file_path = 'cases/data/kc2/kc2.csv'
    full_path = join(str(project_root()), file_path)
    task = Task(TaskTypesEnum.classification)
    data = InputData.from_csv(full_path, task=task)
    train, test = train_test_data_setup(data)

    return train, test
Exemple #10
0
def run_text_problem_from_saved_meta_file(path):

    data = InputData.from_text_meta_file(meta_file_path=path)

    train_data, test_data = train_test_data_setup(data, split_ratio=0.7)

    metric = execute_chain_for_text_problem(train_data, test_data)

    print(f'meta_file metric: {metric}')
Exemple #11
0
def test_pca_model_removes_redunant_features_correct():
    n_informative = 5
    data = classification_dataset_with_redunant_features(
        n_samples=1000, n_features=100, n_informative=n_informative)
    train_data, test_data = train_test_data_setup(data=data)

    pca = Model(model_type='pca_data_model')
    _, train_predicted = pca.fit(data=train_data)

    assert train_predicted.shape[1] < data.features.shape[1]
Exemple #12
0
def get_dataset(task_type: str):
    if task_type == 'regression':
        data = get_synthetic_regression_data()
        train_data, test_data = train_test_data_setup(data)
        threshold = np.std(test_data.target) * 0.05
    elif task_type == 'classification':
        data = get_iris_data()
        train_data, test_data = train_test_data_setup(data, shuffle_flag=True)
        threshold = 0.95
    elif task_type == 'clustering':
        data = get_synthetic_input_data(n_samples=10000)
        train_data, test_data = train_test_data_setup(data)
        threshold = 0.5
    elif task_type == 'ts_forecasting':
        train_data, test_data = get_synthetic_ts_data_period(forecast_length=12)
        threshold = np.str(test_data.target)
    else:
        raise ValueError('Incorrect type of machine learning task')
    return train_data, test_data, threshold
Exemple #13
0
def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    kmeans = Model(model_type='kmeans')

    _, train_predicted = kmeans.fit(data=train_data)

    assert all(np.unique(train_predicted) == [0, 1])
Exemple #14
0
def test_regression_chain_fit_correct():
    data = get_synthetic_regression_data()

    chain = generate_chain()
    train_data, test_data = train_test_data_setup(data)

    chain.fit(input_data=train_data)
    _, rmse_on_test = get_rmse_value(chain, train_data, test_data)

    rmse_threshold = np.std(data.target) * 0.05
    assert rmse_on_test < rmse_threshold
Exemple #15
0
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup):
    data = data_setup
    train, test = train_test_data_setup(data)

    first = PrimaryNode(operation_type='logit')
    second = PrimaryNode(operation_type='lda')
    third = PrimaryNode(operation_type='knn')
    final = SecondaryNode(operation_type='xgboost',
                          nodes_from=[first, second, third])

    chain = Chain()
    for node in [first, second, third, final]:
        chain.add_node(node)

    first = deepcopy(first)
    second = deepcopy(second)
    third = deepcopy(third)

    final_shuffled = SecondaryNode(operation_type='xgboost',
                                   nodes_from=[third, first, second])

    chain_shuffled = Chain()
    # change order of nodes in list
    for node in [final_shuffled, third, first, second]:
        chain_shuffled.add_node(node)

    train_predicted = chain.fit(input_data=train)

    train_predicted_shuffled = chain_shuffled.fit(input_data=train)

    # train results should be invariant
    assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id
    assert np.equal(train_predicted.predict,
                    train_predicted_shuffled.predict).all()

    test_predicted = chain.predict(input_data=test)
    test_predicted_shuffled = chain_shuffled.predict(input_data=test)

    # predict results should be invariant
    assert np.equal(test_predicted.predict,
                    test_predicted_shuffled.predict).all()

    # change parents order for the nodes fitted chain
    nodes_for_change = chain.nodes[3].nodes_from
    chain.nodes[3].nodes_from = [
        nodes_for_change[2], nodes_for_change[0], nodes_for_change[1]
    ]
    chain.nodes[3].unfit()
    chain.fit(train)
    test_predicted_re_shuffled = chain.predict(input_data=test)

    # predict results should be invariant
    assert np.equal(test_predicted.predict,
                    test_predicted_re_shuffled.predict).all()
Exemple #16
0
def run_text_problem_from_meta_file():
    data_file_abspath = os.path.abspath(
        os.path.join('data', 'spam', 'spamham.csv'))

    data = InputData.from_text_meta_file(meta_file_path=data_file_abspath)

    train_data, test_data = train_test_data_setup(data, split_ratio=0.7)

    metric = execute_chain_for_text_problem(train_data, test_data)

    print(f'meta_file metric: {metric}')
Exemple #17
0
def get_synthetic_ts_data_period(n_steps=1000, forecast_length=5):
    simulated_data = ArmaProcess().generate_sample(nsample=n_steps)
    x1 = np.arange(0, n_steps)
    x2 = np.arange(0, n_steps) + 1

    simulated_data = simulated_data + x1 * 0.0005 - x2 * 0.0001

    periodicity = np.sin(x1 / 50)

    simulated_data = simulated_data + periodicity

    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=forecast_length))

    data = InputData(idx=np.arange(0, n_steps),
                     features=simulated_data,
                     target=simulated_data,
                     task=task,
                     data_type=DataTypesEnum.ts)
    a, b = train_test_data_setup(data)
    return train_test_data_setup(data)
Exemple #18
0
def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling chain. Fit predict it
    scaling_chain = Chain(PrimaryNode('normalization'))
    scaling_chain.fit(train_data)
    scaled_data = scaling_chain.predict(train_data)

    kmeans = Model(operation_type='kmeans')
    _, train_predicted = kmeans.fit(data=scaled_data)

    assert all(np.unique(train_predicted.predict) == [0, 1])
Exemple #19
0
def test_log_regression_fit_correct(classification_dataset):
    data = classification_dataset
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    log_reg = Model(model_type='logit')

    _, train_predicted = log_reg.fit(data=train_data)

    roc_on_train = get_roc_auc(train_data, train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Exemple #20
0
def test_output_mode_labels():
    data = get_iris_data()
    chain = chain_simple()
    train_data, test_data = train_test_data_setup(data, shuffle_flag=True)

    chain.fit(input_data=train_data)
    results = chain.predict(input_data=test_data, output_mode='labels')
    results_probs = chain.predict(input_data=test_data)

    assert len(results.predict) == len(test_data.target)
    assert set(results.predict) == {0, 1, 2}

    assert not np.array_equal(results_probs.predict, results.predict)
Exemple #21
0
def test_random_forest_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    random_forest = Model(model_type='rf')

    _, train_predicted = random_forest.fit(data=train_data)

    roc_on_train = get_roc_auc(train_data, train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Exemple #22
0
def test_multiclassification_chain_fit_correct():
    data = get_iris_data()
    chain = chain_simple()
    train_data, test_data = train_test_data_setup(data, shuffle_flag=True)

    chain.fit(input_data=train_data)
    results = chain.predict(input_data=test_data)

    roc_auc_on_test = roc_auc(y_true=test_data.target,
                              y_score=results.predict,
                              multi_class='ovo',
                              average='macro')

    assert roc_auc_on_test > 0.95
Exemple #23
0
    def tune_node(self,
                  input_data,
                  loss_function,
                  node_index,
                  loss_params=None):
        """ Method for hyperparameters tuning for particular node"""
        # Train test split
        train_input, predict_input = train_test_data_setup(input_data)
        test_target = np.array(predict_input.target)

        is_need_to_maximize = _greater_is_better(target=test_target,
                                                 loss_function=loss_function,
                                                 loss_params=loss_params)
        self.is_need_to_maximize = is_need_to_maximize

        # Check source metrics for data
        self.init_check(train_input, predict_input, test_target, loss_function,
                        loss_params)

        node = self.chain.nodes[node_index]
        operation_name = str(node.operation.operation_type)

        # Get node's parameters to optimize
        node_params = get_node_params(node_id=node_index,
                                      operation_name=operation_name)

        if node_params is None:
            print(
                f'"{operation_name}" operation has no parameters to optimize')
        else:
            # Apply tuning for current node
            self._optimize_node(node_id=node_index,
                                train_input=train_input,
                                predict_input=predict_input,
                                test_target=test_target,
                                node_params=node_params,
                                iterations_per_node=self.iterations,
                                seconds_per_node=self.max_seconds,
                                loss_function=loss_function,
                                loss_params=loss_params)

        # Validation is the optimization do well
        final_chain = self.final_check(train_input=train_input,
                                       predict_input=predict_input,
                                       test_target=test_target,
                                       tuned_chain=self.chain,
                                       loss_function=loss_function,
                                       loss_params=loss_params)

        return final_chain
Exemple #24
0
def test_eval_strategy_logreg(data_setup):
    data_set = data_setup
    train, test = train_test_data_setup(data=data_set)
    test_skl_model = LogisticRegression(C=10., random_state=1,
                                        solver='liblinear',
                                        max_iter=10000, verbose=0)
    test_skl_model.fit(train.features, train.target)
    expected_result = test_skl_model.predict(test.features)

    test_model_node = PrimaryNode(model_type='logit')
    test_model_node.fit(input_data=train)
    actual_result = test_model_node.predict(input_data=test)

    assert len(actual_result.predict) == len(expected_result)
Exemple #25
0
def test_output_mode_full_probs():
    data = get_binary_classification_data()
    chain = chain_simple()
    train_data, test_data = train_test_data_setup(data, shuffle_flag=True)

    chain.fit(input_data=train_data)
    results = chain.predict(input_data=test_data, output_mode='full_probs')
    results_default = chain.predict(input_data=test_data)
    results_probs = chain.predict(input_data=test_data, output_mode='probs')

    assert not np.array_equal(results_probs.predict, results.predict)
    assert np.array_equal(results_probs.predict, results_default.predict)
    assert results.predict.shape == (len(test_data.target), 2)
    assert results_probs.predict.shape == (len(test_data.target), )
Exemple #26
0
def test_model_fit_and_predict_correctly():
    """Checks whether the model fits and predict correctly on the synthetic dataset"""
    data = get_synthetic_input_data(N_SAMPLES, N_FEATURES, random_state=1)

    chain = generate_chain()
    train_data, test_data = train_test_data_setup(data)

    chain.fit(input_data=train_data)
    roc_auc_value_train, roc_auc_value_test = get_roc_auc_value(
        chain, train_data, test_data)
    train_auc_thr = get_auc_threshold(roc_auc_value_train)
    test_auc_thr = get_auc_threshold(roc_auc_value_test)

    assert train_auc_thr >= CORRECT_MODEL_AUC_THR
    assert test_auc_thr >= CORRECT_MODEL_AUC_THR
Exemple #27
0
def synthetic_benchmark_composing_example():
    fitted_chain = separately_fit_chain(samples=5000, features_amount=10,
                                        classes=2)
    data = synthetic_benchmark_dataset(samples_amount=5000, features_amount=10,
                                       fitted_chain=fitted_chain)

    print(f'Synthetic features: {data.features[:10]}')
    print(f'Synthetic target: {data.target[:10]}')

    train, test = train_test_data_setup(data)
    simple_chain = two_level_chain()
    simple_chain.fit(input_data=train, use_cache=False)

    print(f'ROC score on train: {roc_value(simple_chain, train)}')
    print(f'ROC score on test {roc_value(simple_chain, test)}')
Exemple #28
0
def test_regression_chain_with_datamodel_fit_correct():
    data = get_synthetic_regression_data()
    train_data, test_data = train_test_data_setup(data)

    node_data = PrimaryNode('direct_data_model')
    node_first = PrimaryNode('ridge')
    node_second = SecondaryNode('lasso')
    node_second.nodes_from = [node_first, node_data]

    chain = Chain(node_second)

    chain.fit(train_data)
    results = chain.predict(test_data)

    assert results.predict.shape == test_data.target.shape
Exemple #29
0
def test_logger_manager_keeps_loggers_correctly():
    LogManager().clear_cache()

    chain = create_four_depth_chain()
    expected_number_of_loggers = 4

    file = os.path.join('../data', 'advanced_classification.csv')
    test_file_path = str(os.path.dirname(__file__))
    data = InputData.from_csv(os.path.join(test_file_path, file))
    train_data, _ = train_test_data_setup(data=data)

    chain.fit(train_data)

    actual_number_of_loggers = LogManager().debug['loggers_number']

    assert actual_number_of_loggers == expected_number_of_loggers
Exemple #30
0
def test_chain_with_clusters_fit_correct():
    mean_roc_on_test = 0

    # mean ROC AUC is analysed because of stochastic clustering
    for _ in range(5):
        data = get_synthetic_input_data(n_samples=10000)

        chain = generate_chain()
        train_data, test_data = train_test_data_setup(data)

        chain.fit(input_data=train_data)
        _, roc_on_test = get_roc_auc_value(chain, train_data, test_data)
        mean_roc_on_test = np.mean([mean_roc_on_test, roc_on_test])

    roc_threshold = 0.5
    assert mean_roc_on_test > roc_threshold