Esempi in Python per train_test_data_setup, esempi in Python per fedot.core.data.data_split.train_test_data_setup

Esempio n. 1

0

Mostra file

def prepare_multi_modal_data(files_path, task: Task, images_size=(128, 128), with_split=True):
    path = os.path.join(str(fedot_project_root()), files_path)

    unpack_archived_data(path)

    data = InputData.from_json_files(path, fields_to_use=['votes', 'year'],
                                     label='rating', task=task)

    class_labels = np.asarray([0 if t <= 7 else 1 for t in data.target])
    data.target = class_labels

    ratio = 0.5

    img_files_path = f'{files_path}/*.jpeg'
    img_path = os.path.join(str(fedot_project_root()), img_files_path)

    data_img = InputData.from_image(images=img_path, labels=class_labels, task=task, target_size=images_size)

    data_text = InputData.from_json_files(path, fields_to_use=['plot'],
                                          label='rating', task=task,
                                          data_type=DataTypesEnum.text)
    data_text.target = class_labels

    if with_split:
        train_num, test_num = train_test_data_setup(data, shuffle_flag=False, split_ratio=ratio)
        train_img, test_img = train_test_data_setup(data_img, shuffle_flag=False, split_ratio=ratio)
        train_text, test_text = train_test_data_setup(data_text, shuffle_flag=False, split_ratio=ratio)
    else:
        train_num, test_num = data, data
        train_img, test_img = data_img, data_img
        train_text, test_text = data_text, data_text

    return train_num, test_num, train_img, test_img, train_text, test_text

Esempio n. 2

0

Mostra file

def test_pipeline_hierarchy_fit_correct(data_setup):
    data = data_setup
    train, _ = train_test_data_setup(data)

    first = PrimaryNode(operation_type='logit')
    second = SecondaryNode(operation_type='logit', nodes_from=[first])
    third = SecondaryNode(operation_type='logit', nodes_from=[first])
    final = SecondaryNode(operation_type='logit', nodes_from=[second, third])

    pipeline = Pipeline()
    for node in [first, second, third, final]:
        pipeline.add_node(node)

    pipeline.unfit()
    train_predicted = pipeline.fit(input_data=train)

    assert pipeline.root_node.descriptive_id == (
        '((/n_logit_default_params;)/'
        'n_logit_default_params;;(/'
        'n_logit_default_params;)/'
        'n_logit_default_params;)/'
        'n_logit_default_params')

    assert pipeline.length == 4
    assert pipeline.depth == 3
    assert train_predicted.predict.shape[0] == train.target.shape[0]
    assert final.fitted_operation is not None

Esempio n. 3

0

Mostra file

def run_multi_output_case(path, vis=False):
    """ Function launch case for river levels prediction on Lena river as
    multi-output regression task

    :param path: path to the file with table
    :param vis: is it needed to visualise pipeline and predictions
    """
    target_columns = [
        '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day'
    ]

    data = InputData.from_csv(path,
                              target_columns=target_columns,
                              columns_to_drop=['date'])
    train, test = train_test_data_setup(data)

    problem = 'regression'

    automl_model = Fedot(problem=problem)
    automl_model.fit(features=train)
    predicted_array = automl_model.predict(features=test)

    # Convert output into one dimensional array
    forecast = np.ravel(predicted_array)

    mae_value = mean_absolute_error(np.ravel(test.target), forecast)
    print(f'MAE - {mae_value:.2f}')

    if vis:
        plot_predictions(predicted_array, test)

Esempio n. 4

0

Mostra file

def test_pipeline_fit_time_constraint(data_fixture, request):
    system = platform.system()
    if system == 'Linux':
        set_start_method("spawn", force=True)
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)
    test_pipeline_first = pipeline_first()
    time_constraint = datetime.timedelta(minutes=0.01)
    predicted_first = None
    computation_time_first = None
    process_start_time = time.time()
    try:
        predicted_first = test_pipeline_first.fit(input_data=train_data, time_constraint=time_constraint)
    except Exception as ex:
        received_ex = ex
        computation_time_first = test_pipeline_first.computation_time
        assert type(received_ex) is TimeoutError
    comp_time_proc_with_first_constraint = (time.time() - process_start_time)
    time_constraint = datetime.timedelta(minutes=0.05)
    process_start_time = time.time()
    try:
        test_pipeline_first.fit(input_data=train_data, time_constraint=time_constraint)
    except Exception as ex:
        received_ex = ex
        assert type(received_ex) is TimeoutError
    comp_time_proc_with_second_constraint = (time.time() - process_start_time)
    test_pipeline_second = pipeline_first()
    predicted_second = test_pipeline_second.fit(input_data=train_data)
    computation_time_second = test_pipeline_second.computation_time
    assert comp_time_proc_with_first_constraint < comp_time_proc_with_second_constraint
    assert computation_time_first is None
    assert predicted_first is None
    assert computation_time_second is not None
    assert predicted_second is not None

Esempio n. 5

0

Mostra file

def data_setup():
    task = Task(TaskTypesEnum.classification)
    predictors, response = load_breast_cancer(return_X_y=True)
    np.random.seed(1)
    np.random.shuffle(predictors)
    np.random.shuffle(response)
    response = response[:100]
    predictors = predictors[:100]

    input_data = InputData(idx=np.arange(0, len(predictors)),
                           features=predictors,
                           target=response,
                           task=task,
                           data_type=DataTypesEnum.table)
    train_data, test_data = train_test_data_setup(data=input_data)
    train_data_x = train_data.features
    test_data_x = test_data.features
    train_data_y = train_data.target
    test_data_y = test_data.target

    train_data = InputData(features=train_data_x, target=train_data_y,
                           idx=np.arange(0, len(train_data_y)),
                           task=task, data_type=DataTypesEnum.table)
    test_data = InputData(features=test_data_x, target=test_data_y,
                          idx=np.arange(0, len(test_data_y)),
                          task=task, data_type=DataTypesEnum.table)
    return train_data, test_data

Esempio n. 6

0

Mostra file

def test_classification_models_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)
    roc_threshold = 0.95
    logger = default_log('default_test_logger')

    with OperationTypesRepository() as repo:
        model_names, _ = repo.suitable_operation(
            task_type=TaskTypesEnum.classification,
            data_type=data.data_type,
            tags=['ml'])

    for model_name in model_names:
        logger.info(f"Test classification model: {model_name}.")
        model = Model(operation_type=model_name)
        _, train_predicted = model.fit(data=train_data)
        test_pred = model.predict(fitted_operation=_,
                                  data=test_data,
                                  is_fit_pipeline_stage=False)
        roc_on_test = get_roc_auc(valid_data=test_data,
                                  predicted_data=test_pred)
        if model_name not in ['bernb', 'multinb']:
            assert roc_on_test >= roc_threshold
        else:
            assert roc_on_test >= 0.5

Esempio n. 7

0

Mostra file

def get_cholesterol_data():
    file_path = join('cases', 'data', 'cholesterol', 'cholesterol.csv')
    full_path = join(str(fedot_project_root()), file_path)
    task = Task(TaskTypesEnum.regression)
    data = InputData.from_csv(full_path, task=task)
    train, test = train_test_data_setup(data)

    return train, test

Esempio n. 8

0

Mostra file

def run_text_problem_from_saved_meta_file(path):
    data = InputData.from_text_meta_file(meta_file_path=path)

    train_data, test_data = train_test_data_setup(data, split_ratio=0.7)

    metric = execute_pipeline_for_text_problem(train_data, test_data)

    print(f'meta_file metric: {metric}')

Esempio n. 9

0

Mostra file

def get_kc2_data():
    file_path = join('cases', 'data', 'kc2', 'kc2.csv')
    full_path = join(str(fedot_project_root()), file_path)
    task = Task(TaskTypesEnum.classification)
    data = InputData.from_csv(full_path, task=task)
    train, test = train_test_data_setup(data)

    return train, test

Esempio n. 10

0

Mostra file

def test_evaluate_individuals():
    project_root_path = str(fedot_project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/simple_classification.csv')
    full_path_train = os.path.join(str(fedot_project_root()), file_path_train)

    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(full_path_train, task=task)
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=task.task_type)

    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    composer_requirements = GPComposerRequirements(
        primary=available_model_types, secondary=available_model_types)

    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function)

    composer = builder.build()

    pipelines_to_evaluate = [
        pipeline_first(),
        pipeline_second(),
        pipeline_third(),
        pipeline_fourth()
    ]

    train_data, test_data = train_test_data_setup(
        dataset_to_compose,
        sample_split_ratio_for_tasks[dataset_to_compose.task.task_type])
    metric_function_for_nodes = partial(composer.composer_metric,
                                        composer.metrics, train_data,
                                        test_data)
    adapter = PipelineAdapter()
    population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate]
    timeout = datetime.timedelta(minutes=0.001)
    params = GraphGenerationParams(adapter=PipelineAdapter(),
                                   advisor=PipelineChangeAdvisor())
    with OptimisationTimer(timeout=timeout) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             graph_generation_params=params,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 1
    assert population[0].fitness is not None

    population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate]
    timeout = datetime.timedelta(minutes=5)
    with OptimisationTimer(timeout=timeout) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             graph_generation_params=params,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 4
    assert all([ind.fitness is not None for ind in population])

Esempio n. 11

0

Mostra file

def get_dataset(task_type: str):
    if task_type == 'regression':
        data = get_synthetic_regression_data()
        train_data, test_data = train_test_data_setup(data)
        threshold = np.std(test_data.target) * 0.05
    elif task_type == 'classification':
        data = get_iris_data()
        train_data, test_data = train_test_data_setup(data, shuffle_flag=True)
        threshold = 0.95
    elif task_type == 'clustering':
        data = get_synthetic_input_data(n_samples=1000)
        train_data, test_data = train_test_data_setup(data)
        threshold = 0.5
    elif task_type == 'ts_forecasting':
        train_data, test_data = get_ts_data(forecast_length=5)
        threshold = np.std(test_data.target)
    else:
        raise ValueError('Incorrect type of machine learning task')
    return train_data, test_data, threshold

Esempio n. 12

0

Mostra file

    def compose_pipeline(
        self,
        data: Union[InputData, MultiModalData],
        is_visualise: bool = False,
        is_tune: bool = False,
        on_next_iteration_callback: Optional[Callable] = None
    ) -> Union[Pipeline, List[Pipeline]]:
        """ Function for optimal pipeline structure searching
        :param data: InputData for pipeline composing
        :param is_visualise: is it needed to visualise
        :param is_tune: is it needed to tune pipeline after composing TODO integrate new tuner
        :param on_next_iteration_callback: TODO add description
        :return best_pipeline: obtained result after composing: one pipeline for single-objective optimization;
            For the multi-objective case, the list of the graph is returned.
            In the list, the pipelines are ordered by the descending of primary metric (the first is the best)
        """

        self.optimiser.graph_generation_params.advisor.task = data.task

        if self.composer_requirements.max_pipeline_fit_time:
            set_multiprocess_start_method()

        if not self.optimiser:
            raise AttributeError(
                f'Optimiser for graph composition is not defined')

        if self.composer_requirements.cv_folds is not None:
            objective_function_for_pipeline = self._cv_validation_metric_build(
                data)
        else:
            self.log.info(
                "Hold out validation for graph composing was applied.")
            split_ratio = sample_split_ratio_for_tasks[data.task.task_type]
            train_data, test_data = train_test_data_setup(data, split_ratio)
            objective_function_for_pipeline = partial(self.composer_metric,
                                                      self.metrics, train_data,
                                                      test_data)

        if self.cache_path is None:
            self.cache.clear()
        else:
            self.cache.clear(tmp_only=True)
            self.cache = OperationsCache(
                self.cache_path, clear_exiting=not self.use_existing_cache)

        best_pipeline = self.optimiser.optimise(
            objective_function_for_pipeline,
            on_next_iteration_callback=on_next_iteration_callback)

        self.log.info('GP composition finished')
        self.cache.clear()
        if is_tune:
            self.tune_pipeline(best_pipeline, data,
                               self.composer_requirements.timeout)
        return best_pipeline

Esempio n. 13

0

Mostra file

def test_regression_pipeline_fit_correct():
    data = get_synthetic_regression_data()

    pipeline = generate_pipeline()
    train_data, test_data = train_test_data_setup(data)

    pipeline.fit(input_data=train_data)
    _, rmse_on_test = get_rmse_value(pipeline, train_data, test_data)

    rmse_threshold = np.std(data.target) * 0.05
    assert rmse_on_test < rmse_threshold

Esempio n. 14

0

Mostra file

def run_text_problem_from_meta_file():
    data_file_abspath = os.path.abspath(
        os.path.join('data', 'spam', 'spamham.csv'))

    data = InputData.from_text_meta_file(meta_file_path=data_file_abspath)

    train_data, test_data = train_test_data_setup(data, split_ratio=0.7)

    metric = execute_pipeline_for_text_problem(train_data, test_data)

    print(f'meta_file metric: {metric}')

Esempio n. 15

0

Mostra file

def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling pipeline. Fit predict it
    scaling_pipeline = Pipeline(PrimaryNode('normalization'))
    scaling_pipeline.fit(train_data)
    scaled_data = scaling_pipeline.predict(train_data)

    kmeans = Model(operation_type='kmeans')
    _, train_predicted = kmeans.fit(data=scaled_data)

    assert all(np.unique(train_predicted.predict) == [0, 1])

Esempio n. 16

0

Mostra file

def test_output_mode_labels():
    data = get_iris_data()
    pipeline = pipeline_simple()
    train_data, test_data = train_test_data_setup(data, shuffle_flag=True)

    pipeline.fit(input_data=train_data)
    results = pipeline.predict(input_data=test_data, output_mode='labels')
    results_probs = pipeline.predict(input_data=test_data)

    assert len(results.predict) == len(test_data.target)
    assert set(results.predict) == {0, 1, 2}

    assert not np.array_equal(results_probs.predict, results.predict)

Esempio n. 17

0

Mostra file

def test_model_fit_and_predict_correctly():
    """Checks whether the model fits and predict correctly on the synthetic dataset"""
    data = get_synthetic_input_data(N_SAMPLES, N_FEATURES, random_state=1)

    pipeline = generate_pipeline()
    train_data, test_data = train_test_data_setup(data)

    pipeline.fit(input_data=train_data)
    roc_auc_value_train, roc_auc_value_test = get_roc_auc_value(pipeline, train_data, test_data)
    train_auc_thr = get_auc_threshold(roc_auc_value_train)
    test_auc_thr = get_auc_threshold(roc_auc_value_test)

    assert train_auc_thr >= CORRECT_MODEL_AUC_THR
    assert test_auc_thr >= CORRECT_MODEL_AUC_THR

Esempio n. 18

0

Mostra file

def test_multiclassification_pipeline_fit_correct():
    data = get_iris_data()
    pipeline = pipeline_simple()
    train_data, test_data = train_test_data_setup(data, shuffle_flag=True)

    pipeline.fit(input_data=train_data)
    results = pipeline.predict(input_data=test_data)

    roc_auc_on_test = roc_auc(y_true=test_data.target,
                              y_score=results.predict,
                              multi_class='ovo',
                              average='macro')

    assert roc_auc_on_test > 0.95

Esempio n. 19

0

Mostra file

def test_output_mode_full_probs():
    data = get_binary_classification_data()
    pipeline = pipeline_simple()
    train_data, test_data = train_test_data_setup(data, shuffle_flag=True)

    pipeline.fit(input_data=train_data)
    results = pipeline.predict(input_data=test_data, output_mode='full_probs')
    results_default = pipeline.predict(input_data=test_data)
    results_probs = pipeline.predict(input_data=test_data, output_mode='probs')

    assert not np.array_equal(results_probs.predict, results.predict)
    assert np.array_equal(results_probs.predict, results_default.predict)
    assert results.predict.shape == (len(test_data.target), 2)
    assert results_probs.predict.shape == (len(test_data.target),)

Esempio n. 20

0

Mostra file

def test_secondary_nodes_is_invariant_to_inputs_order(data_setup):
    data = data_setup
    train, test = train_test_data_setup(data)

    first = PrimaryNode(operation_type='logit')
    second = PrimaryNode(operation_type='lda')
    third = PrimaryNode(operation_type='knn')
    final = SecondaryNode(operation_type='xgboost',
                          nodes_from=[first, second, third])

    pipeline = Pipeline()
    for node in [first, second, third, final]:
        pipeline.add_node(node)

    first = deepcopy(first)
    second = deepcopy(second)
    third = deepcopy(third)

    final_shuffled = SecondaryNode(operation_type='xgboost',
                                   nodes_from=[third, first, second])

    pipeline_shuffled = Pipeline()
    # change order of nodes in list
    for node in [final_shuffled, third, first, second]:
        pipeline_shuffled.add_node(node)

    train_predicted = pipeline.fit(input_data=train)

    train_predicted_shuffled = pipeline_shuffled.fit(input_data=train)

    # train results should be invariant
    assert pipeline.root_node.descriptive_id == pipeline_shuffled.root_node.descriptive_id
    assert np.equal(train_predicted.predict, train_predicted_shuffled.predict).all()

    test_predicted = pipeline.predict(input_data=test)
    test_predicted_shuffled = pipeline_shuffled.predict(input_data=test)

    # predict results should be invariant
    assert np.equal(test_predicted.predict, test_predicted_shuffled.predict).all()

    # change parents order for the nodes fitted pipeline
    nodes_for_change = pipeline.nodes[3].nodes_from
    pipeline.nodes[3].nodes_from = [nodes_for_change[2], nodes_for_change[0], nodes_for_change[1]]
    pipeline.nodes[3].unfit()
    pipeline.fit(train)
    test_predicted_re_shuffled = pipeline.predict(input_data=test)

    # predict results should be invariant
    assert np.equal(test_predicted.predict, test_predicted_re_shuffled.predict).all()

Esempio n. 21

0

Mostra file

def multi_target_data_setup():
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/multi_target_sample.csv'
    path = os.path.join(test_file_path, file)

    target_columns = [
        '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day'
    ]
    task = Task(TaskTypesEnum.regression)
    data = InputData.from_csv(path,
                              target_columns=target_columns,
                              columns_to_drop=['date'],
                              task=task)
    train, test = train_test_data_setup(data)
    return train, test

Esempio n. 22

0

Mostra file

def test_eval_strategy_logreg(data_setup):
    data_set = data_setup
    train, test = train_test_data_setup(data=data_set)
    test_skl_model = LogisticRegression(C=10.,
                                        random_state=1,
                                        solver='liblinear',
                                        max_iter=10000,
                                        verbose=0)
    test_skl_model.fit(train.features, train.target)
    expected_result = test_skl_model.predict(test.features)

    test_model_node = PrimaryNode(operation_type='logit')
    test_model_node.fit(input_data=train)
    actual_result = test_model_node.predict(input_data=test)

    assert len(actual_result.predict) == len(expected_result)

Esempio n. 23

0

Mostra file

def test_multi_times_analyze_analyze(analyze_method):
    # given
    pipeline, train_data, test_data, node_index, result_dir = given_data()
    test_data, valid_data = train_test_data_setup(test_data, split_ratio=0.5)

    # when
    analyze_result = MultiTimesAnalyze(pipeline=pipeline,
                                       train_data=train_data,
                                       test_data=test_data,
                                       valid_data=valid_data,
                                       case_name='test_case_name',
                                       path_to_save=result_dir).analyze()

    # then
    assert type(analyze_result) is float
    assert analyze_method.called

Esempio n. 24

0

Mostra file

def test_pca_model_removes_redunant_features_correct():
    n_informative = 5
    data = classification_dataset_with_redunant_features(
        n_samples=1000, n_features=100, n_informative=n_informative)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling pipeline. Fit predict it
    scaling_pipeline = Pipeline(PrimaryNode('normalization'))
    scaling_pipeline.fit(train_data)
    scaled_data = scaling_pipeline.predict(train_data)

    pca = DataOperation(operation_type='pca')
    _, train_predicted = pca.fit(data=scaled_data)
    transformed_features = train_predicted.predict

    assert transformed_features.shape[1] < data.features.shape[1]

Esempio n. 25

0

Mostra file

def test_svc_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling pipeline. Fit predict it
    scaling_pipeline = Pipeline(PrimaryNode('normalization'))
    scaling_pipeline.fit(train_data)
    scaled_data = scaling_pipeline.predict(train_data)

    svc = Model(operation_type='svc')
    _, train_predicted = svc.fit(data=scaled_data)

    roc_on_train = get_roc_auc(valid_data=train_data,
                               predicted_data=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold

Esempio n. 26

0

Mostra file

def test_logger_manager_keeps_loggers_correctly():
    LogManager().clear_cache()

    pipeline = create_four_depth_pipeline()
    expected_number_of_loggers = 5

    file = os.path.join('../data', 'advanced_classification.csv')
    test_file_path = str(os.path.dirname(__file__))
    data = InputData.from_csv(os.path.join(test_file_path, file))
    train_data, _ = train_test_data_setup(data=data)

    pipeline.fit(train_data)

    actual_number_of_loggers = LogManager().debug['loggers_number']

    assert actual_number_of_loggers == expected_number_of_loggers

Esempio n. 27

0

Mostra file

def test_pipeline_with_clusters_fit_correct():
    mean_roc_on_test = 0

    # mean ROC AUC is analysed because of stochastic clustering
    for _ in range(5):
        data = get_synthetic_input_data(n_samples=10000)

        pipeline = generate_pipeline()
        train_data, test_data = train_test_data_setup(data)

        pipeline.fit(input_data=train_data)
        _, roc_on_test = get_roc_auc_value(pipeline, train_data, test_data)
        mean_roc_on_test = np.mean([mean_roc_on_test, roc_on_test])

    roc_threshold = 0.5
    assert mean_roc_on_test > roc_threshold

Esempio n. 28

0

Mostra file

def fit_predict_one_fold(data, pipeline):
    """ Simple strategy for model evaluation based on one folder check

    :param data: InputData for validation
    :param pipeline: Chain to validate
    """

    # Train test split
    train_input, predict_input = train_test_data_setup(data)
    test_target = np.array(predict_input.target)

    pipeline.fit_from_scratch(train_input)
    predicted_output = pipeline.predict(predict_input)
    predictions = np.array(predicted_output.predict)

    return test_target, predictions

Esempio n. 29

0

Mostra file

def data_setup():
    predictors, response = load_breast_cancer(return_X_y=True)
    np.random.seed(1)
    np.random.shuffle(predictors)
    np.random.shuffle(response)
    response = response[:100]
    predictors = predictors[:100]

    # Wrap data into InputData
    input_data = InputData(features=predictors,
                           target=response,
                           idx=np.arange(0, len(predictors)),
                           task=Task(TaskTypesEnum.classification),
                           data_type=DataTypesEnum.table)
    # Train test split
    train_data, test_data = train_test_data_setup(input_data)
    return train_data, test_data

Esempio n. 30

0

Mostra file

def test_model_predictions_on_train_test_random():
    """Checks that model can't predict correctly on random train and test datasets and
    the roc_auc_scores is close to 0.5.
    Both train and test data have no relations between features and target."""
    data = get_synthetic_input_data(N_SAMPLES, N_FEATURES, random_state=1)
    data = get_random_target_data(data)

    train_data, test_data = train_test_data_setup(data)

    pipeline = generate_pipeline()
    pipeline.fit(input_data=train_data)
    roc_auc_value_train, roc_auc_value_test = get_roc_auc_value(pipeline, train_data, test_data)
    train_auc_thr = get_auc_threshold(roc_auc_value_train)
    test_auc_thr = get_auc_threshold(roc_auc_value_test)

    assert test_auc_thr <= CORRECT_MODEL_AUC_THR
    assert train_auc_thr <= CORRECT_MODEL_AUC_THR