Beispiel #1
0
def run_pipeline_from_automl(train_file_path: str,
                             test_file_path: str,
                             max_run_time: timedelta = timedelta(minutes=10)):
    """ Function run pipeline with Auto ML models in nodes

    :param train_file_path: path to the csv file with data for train
    :param test_file_path: path to the csv file with data for validation
    :param max_run_time: maximum running time for customization of the "tpot" model

    :return roc_auc_value: ROC AUC metric for pipeline
    """
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    node_scaling = PrimaryNode('scaling')
    node_tpot = PrimaryNode('tpot')

    node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = SecondaryNode('lda', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda])
    OperationTypesRepository.assign_repo('model', 'automl_repository.json')
    pipeline = Pipeline(node_rf)

    pipeline.fit(train_data)
    results = pipeline.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Beispiel #2
0
def test_forecast_with_exog():
    train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts(
    )

    # Source data for lagged node
    node_lagged = PrimaryNode('lagged')
    # Set window size for lagged transformation
    node_lagged.custom_params = {'window_size': window_size}
    # Exogenous variable for exog node
    node_exog = PrimaryNode('exog_ts_data_source')

    node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_exog])
    pipeline = Pipeline(node_final)

    pipeline.fit(input_data=MultiModalData({
        'exog_ts_data_source': train_exog_ts,
        'lagged': train_source_ts
    }))

    forecast = pipeline.predict(
        input_data=MultiModalData({
            'exog_ts_data_source': predict_exog_ts,
            'lagged': predict_source_ts
        }))
    prediction = np.ravel(np.array(forecast.predict))

    assert tuple(prediction) == tuple(ts_test)
Beispiel #3
0
def test_save_load_fitted_atomized_pipeline_correctly():
    pipeline = create_pipeline_with_several_nested_atomized_model()

    train_data, test_data = create_data_for_train()
    pipeline.fit(train_data)

    json_actual = pipeline.save(
        'test_save_load_fitted_atomized_pipeline_correctly')

    json_path_load = create_correct_path(
        'test_save_load_fitted_atomized_pipeline_correctly')

    pipeline_loaded = Pipeline()
    pipeline_loaded.load(json_path_load)
    json_expected = pipeline_loaded.save(
        'test_save_load_fitted_atomized_pipeline_correctly_loaded')

    assert pipeline.length == pipeline_loaded.length
    assert json_actual == json_expected

    before_save_predicted = pipeline.predict(test_data)

    pipeline_loaded.fit(train_data)
    after_save_predicted = pipeline_loaded.predict(test_data)

    bfr_tun_mse = mean_squared_error(y_true=test_data.target,
                                     y_pred=before_save_predicted.predict)
    aft_tun_mse = mean_squared_error(y_true=test_data.target,
                                     y_pred=after_save_predicted.predict)

    assert aft_tun_mse <= bfr_tun_mse
Beispiel #4
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier())
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results)

    print(f'ROC AUC for TPOT: {roc_auc_value}')

    node_scaling = PrimaryNode('scaling')
    node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling])
    pipeline = Pipeline(node_rf)

    pipeline.fit(train_data)
    results = pipeline.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(f'ROC AUC for FEDOT: {roc_auc_value}')

    return roc_auc_value
Beispiel #5
0
    def _get_metric_value(self, pipeline: Pipeline,
                          metric: MetricByTask) -> float:
        pipeline.fit(self._train_data, use_fitted=False)
        predicted = pipeline.predict(self._test_data)
        metric_value = metric.get_value(true=self._test_data,
                                        predicted=predicted)

        return metric_value
Beispiel #6
0
def test_data_preparation_for_multi_target_correct(multi_target_data_setup):
    train, test = multi_target_data_setup
    simple_pipeline = Pipeline(PrimaryNode('linear'))
    simple_pipeline.fit(input_data=train)

    source_shape = test.target.shape
    # Get converted data
    results, new_test = QualityMetric()._simple_prediction(
        simple_pipeline, test)
    number_elements = len(new_test.target)
    assert source_shape[0] * source_shape[1] == number_elements
Beispiel #7
0
def test_pipeline_with_wrong_data():
    pipeline = Pipeline(PrimaryNode('linear'))
    data_seq = np.arange(0, 10)
    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=10))

    data = InputData(idx=data_seq, features=data_seq, target=data_seq,
                     data_type=DataTypesEnum.ts, task=task)

    with pytest.raises(ValueError):
        pipeline.fit(data)
Beispiel #8
0
def test_pipeline_unfit(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    pipeline = Pipeline(PrimaryNode('logit'))
    pipeline.fit(data)
    assert pipeline.is_fitted

    pipeline.unfit()
    assert not pipeline.is_fitted
    assert not pipeline.root_node.fitted_operation

    with pytest.raises(ValueError) as exc:
        assert pipeline.predict(data)
Beispiel #9
0
def execute_pipeline_for_text_problem(train_data, test_data):
    node_text_clean = PrimaryNode('text_clean')
    node_tfidf = SecondaryNode('tfidf', nodes_from=[node_text_clean])
    model_node = SecondaryNode('multinb', nodes_from=[node_tfidf])
    pipeline = Pipeline(model_node)
    pipeline.fit(train_data)

    predicted = pipeline.predict(test_data)

    roc_auc_metric = roc_auc(y_true=test_data.target,
                             y_score=predicted.predict)

    return roc_auc_metric
Beispiel #10
0
def test_import_custom_json_object_to_pipeline_and_fit_correctly_no_exception():
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/test_custom_json_template.json'
    json_path_load = os.path.join(test_file_path, file)

    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)

    pipeline = Pipeline()
    pipeline.load(json_path_load)
    pipeline.fit(train_data)

    pipeline.save('test_import_custom_json_object_to_pipeline_and_fit_correctly_no_exception')
Beispiel #11
0
def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling pipeline. Fit predict it
    scaling_pipeline = Pipeline(PrimaryNode('normalization'))
    scaling_pipeline.fit(train_data)
    scaled_data = scaling_pipeline.predict(train_data)

    kmeans = Model(operation_type='kmeans')
    _, train_predicted = kmeans.fit(data=scaled_data)

    assert all(np.unique(train_predicted.predict) == [0, 1])
Beispiel #12
0
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup):
    data = data_setup
    train, test = train_test_data_setup(data)

    first = PrimaryNode(operation_type='logit')
    second = PrimaryNode(operation_type='lda')
    third = PrimaryNode(operation_type='knn')
    final = SecondaryNode(operation_type='xgboost',
                          nodes_from=[first, second, third])

    pipeline = Pipeline()
    for node in [first, second, third, final]:
        pipeline.add_node(node)

    first = deepcopy(first)
    second = deepcopy(second)
    third = deepcopy(third)

    final_shuffled = SecondaryNode(operation_type='xgboost',
                                   nodes_from=[third, first, second])

    pipeline_shuffled = Pipeline()
    # change order of nodes in list
    for node in [final_shuffled, third, first, second]:
        pipeline_shuffled.add_node(node)

    train_predicted = pipeline.fit(input_data=train)

    train_predicted_shuffled = pipeline_shuffled.fit(input_data=train)

    # train results should be invariant
    assert pipeline.root_node.descriptive_id == pipeline_shuffled.root_node.descriptive_id
    assert np.equal(train_predicted.predict, train_predicted_shuffled.predict).all()

    test_predicted = pipeline.predict(input_data=test)
    test_predicted_shuffled = pipeline_shuffled.predict(input_data=test)

    # predict results should be invariant
    assert np.equal(test_predicted.predict, test_predicted_shuffled.predict).all()

    # change parents order for the nodes fitted pipeline
    nodes_for_change = pipeline.nodes[3].nodes_from
    pipeline.nodes[3].nodes_from = [nodes_for_change[2], nodes_for_change[0], nodes_for_change[1]]
    pipeline.nodes[3].unfit()
    pipeline.fit(train)
    test_predicted_re_shuffled = pipeline.predict(input_data=test)

    # predict results should be invariant
    assert np.equal(test_predicted.predict, test_predicted_re_shuffled.predict).all()
Beispiel #13
0
def test_multi_modal_pipeline():
    task = Task(TaskTypesEnum.classification)
    images_size = (128, 128)

    files_path = os.path.join('test', 'data', 'multi_modal')
    path = os.path.join(str(fedot_project_root()), files_path)

    train_num, _, train_img, _, train_text, _ = \
        prepare_multi_modal_data(path, task, images_size, with_split=False)

    # image
    image_node = PrimaryNode('cnn')
    image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1),
                                'architecture': 'simplified',
                                'num_classes': 2,
                                'epochs': 1,
                                'batch_size': 128}

    # image
    ds_image = PrimaryNode('data_source_img')
    image_node = SecondaryNode('cnn', nodes_from=[ds_image])
    image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1),
                                'architecture': 'simplified',
                                'num_classes': 2,
                                'epochs': 15,
                                'batch_size': 128}

    # table
    ds_table = PrimaryNode('data_source_table')
    scaling_node = SecondaryNode('scaling', nodes_from=[ds_table])
    numeric_node = SecondaryNode('rf', nodes_from=[scaling_node])

    # text
    ds_text = PrimaryNode('data_source_text')
    node_text_clean = SecondaryNode('text_clean', nodes_from=[ds_text])
    text_node = SecondaryNode('tfidf', nodes_from=[node_text_clean])

    pipeline = Pipeline(SecondaryNode('logit', nodes_from=[numeric_node, image_node, text_node]))

    fit_data = MultiModalData({
        'data_source_img': train_img,
        'data_source_table': train_num,
        'data_source_text': train_text
    })

    pipeline.fit(fit_data)
    prediction = pipeline.predict(fit_data)

    assert prediction is not None
Beispiel #14
0
    def composer_metric(self, metrics, train_data: Union[InputData,
                                                         MultiModalData],
                        test_data: Union[InputData, MultiModalData],
                        pipeline: Pipeline) -> Optional[Tuple[Any]]:
        try:
            validate(pipeline)
            pipeline.log = self.log

            if type(metrics) is not list:
                metrics = [metrics]

            if self.cache is not None:
                # TODO improve cache
                pipeline.fit_from_cache(self.cache)

            if not pipeline.is_fitted:
                self.log.debug(
                    f'Pipeline {pipeline.root_node.descriptive_id} fit started'
                )
                pipeline.fit(input_data=train_data,
                             time_constraint=self.composer_requirements.
                             max_pipeline_fit_time)
                try:
                    self.cache.save_pipeline(pipeline)
                except Exception as ex:
                    self.log.info(f'Cache can not be saved: {ex}. Continue.')

            evaluated_metrics = ()
            for metric in metrics:
                if callable(metric):
                    metric_func = metric
                else:
                    metric_func = MetricsRepository().metric_by_id(metric)
                evaluated_metrics = evaluated_metrics + (metric_func(
                    pipeline, reference_data=test_data), )

            self.log.debug(
                f'Pipeline {pipeline.root_node.descriptive_id} with metrics: {list(evaluated_metrics)}'
            )

            # enforce memory cleaning
            pipeline.unfit()
            gc.collect()
        except Exception as ex:
            self.log.info(f'Pipeline assessment warning: {ex}. Continue.')
            evaluated_metrics = None

        return evaluated_metrics
Beispiel #15
0
def test_svc_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling pipeline. Fit predict it
    scaling_pipeline = Pipeline(PrimaryNode('normalization'))
    scaling_pipeline.fit(train_data)
    scaled_data = scaling_pipeline.predict(train_data)

    svc = Model(operation_type='svc')
    _, train_predicted = svc.fit(data=scaled_data)

    roc_on_train = get_roc_auc(valid_data=train_data,
                               predicted_data=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Beispiel #16
0
def test_pipeline_hierarchy_fit_correct(data_setup):
    data = data_setup
    train, _ = train_test_data_setup(data)

    first = PrimaryNode(operation_type='logit')
    second = SecondaryNode(operation_type='logit', nodes_from=[first])
    third = SecondaryNode(operation_type='logit', nodes_from=[first])
    final = SecondaryNode(operation_type='logit', nodes_from=[second, third])

    pipeline = Pipeline()
    for node in [first, second, third, final]:
        pipeline.add_node(node)

    pipeline.unfit()
    train_predicted = pipeline.fit(input_data=train)

    assert pipeline.root_node.descriptive_id == (
        '((/n_logit_default_params;)/'
        'n_logit_default_params;;(/'
        'n_logit_default_params;)/'
        'n_logit_default_params;)/'
        'n_logit_default_params')

    assert pipeline.length == 4
    assert pipeline.depth == 3
    assert train_predicted.predict.shape[0] == train.target.shape[0]
    assert final.fitted_operation is not None
Beispiel #17
0
def test_pca_model_removes_redunant_features_correct():
    n_informative = 5
    data = classification_dataset_with_redunant_features(
        n_samples=1000, n_features=100, n_informative=n_informative)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling pipeline. Fit predict it
    scaling_pipeline = Pipeline(PrimaryNode('normalization'))
    scaling_pipeline.fit(train_data)
    scaled_data = scaling_pipeline.predict(train_data)

    pca = DataOperation(operation_type='pca')
    _, train_predicted = pca.fit(data=scaled_data)
    transformed_features = train_predicted.predict

    assert transformed_features.shape[1] < data.features.shape[1]
Beispiel #18
0
def test_forecast_with_sparse_lagged():
    train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts(
    )

    # Source data for lagged node
    node_lagged = PrimaryNode('sparse_lagged')
    # Set window size for lagged transformation
    node_lagged.custom_params = {'window_size': window_size}

    node_final = SecondaryNode('linear', nodes_from=[node_lagged])
    pipeline = Pipeline(node_final)

    pipeline.fit(input_data=MultiModalData({'sparse_lagged': train_source_ts}))

    forecast = pipeline.predict(
        input_data=MultiModalData({'sparse_lagged': predict_source_ts}))
    is_forecasted = True

    assert is_forecasted
Beispiel #19
0
def test_pipeline_with_datamodel_fit_correct(data_setup):
    data = data_setup
    train_data, test_data = train_test_data_setup(data)

    pipeline = Pipeline()

    node_data = PrimaryNode('logit')
    node_first = PrimaryNode('bernb')
    node_second = SecondaryNode('rf')

    node_second.nodes_from = [node_first, node_data]

    pipeline.add_node(node_data)
    pipeline.add_node(node_first)
    pipeline.add_node(node_second)

    pipeline.fit(train_data)
    results = np.asarray(probs_to_labels(pipeline.predict(test_data).predict))

    assert results.shape == test_data.target.shape
Beispiel #20
0
def test_clean_text_preprocessing():
    test_text = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
    ]

    input_data = InputData(features=test_text,
                           target=[0, 1, 1, 0],
                           idx=np.arange(0, len(test_text)),
                           task=Task(TaskTypesEnum.classification),
                           data_type=DataTypesEnum.text)

    preprocessing_pipeline = Pipeline(PrimaryNode('text_clean'))
    preprocessing_pipeline.fit(input_data)

    predicted_output = preprocessing_pipeline.predict(input_data)
    cleaned_text = predicted_output.predict

    assert len(test_text) == len(cleaned_text)
Beispiel #21
0
def test_regression_pipeline_with_data_operation_fit_correct():
    data = get_synthetic_regression_data()
    train_data, test_data = train_test_data_setup(data)

    #           linear
    #       /           \
    #     ridge          |
    #       |            |
    # ransac_lin_reg   lasso
    #        \         /
    #          scaling
    node_scaling = PrimaryNode('scaling')
    node_ransac = SecondaryNode('ransac_lin_reg', nodes_from=[node_scaling])
    node_lasso = SecondaryNode('lasso', nodes_from=[node_scaling])
    node_ridge = SecondaryNode('ridge', nodes_from=[node_ransac])
    node_root = SecondaryNode('linear', nodes_from=[node_lasso, node_ridge])
    pipeline = Pipeline(node_root)

    pipeline.fit(train_data)
    results = pipeline.predict(test_data)

    assert results.predict.shape == test_data.target.shape
Beispiel #22
0
def test_pipeline_with_custom_params_for_model(data_setup):
    data = data_setup
    custom_params = dict(n_neighbors=1,
                         weights='uniform',
                         p=1)

    first = PrimaryNode(operation_type='logit')
    second = PrimaryNode(operation_type='lda')
    final = SecondaryNode(operation_type='knn', nodes_from=[first, second])

    pipeline = Pipeline()
    pipeline.add_node(final)
    pipeline_default_params = deepcopy(pipeline)

    pipeline.root_node.custom_params = custom_params

    pipeline_default_params.fit(data)
    pipeline.fit(data)

    custom_params_prediction = pipeline.predict(data).predict
    default_params_prediction = pipeline_default_params.predict(data).predict

    assert not np.array_equal(custom_params_prediction, default_params_prediction)