Beispiel #1
0
def test_vectorize_tfidf_strategy():
    train_text = [
        'This document first'
        'second This document'
        'And one third'
        'Is document first'
    ]
    test_text = ['document allow', 'spam not found', 'is are']

    train_data = InputData(idx=len(train_text),
                           features=train_text,
                           target=[0, 0, 1, 0],
                           data_type=DataTypesEnum.text,
                           task=Task(TaskTypesEnum.classification))
    test_data = InputData(idx=len(test_text),
                          features=test_text,
                          target=[0, 1, 0],
                          data_type=DataTypesEnum.text,
                          task=Task(TaskTypesEnum.classification))

    vectorizer = SkLearnTextVectorizeStrategy(operation_type='tfidf',
                                              params=None)

    vectorizer_fitted = vectorizer.fit(train_data)

    predicted = vectorizer.predict(trained_operation=vectorizer_fitted,
                                   predict_data=test_data,
                                   is_fit_chain_stage=False)
    predicted_labels = predicted.predict

    assert isinstance(vectorizer_fitted, TfidfVectorizer)
    assert len(predicted_labels[0]) == 7
Beispiel #2
0
def test_multi_modal_data():
    num_samples = 5
    target = np.asarray([0, 0, 1, 0, 1])
    img_data = InputData(
        idx=range(num_samples),
        features=None,  # in test the real data is not passed
        target=target,
        data_type=DataTypesEnum.text,
        task=Task(TaskTypesEnum.classification))
    tbl_data = InputData(
        idx=range(num_samples),
        features=None,  # in test the real data is not passed
        target=target,
        data_type=DataTypesEnum.table,
        task=Task(TaskTypesEnum.classification))

    multi_modal = MultiModalData({
        'data_source_img': img_data,
        'data_source_table': tbl_data,
    })

    assert multi_modal.task.task_type == TaskTypesEnum.classification
    assert len(multi_modal.idx) == 5
    assert multi_modal.num_classes == 2
    assert np.array_equal(multi_modal.target, target)
Beispiel #3
0
def test_target_task_two_ignore_merge():
    """ The test runs an example of how different targets and tasks will be
    combined. Consider situation when one target should be untouched"""

    # Targets in different outputs
    labels_col = [[1], [1]]
    probabilities_col_1 = [[0.8], [0.7]]
    probabilities_col_2 = [[0.5], [0.5]]
    targets = np.array([labels_col,
                        probabilities_col_1,
                        probabilities_col_2])

    # Flags for targets
    main_targets = [True, False, False]

    # Tasks
    class_task = Task(TaskTypesEnum.classification)
    regr_task = Task(TaskTypesEnum.classification)
    tasks = [class_task, regr_task, regr_task]

    merger = TaskTargetMerger(None)
    target, is_main_target, task = merger.ignored_merge(targets, main_targets, tasks)

    assert is_main_target is True
    assert task.task_type is TaskTypesEnum.classification
Beispiel #4
0
def test_api_check_data_correct():
    task_type, x_train, x_test, y_train, y_test = get_split_data()
    path_to_train, path_to_test = get_split_data_paths()
    train_data, test_data, threshold = get_dataset(task_type)
    string_data_input = _define_data(ml_task=Task(TaskTypesEnum.regression),
                                     features=path_to_train)
    array_data_input = _define_data(ml_task=Task(TaskTypesEnum.regression),
                                    features=x_train)
    fedot_data_input = _define_data(ml_task=Task(TaskTypesEnum.regression),
                                    features=train_data)
    assert (not type(string_data_input) == InputData
            or type(array_data_input) == InputData
            or type(fedot_data_input) == InputData)
Beispiel #5
0
def test_knn_reg_with_invalid_params_fit_correctly():
    """ The function define a chain with incorrect parameters in the K-nn regression
    model. During the training of the chain, the parameter 'n_neighbors' is corrected
    """
    samples_amount = 100
    k_neighbors = 150

    features_options = {'informative': 1, 'bias': 0.0}
    features_amount = 3
    x_data, y_data = regression_dataset(samples_amount=samples_amount,
                                        features_amount=features_amount,
                                        features_options=features_options,
                                        n_targets=1,
                                        noise=0.0,
                                        shuffle=True)

    # Define regression task
    task = Task(TaskTypesEnum.regression)

    # Prepare data to train the model
    train_input = InputData(idx=np.arange(0, len(x_data)),
                            features=x_data,
                            target=y_data,
                            task=task,
                            data_type=DataTypesEnum.table)

    # Prepare regression chain
    chain = get_knn_reg_chain(k_neighbors)

    # Fit it
    chain.fit(train_input)

    is_chain_was_fitted = True
    assert is_chain_was_fitted
Beispiel #6
0
def array_to_input_data(features_array: np.array,
                        target_array: np.array,
                        task: Task = Task(TaskTypesEnum.classification)):
    data_type = _autodetect_data_type(task)
    idx = np.arange(len(features_array))

    return InputData(idx=idx, features=features_array, target=target_array, task=task, data_type=data_type)
Beispiel #7
0
def prepare_input_data(features, target):
    """ Function create InputData with features """
    x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(
        features,
        target,
        test_size=0.2,
        shuffle=True,
        random_state=10)
    y_data_test = np.ravel(y_data_test)

    # Define regression task
    task = Task(TaskTypesEnum.regression)

    # Prepare data to train the model
    train_input = InputData(idx=np.arange(0, len(x_data_train)),
                            features=x_data_train,
                            target=y_data_train,
                            task=task,
                            data_type=DataTypesEnum.table)

    predict_input = InputData(idx=np.arange(0, len(x_data_test)),
                              features=x_data_test,
                              target=y_data_test,
                              task=task,
                              data_type=DataTypesEnum.table)

    return train_input, predict_input, task
Beispiel #8
0
def run_refinement_scoring_example(train_path, test_path, with_tuning=False):
    """ Function launch example with error modeling for classification task

    :param train_path: path to the csv file with training sample
    :param test_path: path to the csv file with test sample
    :param with_tuning: is it need to tune pipelines or not
    """

    task = Task(TaskTypesEnum.classification)
    train_dataset = InputData.from_csv(train_path, task=task)
    test_dataset = InputData.from_csv(test_path, task=task)

    # Get and fit pipelines
    no_decompose_c = get_non_refinement_pipeline()
    decompose_c = get_refinement_pipeline()

    no_decompose_c.fit(train_dataset)
    decompose_c.fit(train_dataset)

    # Check metrics for both pipelines
    display_roc_auc(no_decompose_c, test_dataset, 'Non decomposition pipeline')
    display_roc_auc(decompose_c, test_dataset, 'With decomposition pipeline')

    if with_tuning:
        no_decompose_c.fine_tune_all_nodes(loss_function=roc_auc, loss_params=None,
                                           input_data=train_dataset, iterations=30)

        decompose_c.fine_tune_all_nodes(loss_function=roc_auc, loss_params=None,
                                        input_data=train_dataset, iterations=30)

        display_roc_auc(no_decompose_c, test_dataset, 'Non decomposition pipeline after tuning')
        display_roc_auc(decompose_c, test_dataset, 'With decomposition pipeline after tuning')
Beispiel #9
0
def prepare_input_data(forecast_length, horizon):
    ts = np.array([
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 101
    ])

    # Forecast for 2 elements ahead
    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=forecast_length))

    # To avoid data leak
    ts_train = ts[:-horizon]
    train_input = InputData(idx=np.arange(0, len(ts_train)),
                            features=ts_train,
                            target=ts_train,
                            task=task,
                            data_type=DataTypesEnum.ts)

    start_forecast = len(ts_train)
    end_forecast = start_forecast + forecast_length
    predict_input = InputData(idx=np.arange(start_forecast, end_forecast),
                              features=ts,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

    return train_input, predict_input
Beispiel #10
0
def get_time_series():
    """ Function returns time series for time series forecasting task """
    len_forecast = 100
    synthetic_ts = generate_synthetic_data(length=1000)

    train_data = synthetic_ts[:-len_forecast]
    test_data = synthetic_ts[-len_forecast:]

    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=len_forecast))

    train_input = InputData(idx=np.arange(0, len(train_data)),
                            features=train_data,
                            target=train_data,
                            task=task,
                            data_type=DataTypesEnum.ts)

    start_forecast = len(train_data)
    end_forecast = start_forecast + len_forecast
    predict_input = InputData(idx=np.arange(start_forecast, end_forecast),
                              features=train_data,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

    return train_input, predict_input, test_data
Beispiel #11
0
def get_small_classification_dataset():
    """ Function returns features and target for train and test classification models """
    features_options = {
        'informative': 1,
        'redundant': 0,
        'repeated': 0,
        'clusters_per_class': 1
    }
    x_train, y_train, x_test, y_test = get_classification_dataset(
        features_options=features_options,
        samples_amount=70,
        features_amount=4,
        classes_amount=2)
    # Define regression task
    task = Task(TaskTypesEnum.classification)

    # Prepare data to train the model
    train_input = InputData(idx=np.arange(0, len(x_train)),
                            features=x_train,
                            target=y_train,
                            task=task,
                            data_type=DataTypesEnum.table)

    predict_input = InputData(idx=np.arange(0, len(x_test)),
                              features=x_test,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.table)

    return train_input, predict_input, y_test
Beispiel #12
0
def get_classification_data(classes_amount: int):
    """ Function generate synthetic dataset for classification task

    :param classes_amount: amount of classes to predict

    :return train_input: InputData for model fit
    :return predict_input: InputData for predict stage
    """

    # Define options for dataset with 800 objects
    features_options = {'informative': 2, 'redundant': 1,
                        'repeated': 1, 'clusters_per_class': 1}
    x_train, y_train, x_test, y_test = get_classification_dataset(features_options,
                                                                  800, 4,
                                                                  classes_amount)
    y_train = y_train.reshape((-1, 1))
    y_test = y_test.reshape((-1, 1))

    # Define classification task
    task = Task(TaskTypesEnum.classification)

    # Prepare data to train and validate the model
    train_input = InputData(idx=np.arange(0, len(x_train)),
                            features=x_train, target=y_train,
                            task=task, data_type=DataTypesEnum.table)
    predict_input = InputData(idx=np.arange(0, len(x_test)),
                              features=x_test, target=y_test,
                              task=task, data_type=DataTypesEnum.table)

    return train_input, predict_input
Beispiel #13
0
    def __chain_fit_predict(self, timeseries_train: np.array, len_gap: int):
        """
        The method makes a prediction as a sequence of elements based on a
        training sample. There are two main parts: fit model and predict.

        :param timeseries_train: part of the time series for training the model
        :param len_gap: number of elements in the gap
        :return: array without gaps
        """

        task = Task(TaskTypesEnum.ts_forecasting,
                    TsForecastingParams(forecast_length=len_gap))

        input_data = InputData(idx=np.arange(0, len(timeseries_train)),
                               features=timeseries_train,
                               target=timeseries_train,
                               task=task,
                               data_type=DataTypesEnum.ts)

        # Making predictions for the missing part in the time series
        self.chain.fit_from_scratch(input_data)

        # "Test data" for making prediction for a specific length
        start_forecast = len(timeseries_train)
        end_forecast = start_forecast + len_gap
        idx_test = np.arange(start_forecast, end_forecast)
        test_data = InputData(idx=idx_test,
                              features=timeseries_train,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

        predicted_values = self.chain.predict(test_data)
        predicted_values = np.ravel(np.array(predicted_values.predict))
        return predicted_values
Beispiel #14
0
def get_synthetic_ts_data_period(n_steps=6000,
                                 forecast_length=1,
                                 max_window_size=50,
                                 with_exog: bool = True) -> InputData:
    x1 = np.arange(0, n_steps) / 10
    x2 = np.arange(0, n_steps) + 1

    x1_exog = np.arange(0, n_steps + forecast_length) / 10
    x2_exog = np.arange(0, n_steps + forecast_length) + 1

    simulated_data = x1 * 0.005 - x2 * 0.001
    periodicity = np.sin(x1 * 0.4)
    random = np.random.normal(0, 0.1, n_steps)
    simulated_data = simulated_data + periodicity + random

    task = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size,
                            return_all_steps=False,
                            make_future_prediction=True))

    exog_features = np.asarray([x1_exog, x2_exog]).T
    if not with_exog:
        # move target to features
        exog_features = None
    input_data = InputData(idx=np.arange(0, n_steps),
                           features=exog_features,
                           target=simulated_data,
                           task=task,
                           data_type=DataTypesEnum.ts)
    return input_data
Beispiel #15
0
def test_composer_with_cv_optimization_correct():
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose, dataset_to_validate = get_data(task)

    models_repo = OperationTypesRepository()
    available_model_types, _ = models_repo.suitable_operation(
        task_type=task.task_type, tags=['simple'])

    metric_function = [
        ClassificationMetricsEnum.ROCAUC_penalty,
        ClassificationMetricsEnum.accuracy, ClassificationMetricsEnum.logloss
    ]

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        timeout=timedelta(minutes=1),
        num_of_generations=2,
        cv_folds=3)

    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    pipeline_evo_composed = composer.compose_pipeline(data=dataset_to_compose,
                                                      is_visualise=False)[0]

    assert isinstance(pipeline_evo_composed, Pipeline)

    pipeline_evo_composed.fit(input_data=dataset_to_compose)
    predicted = pipeline_evo_composed.predict(dataset_to_validate)
    roc_on_valid_evo_composed = roc_auc(y_score=predicted.predict,
                                        y_true=dataset_to_validate.target)

    assert roc_on_valid_evo_composed > 0
Beispiel #16
0
def test_gp_composer_build_pipeline_correct(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    task = Task(TaskTypesEnum.classification)
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=task.task_type)

    metric_function = ClassificationMetricsEnum.ROCAUC

    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)

    builder = GPComposerBuilder(task).with_requirements(req).with_metrics(
        metric_function)
    gp_composer = builder.build()
    pipeline_gp_composed = gp_composer.compose_pipeline(
        data=dataset_to_compose)

    pipeline_gp_composed.fit_from_scratch(input_data=dataset_to_compose)
    predicted_gp_composed = pipeline_gp_composed.predict(dataset_to_validate)

    roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target,
                                       y_score=predicted_gp_composed.predict)

    assert roc_on_valid_gp_composed > 0.6
Beispiel #17
0
def data_setup():
    task = Task(TaskTypesEnum.classification)
    predictors, response = load_breast_cancer(return_X_y=True)
    np.random.seed(1)
    np.random.shuffle(predictors)
    np.random.shuffle(response)
    response = response[:100]
    predictors = predictors[:100]

    input_data = InputData(idx=np.arange(0, len(predictors)),
                           features=predictors,
                           target=response,
                           task=task,
                           data_type=DataTypesEnum.table)
    train_data, test_data = train_test_data_setup(data=input_data)
    train_data_x = train_data.features
    test_data_x = test_data.features
    train_data_y = train_data.target
    test_data_y = test_data.target

    train_data = InputData(features=train_data_x,
                           target=train_data_y,
                           idx=np.arange(0, len(train_data_y)),
                           task=task,
                           data_type=DataTypesEnum.table)
    test_data = InputData(features=test_data_x,
                          target=test_data_y,
                          idx=np.arange(0, len(test_data_y)),
                          task=task,
                          data_type=DataTypesEnum.table)
    return train_data, test_data
Beispiel #18
0
def test_gp_composer_with_start_depth(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    available_model_types = ['xgboost', 'knn']
    quality_metric = ClassificationMetricsEnum.ROCAUC
    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=5,
                                 pop_size=5,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5,
                                 start_depth=2)
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type)
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            quality_metric).with_optimiser_parameters(optimiser_parameters)
    composer = builder.build()
    composer.compose_pipeline(data=dataset_to_compose, is_visualise=True)
    assert all(
        [ind.graph.depth <= 3 for ind in composer.history.individuals[0]])
    assert composer.optimiser.max_depth == 5
Beispiel #19
0
def get_synthetic_ts_data_period(n_steps=1000,
                                 forecast_length=1,
                                 max_window_size=50):
    simulated_data = ArmaProcess().generate_sample(nsample=n_steps)
    x1 = np.arange(0, n_steps)
    x2 = np.arange(0, n_steps) + 1

    simulated_data = simulated_data + x1 * 0.0005 - x2 * 0.0001

    periodicity = np.sin(x1 / 50)

    simulated_data = simulated_data + periodicity

    task = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size,
                            return_all_steps=False))

    data = InputData(idx=np.arange(0, n_steps),
                     features=np.asarray([x1, x2]).T,
                     target=simulated_data,
                     task=task,
                     data_type=DataTypesEnum.ts)

    return train_test_data_setup(data)
Beispiel #20
0
def test_parameter_free_composer_build_chain_correct(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=TaskTypesEnum.classification)

    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC)

    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)
    opt_params = GPChainOptimiserParameters(
        genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free)
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            metric_function).with_optimiser_parameters(opt_params)
    gp_composer = builder.build()
    chain_gp_composed = gp_composer.compose_chain(data=dataset_to_compose)

    chain_gp_composed.fit_from_scratch(input_data=dataset_to_compose)
    predicted_gp_composed = chain_gp_composed.predict(dataset_to_validate)

    roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target,
                                       y_score=predicted_gp_composed.predict)

    assert roc_on_valid_gp_composed > 0.6
Beispiel #21
0
def prepare_train_test_input(train_part, len_forecast):
    """ Function return prepared data for fit and predict

    :param len_forecast: forecast length
    :param train_part: time series which can be used as predictors for train

    :return train_input: Input Data for fit
    :return predict_input: Input Data for predict
    :return task: Time series forecasting task with parameters
    """

    # Specify the task to solve
    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=len_forecast))

    train_input = InputData(idx=np.arange(0, len(train_part)),
                            features=train_part,
                            target=train_part,
                            task=task,
                            data_type=DataTypesEnum.ts)

    start_forecast = len(train_part)
    end_forecast = start_forecast + len_forecast
    predict_input = InputData(idx=np.arange(start_forecast, end_forecast),
                              features=train_part,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

    return train_input, predict_input, task
Beispiel #22
0
def prepare_input_data(len_forecast, train_data_features, train_data_target,
                       test_data_features):
    """ Function return prepared data for fit and predict

    :param len_forecast: forecast length
    :param train_data_features: time series which can be used as predictors for train
    :param train_data_target: time series which can be used as target for train
    :param test_data_features: time series which can be used as predictors for prediction

    :return train_input: Input Data for fit
    :return predict_input: Input Data for predict
    :return task: Time series forecasting task with parameters
    """

    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=len_forecast))

    train_input = InputData(idx=np.arange(0, len(train_data_features)),
                            features=train_data_features,
                            target=train_data_target,
                            task=task,
                            data_type=DataTypesEnum.ts)

    # Determine indices for forecast
    start_forecast = len(train_data_features)
    end_forecast = start_forecast + len_forecast
    predict_input = InputData(idx=np.arange(start_forecast, end_forecast),
                              features=test_data_features,
                              target=None,
                              task=task,
                              data_type=DataTypesEnum.ts)

    return train_input, predict_input, task
Beispiel #23
0
def test_data_merge_function():
    """ Test check is the merge function can find appropriate intersections of
    indices. Set {idx_2} ∈ set {idx_1}, so intersection must be = idx_2
    """

    idx_1 = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
    idx_2 = [2, 3, 4, 5, 6, 7, 8, 9]

    task = Task(TaskTypesEnum.regression)
    generated_target = np.random.sample((len(idx_1), 1))
    generated_features = np.random.sample((len(idx_1), 2))

    list_with_outputs = []
    for idx in [idx_1, idx_2]:
        output_data = OutputData(idx=idx,
                                 features=generated_features[idx, :],
                                 predict=generated_target[idx, :],
                                 task=task,
                                 target=generated_target[idx, :],
                                 data_type=DataTypesEnum.table)
        list_with_outputs.append(output_data)

    idx, features, target = DataMerger(list_with_outputs).merge()

    assert tuple(idx) == tuple(idx_2)
Beispiel #24
0
def test_lagged_with_invalid_params_fit_correctly():
    """ The function define a chain with incorrect parameters in the lagged
    transformation. During the training of the chain, the parameter 'window_size'
    is corrected
    """
    window_size = 600
    len_forecast = 50

    # The length of the time series is 500 elements
    project_root_path = str(project_root())
    file_path = os.path.join(project_root_path,
                             'test/data/short_time_series.csv')
    df = pd.read_csv(file_path)
    time_series = np.array(df['sea_height'])

    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=len_forecast))

    train_input = InputData(idx=np.arange(0, len(time_series)),
                            features=time_series,
                            target=time_series,
                            task=task,
                            data_type=DataTypesEnum.ts)

    # Get chain with lagged transformation in it
    chain = get_ts_chain(window_size)

    # Fit it
    chain.fit(train_input)

    is_chain_was_fitted = True
    assert is_chain_was_fitted
Beispiel #25
0
def get_model(train_file_path: str,
              cur_lead_time: datetime.timedelta = timedelta(seconds=60)):
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)

    # the search of the models provided by the framework
    # that can be used as nodes in a chain for the selected task
    models_repo = ModelTypesRepository()
    available_model_types, _ = models_repo.suitable_model(
        task_type=task.task_type, tags=['simple'])

    metric_function = MetricsRepository(). \
        metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty)

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_lead_time=cur_lead_time)

    # Create the genetic programming-based composer, that allow to find
    # the optimal structure of the composite model
    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    # run the search of best suitable model
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose)

    return chain_evo_composed
Beispiel #26
0
def test_target_data_from_csv_correct():
    """ Function tests two ways of processing target columns in "from_csv"
    method
    """
    test_file_path = str(os.path.dirname(__file__))
    file = '../../data/multi_target_sample.csv'
    path = os.path.join(test_file_path, file)
    task = Task(TaskTypesEnum.regression)

    # Process one column
    target_column = '1_day'
    one_column_data = InputData.from_csv(path,
                                         target_columns=target_column,
                                         columns_to_drop=['date'],
                                         task=task)

    # Process multiple target columns
    target_columns = [
        '1_day', '2_day', '3_day', '4_day', '5_day', '6_day', '7_day'
    ]
    seven_columns_data = InputData.from_csv(path,
                                            target_columns=target_columns,
                                            columns_to_drop=['date'],
                                            task=task)

    assert one_column_data.target.shape == (499, 1)
    assert seven_columns_data.target.shape == (499, 7)
Beispiel #27
0
def get_iris_data() -> InputData:
    synthetic_data = load_iris()
    input_data = InputData(idx=np.arange(0, len(synthetic_data.target)),
                           features=synthetic_data.data,
                           target=synthetic_data.target,
                           task=Task(TaskTypesEnum.classification),
                           data_type=DataTypesEnum.table)
    return input_data
Beispiel #28
0
def get_cholesterol_data():
    file_path = 'cases/data/cholesterol/cholesterol.csv'
    full_path = join(str(project_root()), file_path)
    task = Task(TaskTypesEnum.regression)
    data = InputData.from_csv(full_path, task=task)
    train, test = train_test_data_setup(data)

    return train, test
Beispiel #29
0
def get_kc2_data():
    file_path = 'cases/data/kc2/kc2.csv'
    full_path = join(str(project_root()), file_path)
    task = Task(TaskTypesEnum.classification)
    data = InputData.from_csv(full_path, task=task)
    train, test = train_test_data_setup(data)

    return train, test
Beispiel #30
0
def run_tpot(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task = params.task

    models_hyperparameters = get_models_hyperparameters()['TPOT']
    generations = models_hyperparameters['GENERATIONS']
    population_size = models_hyperparameters['POPULATION_SIZE']

    result_model_filename = f'{case_label}_g{generations}' \
                            f'_p{population_size}_{task.name}.pkl'
    current_file_path = str(os.path.dirname(__file__))
    result_file_path = os.path.join(current_file_path, result_model_filename)

    train_data = InputData.from_csv(train_file_path, task=Task(task))

    if result_model_filename not in os.listdir(current_file_path):
        # TODO change hyperparameters to actual from variable
        model = fit_tpot(train_data,
                         models_hyperparameters['MAX_RUNTIME_MINS'])

        model.export(
            output_file_name=f'{result_model_filename[:-4]}_pipeline.py')

        # sklearn pipeline object
        fitted_model_config = model.fitted_pipeline_
        joblib.dump(fitted_model_config, result_file_path, compress=1)

    imported_model = joblib.load(result_file_path)

    predict_data = InputData.from_csv(test_file_path, task=Task(task))
    true_target = predict_data.target
    if task == TaskTypesEnum.regression:
        predicted = predict_tpot_reg(imported_model, predict_data)
        predicted_labels = predicted
    elif task == TaskTypesEnum.classification:
        predicted, predicted_labels = predict_tpot_class(
            imported_model, predict_data)
    else:
        print('Incorrect type of ml task')
        raise NotImplementedError()

    print(f'BEST_model: {imported_model}')

    return true_target, predicted, predicted_labels