Example #1
0
def get_model(train_file_path: str,
              cur_lead_time: datetime.timedelta = timedelta(seconds=60)):
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)

    # the search of the models provided by the framework
    # that can be used as nodes in a chain for the selected task
    models_repo = ModelTypesRepository()
    available_model_types, _ = models_repo.suitable_model(
        task_type=task.task_type)

    metric_function = MetricsRepository(). \
        metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty)

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_lead_time=cur_lead_time)

    # Create the genetic programming-based composer, that allow to find
    # the optimal structure of the composite model
    composer = GPComposer()

    # run the search of best suitable model
    chain_evo_composed = composer.compose_chain(
        data=dataset_to_compose,
        initial_chain=None,
        composer_requirements=composer_requirements,
        metrics=metric_function,
        is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose)

    return chain_evo_composed
Example #2
0
def test_dummy_composer_flat_chain_build_correct():
    composer = DummyComposer(DummyChainTypeEnum.flat)
    empty_data = InputData(idx=np.zeros(1),
                           features=np.zeros(1),
                           target=np.zeros(1),
                           task=Task(TaskTypesEnum.classification),
                           data_type=DataTypesEnum.table)

    primary = ['logit']
    secondary = ['logit', 'xgboost']

    composer_requirements = ComposerRequirements(primary=primary,
                                                 secondary=secondary)
    new_chain = composer.compose_chain(
        data=empty_data,
        initial_chain=None,
        composer_requirements=composer_requirements,
        metrics=None)

    assert len(new_chain.nodes) == 3
    assert isinstance(new_chain.nodes[0], PrimaryNode)
    assert isinstance(new_chain.nodes[1], SecondaryNode)
    assert isinstance(new_chain.nodes[2], SecondaryNode)
    assert new_chain.nodes[1].nodes_from[0] is new_chain.nodes[0]
    assert new_chain.nodes[2].nodes_from[0] is new_chain.nodes[1]
    assert new_chain.nodes[0].nodes_from is None
Example #3
0
def data_setup():
    predictors, response = load_breast_cancer(return_X_y=True)
    np.random.seed(1)
    np.random.shuffle(predictors)
    np.random.shuffle(response)
    response = response[:100]
    predictors = predictors[:100]
    train_data_x, test_data_x = split_train_test(predictors)
    train_data_y, test_data_y = split_train_test(response)
    train_data = InputData(features=train_data_x, target=train_data_y,
                           idx=np.arange(0, len(train_data_y)),
                           task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table)
    test_data = InputData(features=test_data_x, target=test_data_y,
                          idx=np.arange(0, len(test_data_y)),
                          task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table)
    return train_data, test_data
Example #4
0
def chain_with_incorrect_task_type():
    first = PrimaryNode(model_type='linear')
    second = PrimaryNode(model_type='linear')
    final = SecondaryNode(model_type='kmeans', nodes_from=[first, second])

    chain = Chain(final)

    return chain, Task(TaskTypesEnum.classification)
Example #5
0
def get_iris_data() -> InputData:
    synthetic_data = load_iris()
    input_data = InputData(idx=np.arange(0, len(synthetic_data.target)),
                           features=synthetic_data.data,
                           target=synthetic_data.target,
                           task=Task(TaskTypesEnum.classification),
                           data_type=DataTypesEnum.table)
    return input_data
Example #6
0
def run_tpot(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task = params.task

    models_hyperparameters = get_models_hyperparameters()['TPOT']
    generations = models_hyperparameters['GENERATIONS']
    population_size = models_hyperparameters['POPULATION_SIZE']

    result_model_filename = f'{case_label}_g{generations}' \
                            f'_p{population_size}_{task.name}.pkl'
    current_file_path = str(os.path.dirname(__file__))
    result_file_path = os.path.join(current_file_path, result_model_filename)

    train_data = InputData.from_csv(train_file_path, task=Task(task))

    if result_model_filename not in os.listdir(current_file_path):
        # TODO change hyperparameters to actual from variable
        model = fit_tpot(train_data,
                         models_hyperparameters['MAX_RUNTIME_MINS'])

        model.export(
            output_file_name=f'{result_model_filename[:-4]}_pipeline.py')

        # sklearn pipeline object
        fitted_model_config = model.fitted_pipeline_
        joblib.dump(fitted_model_config, result_file_path, compress=1)

    imported_model = joblib.load(result_file_path)

    predict_data = InputData.from_csv(test_file_path, task=Task(task))
    true_target = predict_data.target
    if task == TaskTypesEnum.regression:
        predicted = predict_tpot_reg(imported_model, predict_data)
    elif task == TaskTypesEnum.classification:
        predicted = predict_tpot_class(imported_model, predict_data)
    else:
        print('Incorrect type of ml task')
        raise NotImplementedError()

    print(f'BEST_model: {imported_model}')

    return true_target, predicted
def run_credit_scoring_problem(
        train_file_path,
        test_file_path,
        max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
        is_visualise=False):
    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    # the search of the models provided by the framework that can be used as nodes in a chain for the selected task
    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=task.task_type)

    # the choice of the metric for the chain quality assessment during composition
    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC_penalty)

    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=max_lead_time)

    # Create GP-based composer
    composer = GPComposer()

    # the optimal chain generation by composition - the most time-consuming task
    chain_evo_composed = composer.compose_chain(
        data=dataset_to_compose,
        initial_chain=None,
        composer_requirements=composer_requirements,
        metrics=metric_function,
        is_visualise=False)

    chain_evo_composed.fine_tune_primary_nodes(input_data=dataset_to_compose,
                                               iterations=50)

    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True)

    if is_visualise:
        ComposerVisualiser.visualise(chain_evo_composed)

    # the quality assessment for the obtained composite models
    roc_on_valid_evo_composed = calculate_validation_metric(
        chain_evo_composed, dataset_to_validate)

    print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return roc_on_valid_evo_composed
Example #8
0
def data_setup() -> InputData:
    predictors, response = load_iris(return_X_y=True)
    np.random.seed(1)
    np.random.shuffle(predictors)
    np.random.shuffle(response)
    predictors = predictors[:100]
    response = response[:100]
    data = InputData(features=predictors, target=response, idx=np.arange(0, 100),
                     task=Task(TaskTypesEnum.classification),
                     data_type=DataTypesEnum.table)
    return data
Example #9
0
def get_synthetic_input_data(n_samples=10000,
                             n_features=10,
                             random_state=None) -> InputData:
    synthetic_data = make_classification(n_samples=n_samples,
                                         n_features=n_features,
                                         random_state=random_state)
    input_data = InputData(idx=np.arange(0, len(synthetic_data[1])),
                           features=synthetic_data[0],
                           target=synthetic_data[1],
                           task=Task(TaskTypesEnum.classification),
                           data_type=DataTypesEnum.table)
    return input_data
Example #10
0
def run_metocean_forecasting_problem(train_file_path,
                                     test_file_path,
                                     forecast_length=1,
                                     max_window_size=64,
                                     is_visualise=False):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts)

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), test_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts)

    chain = get_composite_lstm_chain()

    chain_simple = Chain()
    node_single = PrimaryNode('ridge')
    chain_simple.add_node(node_single)

    chain_lstm = Chain()
    node_lstm = PrimaryNode('lstm')
    chain_lstm.add_node(node_lstm)

    chain.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid = calculate_validation_metric(
        chain.predict(dataset_to_validate), dataset_to_validate,
        f'full-composite_{forecast_length}', is_visualise)

    chain_lstm.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid_lstm_only = calculate_validation_metric(
        chain_lstm.predict(dataset_to_validate), dataset_to_validate,
        f'full-lstm-only_{forecast_length}', is_visualise)

    chain_simple.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid_simple = calculate_validation_metric(
        chain_simple.predict(dataset_to_validate), dataset_to_validate,
        f'full-simple_{forecast_length}', is_visualise)

    print(f'RMSE composite: {rmse_on_valid}')
    print(f'RMSE simple: {rmse_on_valid_simple}')
    print(f'RMSE LSTM only: {rmse_on_valid_lstm_only}')

    return rmse_on_valid_simple
Example #11
0
def classification_dataset_with_redunant_features(n_samples=1000,
                                                  n_features=100,
                                                  n_informative=5
                                                  ) -> InputData:
    synthetic_data = make_classification(n_samples=n_samples,
                                         n_features=n_features,
                                         n_informative=n_informative)

    input_data = InputData(idx=np.arange(0, len(synthetic_data[1])),
                           features=synthetic_data[0],
                           target=synthetic_data[1],
                           task=Task(TaskTypesEnum.classification),
                           data_type=DataTypesEnum.table)
    return input_data
Example #12
0
def classification_dataset():
    samples = 1000
    x = 10.0 * np.random.rand(samples, ) - 5.0
    x = np.expand_dims(x, axis=1)
    y = 1.0 / (1.0 + np.exp(np.power(x, -1.0)))
    threshold = 0.5
    classes = np.array([0.0 if val <= threshold else 1.0 for val in y])
    classes = np.expand_dims(classes, axis=1)
    data = InputData(features=x,
                     target=classes,
                     idx=np.arange(0, len(x)),
                     task=Task(TaskTypesEnum.classification),
                     data_type=DataTypesEnum.table)

    return data
Example #13
0
def get_synthetic_ts_data(n_steps=10000) -> InputData:
    simulated_data = ArmaProcess().generate_sample(nsample=n_steps)
    x1 = np.arange(0, n_steps)
    x2 = np.arange(0, n_steps) + 1

    simulated_data = simulated_data + x1 * 0.0005 - x2 * 0.0001

    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=1, max_window_size=2))

    input_data = InputData(idx=np.arange(0, n_steps),
                           features=np.asarray([x1, x2]).T,
                           target=simulated_data,
                           task=task,
                           data_type=DataTypesEnum.ts)
    return input_data
Example #14
0
def output_dataset():
    task = Task(TaskTypesEnum.classification)

    samples = 1000
    x = 10.0 * np.random.rand(samples, ) - 5.0
    x = np.expand_dims(x, axis=1)
    threshold = 0.5
    y = 1.0 / (1.0 + np.exp(np.power(x, -1.0)))
    classes = np.array([0.0 if val <= threshold else 1.0 for val in y])
    classes = np.expand_dims(classes, axis=1)
    data = OutputData(idx=np.arange(0, 100),
                      features=x,
                      predict=classes,
                      task=task,
                      data_type=DataTypesEnum.table)

    return data
Example #15
0
def test_data_from_csv():
    test_file_path = str(os.path.dirname(__file__))
    file = 'data/simple_classification.csv'
    task = Task(TaskTypesEnum.classification)
    df = pd.read_csv(os.path.join(test_file_path, file))
    data_array = np.array(df).T
    features = data_array[1:-1].T
    target = data_array[-1]
    idx = data_array[0]
    expected_features = InputData(
        features=features,
        target=target,
        idx=idx,
        task=task,
        data_type=DataTypesEnum.table).features.all()
    actual_features = InputData.from_csv(os.path.join(test_file_path,
                                                      file)).features.all()
    assert expected_features == actual_features
Example #16
0
def fit_template(chain_template, classes, with_gaussian=False, skip_fit=False):
    templates_by_models = []
    for model_template in itertools.chain.from_iterable(chain_template):
        model_instance = Model(model_type=model_template.model_type)
        model_template.model_instance = model_instance
        templates_by_models.append((model_template, model_instance))
    if skip_fit:
        return

    for template, instance in templates_by_models:
        samples, features_amount = template.input_shape

        if with_gaussian:
            features, target = gauss_quantiles(samples_amount=samples,
                                               features_amount=features_amount,
                                               classes_amount=classes)
        else:
            options = {
                'informative': features_amount,
                'redundant': 0,
                'repeated': 0,
                'clusters_per_class': 1
            }
            features, target = synthetic_dataset(
                samples_amount=samples,
                features_amount=features_amount,
                classes_amount=classes,
                features_options=options)
        target = np.expand_dims(target, axis=1)
        data_train = InputData(idx=np.arange(0, samples),
                               features=features,
                               target=target,
                               data_type=DataTypesEnum.table,
                               task=Task(TaskTypesEnum.classification))

        preproc_data = copy(data_train)
        preprocessor = Normalization().fit(preproc_data.features)
        preproc_data.features = preprocessor.apply(preproc_data.features)
        print(f'Fit {instance}')
        fitted_model, predictions = instance.fit(data=preproc_data)

        template.fitted_model = fitted_model
        template.data_fit = preproc_data
        template.preprocessor = preprocessor
Example #17
0
 def from_csv(file_path,
              delimiter=',',
              task: Task = Task(TaskTypesEnum.classification),
              data_type: DataTypesEnum = DataTypesEnum.table,
              with_target=True):
     data_frame = pd.read_csv(file_path, sep=delimiter)
     data_frame = _convert_dtypes(data_frame=data_frame)
     data_array = np.array(data_frame).T
     idx = data_array[0]
     if with_target:
         features = data_array[1:-1].T
         target = data_array[-1].astype(np.float)
     else:
         features = data_array[1:].T
         target = None
     return InputData(idx=idx,
                      features=features,
                      target=target,
                      task=task,
                      data_type=data_type)
Example #18
0
def synthetic_benchmark_dataset(samples_amount: int,
                                features_amount: int,
                                classes_amount: int = 2,
                                features_options: Dict = DEFAULT_OPTIONS,
                                fitted_chain: Chain = None) -> InputData:
    """
    Generates a binary classification benchmark dataset that was obtained using
    the (TODO: add. reference) proposed fitting schema.
    :param samples_amount: Total amount of samples in the resulted dataset.
    :param features_amount: Total amount of features per sample.
    :param classes_amount: The amount of classes in the dataset.
    :param features_options: features options in key-value suitable for classification_dataset.
    :param fitted_chain: Chain with separately fitted models.
    If None then 3-level balanced tree were fitted and taken as a default.
    :return: Benchmark dataset that is ready to be used by Chain.
    """
    if fitted_chain is None:
        fitted_chain = _default_chain(samples_amount=samples_amount,
                                      features_amount=features_amount,
                                      classes_amount=classes_amount)

    if classes_amount != 2:
        raise NotImplementedError(
            'Only binary classification tasks are supported')

    features, target = classification_dataset(
        samples_amount=samples_amount,
        features_amount=features_amount,
        classes_amount=classes_amount,
        features_options=features_options)
    target = np.expand_dims(target, axis=1)

    task = Task(TaskTypesEnum.classification)
    samples_idxs = np.arange(0, samples_amount)

    train = InputData(idx=samples_idxs,
                      features=features,
                      target=target,
                      task=task,
                      data_type=DataTypesEnum.table)

    synth_target = fitted_chain.predict(input_data=train).predict
    synth_labels = _to_labels(synth_target)
    data_synth_train = InputData(idx=np.arange(0, samples_amount),
                                 features=features,
                                 target=synth_labels,
                                 task=task,
                                 data_type=DataTypesEnum.table)

    # TODO: fix preproc issues

    fitted_chain.fit_from_scratch(input_data=data_synth_train)

    features, target = classification_dataset(
        samples_amount=samples_amount,
        features_amount=features_amount,
        classes_amount=classes_amount,
        features_options=features_options)
    target = np.expand_dims(target, axis=1)
    test = InputData(idx=samples_idxs,
                     features=features,
                     target=target,
                     data_type=DataTypesEnum.table,
                     task=task)
    synth_target = fitted_chain.predict(input_data=test).predict
    synth_labels = _to_labels(synth_target)
    data_synth_final = InputData(idx=samples_idxs,
                                 features=features,
                                 data_type=DataTypesEnum.table,
                                 target=synth_labels,
                                 task=task)

    return data_synth_final
Example #19
0
def regression_dataset():
    test_file_path = str(os.path.dirname(__file__))
    file = 'data/advanced_regression.csv'
    data = InputData.from_csv(os.path.join(test_file_path, file))
    data.task = Task(TaskTypesEnum.regression)
    return data
def run_oil_forecasting_problem(train_file_path,
                                train_file_path_crm,
                                forecast_length,
                                max_window_size,
                                is_visualise=False,
                                well_id='Unknown'):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size,
                            return_all_steps=False,
                            make_future_prediction=False))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts,
                                          delimiter=',')

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), train_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts,
                                             delimiter=',')

    full_path_train_crm = os.path.join(str(project_root()),
                                       train_file_path_crm)
    dataset_to_train_crm = InputData.from_csv(full_path_train_crm,
                                              task=task_to_solve,
                                              data_type=DataTypesEnum.ts,
                                              delimiter=',')

    dataset_to_validate_crm = copy(dataset_to_train_crm)

    prediction_full = None
    prediction_full_crm = None
    prediction_full_crm_opt = None

    forecast_window_shift_num = 4

    depth = 100

    for forecasting_step in range(forecast_window_shift_num):
        start = 0 + depth * forecasting_step
        end = depth * 2 + depth * (forecasting_step + 1)

        dataset_to_train_local = dataset_to_train.subset(start, end)
        dataset_to_train_local_crm = dataset_to_train_crm.subset(start, end)

        start = 0 + depth * forecasting_step
        end = depth * 2 + depth * (forecasting_step + 1)

        dataset_to_validate_local = dataset_to_validate.subset(
            start + depth, end + depth)
        dataset_to_validate_local_crm = dataset_to_validate_crm.subset(
            start + depth, end + depth)

        chain_simple = Chain(PrimaryNode('lstm'))
        chain_simple_crm = Chain(PrimaryNode('lstm'))
        chain_crm_opt = get_comp_chain()

        chain_simple.fit_from_scratch(input_data=dataset_to_train_local,
                                      verbose=False)
        chain_simple_crm.fit_from_scratch(
            input_data=dataset_to_train_local_crm, verbose=False)
        chain_crm_opt.fit_from_scratch(input_data=dataset_to_train_local_crm,
                                       verbose=False)

        prediction = chain_simple.predict(dataset_to_validate_local)
        prediction_crm = chain_simple_crm.predict(
            dataset_to_validate_local_crm)
        prediction_crm_opt = chain_crm_opt.predict(
            dataset_to_validate_local_crm)

        prediction_full = merge_datasets(prediction_full, prediction,
                                         forecasting_step)
        prediction_full_crm = merge_datasets(prediction_full_crm,
                                             prediction_crm, forecasting_step)
        prediction_full_crm_opt = merge_datasets(prediction_full_crm_opt,
                                                 prediction_crm_opt,
                                                 forecasting_step)

    rmse_on_valid_simple = calculate_validation_metric(
        prediction_full, prediction_full_crm, prediction_full_crm_opt,
        dataset_to_validate, well_id, is_visualise)

    print(well_id)
    print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}')
    print(f'RMSE ML: {round(rmse_on_valid_simple[1])}')
    print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}')
    print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}')

    print(f'DTW CRM: {round(rmse_on_valid_simple[4])}')
    print(f'DTW ML: {round(rmse_on_valid_simple[5])}')
    print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}')
    print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}')

    return rmse_on_valid_simple
Example #21
0
def run_fedot(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task_type = params.task

    if task_type == TaskTypesEnum.classification:
        metric = ClassificationMetricsEnum.ROCAUC
    elif task_type == TaskTypesEnum.regression:
        metric = RegressionMetricsEnum.RMSE
    else:
        raise NotImplementedError()

    task = Task(task_type)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    models_hyperparameters = get_models_hyperparameters()['FEDOT']
    cur_lead_time = models_hyperparameters['MAX_RUNTIME_MINS']

    saved_model_name = f'fedot_{case_label}_{task_type}_{cur_lead_time}_{metric}'
    loaded_model = load_fedot_model(saved_model_name)

    if not loaded_model:
        generations = models_hyperparameters['GENERATIONS']
        population_size = models_hyperparameters['POPULATION_SIZE']

        # the search of the models provided by the framework that can be used as nodes in a chain'
        models_repo = ModelTypesRepository()
        available_model_types, _ = models_repo.suitable_model(task.task_type)

        metric_function = MetricsRepository().metric_by_id(metric)

        composer_requirements = GPComposerRequirements(
            primary=available_model_types,
            secondary=available_model_types,
            max_arity=3,
            max_depth=3,
            pop_size=population_size,
            num_of_generations=generations,
            crossover_prob=0.8,
            mutation_prob=0.8,
            max_lead_time=datetime.timedelta(minutes=cur_lead_time))

        # Create GP-based composer
        composer = GPComposer()

        # the optimal chain generation by composition - the most time-consuming task
        chain_evo_composed = composer.compose_chain(
            data=dataset_to_compose,
            initial_chain=None,
            composer_requirements=composer_requirements,
            metrics=metric_function,
            is_visualise=False)
        chain_evo_composed.fine_tune_primary_nodes(
            input_data=dataset_to_compose, iterations=50)
        chain_evo_composed.fit(input_data=dataset_to_compose, verbose=False)
        save_fedot_model(chain_evo_composed, saved_model_name)
    else:
        chain_evo_composed = loaded_model

    evo_predicted = chain_evo_composed.predict(dataset_to_validate)

    return dataset_to_validate.target, evo_predicted.predict
Example #22
0
def classification_dataset():
    test_file_path = str(os.path.dirname(__file__))
    file = os.path.join('data', 'advanced_classification.csv')
    return InputData.from_csv(os.path.join(test_file_path, file), task=Task(TaskTypesEnum.classification))
Example #23
0
def run_metocean_forecasting_problem(train_file_path,
                                     test_file_path,
                                     forecast_length=1,
                                     max_window_size=64,
                                     with_visualisation=True):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts)

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), test_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts)

    metric_function = MetricsRepository().metric_by_id(
        RegressionMetricsEnum.RMSE)

    ref_chain = get_composite_lstm_chain()

    available_model_types_primary = ['trend_data_model', 'residual_data_model']

    available_model_types_secondary = [
        'rfr', 'linear', 'ridge', 'lasso', 'additive_data_model'
    ]

    composer = FixedStructureComposer()

    composer_requirements = GPComposerRequirements(
        primary=available_model_types_primary,
        secondary=available_model_types_secondary,
        max_arity=2,
        max_depth=4,
        pop_size=10,
        num_of_generations=10,
        crossover_prob=0,
        mutation_prob=0.8,
        max_lead_time=datetime.timedelta(minutes=20))

    chain = composer.compose_chain(data=dataset_to_train,
                                   initial_chain=ref_chain,
                                   composer_requirements=composer_requirements,
                                   metrics=metric_function,
                                   is_visualise=False)

    if with_visualisation:
        ComposerVisualiser.visualise(chain)

    chain.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid = calculate_validation_metric(
        chain.predict(dataset_to_validate),
        dataset_to_validate,
        f'full-composite_{forecast_length}',
        is_visualise=with_visualisation)

    print(f'RMSE composite: {rmse_on_valid}')

    return rmse_on_valid
Example #24
0
def synthetic_forecasting_problem(forecast_length: int, max_window_size: int):
    task = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size))
    ts_len = 10
    ts = np.asarray(range(ts_len))

    exog_variable = 10 + np.asarray(range(ts_len)).reshape(-1, 1)

    ts_data = InputData(idx=range(len(ts)),
                        features=exog_variable,
                        target=ts,
                        task=task,
                        data_type=DataTypesEnum.table)

    # shape is (5, 4, 1)
    exog_variable_3d = np.asarray([[[10], [11], [12], [13]],
                                   [[11], [12], [13], [14]],
                                   [[12], [13], [14], [15]],
                                   [[13], [14], [15], [16]],
                                   [[14], [15], [16], [17]]])

    lagged_target_3d_as_feature = np.asarray([[[0], [1], [2], [3]],
                                              [[1], [2], [3], [4]],
                                              [[2], [3], [4], [5]],
                                              [[3], [4], [5], [6]],
                                              [[4], [5], [6], [7]]])

    # now we concat exog lagged variables as well as target lagged
    # so features now is (5, 4, 2), i.e.
    # (n-max_window_size-forecast_length+1, max_window_size, amount_exog_features+target_shape)
    feature_3d = np.concatenate(
        (exog_variable_3d, lagged_target_3d_as_feature), axis=2)

    # target is (5, 4, 1)
    # (n-max_window_size-forecast_length+1, max_window_size, target_shape)
    # So lstm returns predictions with same shape
    # To get only forecast values do next:
    # pred_3d[:, -forecast_length:, :]
    # i.e. we take values only from last `forecast_length` timestamps
    target_3d = np.asarray([[[2], [3], [4], [5]], [[3], [4], [5], [6]],
                            [[4], [5], [6], [7]], [[5], [6], [7], [8]],
                            [[6], [7], [8], [9]]])

    ts_data_3d = InputData(idx=range(len(ts)),
                           features=feature_3d,
                           target=target_3d,
                           task=task,
                           data_type=DataTypesEnum.ts_lagged_3d)

    # lagged format contains only the values to forecast (future values) in the target
    # this format convinient to use with classic regression modules
    # shape is (5, 2), i.e. (n-max_window_size-forecast_length+1, forecast_length * target_shape)
    target_lagged = np.asarray([[4, 5], [5, 6], [6, 7], [7, 8], [8, 9]])

    # in lagged format all features are as feature_3d, but in 2d format
    # with shape (n-max_window_size-forecast_length+1, max_window_size * (amount_exog_features + target_shape))
    features_lagged = feature_3d.reshape(feature_3d.shape[0], -1)

    ts_data_lagged = InputData(idx=range(len(ts)),
                               features=features_lagged,
                               target=target_lagged,
                               task=task,
                               data_type=DataTypesEnum.ts_lagged_table)

    return task, ts_len, ts_data, ts_data_3d, ts_data_lagged