Esempio n. 1
0
def run_autokeras(train_file_path: str,
                  test_file_path: str,
                  task: MachineLearningTasksEnum,
                  case_name: str = 'default'):
    config_data = get_models_hyperparameters()['autokeras']
    max_trial = config_data['MAX_TRIAL']
    epoch = config_data['EPOCH']

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    # TODO Save model to file

    if task is MachineLearningTasksEnum.classification:
        estimator = ak.StructuredDataClassifier
    else:
        estimator = ak.StructuredDataRegressor

    model = estimator(max_trials=max_trial)

    model.fit(train_data.features, train_data.target, epochs=epoch)

    predicted = model.predict(test_data.features)

    if task is MachineLearningTasksEnum.classification:
        result_metric = {
            'autokeras_roc_auc':
            round(roc_auc_score(test_data.target, predicted), 3)
        }
    else:
        result_metric = {'MSE': round(mse(test_data.target, predicted), 3)}

    return result_metric
Esempio n. 2
0
def run_h2o(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task = params.task

    config_data = get_models_hyperparameters()['H2O']
    max_models = config_data['MAX_MODELS']
    max_runtime_secs = config_data['MAX_RUNTIME_SECS']

    result_filename = f'{case_label}_m{max_models}_rs{max_runtime_secs}_{task.name}'
    exported_model_path = os.path.join(CURRENT_PATH, result_filename)

    # TODO Regression
    if result_filename not in os.listdir(CURRENT_PATH):
        train_data = InputData.from_csv(train_file_path)
        best_model = fit_h2o(train_data, round(max_runtime_secs / 60))
        temp_exported_model_path = h2o.save_model(model=best_model,
                                                  path=CURRENT_PATH)

        os.renames(temp_exported_model_path, exported_model_path)

    ip, port = get_h2o_connect_config()
    h2o.init(ip=ip, port=port, name='h2o_server')

    imported_model = h2o.load_model(exported_model_path)

    test_frame = InputData.from_csv(test_file_path)
    true_target = test_frame.target

    predicted = predict_h2o(imported_model, test_frame)

    h2o.shutdown(prompt=False)

    return true_target, predicted
Esempio n. 3
0
def run_autokeras(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    task = params.task

    config_data = get_models_hyperparameters()['autokeras']
    max_trial = config_data['MAX_TRIAL']
    epoch = config_data['EPOCH']

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    # TODO Save model to file

    if task is TaskTypesEnum.classification:
        estimator = ak.StructuredDataClassifier
    else:
        estimator = ak.StructuredDataRegressor

    model = estimator(max_trials=max_trial)

    model.fit(train_data.features, train_data.target, epochs=epoch)

    predicted = model.predict(test_data.features)

    return test_data.target, predicted
Esempio n. 4
0
def run_chain_from_automl(train_file_path: str,
                          test_file_path: str,
                          max_run_time: timedelta = timedelta(minutes=10)):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    chain = Chain()
    node_tpot = PrimaryNode('tpot')
    node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = PrimaryNode('lda')
    node_rf = SecondaryNode('rf')

    node_rf.nodes_from = [node_tpot, node_lda]

    chain.add_node(node_rf)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Esempio n. 5
0
def run_h2o(train_file_path: str,
            test_file_path: str,
            task: MachineLearningTasksEnum,
            case_name='h2o_default'):
    config_data = get_models_hyperparameters()['H2O']
    max_models = config_data['MAX_MODELS']
    max_runtime_secs = config_data['MAX_RUNTIME_SECS']

    result_filename = f'{case_name}_m{max_models}_rs{max_runtime_secs}_{task.name}'
    exported_model_path = os.path.join(CURRENT_PATH, result_filename)

    # TODO Regression
    if result_filename not in os.listdir(CURRENT_PATH):
        train_data = InputData.from_csv(train_file_path)
        best_model = fit_h2o(train_data)
        temp_exported_model_path = h2o.save_model(model=best_model,
                                                  path=CURRENT_PATH)

        os.renames(temp_exported_model_path, exported_model_path)

    ip, port = get_h2o_connect_config()
    h2o.init(ip=ip, port=port, name='h2o_server')

    imported_model = h2o.load_model(exported_model_path)

    test_frame = InputData.from_csv(test_file_path)
    true_target = test_frame.target

    predictions = predict_h2o(imported_model, test_frame)

    if task is MachineLearningTasksEnum.classification:
        train_roc_auc_value = round(imported_model.auc(train=True), 3)
        valid_roc_auc_value = round(imported_model.auc(valid=True), 3)
        test_roc_auc_value = round(roc_auc_score(true_target, predictions), 3)

        metrics = {
            'H2O_ROC_AUC_train': train_roc_auc_value,
            'H2O_ROC_AUC_valid': valid_roc_auc_value,
            'H2O_ROC_AUC_test': test_roc_auc_value
        }

        print(f"H2O_ROC_AUC_train: {metrics['H2O_ROC_AUC_train']}")
        print(f"H2O_ROC_AUC_valid: {metrics['H2O_ROC_AUC_valid']}")
        print(f"H2O_ROC_AUC_test: {metrics['H2O_ROC_AUC_test']}")
    else:
        mse_train = imported_model.mse()
        rmse_train = imported_model.rmse()

        metrics = {'H2O_MSE_train': mse_train, 'H2O_RMSE_train': rmse_train}

        print(f"H2O_MSE_train: {metrics['H2O_MSE_train']}")
        print(f"H2O_RMSE_train: {metrics['H2O_RMSE_train']}")

    h2o.shutdown(prompt=False)

    return metrics
Esempio n. 6
0
def run_credit_scoring_problem(
        train_file_path,
        test_file_path,
        max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
        is_visualise=False):
    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    # the search of the models provided by the framework that can be used as nodes in a chain for the selected task
    available_model_types, _ = ModelTypesRepository().suitable_model(
        task_type=task.task_type)

    # the choice of the metric for the chain quality assessment during composition
    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC_penalty)

    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=max_lead_time)

    # Create GP-based composer
    composer = GPComposer()

    # the optimal chain generation by composition - the most time-consuming task
    chain_evo_composed = composer.compose_chain(
        data=dataset_to_compose,
        initial_chain=None,
        composer_requirements=composer_requirements,
        metrics=metric_function,
        is_visualise=False)

    chain_evo_composed.fine_tune_primary_nodes(input_data=dataset_to_compose,
                                               iterations=50)

    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True)

    if is_visualise:
        ComposerVisualiser.visualise(chain_evo_composed)

    # the quality assessment for the obtained composite models
    roc_on_valid_evo_composed = calculate_validation_metric(
        chain_evo_composed, dataset_to_validate)

    print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return roc_on_valid_evo_composed
Esempio n. 7
0
def run_xgb_classifier(train_file: str, test_file: str):
    train_data = InputData.from_csv(train_file)
    test_data = InputData.from_csv(test_file)

    model = XGBClassifier()
    model.fit(train_data.features, train_data.target)

    predicted = model.predict_proba(test_data.features)[:, 1]

    roc_auc_value = round(roc_auc_score(test_data.target, predicted), 3)

    return roc_auc_value
Esempio n. 8
0
def run_metocean_forecasting_problem(train_file_path,
                                     test_file_path,
                                     forecast_length=1,
                                     max_window_size=64,
                                     is_visualise=False):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts)

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), test_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts)

    chain = get_composite_lstm_chain()

    chain_simple = Chain()
    node_single = PrimaryNode('ridge')
    chain_simple.add_node(node_single)

    chain_lstm = Chain()
    node_lstm = PrimaryNode('lstm')
    chain_lstm.add_node(node_lstm)

    chain.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid = calculate_validation_metric(
        chain.predict(dataset_to_validate), dataset_to_validate,
        f'full-composite_{forecast_length}', is_visualise)

    chain_lstm.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid_lstm_only = calculate_validation_metric(
        chain_lstm.predict(dataset_to_validate), dataset_to_validate,
        f'full-lstm-only_{forecast_length}', is_visualise)

    chain_simple.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid_simple = calculate_validation_metric(
        chain_simple.predict(dataset_to_validate), dataset_to_validate,
        f'full-simple_{forecast_length}', is_visualise)

    print(f'RMSE composite: {rmse_on_valid}')
    print(f'RMSE simple: {rmse_on_valid_simple}')
    print(f'RMSE LSTM only: {rmse_on_valid_lstm_only}')

    return rmse_on_valid_simple
Esempio n. 9
0
def get_model(train_file_path: str,
              cur_lead_time: datetime.timedelta = timedelta(seconds=60)):
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)

    # the search of the models provided by the framework
    # that can be used as nodes in a chain for the selected task
    models_repo = ModelTypesRepository()
    available_model_types, _ = models_repo.suitable_model(
        task_type=task.task_type)

    metric_function = MetricsRepository(). \
        metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty)

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_lead_time=cur_lead_time)

    # Create the genetic programming-based composer, that allow to find
    # the optimal structure of the composite model
    composer = GPComposer()

    # run the search of best suitable model
    chain_evo_composed = composer.compose_chain(
        data=dataset_to_compose,
        initial_chain=None,
        composer_requirements=composer_requirements,
        metrics=metric_function,
        is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose)

    return chain_evo_composed
Esempio n. 10
0
def apply_model_to_data(model: Chain, data_path: str):
    df, file_path = create_multi_clf_examples_from_excel(data_path,
                                                         return_df=True)
    dataset_to_apply = InputData.from_csv(file_path, with_target=False)
    evo_predicted = model.predict(dataset_to_apply)
    df['forecast'] = probs_to_labels(evo_predicted.predict)
    return df
Esempio n. 11
0
def test_string_features_from_csv():
    test_file_path = str(os.path.dirname(__file__))
    file = 'data/scoring_train_cat.csv'
    expected_features = InputData.from_csv(os.path.join(test_file_path,
                                                        file)).features

    assert expected_features.dtype == float
    assert np.isfinite(expected_features).all()
Esempio n. 12
0
def run_tpot(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task = params.task

    models_hyperparameters = get_models_hyperparameters()['TPOT']
    generations = models_hyperparameters['GENERATIONS']
    population_size = models_hyperparameters['POPULATION_SIZE']

    result_model_filename = f'{case_label}_g{generations}' \
                            f'_p{population_size}_{task.name}.pkl'
    current_file_path = str(os.path.dirname(__file__))
    result_file_path = os.path.join(current_file_path, result_model_filename)

    train_data = InputData.from_csv(train_file_path, task=Task(task))

    if result_model_filename not in os.listdir(current_file_path):
        # TODO change hyperparameters to actual from variable
        model = fit_tpot(train_data,
                         models_hyperparameters['MAX_RUNTIME_MINS'])

        model.export(
            output_file_name=f'{result_model_filename[:-4]}_pipeline.py')

        # sklearn pipeline object
        fitted_model_config = model.fitted_pipeline_
        joblib.dump(fitted_model_config, result_file_path, compress=1)

    imported_model = joblib.load(result_file_path)

    predict_data = InputData.from_csv(test_file_path, task=Task(task))
    true_target = predict_data.target
    if task == TaskTypesEnum.regression:
        predicted = predict_tpot_reg(imported_model, predict_data)
    elif task == TaskTypesEnum.classification:
        predicted = predict_tpot_class(imported_model, predict_data)
    else:
        print('Incorrect type of ml task')
        raise NotImplementedError()

    print(f'BEST_model: {imported_model}')

    return true_target, predicted
Esempio n. 13
0
def validate_model_quality(model: Chain, data_path: str):
    dataset_to_validate = InputData.from_csv(data_path)
    predicted_labels = model.predict(dataset_to_validate).predict

    roc_auc_valid = round(
        roc_auc(y_true=test_data.target,
                y_score=predicted_labels,
                multi_class='ovo',
                average='macro'), 3)
    return roc_auc_valid
Esempio n. 14
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier())
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results)

    print(roc_auc_value)

    chain = Chain()
    node_first = PrimaryNode('direct_data_model')
    node_second = PrimaryNode('bernb')
    node_third = SecondaryNode('rf')

    node_third.nodes_from.append(node_first)
    node_third.nodes_from.append(node_second)

    chain.add_node(node_third)

    chain.fit(train_data)
    results = chain.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Esempio n. 15
0
def run_xgboost(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    task = params.task

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    if task is TaskTypesEnum.classification:
        model = xgb.XGBClassifier(max_depth=2,
                                  learning_rate=1.0,
                                  objective='binary:logistic')
        model.fit(train_data.features, train_data.target)
        predicted = model.predict_proba(test_data.features)[:, 1]
    elif task is TaskTypesEnum.regression:
        xgbr = xgb.XGBRegressor(max_depth=3,
                                learning_rate=0.3,
                                n_estimators=300,
                                objective='reg:squarederror')
        xgbr.fit(train_data.features, train_data.target)
        predicted = xgbr.predict(test_data.features)
    else:
        raise NotImplementedError()
    return test_data.target, predicted
Esempio n. 16
0
def test_data_from_csv():
    test_file_path = str(os.path.dirname(__file__))
    file = 'data/test_dataset.csv'
    task_type = MachineLearningTasksEnum.classification
    df = pd.read_csv(os.path.join(test_file_path, file))
    data_array = np.array(df).T
    features = data_array[1:-1].T
    target = data_array[-1]
    idx = data_array[0]
    expected_features = InputData(features=features,
                                  target=target,
                                  idx=idx,
                                  task_type=task_type).features.all()
    actual_features = InputData.from_csv(os.path.join(test_file_path,
                                                      file)).features.all()
    assert expected_features == actual_features
Esempio n. 17
0
def test_data_from_csv():
    test_file_path = str(os.path.dirname(__file__))
    file = 'data/simple_classification.csv'
    task = Task(TaskTypesEnum.classification)
    df = pd.read_csv(os.path.join(test_file_path, file))
    data_array = np.array(df).T
    features = data_array[1:-1].T
    target = data_array[-1]
    idx = data_array[0]
    expected_features = InputData(
        features=features,
        target=target,
        idx=idx,
        task=task,
        data_type=DataTypesEnum.table).features.all()
    actual_features = InputData.from_csv(os.path.join(test_file_path,
                                                      file)).features.all()
    assert expected_features == actual_features
Esempio n. 18
0
def classification_dataset():
    test_file_path = str(os.path.dirname(__file__))
    file = os.path.join('data', 'advanced_classification.csv')
    return InputData.from_csv(os.path.join(test_file_path, file), task=Task(TaskTypesEnum.classification))
Esempio n. 19
0
def scoring_dataset():
    train_file_path, test_file_path = get_scoring_case_data_paths()
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    return train_data, test_data
Esempio n. 20
0
def regression_dataset():
    test_file_path = str(os.path.dirname(__file__))
    file = 'data/advanced_regression.csv'
    data = InputData.from_csv(os.path.join(test_file_path, file))
    data.task = Task(TaskTypesEnum.regression)
    return data
Esempio n. 21
0
def classification_dataset():
    test_file_path = str(os.path.dirname(__file__))
    file = 'data/advanced_classification.csv'
    return InputData.from_csv(os.path.join(test_file_path, file))
def run_oil_forecasting_problem(train_file_path,
                                train_file_path_crm,
                                forecast_length,
                                max_window_size,
                                is_visualise=False,
                                well_id='Unknown'):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size,
                            return_all_steps=False,
                            make_future_prediction=False))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts,
                                          delimiter=',')

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), train_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts,
                                             delimiter=',')

    full_path_train_crm = os.path.join(str(project_root()),
                                       train_file_path_crm)
    dataset_to_train_crm = InputData.from_csv(full_path_train_crm,
                                              task=task_to_solve,
                                              data_type=DataTypesEnum.ts,
                                              delimiter=',')

    dataset_to_validate_crm = copy(dataset_to_train_crm)

    prediction_full = None
    prediction_full_crm = None
    prediction_full_crm_opt = None

    forecast_window_shift_num = 4

    depth = 100

    for forecasting_step in range(forecast_window_shift_num):
        start = 0 + depth * forecasting_step
        end = depth * 2 + depth * (forecasting_step + 1)

        dataset_to_train_local = dataset_to_train.subset(start, end)
        dataset_to_train_local_crm = dataset_to_train_crm.subset(start, end)

        start = 0 + depth * forecasting_step
        end = depth * 2 + depth * (forecasting_step + 1)

        dataset_to_validate_local = dataset_to_validate.subset(
            start + depth, end + depth)
        dataset_to_validate_local_crm = dataset_to_validate_crm.subset(
            start + depth, end + depth)

        chain_simple = Chain(PrimaryNode('lstm'))
        chain_simple_crm = Chain(PrimaryNode('lstm'))
        chain_crm_opt = get_comp_chain()

        chain_simple.fit_from_scratch(input_data=dataset_to_train_local,
                                      verbose=False)
        chain_simple_crm.fit_from_scratch(
            input_data=dataset_to_train_local_crm, verbose=False)
        chain_crm_opt.fit_from_scratch(input_data=dataset_to_train_local_crm,
                                       verbose=False)

        prediction = chain_simple.predict(dataset_to_validate_local)
        prediction_crm = chain_simple_crm.predict(
            dataset_to_validate_local_crm)
        prediction_crm_opt = chain_crm_opt.predict(
            dataset_to_validate_local_crm)

        prediction_full = merge_datasets(prediction_full, prediction,
                                         forecasting_step)
        prediction_full_crm = merge_datasets(prediction_full_crm,
                                             prediction_crm, forecasting_step)
        prediction_full_crm_opt = merge_datasets(prediction_full_crm_opt,
                                                 prediction_crm_opt,
                                                 forecasting_step)

    rmse_on_valid_simple = calculate_validation_metric(
        prediction_full, prediction_full_crm, prediction_full_crm_opt,
        dataset_to_validate, well_id, is_visualise)

    print(well_id)
    print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}')
    print(f'RMSE ML: {round(rmse_on_valid_simple[1])}')
    print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}')
    print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}')

    print(f'DTW CRM: {round(rmse_on_valid_simple[4])}')
    print(f'DTW ML: {round(rmse_on_valid_simple[5])}')
    print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}')
    print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}')

    return rmse_on_valid_simple
Esempio n. 23
0
def run_metocean_forecasting_problem(train_file_path,
                                     test_file_path,
                                     forecast_length=1,
                                     max_window_size=64,
                                     with_visualisation=True):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts)

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), test_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts)

    metric_function = MetricsRepository().metric_by_id(
        RegressionMetricsEnum.RMSE)

    ref_chain = get_composite_lstm_chain()

    available_model_types_primary = ['trend_data_model', 'residual_data_model']

    available_model_types_secondary = [
        'rfr', 'linear', 'ridge', 'lasso', 'additive_data_model'
    ]

    composer = FixedStructureComposer()

    composer_requirements = GPComposerRequirements(
        primary=available_model_types_primary,
        secondary=available_model_types_secondary,
        max_arity=2,
        max_depth=4,
        pop_size=10,
        num_of_generations=10,
        crossover_prob=0,
        mutation_prob=0.8,
        max_lead_time=datetime.timedelta(minutes=20))

    chain = composer.compose_chain(data=dataset_to_train,
                                   initial_chain=ref_chain,
                                   composer_requirements=composer_requirements,
                                   metrics=metric_function,
                                   is_visualise=False)

    if with_visualisation:
        ComposerVisualiser.visualise(chain)

    chain.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid = calculate_validation_metric(
        chain.predict(dataset_to_validate),
        dataset_to_validate,
        f'full-composite_{forecast_length}',
        is_visualise=with_visualisation)

    print(f'RMSE composite: {rmse_on_valid}')

    return rmse_on_valid
Esempio n. 24
0
    roc_auc_valid = round(
        roc_auc(y_true=test_data.target,
                y_score=predicted_labels,
                multi_class='ovo',
                average='macro'), 3)
    return roc_auc_valid


if __name__ == '__main__':
    file_path_first = r'./data/example1.xlsx'
    file_path_second = r'./data/example2.xlsx'
    file_path_third = r'./data/example3.xlsx'

    train_file_path, test_file_path = create_multi_clf_examples_from_excel(
        file_path_first)
    test_data = InputData.from_csv(test_file_path)

    fitted_model = get_model(train_file_path)

    ComposerVisualiser.visualise(fitted_model)

    roc_auc = validate_model_quality(fitted_model, test_file_path)
    print(f'ROC AUC metric is {roc_auc}')

    final_prediction_first = apply_model_to_data(fitted_model,
                                                 file_path_second)
    print(final_prediction_first['forecast'])

    final_prediction_second = apply_model_to_data(fitted_model,
                                                  file_path_third)
    print(final_prediction_second['forecast'])
Esempio n. 25
0
def file_data_setup():
    test_file_path = str(os.path.dirname(__file__))
    file = 'data/simple_classification.csv'
    input_data = InputData.from_csv(os.path.join(test_file_path, file))
    input_data.idx = _to_numerical(categorical_ids=input_data.idx)
    return input_data
Esempio n. 26
0
def run_credit_scoring_problem(
        train_file_path,
        test_file_path,
        max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
        gp_optimiser_params: Optional[GPChainOptimiserParameters] = None,
        pop_size=None,
        generations=None):
    dataset_to_compose = InputData.from_csv(train_file_path)
    dataset_to_validate = InputData.from_csv(test_file_path)

    available_model_types, _ = ModelTypesRepository(). \
        suitable_model(task_type=TaskTypesEnum.classification)

    # the choice of the metric for the chain quality assessment during composition
    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC)

    if gp_optimiser_params:
        optimiser_parameters = gp_optimiser_params
    else:
        selection_types = [SelectionTypesEnum.tournament]
        crossover_types = [CrossoverTypesEnum.subtree]
        mutation_types = [
            MutationTypesEnum.simple, MutationTypesEnum.growth,
            MutationTypesEnum.reduce
        ]
        regularization_type = RegularizationTypesEnum.decremental
        optimiser_parameters = GPChainOptimiserParameters(
            selection_types=selection_types,
            crossover_types=crossover_types,
            mutation_types=mutation_types,
            regularization_type=regularization_type)
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=4,
        max_depth=3,
        pop_size=pop_size,
        num_of_generations=generations,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=max_lead_time)

    # Create GP-based composer
    composer = GPComposer()

    chain_evo_composed = composer.compose_chain(
        data=dataset_to_compose,
        initial_chain=None,
        composer_requirements=composer_requirements,
        metrics=metric_function,
        optimiser_parameters=optimiser_parameters,
        is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True)

    roc_on_valid_evo_composed = calculate_validation_metric(
        chain_evo_composed, dataset_to_validate)

    print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return roc_on_valid_evo_composed, chain_evo_composed, composer
Esempio n. 27
0
def run_fedot(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task_type = params.task

    if task_type == TaskTypesEnum.classification:
        metric = ClassificationMetricsEnum.ROCAUC
    elif task_type == TaskTypesEnum.regression:
        metric = RegressionMetricsEnum.RMSE
    else:
        raise NotImplementedError()

    task = Task(task_type)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    models_hyperparameters = get_models_hyperparameters()['FEDOT']
    cur_lead_time = models_hyperparameters['MAX_RUNTIME_MINS']

    saved_model_name = f'fedot_{case_label}_{task_type}_{cur_lead_time}_{metric}'
    loaded_model = load_fedot_model(saved_model_name)

    if not loaded_model:
        generations = models_hyperparameters['GENERATIONS']
        population_size = models_hyperparameters['POPULATION_SIZE']

        # the search of the models provided by the framework that can be used as nodes in a chain'
        models_repo = ModelTypesRepository()
        available_model_types, _ = models_repo.suitable_model(task.task_type)

        metric_function = MetricsRepository().metric_by_id(metric)

        composer_requirements = GPComposerRequirements(
            primary=available_model_types,
            secondary=available_model_types,
            max_arity=3,
            max_depth=3,
            pop_size=population_size,
            num_of_generations=generations,
            crossover_prob=0.8,
            mutation_prob=0.8,
            max_lead_time=datetime.timedelta(minutes=cur_lead_time))

        # Create GP-based composer
        composer = GPComposer()

        # the optimal chain generation by composition - the most time-consuming task
        chain_evo_composed = composer.compose_chain(
            data=dataset_to_compose,
            initial_chain=None,
            composer_requirements=composer_requirements,
            metrics=metric_function,
            is_visualise=False)
        chain_evo_composed.fine_tune_primary_nodes(
            input_data=dataset_to_compose, iterations=50)
        chain_evo_composed.fit(input_data=dataset_to_compose, verbose=False)
        save_fedot_model(chain_evo_composed, saved_model_name)
    else:
        chain_evo_composed = loaded_model

    evo_predicted = chain_evo_composed.predict(dataset_to_validate)

    return dataset_to_validate.target, evo_predicted.predict