Beispiel #1
0
def run_h2o(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task = params.task

    config_data = get_models_hyperparameters()['H2O']
    max_models = config_data['MAX_MODELS']
    max_runtime_secs = config_data['MAX_RUNTIME_SECS']

    result_filename = f'{case_label}_m{max_models}_rs{max_runtime_secs}_{task.name}'
    exported_model_path = os.path.join(CURRENT_PATH, result_filename)

    # TODO Regression
    if result_filename not in os.listdir(CURRENT_PATH):
        train_data = InputData.from_csv(train_file_path)
        best_model = fit_h2o(train_data, round(max_runtime_secs / 60))
        temp_exported_model_path = h2o.save_model(model=best_model,
                                                  path=CURRENT_PATH)

        os.renames(temp_exported_model_path, exported_model_path)

    ip, port = get_h2o_connect_config()
    h2o.init(ip=ip, port=port, name='h2o_server')

    imported_model = h2o.load_model(exported_model_path)

    test_frame = InputData.from_csv(test_file_path)
    true_target = test_frame.target

    predicted = predict_h2o(imported_model, test_frame)

    h2o.shutdown(prompt=False)

    return true_target, predicted
Beispiel #2
0
def run_autokeras(train_file_path: str,
                  test_file_path: str,
                  task: MachineLearningTasksEnum,
                  case_name: str = 'default'):
    config_data = get_models_hyperparameters()['autokeras']
    max_trial = config_data['MAX_TRIAL']
    epoch = config_data['EPOCH']

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    # TODO Save model to file

    if task is MachineLearningTasksEnum.classification:
        estimator = ak.StructuredDataClassifier
    else:
        estimator = ak.StructuredDataRegressor

    model = estimator(max_trials=max_trial)

    model.fit(train_data.features, train_data.target, epochs=epoch)

    predicted = model.predict(test_data.features)

    if task is MachineLearningTasksEnum.classification:
        result_metric = {
            'autokeras_roc_auc':
            round(roc_auc_score(test_data.target, predicted), 3)
        }
    else:
        result_metric = {'MSE': round(mse(test_data.target, predicted), 3)}

    return result_metric
Beispiel #3
0
def run_mlbox(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    target_name = params.target_name
    task = params.task

    config_data = get_models_hyperparameters()['MLBox']
    new_test_file_path, true_target = separate_target_column(
        test_file_path, target_name)
    paths = [train_file_path, new_test_file_path]

    data = Reader(sep=",").train_test_split(paths, target_name)
    data = Drift_thresholder().fit_transform(data)

    score = 'roc_auc' if task is TaskTypesEnum.classification else 'neg_mean_squared_error'

    opt = Optimiser(scoring=score, n_folds=5)
    params = opt.optimise(config_data['space'],
                          data,
                          max_evals=config_data['max_evals'])
    opt.evaluate(params, data)

    Predictor(verbose=False).fit_predict(params, data)

    cur_work_dir = os.path.abspath(os.curdir)

    predicted_df = pd.read_csv(
        os.path.join(cur_work_dir, f'save/{target_name}_predictions.csv'))
    predicted = predicted_df['1.0']

    os.remove(new_test_file_path)

    return true_target, predicted
def run_mlbox(train_file_path: str, test_file_path: str, target_name: str,
              task: MachineLearningTasksEnum):
    config_data = get_models_hyperparameters()['MLBox']
    new_test_file_path, test_target = separate_target_column(
        test_file_path, target_name)
    paths = [train_file_path, new_test_file_path]

    data = Reader(sep=",").train_test_split(paths, target_name)
    data = Drift_thresholder().fit_transform(data)

    score = 'roc_auc' if task is MachineLearningTasksEnum.classification else 'neg_mean_squared_error'

    opt = Optimiser(scoring=score, n_folds=5)
    params = opt.optimise(config_data['space'],
                          data,
                          max_evals=config_data['max_evals'])
    opt.evaluate(params, data)

    Predictor(verbose=False).fit_predict(params, data)

    cur_work_dir = os.path.abspath(os.curdir)

    predicted_df = pd.read_csv(
        os.path.join(cur_work_dir, f'save/{target_name}_predictions.csv'))
    predicted = predicted_df['1.0']
    metric = roc_auc_score(test_target, predicted)

    print(f'ROC_AUC: {metric}')

    os.remove(new_test_file_path)

    return metric
Beispiel #5
0
def run_autokeras(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    task = params.task

    config_data = get_models_hyperparameters()['autokeras']
    max_trial = config_data['MAX_TRIAL']
    epoch = config_data['EPOCH']

    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    # TODO Save model to file

    if task is TaskTypesEnum.classification:
        estimator = ak.StructuredDataClassifier
    else:
        estimator = ak.StructuredDataRegressor

    model = estimator(max_trials=max_trial)

    model.fit(train_data.features, train_data.target, epochs=epoch)

    predicted = model.predict(test_data.features)

    return test_data.target, predicted
Beispiel #6
0
def run_h2o(train_file_path: str,
            test_file_path: str,
            task: MachineLearningTasksEnum,
            case_name='h2o_default'):
    config_data = get_models_hyperparameters()['H2O']
    max_models = config_data['MAX_MODELS']
    max_runtime_secs = config_data['MAX_RUNTIME_SECS']

    result_filename = f'{case_name}_m{max_models}_rs{max_runtime_secs}_{task.name}'
    exported_model_path = os.path.join(CURRENT_PATH, result_filename)

    # TODO Regression
    if result_filename not in os.listdir(CURRENT_PATH):
        train_data = InputData.from_csv(train_file_path)
        best_model = fit_h2o(train_data)
        temp_exported_model_path = h2o.save_model(model=best_model,
                                                  path=CURRENT_PATH)

        os.renames(temp_exported_model_path, exported_model_path)

    ip, port = get_h2o_connect_config()
    h2o.init(ip=ip, port=port, name='h2o_server')

    imported_model = h2o.load_model(exported_model_path)

    test_frame = InputData.from_csv(test_file_path)
    true_target = test_frame.target

    predictions = predict_h2o(imported_model, test_frame)

    if task is MachineLearningTasksEnum.classification:
        train_roc_auc_value = round(imported_model.auc(train=True), 3)
        valid_roc_auc_value = round(imported_model.auc(valid=True), 3)
        test_roc_auc_value = round(roc_auc_score(true_target, predictions), 3)

        metrics = {
            'H2O_ROC_AUC_train': train_roc_auc_value,
            'H2O_ROC_AUC_valid': valid_roc_auc_value,
            'H2O_ROC_AUC_test': test_roc_auc_value
        }

        print(f"H2O_ROC_AUC_train: {metrics['H2O_ROC_AUC_train']}")
        print(f"H2O_ROC_AUC_valid: {metrics['H2O_ROC_AUC_valid']}")
        print(f"H2O_ROC_AUC_test: {metrics['H2O_ROC_AUC_test']}")
    else:
        mse_train = imported_model.mse()
        rmse_train = imported_model.rmse()

        metrics = {'H2O_MSE_train': mse_train, 'H2O_RMSE_train': rmse_train}

        print(f"H2O_MSE_train: {metrics['H2O_MSE_train']}")
        print(f"H2O_RMSE_train: {metrics['H2O_RMSE_train']}")

    h2o.shutdown(prompt=False)

    return metrics
Beispiel #7
0
def run_tpot(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task = params.task

    models_hyperparameters = get_models_hyperparameters()['TPOT']
    generations = models_hyperparameters['GENERATIONS']
    population_size = models_hyperparameters['POPULATION_SIZE']

    result_model_filename = f'{case_label}_g{generations}' \
                            f'_p{population_size}_{task.name}.pkl'
    current_file_path = str(os.path.dirname(__file__))
    result_file_path = os.path.join(current_file_path, result_model_filename)

    train_data = InputData.from_csv(train_file_path, task=Task(task))

    if result_model_filename not in os.listdir(current_file_path):
        # TODO change hyperparameters to actual from variable
        model = fit_tpot(train_data,
                         models_hyperparameters['MAX_RUNTIME_MINS'])

        model.export(
            output_file_name=f'{result_model_filename[:-4]}_pipeline.py')

        # sklearn pipeline object
        fitted_model_config = model.fitted_pipeline_
        joblib.dump(fitted_model_config, result_file_path, compress=1)

    imported_model = joblib.load(result_file_path)

    predict_data = InputData.from_csv(test_file_path, task=Task(task))
    true_target = predict_data.target
    if task == TaskTypesEnum.regression:
        predicted = predict_tpot_reg(imported_model, predict_data)
    elif task == TaskTypesEnum.classification:
        predicted = predict_tpot_class(imported_model, predict_data)
    else:
        print('Incorrect type of ml task')
        raise NotImplementedError()

    print(f'BEST_model: {imported_model}')

    return true_target, predicted
Beispiel #8
0
from benchmark.benchmark_model_types import BenchmarkModelTypesEnum
from benchmark.benchmark_utils import get_models_hyperparameters, get_scoring_case_data_paths, \
    save_metrics_result_file
from benchmark.executor import CaseExecutor, ExecutionParams
from core.repository.tasks import TaskTypesEnum

if __name__ == '__main__':
    train_file, test_file = get_scoring_case_data_paths()

    result_metrics = CaseExecutor(
        params=ExecutionParams(train_file=train_file,
                               test_file=test_file,
                               task=TaskTypesEnum.classification,
                               target_name='default',
                               case_label='scoring'),
        models=[
            BenchmarkModelTypesEnum.baseline, BenchmarkModelTypesEnum.tpot,
            BenchmarkModelTypesEnum.fedot
        ],
        metric_list=['roc_auc', 'f1']).execute()

    result_metrics['hyperparameters'] = get_models_hyperparameters()

    save_metrics_result_file(result_metrics, file_name='scoring_metrics')
Beispiel #9
0
def run_fedot(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    case_label = params.case_label
    task_type = params.task

    if task_type == TaskTypesEnum.classification:
        metric = ClassificationMetricsEnum.ROCAUC
    elif task_type == TaskTypesEnum.regression:
        metric = RegressionMetricsEnum.RMSE
    else:
        raise NotImplementedError()

    task = Task(task_type)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    models_hyperparameters = get_models_hyperparameters()['FEDOT']
    cur_lead_time = models_hyperparameters['MAX_RUNTIME_MINS']

    saved_model_name = f'fedot_{case_label}_{task_type}_{cur_lead_time}_{metric}'
    loaded_model = load_fedot_model(saved_model_name)

    if not loaded_model:
        generations = models_hyperparameters['GENERATIONS']
        population_size = models_hyperparameters['POPULATION_SIZE']

        # the search of the models provided by the framework that can be used as nodes in a chain'
        models_repo = ModelTypesRepository()
        available_model_types, _ = models_repo.suitable_model(task.task_type)

        metric_function = MetricsRepository().metric_by_id(metric)

        composer_requirements = GPComposerRequirements(
            primary=available_model_types,
            secondary=available_model_types,
            max_arity=3,
            max_depth=3,
            pop_size=population_size,
            num_of_generations=generations,
            crossover_prob=0.8,
            mutation_prob=0.8,
            max_lead_time=datetime.timedelta(minutes=cur_lead_time))

        # Create GP-based composer
        composer = GPComposer()

        # the optimal chain generation by composition - the most time-consuming task
        chain_evo_composed = composer.compose_chain(
            data=dataset_to_compose,
            initial_chain=None,
            composer_requirements=composer_requirements,
            metrics=metric_function,
            is_visualise=False)
        chain_evo_composed.fine_tune_primary_nodes(
            input_data=dataset_to_compose, iterations=50)
        chain_evo_composed.fit(input_data=dataset_to_compose, verbose=False)
        save_fedot_model(chain_evo_composed, saved_model_name)
    else:
        chain_evo_composed = loaded_model

    evo_predicted = chain_evo_composed.predict(dataset_to_validate)

    return dataset_to_validate.target, evo_predicted.predict
Beispiel #10
0
        print('Please create nonempty csv-file with datasets')

    if len(dataset) == 0:
        dataset = classification_dataset_names + regression_dataset_names

    for name_of_dataset in dataset:
        pmlb_data = fetch_data(name_of_dataset)
        num_classes, _ = imbalance_metrics(pmlb_data['target'].tolist())
        problem_class, metric_names = _problem_and_metric_for_dataset(
            name_of_dataset, num_classes)
        if not problem_class or not metric_names:
            print('Incorrect dataset')
            continue

        train_file, test_file = get_penn_case_data_paths(name_of_dataset)
        config_models_data = get_models_hyperparameters()
        case_name = f'penn_ml_{name_of_dataset}'

        try:
            result_metrics = CaseExecutor(params=ExecutionParams(
                train_file=train_file,
                test_file=test_file,
                task=problem_class,
                target_name='target',
                case_label=case_name),
                                          models=[
                                              BenchmarkModelTypesEnum.tpot,
                                              BenchmarkModelTypesEnum.baseline,
                                              BenchmarkModelTypesEnum.fedot
                                          ],
                                          metric_list=metric_names).execute()