Beispiel #1
0
def get_composed_pipeline(dataset_to_compose, task, metric_function):
    # the search of the models provided by the framework that can be used as nodes in a pipeline for the selected task
    available_model_types = get_operations_for_task(task=task, mode='model')

    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type)

    # Create builder for composer and set composer params
    builder = GPComposerBuilder(
        task=task).with_requirements(composer_requirements).with_metrics(
            metric_function).with_optimiser_parameters(optimiser_parameters)

    # Create GP-based composer
    composer = builder.build()

    # the optimal pipeline generation by composition - the most time-consuming task
    pipeline_evo_composed = composer.compose_pipeline(data=dataset_to_compose,
                                                      is_visualise=True)

    return pipeline_evo_composed
Beispiel #2
0
def test_gp_composer_with_start_depth(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    available_model_types = ['xgboost', 'knn']
    quality_metric = ClassificationMetricsEnum.ROCAUC
    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=5,
                                 pop_size=5,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5,
                                 start_depth=2)
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type)
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            quality_metric).with_optimiser_parameters(optimiser_parameters)
    composer = builder.build()
    composer.compose_pipeline(data=dataset_to_compose, is_visualise=True)
    assert all(
        [ind.graph.depth <= 3 for ind in composer.history.individuals[0]])
    assert composer.optimiser.max_depth == 5
Beispiel #3
0
def run_custom_example(
        timeout: datetime.timedelta = datetime.timedelta(minutes=0.2)):
    data = pd.read_csv(
        os.path.join(fedot_project_root(), 'examples', 'data',
                     'custom_encoded.csv'))
    nodes_types = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10']
    rules = [has_no_self_cycled_nodes, has_no_cycle, _has_no_duplicates]

    initial = CustomGraphModel(nodes=[
        CustomGraphNode(nodes_from=None, content=node_type)
        for node_type in nodes_types
    ])

    requirements = GPComposerRequirements(primary=nodes_types,
                                          secondary=nodes_types,
                                          max_arity=10,
                                          max_depth=10,
                                          pop_size=5,
                                          num_of_generations=5,
                                          crossover_prob=0.8,
                                          mutation_prob=0.9,
                                          timeout=timeout)

    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=GeneticSchemeTypesEnum.steady_state,
        mutation_types=[custom_mutation],
        crossover_types=[CrossoverTypesEnum.none],
        regularization_type=RegularizationTypesEnum.none)

    graph_generation_params = GraphGenerationParams(adapter=DirectAdapter(
        base_graph_class=CustomGraphModel, base_node_class=CustomGraphNode),
                                                    rules_for_constraint=rules)

    optimizer = GPGraphOptimiser(
        graph_generation_params=graph_generation_params,
        metrics=[],
        parameters=optimiser_parameters,
        requirements=requirements,
        initial_graph=initial,
        log=default_log(logger_name='Bayesian', verbose_level=1))

    optimized_network = optimizer.optimise(partial(custom_metric, data=data))

    optimized_network.show()
Beispiel #4
0
def test_multi_objective_composer(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=TaskTypesEnum.classification)
    quality_metric = ClassificationMetricsEnum.ROCAUC
    complexity_metric = ComplexityMetricsEnum.node_num
    metrics = [quality_metric, complexity_metric]
    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type,
        selection_types=[SelectionTypesEnum.nsga2])
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            metrics).with_optimiser_parameters(optimiser_parameters)
    composer = builder.build()
    pipelines_evo_composed = composer.compose_pipeline(data=dataset_to_compose)
    pipelines_roc_auc = []
    for pipeline_evo_composed in pipelines_evo_composed:
        pipeline_evo_composed.fit_from_scratch(input_data=dataset_to_compose)
        predicted_gp_composed = pipeline_evo_composed.predict(
            dataset_to_validate)

        roc_on_valid_gp_composed = roc_auc(
            y_true=dataset_to_validate.target,
            y_score=predicted_gp_composed.predict)

        pipelines_roc_auc.append(roc_on_valid_gp_composed)

    assert type(composer.metrics) is list and len(composer.metrics) > 1
    assert type(pipelines_evo_composed) is list
    assert composer.optimiser.parameters.multi_objective
    assert all([roc_auc > 0.6 for roc_auc in pipelines_roc_auc])
Beispiel #5
0
def test_custom_graph_opt():
    nodes_types = ['A', 'B', 'C', 'D']
    rules = [has_no_self_cycled_nodes]

    requirements = GPComposerRequirements(primary=nodes_types,
                                          secondary=nodes_types,
                                          max_arity=3,
                                          max_depth=3,
                                          pop_size=5,
                                          num_of_generations=5,
                                          crossover_prob=0.8,
                                          mutation_prob=0.9)

    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=GeneticSchemeTypesEnum.steady_state,
        mutation_types=[
            MutationTypesEnum.simple, MutationTypesEnum.reduce,
            MutationTypesEnum.growth, MutationTypesEnum.local_growth
        ],
        regularization_type=RegularizationTypesEnum.none)

    graph_generation_params = GraphGenerationParams(adapter=DirectAdapter(
        CustomModel, CustomNode),
                                                    rules_for_constraint=rules)

    optimizer = GPGraphOptimiser(
        graph_generation_params=graph_generation_params,
        metrics=[],
        parameters=optimiser_parameters,
        requirements=requirements,
        initial_graph=None)

    optimized_network = optimizer.optimise(custom_metric)

    assert optimized_network is not None
    assert isinstance(optimized_network, CustomModel)
    assert isinstance(optimized_network.nodes[0], CustomNode)

    assert 'custom_A' in [str(_) for _ in optimized_network.nodes]
Beispiel #6
0
def test_parameter_free_composer_build_pipeline_correct(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=TaskTypesEnum.classification)

    metric_function = ClassificationMetricsEnum.ROCAUC

    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=4,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)
    opt_params = GPGraphOptimiserParameters(
        genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free)
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            metric_function).with_optimiser_parameters(opt_params)
    gp_composer = builder.build()
    pipeline_gp_composed = gp_composer.compose_pipeline(
        data=dataset_to_compose)

    pipeline_gp_composed.fit_from_scratch(input_data=dataset_to_compose)
    predicted_gp_composed = pipeline_gp_composed.predict(dataset_to_validate)

    roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target,
                                       y_score=predicted_gp_composed.predict)
    population_len = sum([
        len(history) for history in gp_composer.history.individuals
    ]) / len(gp_composer.history.individuals)
    assert population_len != len(gp_composer.history.individuals[0])
    assert roc_on_valid_gp_composed > 0.6
Beispiel #7
0
def test_gp_composer_saving_info_from_process(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    available_model_types = ['xgboost', 'knn']
    quality_metric = ClassificationMetricsEnum.ROCAUC
    req = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=2,
        max_depth=2,
        pop_size=2,
        num_of_generations=1,
        crossover_prob=0.4,
        mutation_prob=0.5,
        start_depth=2,
        max_pipeline_fit_time=datetime.timedelta(minutes=5))
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type)
    builder = GPComposerBuilder(
        task=Task(TaskTypesEnum.classification)).with_requirements(
            req).with_metrics(quality_metric).with_optimiser_parameters(
                optimiser_parameters).with_cache()
    composer = builder.build()
    train_data, test_data = train_test_data_setup(
        data, sample_split_ratio_for_tasks[data.task.task_type])
    composer.compose_pipeline(data=dataset_to_compose, is_visualise=True)
    with shelve.open(composer.cache.db_path) as cache:
        global_cache_len_before = len(cache.dict)
    new_pipeline = pipeline_first()
    composer.composer_metric([quality_metric], dataset_to_compose, test_data,
                             new_pipeline)
    with shelve.open(composer.cache.db_path) as cache:
        global_cache_len_after = len(cache.dict)
    assert global_cache_len_before < global_cache_len_after
    assert new_pipeline.computation_time is not None
    assert new_pipeline.fitted_on_data is not None
Beispiel #8
0
def run_ts_forecasting_problem(forecast_length=50,
                               with_visualisation=True,
                               cv_folds=None) -> None:
    """ Function launch time series task with composing

    :param forecast_length: length of the forecast
    :param with_visualisation: is it needed to show the plots
    :param cv_folds: is it needed apply cross validation and what number
    of folds to use
    """
    file_path = '../cases/data/metocean/metocean_data_test.csv'

    df = pd.read_csv(file_path)
    time_series = np.array(df['sea_height'])

    # Train/test split
    train_part = time_series[:-forecast_length]
    test_part = time_series[-forecast_length:]

    # Prepare data for train and test
    train_input, predict_input, task = prepare_train_test_input(
        train_part, forecast_length)

    # Get pipeline with pre-defined structure
    init_pipeline = get_source_pipeline()

    # Init check
    preds = fit_predict_for_pipeline(pipeline=init_pipeline,
                                     train_input=train_input,
                                     predict_input=predict_input)
    display_validation_metric(predicted=preds,
                              real=test_part,
                              actual_values=time_series,
                              is_visualise=with_visualisation)

    # Get available_operations type
    primary_operations, secondary_operations = get_available_operations()

    # Composer parameters
    composer_requirements = GPComposerRequirements(
        primary=primary_operations,
        secondary=secondary_operations,
        max_arity=3,
        max_depth=8,
        pop_size=10,
        num_of_generations=10,
        crossover_prob=0.8,
        mutation_prob=0.8,
        timeout=datetime.timedelta(minutes=10),
        cv_folds=cv_folds,
        validation_blocks=3)

    mutation_types = [
        parameter_change_mutation, MutationTypesEnum.simple,
        MutationTypesEnum.reduce
    ]
    optimiser_parameters = GPGraphOptimiserParameters(
        mutation_types=mutation_types)

    metric_function = MetricsRepository().metric_by_id(
        RegressionMetricsEnum.RMSE)
    builder = GPComposerBuilder(task=task). \
        with_optimiser_parameters(optimiser_parameters).\
        with_requirements(composer_requirements).\
        with_metrics(metric_function).with_initial_pipeline(init_pipeline)
    composer = builder.build()

    obtained_pipeline = composer.compose_pipeline(data=train_input,
                                                  is_visualise=False)

    ###################################
    # Obtained pipeline visualisation #
    ###################################
    if with_visualisation:
        obtained_pipeline.show()

    preds = fit_predict_for_pipeline(pipeline=obtained_pipeline,
                                     train_input=train_input,
                                     predict_input=predict_input)

    display_validation_metric(predicted=preds,
                              real=test_part,
                              actual_values=time_series,
                              is_visualise=with_visualisation)

    obtained_pipeline.print_structure()
Beispiel #9
0
 def __init__(self, task: Task):
     self._composer = GPComposer()
     self.optimiser_parameters = GPGraphOptimiserParameters()
     self.task = task
     self.set_default_composer_params()
Beispiel #10
0
def run_credit_scoring_problem(train_file_path, test_file_path,
                               timeout: datetime.timedelta = datetime.timedelta(minutes=5),
                               is_visualise=False):
    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    # the search of the models provided by the framework that can be used as nodes in a pipeline for the selected task
    available_model_types = get_operations_for_task(task=task, mode='model')

    # the choice of the metric for the pipeline quality assessment during composition
    quality_metric = ClassificationMetricsEnum.ROCAUC
    complexity_metric = ComplexityMetricsEnum.node_num
    metrics = [quality_metric, complexity_metric]
    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types, max_arity=3,
        max_depth=3, pop_size=20, num_of_generations=20,
        crossover_prob=0.8, mutation_prob=0.8, timeout=timeout,
        start_depth=2)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.parameter_free
    optimiser_parameters = GPGraphOptimiserParameters(genetic_scheme_type=scheme_type,
                                                      selection_types=[SelectionTypesEnum.spea2])

    # Create builder for composer and set composer params
    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements).with_metrics(
        metrics).with_optimiser_parameters(optimiser_parameters)

    # Create GP-based composer
    composer = builder.build()

    # the optimal pipeline generation by composition - the most time-consuming task
    pipelines_evo_composed = composer.compose_pipeline(data=dataset_to_compose,
                                                       is_visualise=True)

    composer.history.write_composer_history_to_csv()

    if is_visualise:
        results_visualization(composed_pipelines=pipelines_evo_composed, history=composer.history)

    pipelines_roc_auc = []
    for pipeline_num, pipeline_evo_composed in enumerate(pipelines_evo_composed):

        pipeline_evo_composed.fine_tune_primary_nodes(input_data=dataset_to_compose,
                                                      iterations=50)

        pipeline_evo_composed.fit(input_data=dataset_to_compose)

        # the quality assessment for the obtained composite models
        roc_on_valid_evo_composed = calculate_validation_metric(pipeline_evo_composed,
                                                                dataset_to_validate)

        pipelines_roc_auc.append(roc_on_valid_evo_composed)
        if len(pipelines_evo_composed) > 1:
            print(f'Composed ROC AUC of pipeline {pipeline_num + 1} is {round(roc_on_valid_evo_composed, 3)}')

        else:
            print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return max(pipelines_roc_auc)
Beispiel #11
0
def run_multi_modal_case(files_path,
                         is_visualise=False,
                         timeout=datetime.timedelta(minutes=2)):
    task = Task(TaskTypesEnum.classification)
    images_size = (128, 128)

    train_num, test_num, train_img, test_img, train_text, test_text = prepare_multi_modal_data(
        files_path, task, images_size)

    pipeline, fit_data, predict_data = generate_initial_pipeline_and_data(
        images_size, train_num, test_num, train_img, test_img, train_text,
        test_text)

    # the search of the models provided by the framework that can be used as nodes in a pipeline for the selected task
    available_model_types = get_operations_for_task(task=task, mode='model')

    # the choice of the metric for the pipeline quality assessment during composition
    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=5,
        num_of_generations=5,
        crossover_prob=0.8,
        mutation_prob=0.8,
        timeout=timeout)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.parameter_free
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type)

    # Create builder for composer and set composer params
    logger = default_log('FEDOT logger', verbose_level=4)

    # the multi modal template (with data sources) is passed as inital assumption for composer
    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function).with_optimiser_parameters(optimiser_parameters).with_logger(logger=logger). \
        with_initial_pipeline(pipeline).with_cache('multi_modal_opt.cache')

    # Create GP-based composer
    composer = builder.build()

    # the optimal pipeline generation by composition - the most time-consuming task
    pipeline_evo_composed = composer.compose_pipeline(data=fit_data,
                                                      is_visualise=True)

    pipeline_evo_composed.fit(input_data=fit_data)

    if is_visualise:
        pipeline_evo_composed.show()

    prediction = pipeline_evo_composed.predict(predict_data)

    err = calculate_validation_metric(prediction, test_num)

    print(f'ROC AUC for validation sample is {err}')

    return err
Beispiel #12
0
 def __init__(self, task: Task):
     super().__init__(task=task)
     self._composer = FixedStructureComposer()
     fixed_structure_optimiser_parameters = GPGraphOptimiserParameters(
         mutation_types=[MutationTypesEnum.simple])
     self.optimiser_parameters = fixed_structure_optimiser_parameters