コード例 #1
0
def filter_operations_by_preset(task: Task, preset: str):
    """ Function filter operations by preset, remove "heavy" operations and save
    appropriate ones
    """
    excluded_models_dict = {'light': ['mlp', 'svc', 'arima', 'exog_ts_data_source', 'text_clean'],
                            'light_tun': ['mlp', 'svc', 'arima', 'exog_ts_data_source', 'text_clean']}

    # Get data operations and models
    available_operations = get_operations_for_task(task, mode='all')
    available_data_operation = get_operations_for_task(task, mode='data_operation')

    # Exclude "heavy" operations if necessary
    if preset in excluded_models_dict.keys():
        excluded_operations = excluded_models_dict[preset]
        available_operations = [_ for _ in available_operations if _ not in excluded_operations]

    # Save only "light" operations
    if preset in ['ultra_light', 'ultra_light_tun']:
        light_models = ['dt', 'dtreg', 'logit', 'linear', 'lasso', 'ridge', 'knn', 'ar']
        included_operations = light_models + available_data_operation
        available_operations = [_ for _ in available_operations if _ in included_operations]

    if preset == 'gpu':
        # OperationTypesRepository.assign_repo('model', 'gpu_models_repository.json')
        repository = OperationTypesRepository().assign_repo('model', 'gpu_models_repository.json')
        available_operations = repository.suitable_operation(task_type=task.task_type)
    return available_operations
コード例 #2
0
def filter_operations_by_preset(task, preset: str):
    """ Function filter operations by preset, remove "heavy" operations and save
    appropriate ones
    """
    excluded_models_dict = {
        'light': ['mlp', 'svc', 'arima', 'exog', 'text_clean'],
        'light_tun': ['mlp', 'svc', 'arima', 'exog', 'text_clean']
    }

    # Get data operations and models
    available_operations = get_operations_for_task(task, mode='all')
    available_data_operation = get_operations_for_task(task,
                                                       mode='data_operations')

    # Exclude "heavy" operations if necessary
    if preset in excluded_models_dict.keys():
        excluded_operations = excluded_models_dict[preset]
        available_operations = [
            _ for _ in available_operations if _ not in excluded_operations
        ]

    # Save only "light" operations
    if preset in ['ultra_light', 'ultra_light_tun']:
        light_models = [
            'dt', 'dtreg', 'logit', 'linear', 'lasso', 'ridge', 'knn', 'ar'
        ]
        included_operations = light_models + available_data_operation
        available_operations = [
            _ for _ in available_operations if _ in included_operations
        ]

    return available_operations
コード例 #3
0
def get_composed_chain(dataset_to_compose, task, metric_function):
    # the search of the models provided by the framework that can be used as nodes in a chain for the selected task
    available_model_types = get_operations_for_task(task=task, mode='models')

    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8,
        allow_single_operations=False)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type)

    # Create builder for composer and set composer params
    builder = GPComposerBuilder(
        task=task).with_requirements(composer_requirements).with_metrics(
            metric_function).with_optimiser_parameters(optimiser_parameters)

    # Create GP-based composer
    composer = builder.build()

    # the optimal chain generation by composition - the most time-consuming task
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=True)

    return chain_evo_composed
コード例 #4
0
def test_presets_classification():
    task = Task(TaskTypesEnum.classification)

    class_operations = get_operations_for_task(task, mode='all')

    operations_for_light_preset = filter_operations_by_preset(task, 'light')
    operations_for_ultra_light_preset = filter_operations_by_preset(task, 'ultra_light')

    assert len(operations_for_ultra_light_preset) < len(operations_for_light_preset) < len(class_operations)
    assert {'dt', 'logit', 'knn'} <= set(operations_for_ultra_light_preset)
コード例 #5
0
def test_presets_regression():
    task = Task(TaskTypesEnum.regression)

    regr_operations = get_operations_for_task(task, mode='all')

    operations_for_light_preset = filter_operations_by_preset(task, 'light')
    operations_for_ultra_light_preset = filter_operations_by_preset(task, 'ultra_light')

    assert len(operations_for_ultra_light_preset) < len(operations_for_light_preset) == len(regr_operations)
    assert {'dtreg', 'lasso', 'ridge', 'linear'} <= set(operations_for_ultra_light_preset)
コード例 #6
0
def _divide_operations(available_operations, task):
    """ Function divide operations for primary and secondary """

    if task.task_type == TaskTypesEnum.ts_forecasting:
        ts_data_operations = get_operations_for_task(task=task,
                                                     mode='data_operation',
                                                     tags=["ts_specific"])
        # Remove exog data operation from the list
        ts_data_operations.remove('exog_ts_data_source')

        primary_operations = ts_data_operations
        secondary_operations = available_operations
    else:
        primary_operations = available_operations
        secondary_operations = available_operations
    return primary_operations, secondary_operations
コード例 #7
0
    def set_default_composer_params(self):
        """ Method set metrics and composer requirements """
        if not self._composer.composer_requirements:
            # Get all available operations for task
            operations = get_operations_for_task(task=self.task, mode='all')

            # Set protected attributes to composer
            self._composer.composer_requirements = GPComposerRequirements(
                primary=operations, secondary=operations)
        if not self._composer.metrics:
            metric_function = ClassificationMetricsEnum.ROCAUC_penalty
            if self.task.task_type in (TaskTypesEnum.regression,
                                       TaskTypesEnum.ts_forecasting):
                metric_function = RegressionMetricsEnum.RMSE

            # Set metric
            self._composer.metrics = [metric_function]
コード例 #8
0
def is_pipeline_contains_ts_operations(pipeline: 'Pipeline'):
    """ Function checks is the model contains operations for time series
    forecasting """
    # Get time series specific operations with tag "ts_specific"
    ts_operations = get_operations_for_task(task=Task(
        TaskTypesEnum.ts_forecasting),
                                            tags=["ts_specific"],
                                            mode='all')

    # List with operations in considering pipeline
    operations_in_pipeline = []
    for node in pipeline.nodes:
        operations_in_pipeline.append(node.operation.operation_type)

    if len(set(ts_operations) & set(operations_in_pipeline)) > 0:
        return True
    else:
        return False
コード例 #9
0
def run_credit_scoring_problem(
        train_file_path,
        test_file_path,
        max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
        is_visualise=False,
        with_tuning=False,
        cache_path=None):
    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    # the search of the models provided by the framework that can be used as nodes in a chain for the selected task
    available_model_types = get_operations_for_task(task=task, mode='models')

    # the choice of the metric for the chain quality assessment during composition
    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=max_lead_time)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.parameter_free
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type)

    # Create builder for composer and set composer params
    logger = default_log('FEDOT logger', verbose_level=4)
    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function).with_optimiser_parameters(optimiser_parameters).with_logger(logger=logger)

    if cache_path:
        builder = builder.with_cache(cache_path)

    # Create GP-based composer
    composer = builder.build()

    # the optimal chain generation by composition - the most time-consuming task
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=True)

    if with_tuning:
        # TODO Add tuning
        raise NotImplementedError(f'Tuning is not supported')

    chain_evo_composed.fit(input_data=dataset_to_compose)

    composer.history.write_composer_history_to_csv()

    if is_visualise:
        visualiser = ChainVisualiser()

        composer.log.debug('History visualization started')
        visualiser.visualise_history(composer.history)
        composer.log.debug('History visualization finished')

        composer.log.debug('Best chain visualization started')
        visualiser.visualise(chain_evo_composed)
        composer.log.debug('Best chain visualization finished')

    # the quality assessment for the obtained composite models
    roc_on_valid_evo_composed = calculate_validation_metric(
        chain_evo_composed, dataset_to_validate)

    print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return roc_on_valid_evo_composed
コード例 #10
0
def compose_fedot_model(train_data: [InputData, MultiModalData],
                        task: Task,
                        logger: Log,
                        max_depth: int,
                        max_arity: int,
                        pop_size: int,
                        num_of_generations: int,
                        available_operations: list = None,
                        composer_metric=None,
                        timeout: float = 5,
                        with_tuning=False,
                        tuner_metric=None,
                        cv_folds: Optional[int] = None,
                        validation_blocks: int = None,
                        initial_pipeline=None
                        ):
    """ Function for composing FEDOT pipeline """

    metric_function = _obtain_metric(task, composer_metric)

    if available_operations is None:
        available_operations = get_operations_for_task(task, mode='model')

    logger.message(f'Composition started. Parameters tuning: {with_tuning}. '
                   f'Set of candidate models: {available_operations}. Composing time limit: {timeout} min')

    primary_operations, secondary_operations = _divide_operations(available_operations,
                                                                  task)

    timeout_for_composing = timeout / 2 if with_tuning else timeout
    # the choice and initialisation of the GP composer
    composer_requirements = \
        GPComposerRequirements(primary=primary_operations,
                               secondary=secondary_operations,
                               max_arity=max_arity,
                               max_depth=max_depth,
                               pop_size=pop_size,
                               num_of_generations=num_of_generations,
                               cv_folds=cv_folds,
                               validation_blocks=validation_blocks,
                               timeout=datetime.timedelta(minutes=timeout_for_composing))

    optimizer_parameters = GPGraphOptimiserParameters(genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free)

    # Create GP-based composer
    builder = _get_gp_composer_builder(task=task,
                                       metric_function=metric_function,
                                       composer_requirements=composer_requirements,
                                       optimizer_parameters=optimizer_parameters,
                                       data=train_data,
                                       initial_pipeline=initial_pipeline,
                                       logger=logger)
    gp_composer = builder.build()

    logger.message('Pipeline composition started')
    pipeline_gp_composed = gp_composer.compose_pipeline(data=train_data)

    pipeline_for_return = pipeline_gp_composed

    if isinstance(pipeline_gp_composed, list):
        for pipeline in pipeline_gp_composed:
            pipeline.log = logger
        pipeline_for_return = pipeline_gp_composed[0]
        best_candidates = gp_composer.optimiser.archive
    else:
        best_candidates = [pipeline_gp_composed]
        pipeline_gp_composed.log = logger

    if with_tuning:
        logger.message('Hyperparameters tuning started')

        if tuner_metric is None:
            logger.message('Default loss function was set')
            # Default metric for tuner
            tune_metrics = TunerMetricByTask(task.task_type)
            tuner_loss, loss_params = tune_metrics.get_metric_and_params(train_data)
        else:
            # Get metric and parameters by name
            tuner_loss, loss_params = tuner_metric_by_name(metric_name=tuner_metric,
                                                           train_data=train_data,
                                                           task=task)

        iterations = 20 if timeout is None else 1000
        timeout_for_tuning = timeout / 2

        # Tune all nodes in the pipeline

        vb_number = composer_requirements.validation_blocks
        folds = composer_requirements.cv_folds
        if train_data.task.task_type != TaskTypesEnum.ts_forecasting:
            # TODO remove after implementation of CV for class/regr
            logger.warn('Cross-validation is not supported for tuning of ts-forecasting pipeline: '
                        'hold-out validation used instead')
            folds = None
        pipeline_for_return = pipeline_for_return.fine_tune_all_nodes(loss_function=tuner_loss,
                                                                      loss_params=loss_params,
                                                                      input_data=train_data,
                                                                      iterations=iterations,
                                                                      timeout=timeout_for_tuning,
                                                                      cv_folds=folds,
                                                                      validation_blocks=vb_number)

    logger.message('Model composition finished')

    history = gp_composer.optimiser.history

    return pipeline_for_return, best_candidates, history
コード例 #11
0
def compose_fedot_model(train_data: InputData,
                        task: Task,
                        logger: Log,
                        max_depth: int,
                        max_arity: int,
                        pop_size: int,
                        num_of_generations: int,
                        available_operations: list = None,
                        composer_metric=None,
                        learning_time: float = 5,
                        with_tuning=False,
                        tuner_metric=None):
    """ Function for composing FEDOT chain model """

    metric_function = _obtain_metric(task, composer_metric)

    if available_operations is None:
        available_operations = get_operations_for_task(task, mode='models')

    logger.message(
        f'Composition started. Parameters tuning: {with_tuning}. '
        f'Set of candidate models: {available_operations}. Composing time limit: {learning_time} min'
    )

    primary_operations, secondary_operations = _divide_operations(
        available_operations, task)

    learning_time_for_composing = learning_time / 2 if with_tuning else learning_time
    # the choice and initialisation of the GP composer
    composer_requirements = \
        GPComposerRequirements(primary=primary_operations,
                               secondary=secondary_operations,
                               max_arity=max_arity,
                               max_depth=max_depth,
                               pop_size=pop_size,
                               num_of_generations=num_of_generations,
                               max_lead_time=datetime.timedelta(minutes=learning_time_for_composing),
                               allow_single_operations=False)

    optimizer_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=GeneticSchemeTypesEnum.parameter_free,
        mutation_types=[
            MutationTypesEnum.parameter_change, MutationTypesEnum.simple,
            MutationTypesEnum.reduce, MutationTypesEnum.growth,
            MutationTypesEnum.local_growth
        ],
        crossover_types=[
            CrossoverTypesEnum.one_point, CrossoverTypesEnum.subtree
        ])

    # Create GP-based composer
    builder = _get_gp_composer_builder(
        task=task,
        metric_function=metric_function,
        composer_requirements=composer_requirements,
        optimizer_parameters=optimizer_parameters,
        logger=logger)
    gp_composer = builder.build()

    logger.message('Model composition started')
    chain_gp_composed = gp_composer.compose_chain(data=train_data)

    chain_for_return = chain_gp_composed

    if isinstance(chain_gp_composed, list):
        for chain in chain_gp_composed:
            chain.log = logger
        chain_for_return = chain_gp_composed[0]
        best_candidates = gp_composer.optimiser.archive
    else:
        best_candidates = [chain_gp_composed]
        chain_gp_composed.log = logger

    if with_tuning:
        logger.message('Hyperparameters tuning started')

        if tuner_metric is None:
            logger.message('Default loss function was set')
            # Default metric for tuner
            tune_metrics = TunerMetricByTask(task.task_type)
            tuner_loss, loss_params = tune_metrics.get_metric_and_params(
                train_data)
        else:
            # Get metric and parameters by name
            tuner_loss, loss_params = tuner_metric_by_name(
                metric_name=tuner_metric, train_data=train_data, task=task)

        iterations = 20 if learning_time is None else 1000
        learning_time_for_tuning = learning_time / 2

        # Tune all nodes in the chain
        chain_for_return.fine_tune_all_nodes(
            loss_function=tuner_loss,
            loss_params=loss_params,
            input_data=train_data,
            iterations=iterations,
            max_lead_time=learning_time_for_tuning)

    logger.message('Model composition finished')

    return chain_for_return, best_candidates
コード例 #12
0
def run_credit_scoring_problem(
        train_file_path,
        test_file_path,
        max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5),
        is_visualise=False):
    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)
    dataset_to_validate = InputData.from_csv(test_file_path, task=task)

    # the search of the models provided by the framework that can be used as nodes in a chain for the selected task
    available_model_types = get_operations_for_task(task=task, mode='models')

    # the choice of the metric for the chain quality assessment during composition
    quality_metric = ClassificationMetricsEnum.ROCAUC
    complexity_metric = ComplexityMetricsEnum.node_num
    metrics = [quality_metric, complexity_metric]
    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=20,
        num_of_generations=20,
        crossover_prob=0.8,
        mutation_prob=0.8,
        max_lead_time=max_lead_time,
        start_depth=2,
        allow_single_operations=False)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.parameter_free
    optimiser_parameters = GPChainOptimiserParameters(
        genetic_scheme_type=scheme_type,
        selection_types=[SelectionTypesEnum.spea2])

    # Create builder for composer and set composer params
    builder = GPComposerBuilder(
        task=task).with_requirements(composer_requirements).with_metrics(
            metrics).with_optimiser_parameters(optimiser_parameters)

    # Create GP-based composer
    composer = builder.build()

    # the optimal chain generation by composition - the most time-consuming task
    chains_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                 is_visualise=True)

    composer.history.write_composer_history_to_csv()

    if is_visualise:
        results_visualization(composed_chains=chains_evo_composed,
                              history=composer.history)

    chains_roc_auc = []
    for chain_num, chain_evo_composed in enumerate(chains_evo_composed):

        chain_evo_composed.fine_tune_primary_nodes(
            input_data=dataset_to_compose, iterations=50)

        chain_evo_composed.fit(input_data=dataset_to_compose)

        # the quality assessment for the obtained composite models
        roc_on_valid_evo_composed = calculate_validation_metric(
            chain_evo_composed, dataset_to_validate)

        chains_roc_auc.append(roc_on_valid_evo_composed)
        if len(chains_evo_composed) > 1:
            print(
                f'Composed ROC AUC of chain {chain_num + 1} is {round(roc_on_valid_evo_composed, 3)}'
            )

        else:
            print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return max(chains_roc_auc)
コード例 #13
0
 def __init__(self, task=None):
     self.models = get_operations_for_task(task, mode='model')
     self.data_operations = get_operations_for_task(task,
                                                    mode='data_operation')
     super().__init__(task)
コード例 #14
0
def has_no_data_flow_conflicts_in_ts_pipeline(pipeline: 'Pipeline'):
    """ Function checks the correctness of connection between nodes """

    task = Task(TaskTypesEnum.ts_forecasting)
    if not is_pipeline_contains_ts_operations(pipeline):
        return True
    models = get_operations_for_task(task=task, mode='model')
    # Preprocessing not only for time series
    non_ts_data_operations = get_operations_for_task(
        task=task, mode='data_operation', forbidden_tags=["ts_specific"])
    ts_data_operations = get_operations_for_task(task=task,
                                                 mode='data_operation',
                                                 tags=["ts_specific"])
    # Remove lagged transformation
    ts_data_operations.remove('lagged')
    ts_data_operations.remove('exog_ts_data_source')

    # Dictionary as {'current operation in the node': 'parent operations list'}
    # TODO refactor
    wrong_connections = {
        'lagged': models + non_ts_data_operations + ['lagged'],
        'ar': models + non_ts_data_operations + ['lagged'],
        'arima': models + non_ts_data_operations + ['lagged'],
        'ridge': ts_data_operations,
        'linear': ts_data_operations,
        'lasso': ts_data_operations,
        'dtreg': ts_data_operations,
        'knnreg': ts_data_operations,
        'scaling': ts_data_operations,
        'xgbreg': ts_data_operations,
        'adareg': ts_data_operations,
        'gbr': ts_data_operations,
        'treg': ts_data_operations,
        'rfr': ts_data_operations,
        'svr': ts_data_operations,
        'sgdr': ts_data_operations,
        'normalization': ts_data_operations,
        'simple_imputation': ts_data_operations,
        'pca': ts_data_operations,
        'kernel_pca': ts_data_operations,
        'poly_features': ts_data_operations,
        'ransac_lin_reg': ts_data_operations,
        'ransac_non_lin_reg': ts_data_operations,
        'rfe_lin_reg': ts_data_operations,
        'rfe_non_lin_reg': ts_data_operations
    }

    for node in pipeline.nodes:
        # Operation name in the current node
        current_operation = node.operation.operation_type
        parent_nodes = node.nodes_from

        if parent_nodes is not None:
            # There are several parents for current node or at least 1
            for parent in parent_nodes:
                parent_operation = parent.operation.operation_type

                forbidden_parents = wrong_connections.get(current_operation)
                if forbidden_parents is not None:
                    __check_connection(parent_operation, forbidden_parents)

    return True
コード例 #15
0
def run_multi_modal_case(files_path,
                         is_visualise=False,
                         timeout=datetime.timedelta(minutes=2)):
    task = Task(TaskTypesEnum.classification)
    images_size = (128, 128)

    train_num, test_num, train_img, test_img, train_text, test_text = prepare_multi_modal_data(
        files_path, task, images_size)

    pipeline, fit_data, predict_data = generate_initial_pipeline_and_data(
        images_size, train_num, test_num, train_img, test_img, train_text,
        test_text)

    # the search of the models provided by the framework that can be used as nodes in a pipeline for the selected task
    available_model_types = get_operations_for_task(task=task, mode='model')

    # the choice of the metric for the pipeline quality assessment during composition
    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    # the choice and initialisation of the GP search
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_arity=3,
        max_depth=3,
        pop_size=5,
        num_of_generations=5,
        crossover_prob=0.8,
        mutation_prob=0.8,
        timeout=timeout)

    # GP optimiser parameters choice
    scheme_type = GeneticSchemeTypesEnum.parameter_free
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type)

    # Create builder for composer and set composer params
    logger = default_log('FEDOT logger', verbose_level=4)

    # the multi modal template (with data sources) is passed as inital assumption for composer
    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function).with_optimiser_parameters(optimiser_parameters).with_logger(logger=logger). \
        with_initial_pipeline(pipeline).with_cache('multi_modal_opt.cache')

    # Create GP-based composer
    composer = builder.build()

    # the optimal pipeline generation by composition - the most time-consuming task
    pipeline_evo_composed = composer.compose_pipeline(data=fit_data,
                                                      is_visualise=True)

    pipeline_evo_composed.fit(input_data=fit_data)

    if is_visualise:
        pipeline_evo_composed.show()

    prediction = pipeline_evo_composed.predict(predict_data)

    err = calculate_validation_metric(prediction, test_num)

    print(f'ROC AUC for validation sample is {err}')

    return err