コード例 #1
0
def operations_for_task(task_name: str):
    """ Function filter operations by task and returns dictionary with names of
    models and data operations

    :param task_name: name of available task type

    :return dict_with_operations: dictionary with operations
        - models: appropriate models for task
        - data operations: appropriate data operations for task
    """

    task = _get_task_by_name(task_name)

    # Get models and data operations
    models_repo = OperationTypesRepository()
    data_operations_repo = OperationTypesRepository(
        operation_type='data_operation')

    appropriate_models, _ = models_repo.suitable_operation(task_type=task)
    appropriate_data_operations, _ = data_operations_repo.suitable_operation(
        task_type=task)

    dict_with_operations = {
        'model': appropriate_models,
        'data operation': appropriate_data_operations
    }

    return dict_with_operations
コード例 #2
0
def get_model(train_file_path: str,
              cur_lead_time: datetime.timedelta = timedelta(seconds=60)):
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)

    # the search of the models provided by the framework
    # that can be used as nodes in a chain for the selected task
    models_repo = OperationTypesRepository()
    available_model_types, _ = models_repo.suitable_operation(
        task_type=task.task_type, tags=['simple'])

    metric_function = ClassificationMetricsEnum.ROCAUC_penalty

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_lead_time=cur_lead_time)

    # Create the genetic programming-based composer, that allow to find
    # the optimal structure of the composite model
    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    # run the search of best suitable model
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose)

    return chain_evo_composed
コード例 #3
0
def test_composer_with_cv_optimization_correct():
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose, dataset_to_validate = get_data(task)

    models_repo = OperationTypesRepository()
    available_model_types, _ = models_repo.suitable_operation(
        task_type=task.task_type, tags=['simple'])

    metric_function = [
        ClassificationMetricsEnum.ROCAUC_penalty,
        ClassificationMetricsEnum.accuracy, ClassificationMetricsEnum.logloss
    ]

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        timeout=timedelta(minutes=1),
        num_of_generations=2,
        cv_folds=3)

    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    pipeline_evo_composed = composer.compose_pipeline(data=dataset_to_compose,
                                                      is_visualise=False)[0]

    assert isinstance(pipeline_evo_composed, Pipeline)

    pipeline_evo_composed.fit(input_data=dataset_to_compose)
    predicted = pipeline_evo_composed.predict(dataset_to_validate)
    roc_on_valid_evo_composed = roc_auc(y_score=predicted.predict,
                                        y_true=dataset_to_validate.target)

    assert roc_on_valid_evo_composed > 0
コード例 #4
0
def filter_operations_by_preset(task: Task, preset: str):
    """ Function filter operations by preset, remove "heavy" operations and save
    appropriate ones
    """
    excluded_models_dict = {'light': ['mlp', 'svc', 'arima', 'exog_ts_data_source', 'text_clean'],
                            'light_tun': ['mlp', 'svc', 'arima', 'exog_ts_data_source', 'text_clean']}

    # Get data operations and models
    available_operations = get_operations_for_task(task, mode='all')
    available_data_operation = get_operations_for_task(task, mode='data_operation')

    # Exclude "heavy" operations if necessary
    if preset in excluded_models_dict.keys():
        excluded_operations = excluded_models_dict[preset]
        available_operations = [_ for _ in available_operations if _ not in excluded_operations]

    # Save only "light" operations
    if preset in ['ultra_light', 'ultra_light_tun']:
        light_models = ['dt', 'dtreg', 'logit', 'linear', 'lasso', 'ridge', 'knn', 'ar']
        included_operations = light_models + available_data_operation
        available_operations = [_ for _ in available_operations if _ in included_operations]

    if preset == 'gpu':
        # OperationTypesRepository.assign_repo('model', 'gpu_models_repository.json')
        repository = OperationTypesRepository().assign_repo('model', 'gpu_models_repository.json')
        available_operations = repository.suitable_operation(task_type=task.task_type)
    return available_operations
コード例 #5
0
    def fit(self, train_data: InputData):
        """
        This method is used for operation training with the data provided
        :param InputData train_data: data used for operation training
        :return: trained Sklearn operation
        """

        warnings.filterwarnings("ignore", category=RuntimeWarning)
        if self.params_for_fit:
            operation_implementation = self.operation_impl(
                **self.params_for_fit)
        else:
            operation_implementation = self.operation_impl()

        # If model doesn't support mulio-utput and current task is ts_forecasting
        current_task = train_data.task.task_type
        models_repo = OperationTypesRepository()
        non_multi_models, _ = models_repo.suitable_operation(
            task_type=current_task, tags=['non_multi'])
        is_model_not_support_multi = self.operation_type in non_multi_models
        if is_model_not_support_multi and current_task == TaskTypesEnum.ts_forecasting:
            # Manually wrap the regressor into multi-output model
            operation_implementation = convert_to_multivariate_model(
                operation_implementation, train_data)
        else:
            operation_implementation.fit(train_data.features,
                                         train_data.target)
        return operation_implementation
コード例 #6
0
    def fit(self, train_data: InputData):
        """
        This method is used for operation training with the data provided
        :param InputData train_data: data used for operation training
        :return: trained cuML operation
        """
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        if self.params_for_fit:
            operation_implementation = self.operation_impl(
                **self.params_for_fit)
        else:
            operation_implementation = self.operation_impl()

        # If model doesn't support multi-output and current task is ts_forecasting
        current_task = train_data.task.task_type
        models_repo = OperationTypesRepository()
        non_multi_models, _ = models_repo.suitable_operation(
            task_type=current_task, tags=['non_multi'])
        is_model_not_support_multi = self.operation_type in non_multi_models
        features = cudf.DataFrame(train_data.features.astype('float32'))
        target = cudf.Series(train_data.target.flatten().astype('float32'))

        if is_model_not_support_multi and current_task == TaskTypesEnum.ts_forecasting:
            raise NotImplementedError('Not supported for GPU yet')
            # TODO Manually wrap the regressor into multi-output model
        else:
            operation_implementation.fit(features, target)
        return operation_implementation
コード例 #7
0
def test_cv_min_kfolds_raise():
    task = Task(task_type=TaskTypesEnum.classification)
    models_repo = OperationTypesRepository()
    available_model_types, _ = models_repo.suitable_operation(
        task_type=task.task_type, tags=['simple'])

    with pytest.raises(ValueError):
        GPComposerRequirements(primary=available_model_types,
                               secondary=available_model_types,
                               cv_folds=1)
コード例 #8
0
def test_cv_ts_and_cluster_raise():
    task = Task(task_type=TaskTypesEnum.clustering)
    dataset_to_compose, dataset_to_validate = get_data(task)
    metric_function = ClusteringMetricsEnum.silhouette

    operations_repo = OperationTypesRepository()
    available_model_types, _ = operations_repo.suitable_operation(
        task_type=task.task_type)
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        cv_folds=4)
    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    with pytest.raises(NotImplementedError):
        composer.compose_pipeline(data=dataset_to_compose, is_visualise=False)
コード例 #9
0
def has_no_conflicts_with_data_flow(pipeline: 'Pipeline'):
    """ Check if the pipeline contains incorrect connections between nodes """
    operation_repo = OperationTypesRepository(operation_type='data_operation')
    forbidden_parents_combination, _ = operation_repo.suitable_operation()
    forbidden_parents_combination = set(forbidden_parents_combination)

    for node in pipeline.nodes:
        parent_nodes = node.nodes_from

        if parent_nodes is not None and len(parent_nodes) > 1:
            # There are several parents
            operation_names = []
            for parent in parent_nodes:
                operation_names.append(parent.operation.operation_type)

            # If operations are identical
            if len(set(operation_names)) == 1:
                # And if it is forbidden to combine them
                if operation_names[0] in forbidden_parents_combination:
                    raise ValueError(
                        f'{ERROR_PREFIX} Pipeline has incorrect subgraph with identical data operations'
                    )
    return True