Beispiel #1
0
def test_pca_model_removes_redunant_features_correct():
    n_informative = 5
    data = classification_dataset_with_redunant_features(
        n_samples=1000, n_features=100, n_informative=n_informative)
    train_data, test_data = train_test_data_setup(data=data)

    pca = Model(model_type='pca_data_model')
    _, train_predicted = pca.fit(data=train_data)

    assert train_predicted.shape[1] < data.features.shape[1]
Beispiel #2
0
def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    kmeans = Model(model_type='kmeans')

    _, train_predicted = kmeans.fit(data=train_data)

    assert all(np.unique(train_predicted) == [0, 1])
Beispiel #3
0
def test_random_forest_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    random_forest = Model(model_type='rf')

    _, train_predicted = random_forest.fit(data=train_data)

    roc_on_train = get_roc_auc(train_data, train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Beispiel #4
0
def test_log_regression_fit_correct(classification_dataset):
    data = classification_dataset
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    log_reg = Model(model_type='logit')

    _, train_predicted = log_reg.fit(data=train_data)

    roc_on_train = get_roc_auc(train_data, train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
Beispiel #5
0
def test_knn_classification_tune_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    knn = Model(model_type='knn')
    model, _ = knn.fit(data=train_data)
    test_predicted = knn.predict(fitted_model=model, data=test_data)

    roc_on_test = roc_auc(y_true=test_data.target, y_score=test_predicted)

    roc_on_test_tuned_list = []
    for _ in range(3):
        knn_for_tune = Model(model_type='knn')
        model, _ = knn_for_tune.fine_tune(data=train_data,
                                          iterations=10,
                                          max_lead_time=timedelta(minutes=1))

        test_predicted_tuned = knn_for_tune.predict(fitted_model=model,
                                                    data=test_data)

        roc_on_test_tuned = roc_auc(y_true=test_data.target,
                                    y_score=test_predicted_tuned)

        roc_on_test_tuned_list.append(roc_on_test_tuned)

    roc_threshold = 0.6
    assert np.array(
        roc_on_test_tuned_list).any() >= roc_on_test > roc_threshold
Beispiel #6
0
def test_scoring_logreg_tune_correct(data_fixture, request):
    train_data, test_data = request.getfixturevalue(data_fixture)

    train_data.features = ScalingWithImputation().fit(
        train_data.features).apply(train_data.features)
    test_data.features = ScalingWithImputation().fit(test_data.features).apply(
        test_data.features)

    logreg = Model(model_type='logit')

    model, _ = logreg.fit(train_data)
    test_predicted = logreg.predict(fitted_model=model, data=test_data)

    test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted)

    logreg_for_tune = Model(model_type='logit')

    model_tuned, _ = logreg_for_tune.fine_tune(
        data=train_data, iterations=50, max_lead_time=timedelta(minutes=0.1))
    test_predicted_tuned = logreg_for_tune.predict(fitted_model=model_tuned,
                                                   data=test_data)

    test_roc_auc_tuned = roc_auc(y_true=test_data.target,
                                 y_score=test_predicted_tuned)

    roc_threshold = 0.6

    assert round(test_roc_auc_tuned, 2) >= round(test_roc_auc,
                                                 2) > roc_threshold
Beispiel #7
0
    def __init__(self, nodes_from: Optional[List['Node']], model_type: [str, 'Model'],
                 manual_preprocessing_func: Optional[Callable] = None,
                 log=None):
        self.nodes_from = nodes_from
        self.cache = FittedModelCache(self)
        self.manual_preprocessing_func = manual_preprocessing_func
        self.log = log

        if not log:
            self.log = default_log(__name__)
        else:
            self.log = log

        if not isinstance(model_type, str):
            self.model = model_type
        else:
            self.model = Model(model_type=model_type)
Beispiel #8
0
def test_node_factory_log_reg_correct(data_setup):
    model_type = 'logit'
    node = PrimaryNode(model_type=model_type)

    expected_model = Model(model_type=model_type).__class__
    actual_model = node.model.__class__

    assert node.__class__ == PrimaryNode
    assert expected_model == actual_model
Beispiel #9
0
def test_arima_tune_correct():
    data = get_synthetic_ts_data_period()
    train_data, test_data = train_test_data_setup(data=data)

    arima_for_tune = Model(model_type='arima')
    model, _ = arima_for_tune.fine_tune(data=train_data,
                                        iterations=5,
                                        max_lead_time=timedelta(minutes=0.1))

    test_predicted_tuned = arima_for_tune.predict(fitted_model=model,
                                                  data=test_data)

    rmse_on_test_tuned = mse(y_true=test_data.target,
                             y_pred=test_predicted_tuned,
                             squared=False)

    rmse_threshold = np.std(test_data.target)

    assert rmse_on_test_tuned < rmse_threshold
Beispiel #10
0
def test_classification_manual_tuning_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    knn = Model(model_type='knn')
    model, _ = knn.fit(data=train_data)
    test_predicted = knn.predict(fitted_model=model, data=test_data)

    knn_for_tune = Model(model_type='knn')
    knn_for_tune.params = {'n_neighbors': 1}
    model, _ = knn_for_tune.fit(data=train_data)

    test_predicted_tuned = knn_for_tune.predict(fitted_model=model,
                                                data=test_data)

    assert not np.array_equal(test_predicted, test_predicted_tuned)
Beispiel #11
0
def test_logger_write_logs_correctly():
    test_file_path = str(os.path.dirname(__file__))
    test_log_file = os.path.join(test_file_path, 'test_log.log')
    test_log = default_log('test_log', log_file=test_log_file)

    # Model data preparation
    file = os.path.join('../data', 'advanced_classification.csv')
    data = InputData.from_csv(os.path.join(test_file_path, file))
    train_data, test_data = train_test_data_setup(data=data)

    try:
        knn = Model(model_type='knnreg', log=test_log)
        model, _ = knn.fit(data=train_data)
    except Exception:
        print('Captured error')

    if os.path.exists(test_log_file):
        with open(test_log_file, 'r') as file:
            content = file.readlines()

    release_log(logger=test_log, log_file=test_log_file)
    assert 'Can not find evaluation strategy' in content[0]
Beispiel #12
0
def fit_template(chain_template, classes, with_gaussian=False, skip_fit=False):
    templates_by_models = []
    for model_template in itertools.chain.from_iterable(chain_template):
        model_instance = Model(model_type=model_template.model_type)
        model_template.model_instance = model_instance
        templates_by_models.append((model_template, model_instance))
    if skip_fit:
        return

    for template, instance in templates_by_models:
        samples, features_amount = template.input_shape

        if with_gaussian:
            features, target = gauss_quantiles(samples_amount=samples,
                                               features_amount=features_amount,
                                               classes_amount=classes)
        else:
            options = {
                'informative': features_amount,
                'redundant': 0,
                'repeated': 0,
                'clusters_per_class': 1
            }
            features, target = synthetic_dataset(
                samples_amount=samples,
                features_amount=features_amount,
                classes_amount=classes,
                features_options=options)
        target = np.expand_dims(target, axis=1)
        data_train = InputData(idx=np.arange(0, samples),
                               features=features,
                               target=target,
                               data_type=DataTypesEnum.table,
                               task=Task(TaskTypesEnum.classification))

        preproc_data = copy(data_train)
        preprocessor = Normalization().fit(preproc_data.features)
        preproc_data.features = preprocessor.apply(preproc_data.features)
        print(f'Fit {instance}')
        fitted_model, predictions = instance.fit(data=preproc_data)

        template.fitted_model = fitted_model
        template.data_fit = preproc_data
        template.preprocessor = preprocessor
Beispiel #13
0
def test_pca_manual_tuning_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    pca = Model(model_type='pca_data_model')
    model, _ = pca.fit(data=train_data)
    test_predicted = pca.predict(fitted_model=model, data=test_data)

    pca_for_tune = Model(model_type='pca_data_model')

    pca_for_tune.params = {
        'svd_solver': 'randomized',
        'iterated_power': 'auto',
        'dim_reduction_expl_thr': 0.7,
        'dim_reduction_min_expl': 0.001
    }

    model, _ = pca_for_tune.fit(data=train_data)
    test_predicted_tuned = pca_for_tune.predict(fitted_model=model,
                                                data=test_data)

    assert not np.array_equal(test_predicted, test_predicted_tuned)
Beispiel #14
0
def test_rf_class_tune_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = ScalingWithImputation().fit(data.features).apply(
        data.features)
    train_data, test_data = train_test_data_setup(data=data)

    rf = Model(model_type='rf')

    model, _ = rf.fit(train_data)
    test_predicted = rf.predict(fitted_model=model, data=test_data)

    test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted)

    model_tuned, _ = rf.fine_tune(data=train_data,
                                  iterations=12,
                                  max_lead_time=timedelta(minutes=0.1))
    test_predicted_tuned = rf.predict(fitted_model=model_tuned, data=test_data)

    test_roc_auc_tuned = roc_auc(y_true=test_data.target,
                                 y_score=test_predicted_tuned)
    roc_threshold = 0.7

    assert test_roc_auc_tuned >= test_roc_auc
    assert test_roc_auc_tuned > roc_threshold
Beispiel #15
0
class Node(ABC):
    """
    Base class for Node definition in Chain structure

    :param nodes_from: parent nodes which information comes from
    :param model_type: str type of the model defined in model repository
    :param manual_preprocessing_func: optional function for data preprocessing.
    If not defined one of the available preprocessing strategies is used. \
    See the `preprocessors <https://github.com/nccr-itmo/FEDOT/blob/master/core/models/preprocessing.py>`__
    :param log: Log object to record messages
    """

    def __init__(self, nodes_from: Optional[List['Node']], model_type: [str, 'Model'],
                 manual_preprocessing_func: Optional[Callable] = None,
                 log=None):
        self.nodes_from = nodes_from
        self.cache = FittedModelCache(self)
        self.manual_preprocessing_func = manual_preprocessing_func
        self.log = log

        if not log:
            self.log = default_log(__name__)
        else:
            self.log = log

        if not isinstance(model_type, str):
            self.model = model_type
        else:
            self.model = Model(model_type=model_type)

    @property
    def descriptive_id(self):
        return self._descriptive_id_recursive(visited_nodes=[])

    def _descriptive_id_recursive(self, visited_nodes):
        node_label = self.model.description
        if self.manual_preprocessing_func:
            node_label = f'{node_label}_custom_preprocessing={self.manual_preprocessing_func.__name__}'
        full_path = ''
        if self in visited_nodes:
            return 'ID_CYCLED'
        visited_nodes.append(self)
        if self.nodes_from:
            previous_items = []
            for parent_node in self.nodes_from:
                previous_items.append(f'{parent_node._descriptive_id_recursive(copy(visited_nodes))};')
            previous_items.sort()
            previous_items_str = ';'.join(previous_items)

            full_path += f'({previous_items_str})'
        full_path += f'/{node_label}'
        return full_path

    @property
    def model_tags(self) -> List[str]:
        return self.model.metadata.tags

    def output_from_prediction(self, input_data, prediction):
        return OutputData(idx=input_data.idx,
                          features=input_data.features,
                          predict=prediction, task=input_data.task,
                          data_type=self.model.output_datatype(input_data.data_type))

    def _transform(self, input_data: InputData):
        transformed_data = transformation_function_for_data(
            input_data_type=input_data.data_type,
            required_data_types=self.model.metadata.input_types)(input_data)
        return transformed_data

    def _preprocess(self, data: InputData):
        preprocessing_func = preprocessing_func_for_data(data, self)

        if not self.cache.actual_cached_state:
            # if fitted preprocessor not found in cache
            preprocessing_strategy = \
                preprocessing_func().fit(data.features)
        else:
            # if fitted preprocessor already exists
            preprocessing_strategy = self.cache.actual_cached_state.preprocessor

        data.features = preprocessing_strategy.apply(data.features)

        return data, preprocessing_strategy

    def fit(self, input_data: InputData, verbose=False) -> OutputData:
        """
        Run training process in the node

        :param input_data: data used for model training
        :param verbose: flag used for status printing to console, default False
        """
        transformed = self._transform(input_data)
        preprocessed_data, preproc_strategy = self._preprocess(transformed)

        if not self.cache.actual_cached_state:
            if verbose:
                print('Cache is not actual')

            cached_model, model_predict = self.model.fit(data=preprocessed_data)
            self.cache.append(CachedState(preprocessor=copy(preproc_strategy),
                                          model=cached_model))
        else:
            if verbose:
                print('Model were obtained from cache')

            model_predict = self.model.predict(fitted_model=self.cache.actual_cached_state.model,
                                               data=preprocessed_data)

        return self.output_from_prediction(input_data, model_predict)

    def predict(self, input_data: InputData, output_mode: str = 'default', verbose=False) -> OutputData:
        """
        Run prediction process in the node

        :param input_data: data used for prediction
        :param output_mode: desired output for models (e.g. labels, probs, full_probs)
        :param verbose: flag used for status printing to console, default False
        """
        transformed = self._transform(input_data)
        preprocessed_data, _ = self._preprocess(transformed)

        if not self.cache:
            raise ValueError('Model must be fitted before predict')

        model_predict = self.model.predict(fitted_model=self.cache.actual_cached_state.model,
                                           data=preprocessed_data, output_mode=output_mode)

        return self.output_from_prediction(input_data, model_predict)

    def fine_tune(self, input_data: InputData,
                  max_lead_time: timedelta = timedelta(minutes=5), iterations: int = 30):
        """
        Run the process of hyperparameter optimization for the node

        :param input_data: data used for tuning
        :param iterations: max number of iterations
        :param max_lead_time: max time available for tuning process
        """

        transformed = self._transform(input_data)
        preprocessed_data, preproc_strategy = self._preprocess(transformed)

        fitted_model, _ = self.model.fine_tune(preprocessed_data,
                                               max_lead_time=max_lead_time,
                                               iterations=iterations)

        self.cache.append(CachedState(preprocessor=copy(preproc_strategy),
                                      model=fitted_model))

    def __str__(self):
        model = f'{self.model}'
        return model

    @property
    def ordered_subnodes_hierarchy(self) -> List['Node']:
        nodes = [self]
        if self.nodes_from:
            for parent in self.nodes_from:
                nodes += parent.ordered_subnodes_hierarchy
        return nodes

    @property
    def custom_params(self) -> dict:
        return self.model.params

    @custom_params.setter
    def custom_params(self, params):
        if params:
            self.model.params = params
 def __init__(self, nodes_from: Optional[List['Node']],
              model_type: ModelTypesIdsEnum):
     model = Model(model_type=model_type)
     nodes_from = [] if nodes_from is None else nodes_from
     super().__init__(nodes_from=nodes_from, model=model)
 def __init__(self, model_type: ModelTypesIdsEnum):
     model = Model(model_type=model_type)
     super().__init__(nodes_from=None, model=model)