Ejemplo n.º 1
0
def test_hoeffding_tree_regressor_coverage():
    max_samples = 1000
    max_size_mb = 2

    stream = RegressionGenerator(
        n_samples=max_samples, n_features=10, n_informative=7, n_targets=1,
        random_state=42
    )
    X, y = stream.next_sample(max_samples)

    # Cover memory management
    tree = HoeffdingTreeRegressor(
        leaf_prediction='mean', grace_period=100,
        memory_estimate_period=100, max_byte_size=max_size_mb*2**20
    )
    tree.partial_fit(X, y)

    # A tree without memory management enabled reaches over 3 MB in size
    assert calculate_object_size(tree, 'MB') <= max_size_mb

    # Typo in leaf prediction
    tree = HoeffdingTreeRegressor(
        leaf_prediction='percptron', grace_period=100,
        memory_estimate_period=100, max_byte_size=max_size_mb*2**20
    )
    # Invalid split_criterion
    tree.split_criterion = 'VR'

    tree.partial_fit(X, y)
    assert calculate_object_size(tree, 'MB') <= max_size_mb

    tree.reset()
    assert tree._estimator_type == 'regressor'
Ejemplo n.º 2
0
def test_hoeffding_tree_coverage():
    max_samples = 1000
    max_size_mb = 2

    stream = RegressionGenerator(
        n_samples=max_samples, n_features=10, n_informative=7, n_targets=3,
        random_state=42
    )
    X, y = stream.next_sample(max_samples)

    # Will generate a warning concerning the invalid leaf prediction option
    tree = StackedSingleTargetHoeffdingTreeRegressor(
        leaf_prediction='mean', grace_period=200,
        memory_estimate_period=100, max_byte_size=max_size_mb*2**20
    )

    # Trying to predict without fitting
    tree.predict(X[0])

    tree.partial_fit(X, y)

    # A tree without memory management enabled reaches over 3 MB in size
    assert calculate_object_size(tree, 'MB') <= max_size_mb

    tree = StackedSingleTargetHoeffdingTreeRegressor(
        leaf_prediction='adaptive', grace_period=200,
        memory_estimate_period=100, max_byte_size=max_size_mb*2**20,
        learning_ratio_const=False
    )
    tree.partial_fit(X, y)
    assert calculate_object_size(tree, 'MB') <= max_size_mb
Ejemplo n.º 3
0
 def _estimate_model_byte_size(self):
     """ Calculate the size of the model and trigger tracker function if the actual model size
     exceeds the max size in the configuration."""
     learning_nodes = self._find_learning_nodes()
     total_active_size = 0
     total_inactive_size = 0
     for found_node in learning_nodes:
         if not found_node.node.is_leaf(
         ):  # Safety check for non-trivial tree structures
             continue
         if isinstance(found_node.node, ActiveLeaf):
             total_active_size += calculate_object_size(found_node.node)
         else:
             total_inactive_size += calculate_object_size(found_node.node)
     if total_active_size > 0:
         self._active_leaf_byte_size_estimate = total_active_size / self._active_leaf_node_cnt
     if total_inactive_size > 0:
         self._inactive_leaf_byte_size_estimate = total_inactive_size \
             / self._inactive_leaf_node_cnt
     actual_model_size = calculate_object_size(self)
     estimated_model_size = (
         self._active_leaf_node_cnt * self._active_leaf_byte_size_estimate +
         self._inactive_leaf_node_cnt *
         self._inactive_leaf_byte_size_estimate)
     self._byte_size_estimate_overhead_fraction = actual_model_size / estimated_model_size
     if actual_model_size > self.max_byte_size:
         self._enforce_tracker_limit()
def test_hoeffding_tree_coverage():
    # Cover memory management
    max_samples = 5000
    max_size_kb = 50
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=10,
                                 n_cat_features=2,
                                 n_num_features=5,
                                 n_categories_per_cat_feature=5,
                                 max_tree_depth=15,
                                 min_leaf_depth=3,
                                 fraction_leaves_per_level=0.15)

    nominal_attr_idx = [x for x in range(5, stream.n_features)]
    # Unconstrained model has over 72 kB
    learner = HoeffdingTreeClassifier(nominal_attributes=nominal_attr_idx,
                                      leaf_prediction='mc',
                                      memory_estimate_period=100,
                                      max_byte_size=max_size_kb * 2**10)

    X, y = stream.next_sample(max_samples)
    learner.partial_fit(X, y)

    assert calculate_object_size(learner, 'kB') <= max_size_kb

    learner.reset()
def test_extremely_fast_decision_tree_coverage():
    # Cover memory management
    max_size_kb = 20
    stream = SEAGenerator(random_state=1, noise_percentage=0.05)
    X, y = get_next_n_samples(stream, 5000)

    # Unconstrained model has over 50 kB
    learner = ExtremelyFastDecisionTreeClassifier(leaf_prediction='mc',
                                                  memory_estimate_period=200,
                                                  max_byte_size=max_size_kb *
                                                  2**10,
                                                  min_samples_reevaluate=2500)

    learner.partial_fit(X, y, classes=[0, 1])
    assert calculate_object_size(learner, 'kB') <= max_size_kb

    learner.reset()

    # Cover nominal attribute observer
    stream = RandomTreeGenerator(tree_random_state=23,
                                 sample_random_state=12,
                                 n_classes=2,
                                 n_cat_features=2,
                                 n_categories_per_cat_feature=4,
                                 n_num_features=1,
                                 max_tree_depth=30,
                                 min_leaf_depth=10,
                                 fraction_leaves_per_level=0.45)
    X, y = get_next_n_samples(stream, 5000)
    learner = ExtremelyFastDecisionTreeClassifier(
        leaf_prediction='nba', nominal_attributes=[i for i in range(1, 9)])
    learner.partial_fit(X, y, classes=[0, 1])
    def train():
        # Pre training the classifier
        X, y = stream.next_sample(stats["pretrain_size"])
        do_pretraining = X.shape[0] > 0
        if ensemble:
            if isinstance(model, list):
                if do_pretraining:
                    logging.info("Pre-training models in ensemble...")
                    [
                        m.partial_fit(X, y, classes=stream.target_values[0])
                        for m in model
                    ]
                    model_pretrained = ensemble(model, stream)
                else:
                    model_pretrained = ensemble(model, stream)
            elif type(ensemble(model,
                               stream)).__name__ == 'OzaBaggingMLClassifier':
                model_pretrained = ensemble(model, stream)
                if do_pretraining:
                    logging.info("Pre-training oza...")
                    model_pretrained.partial_fit(
                        X, y, classes=stream.target_values[0])
            else:
                if do_pretraining:
                    logging.info("Pre-training model in ensemble...")
                    model.partial_fit(X, y, classes=stream.target_values[0])
                model_pretrained = ensemble(model, stream)
        else:
            if do_pretraining:
                logging.info("Pre-training model...")
                model.partial_fit(X, y, classes=stream.target_values[0])
            model_pretrained = model

        # Keeping track of sample count, true labels and predictions to later
        # compute the classifier's hamming score
        iterations = 0

        logging.info("Training...")
        while stream.has_more_samples():
            X, y = stream.next_sample(stats["batch_size"])
            y_pred = model_pretrained.predict(X)
            model_pretrained.partial_fit(X, y, classes=stream.target_values[0])
            predictions.extend(y_pred)
            true_labels.extend(y)
            if iterations % log_every_iterations == 0:
                logging.info("%s / %s trained samples.",
                             (iterations + 1) * stats["batch_size"],
                             stats["train_size"])
            iterations += 1
        end_time = time.time()
        logging.info("All samples trained successfully")
        stats["success"] = True
        stats["error"] = False
        stats["end_time"] = end_time
        stats["time_seconds"] = end_time - stats["start_time"]
        stats["model_size_kb"] = calculate_object_size(model_pretrained, "kB")
    def measure_byte_size(self):
        """ Calculate the size of the tree.

        Returns
        -------
        int
            Size of the tree in bytes.

        """
        return calculate_object_size(self)
    def get_model_measurements(self):
        """Collect metrics corresponding to the current status of the model.

        Returns
        -------
        string
            A string buffer containing the measurements of the model.
        """
        size = calculate_object_size(self)
        measurements = {'Number of rules: ': len(self.rule_set), 'model_size in bytes': size}
        return measurements
Ejemplo n.º 9
0
def test_isoup_tree_coverage():
    max_samples = 1000
    max_size_mb = 2

    stream = RegressionGenerator(n_samples=max_samples,
                                 n_features=10,
                                 n_informative=7,
                                 n_targets=3,
                                 random_state=42)

    # Cover memory management
    tree = iSOUPTreeRegressor(leaf_prediction='mean',
                              grace_period=200,
                              memory_estimate_period=100,
                              max_byte_size=max_size_mb * 2**20)
    # Invalid split_criterion
    tree.split_criterion = 'ICVR'

    X, y = stream.next_sample(max_samples)
    tree.partial_fit(X, y)

    # A tree without memory management enabled reaches over 3 MB in size
    assert calculate_object_size(tree, 'MB') <= max_size_mb

    # Memory management in a tree with perceptron leaves (purposeful typo in leaf_prediction)
    tree = iSOUPTreeRegressor(leaf_prediction='PERCEPTRON',
                              grace_period=200,
                              memory_estimate_period=100,
                              max_byte_size=max_size_mb * 2**20)
    tree.partial_fit(X, y)
    assert calculate_object_size(tree, 'MB') <= max_size_mb

    # Memory management in a tree with adaptive leaves
    tree = iSOUPTreeRegressor(leaf_prediction='adaptive',
                              grace_period=200,
                              memory_estimate_period=100,
                              max_byte_size=max_size_mb * 2**20)

    tree.partial_fit(X, y)
    assert calculate_object_size(tree, 'MB') <= max_size_mb
Ejemplo n.º 10
0
def test_label_combination_hoeffding_tree_coverage():
    # Cover memory management
    max_samples = 10000
    max_size_kb = 50
    stream = MultilabelGenerator(n_samples=10000,
                                 n_features=15,
                                 n_targets=3,
                                 n_labels=4,
                                 random_state=112)

    # Unconstrained model has over 62 kB
    learner = LabelCombinationHoeffdingTreeClassifier(
        n_labels=3,
        leaf_prediction='mc',
        memory_estimate_period=200,
        max_byte_size=max_size_kb * 2**10)

    X, y = stream.next_sample(max_samples)
    learner.partial_fit(X, y)

    assert calculate_object_size(learner, 'kB') <= max_size_kb
 def measure_model_size(self, unit='byte'):
     return calculate_object_size(self, unit)
Ejemplo n.º 12
0
    def _update_metrics(self):
        """ Updates the metrics of interest. This function updates the evaluation data buffer
        which is used to track performance during evaluation.

        The content of the buffer depends on the evaluation task type and metrics selected.

        If more than one model/learner is evaluated at once, data is stored as lists inside
        the buffer.

        """
        shift = 0
        if self._method == 'prequential':
            shift = -self.batch_size  # Adjust index due to training after testing
        sample_id = self.global_sample_count + shift

        for metric in self.metrics:
            values = [[], []]
            if metric == constants.ACCURACY:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].accuracy_score())
                    values[1].append(
                        self.current_eval_measurements[i].accuracy_score())

            elif metric == constants.KAPPA:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].kappa_score())
                    values[1].append(
                        self.current_eval_measurements[i].kappa_score())

            elif metric == constants.KAPPA_T:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].kappa_t_score())
                    values[1].append(
                        self.current_eval_measurements[i].kappa_t_score())

            elif metric == constants.KAPPA_M:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].kappa_m_score())
                    values[1].append(
                        self.current_eval_measurements[i].kappa_m_score())

            elif metric == constants.HAMMING_SCORE:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].hamming_score())
                    values[1].append(
                        self.current_eval_measurements[i].hamming_score())

            elif metric == constants.HAMMING_LOSS:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].hamming_loss_score())
                    values[1].append(
                        self.current_eval_measurements[i].hamming_loss_score())

            elif metric == constants.EXACT_MATCH:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].exact_match_score())
                    values[1].append(
                        self.current_eval_measurements[i].exact_match_score())

            elif metric == constants.J_INDEX:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].jaccard_score())
                    values[1].append(
                        self.current_eval_measurements[i].jaccard_score())

            elif metric == constants.MSE:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].get_mean_square_error())
                    values[1].append(self.current_eval_measurements[i].
                                     get_mean_square_error())

            elif metric == constants.MAE:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].get_average_error())
                    values[1].append(
                        self.current_eval_measurements[i].get_average_error())

            elif metric == constants.AMSE:
                for i in range(self.n_models):
                    values[0].append(self.mean_eval_measurements[i].
                                     get_average_mean_square_error())
                    values[1].append(self.current_eval_measurements[i].
                                     get_average_mean_square_error())

            elif metric == constants.AMAE:
                for i in range(self.n_models):
                    values[0].append(self.mean_eval_measurements[i].
                                     get_average_absolute_error())
                    values[1].append(self.current_eval_measurements[i].
                                     get_average_absolute_error())

            elif metric == constants.ARMSE:
                for i in range(self.n_models):
                    values[0].append(self.mean_eval_measurements[i].
                                     get_average_root_mean_square_error())
                    values[1].append(self.current_eval_measurements[i].
                                     get_average_root_mean_square_error())

            elif metric == constants.F1_SCORE:
                for i in range(self.n_models):
                    values[0].append(self.mean_eval_measurements[i].f1_score())
                    values[1].append(
                        self.current_eval_measurements[i].f1_score())

            elif metric == constants.PRECISION:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].precision_score())
                    values[1].append(
                        self.current_eval_measurements[i].precision_score())

            elif metric == constants.RECALL:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].recall_score())
                    values[1].append(
                        self.current_eval_measurements[i].recall_score())

            elif metric == constants.GMEAN:
                for i in range(self.n_models):
                    values[0].append(
                        self.mean_eval_measurements[i].geometric_mean_score())
                    values[1].append(self.current_eval_measurements[i].
                                     geometric_mean_score())

            elif metric == constants.TRUE_VS_PREDICTED:
                y_true = -1
                y_pred = []
                for i in range(self.n_models):
                    t, p = self.mean_eval_measurements[i].get_last()
                    y_true = t  # We only need to keep one true value
                    y_pred.append(p)
                values[0] = y_true
                for i in range(self.n_models):
                    values[1].append(y_pred[i])

            elif metric == constants.DATA_POINTS:
                target_values = self.stream.target_values
                features = {
                }  # Dictionary containing feature values, using index as key

                y_pred, p = self.mean_eval_measurements[0].get_last(
                )  # Only track one model (first) by default

                X = self.stream.current_sample_x
                idx_1 = 0  # TODO let the user choose the feature indices of interest
                idx_2 = 1
                features[idx_1] = X[0][idx_1]
                features[idx_2] = X[0][idx_2]

                values = [None, None, None]
                values[0] = features
                values[1] = target_values
                values[2] = y_pred

            elif metric == constants.RUNNING_TIME:
                values = [[], [], []]
                for i in range(self.n_models):
                    values[0].append(self.running_time_measurements[i].
                                     get_current_training_time())
                    values[1].append(self.running_time_measurements[i].
                                     get_current_testing_time())
                    values[2].append(self.running_time_measurements[i].
                                     get_current_total_running_time())

            elif metric == constants.MODEL_SIZE:
                values = []
                for i in range(self.n_models):
                    values.append(calculate_object_size(self.model[i], 'kB'))

            else:
                raise ValueError('Unknown metric {}'.format(metric))

            # Update buffer
            if metric == constants.TRUE_VS_PREDICTED:
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id=constants.Y_TRUE,
                                              value=values[0])
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id=constants.Y_PRED,
                                              value=values[1])
            elif metric == constants.DATA_POINTS:
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id='X',
                                              value=values[0])
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id='target_values',
                                              value=values[1])
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id='predictions',
                                              value=values[2])
            elif metric == constants.RUNNING_TIME:
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id='training_time',
                                              value=values[0])
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id='testing_time',
                                              value=values[1])
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id='total_running_time',
                                              value=values[2])
            elif metric == constants.MODEL_SIZE:
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id='model_size',
                                              value=values)
            else:
                # Default case, 'mean' and 'current' performance
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id=constants.MEAN,
                                              value=values[0])
                self._data_buffer.update_data(sample_id=sample_id,
                                              metric_id=metric,
                                              data_id=constants.CURRENT,
                                              value=values[1])

        shift = 0
        if self._method == 'prequential':
            shift = -self.batch_size  # Adjust index due to training after testing
        self._update_outputs(self.global_sample_count + shift)