Beispiel #1
0
def test_evaluate_train_evaluate():
    params = get_hyperparameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)

    # models
    clfs = []

    # fit samples
    num_samples = 16
    for i in range(3):
        samples = [h.sample() for _ in range(num_samples)]
        labels = [np.sum(sample) for sample in samples]
        x, y = samples, labels
        x, y = dataset.encode_dataset(x, y)
        model = xgb_utils.train_single_model(x, y)
        clfs.append(model)

    # test samples
    num_samples = 100
    samples = [h.sample() for _ in range(num_samples)]
    ex2, _ = dataset.encode_dataset(samples, None)

    preds = xgb_utils.evaluate_models(ex2, clfs)
    count = np.sum(preds)

    print(count)
    assert preds.shape == (num_samples,)
    assert count > 0
Beispiel #2
0
def test_evaluate_single_sample():
    params = get_hyperparameter_list()
    h = hp.HyperParameterList(params)

    dataset = data.Dataset(h)

    # models
    clfs = []

    # fit samples
    num_samples = 16
    for i in range(3):
        samples = [h.sample() for _ in range(num_samples)]
        labels = [np.sum(sample) for sample in samples]
        x, y = samples, labels
        x, y = dataset.encode_dataset(x, y)
        model = xgb_utils.train_single_model(x, y)
        clfs.append(model)

    # single sample test
    sample = h.sample()
    ex2, _ = dataset.encode_dataset([sample])

    assert ex2.shape == (1, 3)

    pred = xgb_utils.evaluate_models(ex2, clfs)
    assert pred.shape == (1,)
Beispiel #3
0
    def _sample_parameters(self, relax_checks=False, max_classifiers=None):
        """
        Samples the underlying hyper parameters, checks if the sample passes through all
        of the classifiers, and only then submits it for evaluation.

        This is a very expensive process at large number of classifiers, as on average, it
        requires `2 ^ num_classifiers` number of samples to get a single sample to pass
        the cascade of classifier tests. At 18 classifiers, this is roughly 240k samples,
        and may be several times more than this if outliers exists.

        This also modifies the numpy random state before it begins sampling. This is
        primarily because it is pointless to have multiple processors sampling using the
        same random seed.

        # Arguments:
            relax_checks (bool): If set, will allow samples who do not pass all of the
                checks from all classifiers. Can be useful when large number of models
                are present and remaining search space is not big enough to allow sample
                to pass through all checks.
            max_classifiers (int | None): Number of classifiers to use for sampling.
                If set to None, will use all classifiers.

        # Returns:
            List of encoded sample value
        """
        np.random.RandomState(None)

        # If there are no classifiers, simply sample and pass through.
        if len(self.classifiers) == 0:
            sample = self.parameters.sample()
        else:
            # get the first sample and encode it.
            sample = self.parameters.sample()
            sample, _ = self.dataset.encode_dataset([sample],
                                                    objective=self.objective)

            # if we limit the number of classifiers, during `predict`, it is faster.
            # Not used during training.
            if max_classifiers is None:
                available_clfs = self.classifiers
            else:
                available_clfs = self.classifiers[:max_classifiers]

            # compute the average number of samples needed for a single sample to
            # pass through the cascade of classifiers
            average_num = 2**len(available_clfs)
            max_checks = int(average_num)

            counter = 0
            checks_relaxation_counter = 0
            total_count = 0

            clf_count = len(available_clfs)

            # keep sampling and testing until a sample passes all checks
            while not xgb_utils.evaluate_models(sample, available_clfs,
                                                checks_relaxation_counter):
                sample = self.parameters.sample()
                sample, _ = self.dataset.encode_dataset(
                    [sample], objective=self.objective)
                counter += 1

                # notify users at intervals which are multiples of the average number
                # of samples.
                if counter >= max_checks:
                    total_count += max_checks

                    # If checks are relaxed, uses fewer classifiers approvals in next stage
                    if relax_checks:
                        checks_relaxation_counter += 1

                        warnings.warn(
                            "Relaxing check to pass %d classifiers only" %
                            (clf_count - checks_relaxation_counter))

                    else:
                        # Otherwise, simply notify user that we could not find a sample
                        warnings.warn(
                            "Could not find a sample after %d checks. "
                            "You should consider using `relax_checks` to reduce "
                            "this constraint or wait it out." % (total_count))

                    counter = 0
                else:
                    counter += 1

            sample = sample[0].tolist()
            sample = self.dataset.decode_dataset([sample])
            sample = sample[0].tolist()

        return sample