Ejemplo n.º 1
0
def test_clamp(lower, value, upper):
    lower, upper = sorted((lower, upper))

    clamped = clamp(lower, value, upper)

    assert lower <= clamped <= upper

    if lower <= value <= upper:
        assert value == clamped
    if lower > value:
        assert clamped == lower
    if value > upper:
        assert clamped == upper
Ejemplo n.º 2
0
    def generate_new_examples(self):
        if Phase.generate not in self.settings.phases:
            return
        if self.interesting_examples:
            # The example database has failing examples from a previous run,
            # so we'd rather report that they're still failing ASAP than take
            # the time to look for additional failures.
            return

        self.debug("Generating new examples")

        zero_data = self.cached_test_function(hbytes(BUFFER_SIZE))
        if zero_data.status > Status.OVERRUN:
            self.__data_cache.pin(zero_data.buffer)

        if zero_data.status == Status.OVERRUN or (
            zero_data.status == Status.VALID and len(zero_data.buffer) * 2 > BUFFER_SIZE
        ):
            fail_health_check(
                self.settings,
                "The smallest natural example for your test is extremely "
                "large. This makes it difficult for Hypothesis to generate "
                "good examples, especially when trying to reduce failing ones "
                "at the end. Consider reducing the size of your data if it is "
                "of a fixed size. You could also fix this by improving how "
                "your data shrinks (see https://hypothesis.readthedocs.io/en/"
                "latest/data.html#shrinking for details), or by introducing "
                "default values inside your strategy. e.g. could you replace "
                "some arguments with their defaults by using "
                "one_of(none(), some_complex_strategy)?",
                HealthCheck.large_base_example,
            )

        self.health_check_state = HealthCheckState()

        def should_generate_more():
            # End the generation phase where we would have ended it if no bugs had
            # been found.  This reproduces the exit logic in `self.test_function`,
            # but with the important distinction that this clause will move on to
            # the shrinking phase having found one or more bugs, while the other
            # will exit having found zero bugs.
            if (
                self.valid_examples >= self.settings.max_examples
                or self.call_count >= max(self.settings.max_examples * 10, 1000)
                or (
                    self.best_examples_of_observed_targets
                    and self.valid_examples * 2 >= self.settings.max_examples
                    and self.should_optimise
                )
            ):  # pragma: no cover
                return False

            # If we haven't found a bug, keep looking - if we hit any limits on
            # the number of tests to run that will raise an exception and stop
            # the run.
            if not self.interesting_examples:
                return True
            # If we've found a bug and won't report more than one, stop looking.
            elif not self.settings.report_multiple_bugs:
                return False
            assert self.first_bug_found_at <= self.last_bug_found_at <= self.call_count
            # Otherwise, keep searching for between ten and 'a heuristic' calls.
            # We cap 'calls after first bug' so errors are reported reasonably
            # soon even for tests that are allowed to run for a very long time,
            # or sooner if the latest half of our test effort has been fruitless.
            return self.call_count < MIN_TEST_CALLS or self.call_count < min(
                self.first_bug_found_at + 1000, self.last_bug_found_at * 2
            )

        # We attempt to use the size of the minimal generated test case starting
        # from a given novel prefix as a guideline to generate smaller test
        # cases for an initial period, by restriscting ourselves to test cases
        # that are not much larger than it.
        #
        # Calculating the actual minimal generated test case is hard, so we
        # take a best guess that zero extending a prefix produces the minimal
        # test case starting with that prefix (this is true for our built in
        # strategies). This is only a reasonable thing to do if the resulting
        # test case is valid. If we regularly run into situations where it is
        # not valid then this strategy is a waste of time, so we want to
        # abandon it early. In order to do this we track how many times in a
        # row it has failed to work, and abort small test case generation when
        # it has failed too many times in a row.
        consecutive_zero_extend_is_invalid = 0

        while should_generate_more():
            prefix = self.generate_novel_prefix()
            assert len(prefix) <= BUFFER_SIZE

            # We control growth during initial example generation, for two
            # reasons:
            #
            # * It gives us an opportunity to find small examples early, which
            #   gives us a fast path for easy to find bugs.
            # * It avoids low probability events where we might end up
            #   generating very large examples during health checks, which
            #   on slower machines can trigger HealthCheck.too_slow.
            #
            # The heuristic we use is that we attempt to estimate the smallest
            # extension of this prefix, and limit the size to no more than
            # an order of magnitude larger than that. If we fail to estimate
            # the size accurately, we skip over this prefix and try again.
            #
            # We need to tune the example size based on the initial prefix,
            # because any fixed size might be too small, and any size based
            # on the strategy in general can fall afoul of strategies that
            # have very different sizes for different prefixes.
            small_example_cap = clamp(10, self.settings.max_examples // 10, 50)

            if (
                self.valid_examples <= small_example_cap
                and self.call_count <= 5 * small_example_cap
                and not self.interesting_examples
                and consecutive_zero_extend_is_invalid < 5
            ):
                minimal_example = self.cached_test_function(
                    prefix + hbytes(BUFFER_SIZE - len(prefix))
                )

                if minimal_example.status < Status.VALID:
                    consecutive_zero_extend_is_invalid += 1
                    continue

                consecutive_zero_extend_is_invalid = 0

                minimal_extension = len(minimal_example.buffer) - len(prefix)

                max_length = min(len(prefix) + minimal_extension * 10, BUFFER_SIZE)

                # We could end up in a situation where even though the prefix was
                # novel when we generated it, because we've now tried zero extending
                # it not all possible continuations of it will be novel. In order to
                # avoid making redundant test calls, we rerun it in simulation mode
                # first. If this has a predictable result, then we don't bother
                # running the test function for real here. If however we encounter
                # some novel behaviour, we try again with the real test function,
                # starting from the new novel prefix that has discovered.
                try:
                    trial_data = self.new_conjecture_data(
                        prefix=prefix, max_length=max_length
                    )
                    self.tree.simulate_test_function(trial_data)
                    continue
                except PreviouslyUnseenBehaviour:
                    pass

                # If the simulation entered part of the tree that has been killed,
                # we don't want to run this.
                if trial_data.observer.killed:
                    continue

                # We might have hit the cap on number of examples we should
                # run when calculating the minimal example.
                if not should_generate_more():
                    break

                prefix = trial_data.buffer
            else:
                max_length = BUFFER_SIZE

            data = self.new_conjecture_data(prefix=prefix, max_length=max_length)

            self.test_function(data)

            # A thing that is often useful but rarely happens by accident is
            # to generate the same value at multiple different points in the
            # test case.
            #
            # Rather than make this the responsibility of individual strategies
            # we implement a small mutator that just takes parts of the test
            # case with the same label and tries replacing one of them with a
            # copy of the other and tries running it. If we've made a good
            # guess about what to put where, this will run a similar generated
            # test case with more duplication.
            if (
                # An OVERRUN doesn't have enough information about the test
                # case to mutate, so we just skip those.
                data.status >= Status.INVALID
                # This has a tendency to trigger some weird edge cases during
                # generation so we don't let it run until we're done with the
                # health checks.
                and self.health_check_state is None
            ):
                initial_calls = self.call_count
                failed_mutations = 0
                while (
                    should_generate_more()
                    # We implement fairly conservative checks for how long we
                    # we should run mutation for, as it's generally not obvious
                    # how helpful it is for any given test case.
                    and self.call_count <= initial_calls + 5
                    and failed_mutations <= 5
                ):
                    groups = defaultdict(list)
                    for ex in data.examples:
                        groups[ex.label, ex.depth].append(ex)

                    groups = [v for v in groups.values() if len(v) > 1]

                    if not groups:
                        break

                    group = self.random.choice(groups)

                    ex1, ex2 = sorted(
                        self.random.sample(group, 2), key=lambda i: i.index
                    )
                    assert ex1.end <= ex2.start

                    replacements = [data.buffer[e.start : e.end] for e in [ex1, ex2]]

                    replacement = self.random.choice(replacements)

                    try:
                        # We attempt to replace both the the examples with
                        # whichever choice we made. Note that this might end
                        # up messing up and getting the example boundaries
                        # wrong - labels matching are only a best guess as to
                        # whether the two are equivalent - but it doesn't
                        # really matter. It may not achieve the desired result
                        # but it's still a perfectly acceptable choice sequence.
                        # to try.
                        new_data = self.cached_test_function(
                            data.buffer[: ex1.start]
                            + replacement
                            + data.buffer[ex1.end : ex2.start]
                            + replacement
                            + data.buffer[ex2.end :],
                            # We set error_on_discard so that we don't end up
                            # entering parts of the tree we consider redundant
                            # and not worth exploring.
                            error_on_discard=True,
                            extend=BUFFER_SIZE,
                        )
                    except ContainsDiscard:
                        failed_mutations += 1
                        continue

                    if (
                        new_data.status >= data.status
                        and data.buffer != new_data.buffer
                        and all(
                            k in new_data.target_observations
                            and new_data.target_observations[k] >= v
                            for k, v in data.target_observations.items()
                        )
                    ):
                        data = new_data
                        failed_mutations = 0
                    else:
                        failed_mutations += 1
Ejemplo n.º 3
0
    def generate_new_examples(self):
        if Phase.generate not in self.settings.phases:
            return
        if self.interesting_examples:
            # The example database has failing examples from a previous run,
            # so we'd rather report that they're still failing ASAP than take
            # the time to look for additional failures.
            return

        zero_data = self.cached_test_function(hbytes(BUFFER_SIZE))
        if zero_data.status > Status.OVERRUN:
            self.__data_cache.pin(zero_data.buffer)

        self.optimise_all(zero_data)

        if zero_data.status == Status.OVERRUN or (
                zero_data.status == Status.VALID
                and len(zero_data.buffer) * 2 > BUFFER_SIZE):
            fail_health_check(
                self.settings,
                "The smallest natural example for your test is extremely "
                "large. This makes it difficult for Hypothesis to generate "
                "good examples, especially when trying to reduce failing ones "
                "at the end. Consider reducing the size of your data if it is "
                "of a fixed size. You could also fix this by improving how "
                "your data shrinks (see https://hypothesis.readthedocs.io/en/"
                "latest/data.html#shrinking for details), or by introducing "
                "default values inside your strategy. e.g. could you replace "
                "some arguments with their defaults by using "
                "one_of(none(), some_complex_strategy)?",
                HealthCheck.large_base_example,
            )

        self.health_check_state = HealthCheckState()

        def should_generate_more():
            # If we haven't found a bug, keep looking.  We check this before
            # doing anything else as it's by far the most common case.
            if not self.interesting_examples:
                return True
            # If we've found a bug and won't report more than one, stop looking.
            elif not self.settings.report_multiple_bugs:
                return False
            assert self.first_bug_found_at <= self.last_bug_found_at <= self.call_count
            # End the generation phase where we would have ended it if no bugs had
            # been found.  This reproduces the exit logic in `self.test_function`,
            # but with the important distinction that this clause will move on to
            # the shrinking phase having found one or more bugs, while the other
            # will exit having found zero bugs.
            if (self.valid_examples >= self.settings.max_examples
                    or self.call_count >= max(self.settings.max_examples * 10,
                                              1000)):  # pragma: no cover
                return False
            # Otherwise, keep searching for between ten and 'a heuristic' calls.
            # We cap 'calls after first bug' so errors are reported reasonably
            # soon even for tests that are allowed to run for a very long time,
            # or sooner if the latest half of our test effort has been fruitless.
            return self.call_count < MIN_TEST_CALLS or self.call_count < min(
                self.first_bug_found_at + 1000, self.last_bug_found_at * 2)

        # GenerationParameters are a set of decisions we make that are global
        # to the whole test case, used to bias the data generation in various
        # ways. This is an approach very very loosely inspired by the paper
        # "Swarm testing." by Groce et al. in that it induces deliberate
        # correlation between otherwise independent decisions made during the
        # generation process.
        #
        # More importantly the generation is designed to make certain scenarios
        # more likely (e.g. small examples, duplicated values), which can help
        # or hurt in terms of finding interesting things. Whenever the result
        # of our generation is a bad test case, for whatever definition of
        # "bad" we like (currently, invalid or too large), we ditch the
        # parameter early. This allows us to potentially generate good test
        # cases significantly more often than we otherwise would, by selecting
        # for parameters that make them more likely.
        parameter = GenerationParameters(self.random)
        count = 0

        # We attempt to use the size of the minimal generated test case starting
        # from a given novel prefix as a guideline to generate smaller test
        # cases for an initial period, by restriscting ourselves to test cases
        # that are not much larger than it.
        #
        # Calculating the actual minimal generated test case is hard, so we
        # take a best guess that zero extending a prefix produces the minimal
        # test case starting with that prefix (this is true for our built in
        # strategies). This is only a reasonable thing to do if the resulting
        # test case is valid. If we regularly run into situations where it is
        # not valid then this strategy is a waste of time, so we want to
        # abandon it early. In order to do this we track how many times in a
        # row it has failed to work, and abort small test case generation when
        # it has failed too many times in a row.
        consecutive_zero_extend_is_invalid = 0

        while should_generate_more():
            prefix = self.generate_novel_prefix()
            assert len(prefix) <= BUFFER_SIZE

            # We control growth during initial example generation, for two
            # reasons:
            #
            # * It gives us an opportunity to find small examples early, which
            #   gives us a fast path for easy to find bugs.
            # * It avoids low probability events where we might end up
            #   generating very large examples during health checks, which
            #   on slower machines can trigger HealthCheck.too_slow.
            #
            # The heuristic we use is that we attempt to estimate the smallest
            # extension of this prefix, and limit the size to no more than
            # an order of magnitude larger than that. If we fail to estimate
            # the size accurately, we skip over this prefix and try again.
            #
            # We need to tune the example size based on the initial prefix,
            # because any fixed size might be too small, and any size based
            # on the strategy in general can fall afoul of strategies that
            # have very different sizes for different prefixes.
            small_example_cap = clamp(10, self.settings.max_examples // 10, 50)

            if (self.valid_examples <= small_example_cap
                    and self.call_count <= 5 * small_example_cap
                    and not self.interesting_examples
                    and consecutive_zero_extend_is_invalid < 5):
                minimal_example = self.cached_test_function(
                    prefix + hbytes(BUFFER_SIZE - len(prefix)))

                if minimal_example.status < Status.VALID:
                    consecutive_zero_extend_is_invalid += 1
                    continue

                consecutive_zero_extend_is_invalid = 0

                minimal_extension = len(minimal_example.buffer) - len(prefix)

                max_length = min(
                    len(prefix) + minimal_extension * 10, BUFFER_SIZE)

                # We could end up in a situation where even though the prefix was
                # novel when we generated it, because we've now tried zero extending
                # it not all possible continuations of it will be novel. In order to
                # avoid making redundant test calls, we rerun it in simulation mode
                # first. If this has a predictable result, then we don't bother
                # running the test function for real here. If however we encounter
                # some novel behaviour, we try again with the real test function,
                # starting from the new novel prefix that has discovered.
                try:
                    trial_data = self.new_conjecture_data(
                        prefix=prefix,
                        parameter=parameter,
                        max_length=max_length)
                    self.tree.simulate_test_function(trial_data)
                    continue
                except PreviouslyUnseenBehaviour:
                    pass

                # If the simulation entered part of the tree that has been killed,
                # we don't want to run this.
                if trial_data.observer.killed:
                    continue

                # We might have hit the cap on number of examples we should
                # run when calculating the minimal example.
                if not should_generate_more():
                    break

                prefix = trial_data.buffer
            else:
                max_length = BUFFER_SIZE

            data = self.new_conjecture_data(prefix=prefix,
                                            parameter=parameter,
                                            max_length=max_length)

            self.test_function(data)

            self.optimise_all(data)

            count += 1
            if (data.status < Status.VALID
                    or len(data.buffer) * 2 >= BUFFER_SIZE or count > 5):
                count = 0
                parameter = GenerationParameters(self.random)
Ejemplo n.º 4
0
    def generate_new_examples(self):
        if Phase.generate not in self.settings.phases:
            return
        if self.interesting_examples:
            # The example database has failing examples from a previous run,
            # so we'd rather report that they're still failing ASAP than take
            # the time to look for additional failures.
            return

        self.debug("Generating new examples")

        assert self.should_generate_more()
        zero_data = self.cached_test_function(bytes(BUFFER_SIZE))
        if zero_data.status > Status.OVERRUN:
            self.__data_cache.pin(zero_data.buffer)

        if zero_data.status == Status.OVERRUN or (
            zero_data.status == Status.VALID and len(zero_data.buffer) * 2 > BUFFER_SIZE
        ):
            fail_health_check(
                self.settings,
                "The smallest natural example for your test is extremely "
                "large. This makes it difficult for Hypothesis to generate "
                "good examples, especially when trying to reduce failing ones "
                "at the end. Consider reducing the size of your data if it is "
                "of a fixed size. You could also fix this by improving how "
                "your data shrinks (see https://hypothesis.readthedocs.io/en/"
                "latest/data.html#shrinking for details), or by introducing "
                "default values inside your strategy. e.g. could you replace "
                "some arguments with their defaults by using "
                "one_of(none(), some_complex_strategy)?",
                HealthCheck.large_base_example,
            )

        self.health_check_state = HealthCheckState()

        # We attempt to use the size of the minimal generated test case starting
        # from a given novel prefix as a guideline to generate smaller test
        # cases for an initial period, by restriscting ourselves to test cases
        # that are not much larger than it.
        #
        # Calculating the actual minimal generated test case is hard, so we
        # take a best guess that zero extending a prefix produces the minimal
        # test case starting with that prefix (this is true for our built in
        # strategies). This is only a reasonable thing to do if the resulting
        # test case is valid. If we regularly run into situations where it is
        # not valid then this strategy is a waste of time, so we want to
        # abandon it early. In order to do this we track how many times in a
        # row it has failed to work, and abort small test case generation when
        # it has failed too many times in a row.
        consecutive_zero_extend_is_invalid = 0

        # We control growth during initial example generation, for two
        # reasons:
        #
        # * It gives us an opportunity to find small examples early, which
        #   gives us a fast path for easy to find bugs.
        # * It avoids low probability events where we might end up
        #   generating very large examples during health checks, which
        #   on slower machines can trigger HealthCheck.too_slow.
        #
        # The heuristic we use is that we attempt to estimate the smallest
        # extension of this prefix, and limit the size to no more than
        # an order of magnitude larger than that. If we fail to estimate
        # the size accurately, we skip over this prefix and try again.
        #
        # We need to tune the example size based on the initial prefix,
        # because any fixed size might be too small, and any size based
        # on the strategy in general can fall afoul of strategies that
        # have very different sizes for different prefixes.
        small_example_cap = clamp(10, self.settings.max_examples // 10, 50)

        optimise_at = max(self.settings.max_examples // 2, small_example_cap + 1)
        ran_optimisations = False

        while self.should_generate_more():
            prefix = self.generate_novel_prefix()
            assert len(prefix) <= BUFFER_SIZE
            if (
                self.valid_examples <= small_example_cap
                and self.call_count <= 5 * small_example_cap
                and not self.interesting_examples
                and consecutive_zero_extend_is_invalid < 5
            ):
                minimal_example = self.cached_test_function(
                    prefix + bytes(BUFFER_SIZE - len(prefix))
                )

                if minimal_example.status < Status.VALID:
                    consecutive_zero_extend_is_invalid += 1
                    continue

                consecutive_zero_extend_is_invalid = 0

                minimal_extension = len(minimal_example.buffer) - len(prefix)

                max_length = min(len(prefix) + minimal_extension * 10, BUFFER_SIZE)

                # We could end up in a situation where even though the prefix was
                # novel when we generated it, because we've now tried zero extending
                # it not all possible continuations of it will be novel. In order to
                # avoid making redundant test calls, we rerun it in simulation mode
                # first. If this has a predictable result, then we don't bother
                # running the test function for real here. If however we encounter
                # some novel behaviour, we try again with the real test function,
                # starting from the new novel prefix that has discovered.
                try:
                    trial_data = self.new_conjecture_data(
                        prefix=prefix, max_length=max_length
                    )
                    self.tree.simulate_test_function(trial_data)
                    continue
                except PreviouslyUnseenBehaviour:
                    pass

                # If the simulation entered part of the tree that has been killed,
                # we don't want to run this.
                if trial_data.observer.killed:
                    continue

                # We might have hit the cap on number of examples we should
                # run when calculating the minimal example.
                if not self.should_generate_more():
                    break

                prefix = trial_data.buffer
            else:
                max_length = BUFFER_SIZE

            data = self.new_conjecture_data(prefix=prefix, max_length=max_length)

            self.test_function(data)

            self.generate_mutations_from(data)

            # Although the optimisations are logically a distinct phase, we
            # actually normally run them as part of example generation. The
            # reason for this is that we cannot guarantee that optimisation
            # actually exhausts our budget: It might finish running and we
            # discover that actually we still could run a bunch more test cases
            # if we want.
            if (
                self.valid_examples >= max(small_example_cap, optimise_at)
                and not ran_optimisations
            ):
                ran_optimisations = True
                self.optimise_targets()