def test_clamp(lower, value, upper): lower, upper = sorted((lower, upper)) clamped = clamp(lower, value, upper) assert lower <= clamped <= upper if lower <= value <= upper: assert value == clamped if lower > value: assert clamped == lower if value > upper: assert clamped == upper
def generate_new_examples(self): if Phase.generate not in self.settings.phases: return if self.interesting_examples: # The example database has failing examples from a previous run, # so we'd rather report that they're still failing ASAP than take # the time to look for additional failures. return self.debug("Generating new examples") zero_data = self.cached_test_function(hbytes(BUFFER_SIZE)) if zero_data.status > Status.OVERRUN: self.__data_cache.pin(zero_data.buffer) if zero_data.status == Status.OVERRUN or ( zero_data.status == Status.VALID and len(zero_data.buffer) * 2 > BUFFER_SIZE ): fail_health_check( self.settings, "The smallest natural example for your test is extremely " "large. This makes it difficult for Hypothesis to generate " "good examples, especially when trying to reduce failing ones " "at the end. Consider reducing the size of your data if it is " "of a fixed size. You could also fix this by improving how " "your data shrinks (see https://hypothesis.readthedocs.io/en/" "latest/data.html#shrinking for details), or by introducing " "default values inside your strategy. e.g. could you replace " "some arguments with their defaults by using " "one_of(none(), some_complex_strategy)?", HealthCheck.large_base_example, ) self.health_check_state = HealthCheckState() def should_generate_more(): # End the generation phase where we would have ended it if no bugs had # been found. This reproduces the exit logic in `self.test_function`, # but with the important distinction that this clause will move on to # the shrinking phase having found one or more bugs, while the other # will exit having found zero bugs. if ( self.valid_examples >= self.settings.max_examples or self.call_count >= max(self.settings.max_examples * 10, 1000) or ( self.best_examples_of_observed_targets and self.valid_examples * 2 >= self.settings.max_examples and self.should_optimise ) ): # pragma: no cover return False # If we haven't found a bug, keep looking - if we hit any limits on # the number of tests to run that will raise an exception and stop # the run. if not self.interesting_examples: return True # If we've found a bug and won't report more than one, stop looking. elif not self.settings.report_multiple_bugs: return False assert self.first_bug_found_at <= self.last_bug_found_at <= self.call_count # Otherwise, keep searching for between ten and 'a heuristic' calls. # We cap 'calls after first bug' so errors are reported reasonably # soon even for tests that are allowed to run for a very long time, # or sooner if the latest half of our test effort has been fruitless. return self.call_count < MIN_TEST_CALLS or self.call_count < min( self.first_bug_found_at + 1000, self.last_bug_found_at * 2 ) # We attempt to use the size of the minimal generated test case starting # from a given novel prefix as a guideline to generate smaller test # cases for an initial period, by restriscting ourselves to test cases # that are not much larger than it. # # Calculating the actual minimal generated test case is hard, so we # take a best guess that zero extending a prefix produces the minimal # test case starting with that prefix (this is true for our built in # strategies). This is only a reasonable thing to do if the resulting # test case is valid. If we regularly run into situations where it is # not valid then this strategy is a waste of time, so we want to # abandon it early. In order to do this we track how many times in a # row it has failed to work, and abort small test case generation when # it has failed too many times in a row. consecutive_zero_extend_is_invalid = 0 while should_generate_more(): prefix = self.generate_novel_prefix() assert len(prefix) <= BUFFER_SIZE # We control growth during initial example generation, for two # reasons: # # * It gives us an opportunity to find small examples early, which # gives us a fast path for easy to find bugs. # * It avoids low probability events where we might end up # generating very large examples during health checks, which # on slower machines can trigger HealthCheck.too_slow. # # The heuristic we use is that we attempt to estimate the smallest # extension of this prefix, and limit the size to no more than # an order of magnitude larger than that. If we fail to estimate # the size accurately, we skip over this prefix and try again. # # We need to tune the example size based on the initial prefix, # because any fixed size might be too small, and any size based # on the strategy in general can fall afoul of strategies that # have very different sizes for different prefixes. small_example_cap = clamp(10, self.settings.max_examples // 10, 50) if ( self.valid_examples <= small_example_cap and self.call_count <= 5 * small_example_cap and not self.interesting_examples and consecutive_zero_extend_is_invalid < 5 ): minimal_example = self.cached_test_function( prefix + hbytes(BUFFER_SIZE - len(prefix)) ) if minimal_example.status < Status.VALID: consecutive_zero_extend_is_invalid += 1 continue consecutive_zero_extend_is_invalid = 0 minimal_extension = len(minimal_example.buffer) - len(prefix) max_length = min(len(prefix) + minimal_extension * 10, BUFFER_SIZE) # We could end up in a situation where even though the prefix was # novel when we generated it, because we've now tried zero extending # it not all possible continuations of it will be novel. In order to # avoid making redundant test calls, we rerun it in simulation mode # first. If this has a predictable result, then we don't bother # running the test function for real here. If however we encounter # some novel behaviour, we try again with the real test function, # starting from the new novel prefix that has discovered. try: trial_data = self.new_conjecture_data( prefix=prefix, max_length=max_length ) self.tree.simulate_test_function(trial_data) continue except PreviouslyUnseenBehaviour: pass # If the simulation entered part of the tree that has been killed, # we don't want to run this. if trial_data.observer.killed: continue # We might have hit the cap on number of examples we should # run when calculating the minimal example. if not should_generate_more(): break prefix = trial_data.buffer else: max_length = BUFFER_SIZE data = self.new_conjecture_data(prefix=prefix, max_length=max_length) self.test_function(data) # A thing that is often useful but rarely happens by accident is # to generate the same value at multiple different points in the # test case. # # Rather than make this the responsibility of individual strategies # we implement a small mutator that just takes parts of the test # case with the same label and tries replacing one of them with a # copy of the other and tries running it. If we've made a good # guess about what to put where, this will run a similar generated # test case with more duplication. if ( # An OVERRUN doesn't have enough information about the test # case to mutate, so we just skip those. data.status >= Status.INVALID # This has a tendency to trigger some weird edge cases during # generation so we don't let it run until we're done with the # health checks. and self.health_check_state is None ): initial_calls = self.call_count failed_mutations = 0 while ( should_generate_more() # We implement fairly conservative checks for how long we # we should run mutation for, as it's generally not obvious # how helpful it is for any given test case. and self.call_count <= initial_calls + 5 and failed_mutations <= 5 ): groups = defaultdict(list) for ex in data.examples: groups[ex.label, ex.depth].append(ex) groups = [v for v in groups.values() if len(v) > 1] if not groups: break group = self.random.choice(groups) ex1, ex2 = sorted( self.random.sample(group, 2), key=lambda i: i.index ) assert ex1.end <= ex2.start replacements = [data.buffer[e.start : e.end] for e in [ex1, ex2]] replacement = self.random.choice(replacements) try: # We attempt to replace both the the examples with # whichever choice we made. Note that this might end # up messing up and getting the example boundaries # wrong - labels matching are only a best guess as to # whether the two are equivalent - but it doesn't # really matter. It may not achieve the desired result # but it's still a perfectly acceptable choice sequence. # to try. new_data = self.cached_test_function( data.buffer[: ex1.start] + replacement + data.buffer[ex1.end : ex2.start] + replacement + data.buffer[ex2.end :], # We set error_on_discard so that we don't end up # entering parts of the tree we consider redundant # and not worth exploring. error_on_discard=True, extend=BUFFER_SIZE, ) except ContainsDiscard: failed_mutations += 1 continue if ( new_data.status >= data.status and data.buffer != new_data.buffer and all( k in new_data.target_observations and new_data.target_observations[k] >= v for k, v in data.target_observations.items() ) ): data = new_data failed_mutations = 0 else: failed_mutations += 1
def generate_new_examples(self): if Phase.generate not in self.settings.phases: return if self.interesting_examples: # The example database has failing examples from a previous run, # so we'd rather report that they're still failing ASAP than take # the time to look for additional failures. return zero_data = self.cached_test_function(hbytes(BUFFER_SIZE)) if zero_data.status > Status.OVERRUN: self.__data_cache.pin(zero_data.buffer) self.optimise_all(zero_data) if zero_data.status == Status.OVERRUN or ( zero_data.status == Status.VALID and len(zero_data.buffer) * 2 > BUFFER_SIZE): fail_health_check( self.settings, "The smallest natural example for your test is extremely " "large. This makes it difficult for Hypothesis to generate " "good examples, especially when trying to reduce failing ones " "at the end. Consider reducing the size of your data if it is " "of a fixed size. You could also fix this by improving how " "your data shrinks (see https://hypothesis.readthedocs.io/en/" "latest/data.html#shrinking for details), or by introducing " "default values inside your strategy. e.g. could you replace " "some arguments with their defaults by using " "one_of(none(), some_complex_strategy)?", HealthCheck.large_base_example, ) self.health_check_state = HealthCheckState() def should_generate_more(): # If we haven't found a bug, keep looking. We check this before # doing anything else as it's by far the most common case. if not self.interesting_examples: return True # If we've found a bug and won't report more than one, stop looking. elif not self.settings.report_multiple_bugs: return False assert self.first_bug_found_at <= self.last_bug_found_at <= self.call_count # End the generation phase where we would have ended it if no bugs had # been found. This reproduces the exit logic in `self.test_function`, # but with the important distinction that this clause will move on to # the shrinking phase having found one or more bugs, while the other # will exit having found zero bugs. if (self.valid_examples >= self.settings.max_examples or self.call_count >= max(self.settings.max_examples * 10, 1000)): # pragma: no cover return False # Otherwise, keep searching for between ten and 'a heuristic' calls. # We cap 'calls after first bug' so errors are reported reasonably # soon even for tests that are allowed to run for a very long time, # or sooner if the latest half of our test effort has been fruitless. return self.call_count < MIN_TEST_CALLS or self.call_count < min( self.first_bug_found_at + 1000, self.last_bug_found_at * 2) # GenerationParameters are a set of decisions we make that are global # to the whole test case, used to bias the data generation in various # ways. This is an approach very very loosely inspired by the paper # "Swarm testing." by Groce et al. in that it induces deliberate # correlation between otherwise independent decisions made during the # generation process. # # More importantly the generation is designed to make certain scenarios # more likely (e.g. small examples, duplicated values), which can help # or hurt in terms of finding interesting things. Whenever the result # of our generation is a bad test case, for whatever definition of # "bad" we like (currently, invalid or too large), we ditch the # parameter early. This allows us to potentially generate good test # cases significantly more often than we otherwise would, by selecting # for parameters that make them more likely. parameter = GenerationParameters(self.random) count = 0 # We attempt to use the size of the minimal generated test case starting # from a given novel prefix as a guideline to generate smaller test # cases for an initial period, by restriscting ourselves to test cases # that are not much larger than it. # # Calculating the actual minimal generated test case is hard, so we # take a best guess that zero extending a prefix produces the minimal # test case starting with that prefix (this is true for our built in # strategies). This is only a reasonable thing to do if the resulting # test case is valid. If we regularly run into situations where it is # not valid then this strategy is a waste of time, so we want to # abandon it early. In order to do this we track how many times in a # row it has failed to work, and abort small test case generation when # it has failed too many times in a row. consecutive_zero_extend_is_invalid = 0 while should_generate_more(): prefix = self.generate_novel_prefix() assert len(prefix) <= BUFFER_SIZE # We control growth during initial example generation, for two # reasons: # # * It gives us an opportunity to find small examples early, which # gives us a fast path for easy to find bugs. # * It avoids low probability events where we might end up # generating very large examples during health checks, which # on slower machines can trigger HealthCheck.too_slow. # # The heuristic we use is that we attempt to estimate the smallest # extension of this prefix, and limit the size to no more than # an order of magnitude larger than that. If we fail to estimate # the size accurately, we skip over this prefix and try again. # # We need to tune the example size based on the initial prefix, # because any fixed size might be too small, and any size based # on the strategy in general can fall afoul of strategies that # have very different sizes for different prefixes. small_example_cap = clamp(10, self.settings.max_examples // 10, 50) if (self.valid_examples <= small_example_cap and self.call_count <= 5 * small_example_cap and not self.interesting_examples and consecutive_zero_extend_is_invalid < 5): minimal_example = self.cached_test_function( prefix + hbytes(BUFFER_SIZE - len(prefix))) if minimal_example.status < Status.VALID: consecutive_zero_extend_is_invalid += 1 continue consecutive_zero_extend_is_invalid = 0 minimal_extension = len(minimal_example.buffer) - len(prefix) max_length = min( len(prefix) + minimal_extension * 10, BUFFER_SIZE) # We could end up in a situation where even though the prefix was # novel when we generated it, because we've now tried zero extending # it not all possible continuations of it will be novel. In order to # avoid making redundant test calls, we rerun it in simulation mode # first. If this has a predictable result, then we don't bother # running the test function for real here. If however we encounter # some novel behaviour, we try again with the real test function, # starting from the new novel prefix that has discovered. try: trial_data = self.new_conjecture_data( prefix=prefix, parameter=parameter, max_length=max_length) self.tree.simulate_test_function(trial_data) continue except PreviouslyUnseenBehaviour: pass # If the simulation entered part of the tree that has been killed, # we don't want to run this. if trial_data.observer.killed: continue # We might have hit the cap on number of examples we should # run when calculating the minimal example. if not should_generate_more(): break prefix = trial_data.buffer else: max_length = BUFFER_SIZE data = self.new_conjecture_data(prefix=prefix, parameter=parameter, max_length=max_length) self.test_function(data) self.optimise_all(data) count += 1 if (data.status < Status.VALID or len(data.buffer) * 2 >= BUFFER_SIZE or count > 5): count = 0 parameter = GenerationParameters(self.random)
def generate_new_examples(self): if Phase.generate not in self.settings.phases: return if self.interesting_examples: # The example database has failing examples from a previous run, # so we'd rather report that they're still failing ASAP than take # the time to look for additional failures. return self.debug("Generating new examples") assert self.should_generate_more() zero_data = self.cached_test_function(bytes(BUFFER_SIZE)) if zero_data.status > Status.OVERRUN: self.__data_cache.pin(zero_data.buffer) if zero_data.status == Status.OVERRUN or ( zero_data.status == Status.VALID and len(zero_data.buffer) * 2 > BUFFER_SIZE ): fail_health_check( self.settings, "The smallest natural example for your test is extremely " "large. This makes it difficult for Hypothesis to generate " "good examples, especially when trying to reduce failing ones " "at the end. Consider reducing the size of your data if it is " "of a fixed size. You could also fix this by improving how " "your data shrinks (see https://hypothesis.readthedocs.io/en/" "latest/data.html#shrinking for details), or by introducing " "default values inside your strategy. e.g. could you replace " "some arguments with their defaults by using " "one_of(none(), some_complex_strategy)?", HealthCheck.large_base_example, ) self.health_check_state = HealthCheckState() # We attempt to use the size of the minimal generated test case starting # from a given novel prefix as a guideline to generate smaller test # cases for an initial period, by restriscting ourselves to test cases # that are not much larger than it. # # Calculating the actual minimal generated test case is hard, so we # take a best guess that zero extending a prefix produces the minimal # test case starting with that prefix (this is true for our built in # strategies). This is only a reasonable thing to do if the resulting # test case is valid. If we regularly run into situations where it is # not valid then this strategy is a waste of time, so we want to # abandon it early. In order to do this we track how many times in a # row it has failed to work, and abort small test case generation when # it has failed too many times in a row. consecutive_zero_extend_is_invalid = 0 # We control growth during initial example generation, for two # reasons: # # * It gives us an opportunity to find small examples early, which # gives us a fast path for easy to find bugs. # * It avoids low probability events where we might end up # generating very large examples during health checks, which # on slower machines can trigger HealthCheck.too_slow. # # The heuristic we use is that we attempt to estimate the smallest # extension of this prefix, and limit the size to no more than # an order of magnitude larger than that. If we fail to estimate # the size accurately, we skip over this prefix and try again. # # We need to tune the example size based on the initial prefix, # because any fixed size might be too small, and any size based # on the strategy in general can fall afoul of strategies that # have very different sizes for different prefixes. small_example_cap = clamp(10, self.settings.max_examples // 10, 50) optimise_at = max(self.settings.max_examples // 2, small_example_cap + 1) ran_optimisations = False while self.should_generate_more(): prefix = self.generate_novel_prefix() assert len(prefix) <= BUFFER_SIZE if ( self.valid_examples <= small_example_cap and self.call_count <= 5 * small_example_cap and not self.interesting_examples and consecutive_zero_extend_is_invalid < 5 ): minimal_example = self.cached_test_function( prefix + bytes(BUFFER_SIZE - len(prefix)) ) if minimal_example.status < Status.VALID: consecutive_zero_extend_is_invalid += 1 continue consecutive_zero_extend_is_invalid = 0 minimal_extension = len(minimal_example.buffer) - len(prefix) max_length = min(len(prefix) + minimal_extension * 10, BUFFER_SIZE) # We could end up in a situation where even though the prefix was # novel when we generated it, because we've now tried zero extending # it not all possible continuations of it will be novel. In order to # avoid making redundant test calls, we rerun it in simulation mode # first. If this has a predictable result, then we don't bother # running the test function for real here. If however we encounter # some novel behaviour, we try again with the real test function, # starting from the new novel prefix that has discovered. try: trial_data = self.new_conjecture_data( prefix=prefix, max_length=max_length ) self.tree.simulate_test_function(trial_data) continue except PreviouslyUnseenBehaviour: pass # If the simulation entered part of the tree that has been killed, # we don't want to run this. if trial_data.observer.killed: continue # We might have hit the cap on number of examples we should # run when calculating the minimal example. if not self.should_generate_more(): break prefix = trial_data.buffer else: max_length = BUFFER_SIZE data = self.new_conjecture_data(prefix=prefix, max_length=max_length) self.test_function(data) self.generate_mutations_from(data) # Although the optimisations are logically a distinct phase, we # actually normally run them as part of example generation. The # reason for this is that we cannot guarantee that optimisation # actually exhausts our budget: It might finish running and we # discover that actually we still could run a bunch more test cases # if we want. if ( self.valid_examples >= max(small_example_cap, optimise_at) and not ran_optimisations ): ran_optimisations = True self.optimise_targets()