def choose(self, l): """ Given the list l = [l_1, l_2, ..., l_m], return an index i between 1 and m at random with probability l_i / sum(l). """ assert l is not [] assert all([w >= 0 for w in l]) # We do not expect negative weights - they should be counts. assert not all([w == 0 for w in l]) # If they are all 0, we can't make a probability distribution. probs = cfghelpers.to_prob_dist(l) indices = _inclusive_range(1, len(l)) return self._random.choice(indices, p=probs)
def _generate_form_skeleton(self): """ Returns a randomly generated skeleton of the HTML form. This includes the number and types of all the fields, and their names. """ # For now, I don't see the need for this to be a CFG. We can just choose a set of field types. # A CFG might be more useful if we start trying to add more sophisticated interfaces such as tabbed, multi-page # or hierarchical forms. # Choose the number of fields. num_fields = self._random.randint( self.config["form_skeleton_min_fields"], self.config["form_skeleton_max_fields"]) fields = [] field_types = FormField.FORM_FIELD_TYPES + FormField.FORM_FIELD_TEMPLATE_TYPES field_type_probs = cfghelpers.to_prob_dist([ self.config["form_skeleton_type_weights"].get(t, 1) for t in field_types ]) for idx in range(1, num_fields + 1): # Choose a field type, according to the weights. field_type = self._np_random.choice(field_types, p=field_type_probs) # Choose a name. field_name = "input-{}".format(idx) # If necessary (by type), choose some options options = self._generate_options( ) if field_type in FormField.FORM_FIELD_TYPES_WITH_OPTIONS else None fields.append(FormField(field_type, field_name, options)) return FormSkeleton(fields)
def _build_random_valid_distribution(self, start_idx, mean, std_dev): """ A helper method for _choose_random_valid_length which builds the distribution opn lengths to sample from. Returns a pair of lists (ls, ps) of the lengths [just _inclusive_range(min_length, max_length)] and their probabilities. """ assert mean > 0 assert std_dev >= 0 # Build a discrete distribution which approximates a normal distribution with the given mean and std_dev, but # which also has a limited range and removes those values where we cannot generate a string of that length. # This can be done by using the normal distribution to find the probability that the random length would fall # into each discrete bucket (ignoring values which are outside the allowed range) and removing the values at # invalid lengths. The remaining weights can be re-scaled to give a finite, discrete probability distribution # which we can sample with numpy.random.choice (strictly self._random.choice). # Work out the allowed length range to consider. min_length = max(min(int(round(mean - 2*std_dev)), mean - 1), 0) max_length = max(int(round(mean + 2*std_dev)), mean + 1) allowed_lengths = _inclusive_range(min_length, max_length) # Check which lengths in this range have some possible productions. has_productions = [l for l in allowed_lengths if sum(self.f(start_idx, l)) > 0] if len(has_productions) == 0: raise CFGSampler.GenerationError("This CFG cannot generate any strings of lengths {}..{}.".format(min_length, max_length)) # Get the probabilities for each bucket from a normal distribution CDF. normal_probabilities = [scipy.stats.norm.cdf(x+0.5, loc=mean, scale=std_dev) - scipy.stats.norm.cdf(x-0.5, loc=mean, scale=std_dev) for x in allowed_lengths] # Remove those which we cannot generate a valid string for filtered_weights = [p if x in has_productions else 0 for x, p in zip(allowed_lengths, normal_probabilities)] # Re-scale to a probability distribution. result_dist = cfghelpers.to_prob_dist(filtered_weights) return allowed_lengths, result_dist
def test_to_prob_dist_all_zero(self): with self.assertRaises(AssertionError): cfghelpers.to_prob_dist([0, 0, 0])
def test_to_prob_dist_some_zero(self): self.assertEqual(cfghelpers.to_prob_dist([1, 0, 5, 3, 0]), [1.0 / 9, 0.0 / 9, 5.0 / 9, 3.0 / 9, 0.0 / 9])
def test_to_prob_dist_empty(self): with self.assertRaises(AssertionError): cfghelpers.to_prob_dist([])
def test_to_prob_dist_single(self): self.assertEqual(cfghelpers.to_prob_dist([5]), [1.0])
def test_to_prob_dist_typical(self): self.assertEqual(cfghelpers.to_prob_dist([1, 3, 5, 3, 1]), [1.0 / 13, 3.0 / 13, 5.0 / 13, 3.0 / 13, 1.0 / 13])