예제 #1
0
 def choose(self, l):
     """
     Given the list l = [l_1, l_2, ..., l_m], return an index i between 1 and m at random with probability
     l_i / sum(l).
     """
     assert l is not []
     assert all([w >= 0 for w in l]) # We do not expect negative weights - they should be counts.
     assert not all([w == 0 for w in l]) # If they are all 0, we can't make a probability distribution.
     
     probs = cfghelpers.to_prob_dist(l)
     indices = _inclusive_range(1, len(l))
     return self._random.choice(indices, p=probs)
    def _generate_form_skeleton(self):
        """
        Returns a randomly generated skeleton of the HTML form.
        
        This includes the number and types of all the fields, and their names.
        """
        # For now, I don't see the need for this to be a CFG. We can just choose a set of field types.
        # A CFG might be more useful if we start trying to add more sophisticated interfaces such as tabbed, multi-page
        # or hierarchical forms.

        # Choose the number of fields.
        num_fields = self._random.randint(
            self.config["form_skeleton_min_fields"],
            self.config["form_skeleton_max_fields"])

        fields = []
        field_types = FormField.FORM_FIELD_TYPES + FormField.FORM_FIELD_TEMPLATE_TYPES
        field_type_probs = cfghelpers.to_prob_dist([
            self.config["form_skeleton_type_weights"].get(t, 1)
            for t in field_types
        ])
        for idx in range(1, num_fields + 1):
            # Choose a field type, according to the weights.
            field_type = self._np_random.choice(field_types,
                                                p=field_type_probs)

            # Choose a name.
            field_name = "input-{}".format(idx)

            # If necessary (by type), choose some options
            options = self._generate_options(
            ) if field_type in FormField.FORM_FIELD_TYPES_WITH_OPTIONS else None

            fields.append(FormField(field_type, field_name, options))

        return FormSkeleton(fields)
예제 #3
0
 def _build_random_valid_distribution(self, start_idx, mean, std_dev):
     """
     A helper method for _choose_random_valid_length which builds the distribution opn lengths to sample from.
     Returns a pair of lists (ls, ps) of the lengths [just _inclusive_range(min_length, max_length)] and their
     probabilities.
     """
     assert mean > 0
     assert std_dev >= 0
     
     # Build a discrete distribution which approximates a normal distribution with the given mean and std_dev, but
     # which also has a limited range and removes those values where we cannot generate a string of that length.
     # This can be done by using the normal distribution to find the probability that the random length would fall
     # into each discrete bucket (ignoring values which are outside the allowed range) and removing the values at
     # invalid lengths. The remaining weights can be re-scaled to give a finite, discrete probability distribution
     # which we can sample with numpy.random.choice (strictly self._random.choice).
     
     # Work out the allowed length range to consider.
     min_length = max(min(int(round(mean - 2*std_dev)), mean - 1), 0)
     max_length = max(int(round(mean + 2*std_dev)), mean + 1)
     allowed_lengths = _inclusive_range(min_length, max_length)
     
     # Check which lengths in this range have some possible productions.
     has_productions = [l for l in allowed_lengths if sum(self.f(start_idx, l)) > 0]
     if len(has_productions) == 0:
         raise CFGSampler.GenerationError("This CFG cannot generate any strings of lengths {}..{}.".format(min_length, max_length))
     
     # Get the probabilities for each bucket from a normal distribution CDF.
     normal_probabilities = [scipy.stats.norm.cdf(x+0.5, loc=mean, scale=std_dev) - scipy.stats.norm.cdf(x-0.5, loc=mean, scale=std_dev) for x in allowed_lengths]
     
     # Remove those which we cannot generate a valid string for
     filtered_weights = [p if x in has_productions else 0 for x, p in zip(allowed_lengths, normal_probabilities)]
     
     # Re-scale to a probability distribution.
     result_dist = cfghelpers.to_prob_dist(filtered_weights)
     
     return allowed_lengths, result_dist
 def test_to_prob_dist_all_zero(self):
     with self.assertRaises(AssertionError):
         cfghelpers.to_prob_dist([0, 0, 0])
 def test_to_prob_dist_some_zero(self):
     self.assertEqual(cfghelpers.to_prob_dist([1, 0, 5, 3, 0]),
                      [1.0 / 9, 0.0 / 9, 5.0 / 9, 3.0 / 9, 0.0 / 9])
 def test_to_prob_dist_empty(self):
     with self.assertRaises(AssertionError):
         cfghelpers.to_prob_dist([])
 def test_to_prob_dist_single(self):
     self.assertEqual(cfghelpers.to_prob_dist([5]), [1.0])
 def test_to_prob_dist_typical(self):
     self.assertEqual(cfghelpers.to_prob_dist([1, 3, 5, 3, 1]),
                      [1.0 / 13, 3.0 / 13, 5.0 / 13, 3.0 / 13, 1.0 / 13])