def make_distribution_holder(options, logger, pums_dict, names_dict,
                             zipcode_dict, address_dict, text_engine):
    """
    Take learned  distributions of demographic data, add the 'fixed' 
    (not-learned) distributions, and embed everything into a 
    DistributionHolder object.
    """

    dist_dict = {}
    # Tells the data-generator the order in which to call these distributions
    # Note: Age and fingerprint are special cases, not included here.

    var_names = [sv.VARS.to_string(x) for x in sv.VAR_GENERATION_ORDER]

    dist_dict.update(pums_dict)

    dist_dict.update(names_dict)

    dist_dict.update(zipcode_dict)

    dist_dict.update(address_dict)

    text_dist_dict = make_text_distributions(text_engine)

    dist_dict.update(text_dist_dict)

    dist_dict[sv.VARS.SSN] = bespoke_distributions.SSNDistribution()

    # Note: used to try to generate fingerprints in python
    # We're not doing that for the moment, but leaving this here
    # in case we do so again.
    #
    dist_dict[sv.VARS.FINGERPRINT] = \
        bespoke_distributions.FingerprintDistribution()

    dist_dict[sv.VARS.DOB] = \
        bespoke_distributions.DOBDistribution(dist_dict[sv.VARS.AGE])

    dist_dict[sv.VARS.LAST_UPDATED] = \
        bespoke_distributions.LastUpdatedDistribution(dist_dict[sv.VARS.DOB])

    dist_dict[sv.VARS.FOO]  = \
        bespoke_distributions.FooDistribution()

    dist_dict[sv.VARS.XML] = \
        make_xml_distribution(pums_dict, names_dict)

    dh = distribution_holder.DistributionHolder(sv.VAR_GENERATION_ORDER,
                                                var_names, dist_dict)

    return dh
 def setUp(self):
     self.seed = int(time.time())
     self.seed_msg = "Random seed used for this test: %s" % self.seed
     self.longMessage = True
     spar_random.seed(self.seed)
     #set up intitialization values 
     sub_cat = 'foo-range'
     self._foo_dist = bespoke_distribution.FooDistribution()
     fields = [sv.sql_name_to_enum('foo')]
     dists = [self._foo_dist]
     other_fields = ['no_queries', 'r_lower', 'r_upper','r_exp_lower','r_exp_upper','type']
     other_cols = [[2, 1, 100, 21, 21, 'range'], [2,1, 100,32, 32,'range'],
                   [2, 1, 200, 21, 21,'greater'],[2,1, 200,25, 25,'greater']]
     self.generator = frqg.FooRangeQueryGenerator('P2',sub_cat, ["LL"],dists, fields, 50000,
                                                  100,other_fields, other_cols)
    def test_generated_greater_than(self):
        '''
        Greater than: Tests at a much smaller scale if given 10^4 values of foo
        generated if generate_less_than can generate ranges for all
        sane combinations of record_set_sizes. 
        
        It does this by generating the range and counting the number
        of generated foos fall in the range. It then compares it to 
        the actual desired record set size to see if the two are 
        'close enough' which is a factor of 10. 
        '''
        db_size = 10**4
        RECORD_SET_SIZES = [10, 500, 1000]

        def close_enough(actual_density, desired_density):
            'checks to see if it is either equal or a factor of ten less'
            density_ratio = actual_density / desired_density
            upper_bound = 10
            lower_bound = 0.1
            return ((density_ratio >= lower_bound)
                    and (density_ratio <= upper_bound))

        #generate a bunch of foos to test
        foo_dist = bespoke_distributions.FooDistribution()
        foos = []
        for _ in xrange(db_size):
            x = foo_dist.generate()
            foos.append(x)

        #keep track of what combos work and what don't
        failed_combos = []
        passed_combos = []

        for record_set_size in RECORD_SET_SIZES:
            spar_random.seed(int(time.time() + 1))
            try:
                min = foo_dist.generate_greater_than(record_set_size, db_size)
            except bespoke_distributions.FooInputs:
                #foo distribution says that it is impossible to generate
                #a range for that record size, let's add it to failed combos
                failed_combos.append((db_size, record_set_size, 0, 0, 0))
                continue
            #check to see if what is generated actually works
            #with the exception that even if count_between for 10 is zero
            #we treat it as a success for the sake of passing unit tests
            range_size = min.bit_length()
            count_between = len([foo for foo in foos if foo >= min])
            if close_enough(count_between, record_set_size) or \
               (record_set_size==10 and count_between==0):
                passed_combos.append((db_size,record_set_size,range_size, \
                                      min, count_between))
            else:
                failed_combos.append((db_size,record_set_size,range_size,\
                                      min, count_between))
        #create and see if fail message must be used
        fail_msg = ''
        if len(failed_combos) != 0:
            (db, rss, rs, _, cb) = failed_combos[0]
            fail_msg = "Runs into the law of small numbers, if this fails more"\
                   " than 1 time in 20 then things have gotten fishy."\
                   "The generated ranges did not support all of desired " \
                   "combinations. For example database size: %d, record set "\
                   "size : %d, and range size: %d had %d records returned."\
                   " There are %d other unmatched sets."\
                    % (db,rss,rs,cb, len(failed_combos)-1)
        # Allow 0 failures because we are generating a few of these in actuality
        # and they need to match, most of the time.
        for x in failed_combos:
            print "failed: ", x
        for x in passed_combos:
            print "passed: ", x
        self.assertTrue(len(failed_combos) == 0, fail_msg)
    def test_generated_pdf(self):
        '''
        Tests at a much smaller scale if given 10^4 values of foo
        generated if generate_pdf can generate values for equality
        for specific record set sizes.
        
        It does this by generating the range and counting the number
        of generated foos are equal to the value. Then compares to 
        the desired record set size to see how many matched 
        '''
        db_size = 10**4
        RECORD_SET_SIZES = [(1, 10), (11, 100)]

        def close_enough(actual_density, desired_density):
            'will indicate if the densities are within a factor of 2'
            density_ratio = actual_density / desired_density
            upper_bound = 2
            lower_bound = 0.5
            return ((density_ratio >= lower_bound)
                    and (density_ratio <= upper_bound))

        #generate a bunch of foos to test against
        foo_dist = bespoke_distributions.FooDistribution()
        foos = []
        for _ in xrange(db_size):
            x = foo_dist.generate()
            foos.append(x)
        #keep track of what combinations have failed and what have passed
        failed_combos = []
        passed_combos = []
        for (minim, maxim) in RECORD_SET_SIZES:
            for _ in xrange(10):
                try:
                    value = foo_dist.generate_pdf(minim / 10**4, maxim / 10**4)
                except bespoke_distributions.FooInputs:
                    #According to foo no range is possible for that combination
                    #let's go ahead and add to failed combinations
                    failed_combos.append((minim, maxim, 0, 0))
                    continue
                #calculate and compare the measured density to our theoretical
                #density, if it is too far away, add to the failed combos
                count = len([foo for foo in foos if foo == value])
                if (count < minim or count > maxim) and count != 0:
                    failed_combos.append((minim, maxim, value, count))
                else:
                    passed_combos.append((minim, maxim, value, count))
        #generate fail msg and check if needed
        fail_msg = ''
        if len(failed_combos) != 0:
            (minim, maxim, _, _) = failed_combos[0]
            fail_msg = "Runs into the law of small numbers, if this fails more"\
                   " than 1 time in 20 then things have gotten fishy."\
                   "The generated ranges did not support all of desired " \
                   "combinations. For example min_result_set: %d, and "\
                   "max_result_set: %d" % (minim, maxim)
        # Allow 0 failures because we are generating a lot of these in actuality
        # and they just need to match, most of the time.
        for x in failed_combos:
            print "failed: ", x
        for x in passed_combos:
            print "passed: ", x
        self.assertTrue(len(failed_combos) == 0, fail_msg)
    def test_generated_two_sided(self):
        '''
        Tests at a much smaller scale if given 10^4 values of foo
        generated if generate_two_sided can generate ranges for all
        sane combinations of record_set_sizes and range_sizes. From
        experience, range is up to 2^40 and the largest record set size 
        must be a factor of 10^2 smaller in order to consistently 
        generate across all ranges.  
        
        It does this by generating the range and counting the number
        of generated foos fall in the range. It then compares it to 
        the actual desired record set size to see if the two are 
        'close enough' which is a factor of 2. 
        '''
        db_size = 10**4
        RECORD_SET_SIZES = [5, 10, 50, 100]
        RANGE_SIZES = [2**(x + 1) for x in xrange(0, 50, 10)]

        def close_enough(actual_density, desired_density):
            'will indicate if the densities are within a factor of 2'
            density_ratio = actual_density / desired_density
            upper_bound = 10
            lower_bound = .1
            return ((density_ratio >= lower_bound)
                    and (density_ratio <= upper_bound))

        #generate a bunch of foos to test against
        foo_dist = bespoke_distributions.FooDistribution()
        foos = []
        for _ in xrange(db_size):
            x = foo_dist.generate()
            foos.append(x)
        #keep track of what combinations have failed and what have passed
        failed_combos = []
        passed_combos = []
        for record_set_size in RECORD_SET_SIZES:
            for range_size in RANGE_SIZES:
                try:
                    (min, maxim) = foo_dist.generate_two_sided(
                        record_set_size, range_size, db_size)
                except bespoke_distributions.FooInputs:
                    #According to foo no range is possible for that combination
                    #let's go ahead and add to failed combinations
                    failed_combos.append(
                        (db_size, record_set_size, range_size, 0, 0, 0))
                    continue
                #calculate and compare the measured density to our theoretical
                #density, if it is too far away, add to the failed combos
                count_between = len(
                    [foo for foo in foos if foo >= min and foo <= maxim])
                measured_density = count_between / (maxim - min)

                desired_density = record_set_size / range_size
                if not close_enough(measured_density, desired_density):
                    failed_combos.append((db_size,record_set_size,range_size.bit_length(), \
                                          min, maxim, count_between))
                else:
                    passed_combos.append((db_size,record_set_size,range_size.bit_length(),\
                                          min, maxim, count_between))
        #generate fail msg and check if needed
        fail_msg = ''
        if len(failed_combos) != 0:
            (db, rss, rs, _, _, cb) = failed_combos[0]
            fail_msg = "Runs into the law of small numbers, if this fails more"\
                   " than 1 time in 20 then things have gotten fishy."\
                   "The generated ranges did not support all of desired " \
                   "combinations. For example database size: %d, record set "\
                   "size : %d, and range size: %d had %d records returned."\
                   " There are %d other unmatched sets."\
                    % (db,rss,rs,cb, len(failed_combos)-1)
        # Allow 0 failures because we are generating a lot of these in actuality
        # and they just need to match, most of the time.
        for x in failed_combos:
            print "failed: ", x
        for x in passed_combos:
            print "passed: ", x
        self.assertTrue(len(failed_combos) == 0, fail_msg)
    def test_generate(self):
        """
        Test that this distribution makes it possible for the query-generator to
        generate the spectrum of queries we'd like to test. That is: for every
        combination of
        
        * database size,
        
        * range-width (arithemtic difference between range's upper bound and
        lower bound) and
        
        * matching record-set size,
        
        ensure that there is some bit-length such that:
        
        * the number of integers of that bit-length is at least as large as
        range-width, and
        
        * we expect if we generate 'db-size' values from this distribution,
        the count of numbers generated of that bit-length is roughly:
        
           (record-set-size / range-size) * 2**bit-length
           
        where 'roughly' means within a factor given by CLOSE_ENOUGH. Also,
        test that the number of 64-bit numbers generated is small enough
        to allow grotesque one-sided range queries.
        """

        DB_SIZES = [10**x for x in xrange(5, 10)]
        RECORD_SET_SIZES = [1, 10, 100, 10**3, 10**4, 10**5]
        RANGE_SIZES = [2**(x + 1) for x in xrange(0, 50, 10)]

        monte_carlo_iterations = 100000

        # At 1,000,000 iterations, I can get these down to 0.4 and 10**-15,
        # respectively. But to keep the test running fast and have a low
        # probability of breaking, I'm goign to set it high.
        CLOSE_ENOUGH = 0.5
        MAX_DENSITY_64 = 10**-14

        foo_dist = bespoke_distributions.FooDistribution()

        # Count the number of times we see a number of a given bit-length
        counts = collections.Counter()
        for _ in xrange(monte_carlo_iterations):
            x = foo_dist.generate()
            bit_len = x.bit_length()
            counts[bit_len] += 1

        # From counts,c ompute the probability of getting a number of a
        # given bit-length
        probs = {}
        for bit_len, count in counts.items():
            probs[bit_len] = count / monte_carlo_iterations

        for db_size in DB_SIZES:
            # Given a db-size, how 'dense' do we epxect the generated numbers
            # to be in a given bit-length?
            density = {}
            for bit_len in xrange(65):
                if bit_len == 0:
                    bin_size = 1
                else:
                    bin_size = (2**bit_len) - (2**(bit_len - 1))
                prob = probs.get(bit_len, 0)
                density[bit_len] = prob * db_size / bin_size

            # First, let's check one-sided range queries. To allow the
            # kinds of one-sided queries we want, we need the density of
            # 64-bit numbers to be less than MAX_DENSITY_64
            density_64 = density[64]
            fail_msg = "64-bit integers too dense (%.2e) for DB size %.0e " \
                % (density_64, db_size)
            fail_msg += self.seed_msg
            self.assertLessEqual(density_64, MAX_DENSITY_64, fail_msg)

            # For a given range-size, record-set-size combination, look for
            #
            # * A bit-lenght such that (2**bit-length > record-set-size)
            #
            # * and density[bit-len] is close to
            #    (record_set_size / range-size)
            #
            # But what do we mean by 'close'? This:

            def close_enough(actual_density, desired_density):
                density_ratio = actual_density / desired_density
                upper_bound = 1 + CLOSE_ENOUGH
                lower_bound = 1 - CLOSE_ENOUGH
                return ((density_ratio >= lower_bound)
                        and (density_ratio <= upper_bound))

            for record_set_size in RECORD_SET_SIZES:
                for range_size in RANGE_SIZES:

                    # Some combinations are not going to be possible.
                    # Skip them.

                    if ((record_set_size >= 10**4) and (db_size <= 10**6)):
                        continue
                    if ((record_set_size == 1) and (db_size == 10**9)):
                        continue
                    #Okay, now we know we're dealing with a possible case.
                    # Find the desired density and the minimum bit-length.
                    # Then scan those bit-lengths and and look for
                    # an actual density which is close enough to the desired
                    # one

                    desired_density = record_set_size / range_size

                    min_bit_length = range_size.bit_length() + 1

                    close_enoughs = \
                        [close_enough(density[bit_len], desired_density)
                         for bit_len in xrange(min_bit_length, 65)]

                    any_close_enough = any(close_enoughs)

                    fail_msg = \
                        "No density close to desired for record-set %d, "\
                        "range %d, DB size %.0e." % \
                        (record_set_size, range_size, db_size)
                    fail_msg += self.seed_msg

                    self.assertTrue(any_close_enough, fail_msg)