Exemple #1
0
def count_ways_to_obtain_largest_subpopulation(n, m):
    """Return dict of number of ways to obtain largest subpopulation.

    Inputs
      :n: total number (e.g., total number of highest scoring results)
      :m: number of non-negative integers to sum to n (e.g., number of
          workers)

    Output
      :ways: dictionary whose keys are the maximum value of a multiset
             and whose values are the sum of each distinct ordering of
             results, corresponding to an arrangement of a multiset,
             computed over all arrangements of all multisets sharing
             a maximum value.

    Implementation
        Although Multiset.uniq_msets() returns tuples in lexicographical
        order, this implementation would function regardless of order.

    """
    mset = Multiset(n)
    ways = defaultdict(int)
    for grp in mset.uniq_msets(n, m):
        ways[max(grp)] += (mset.multinomial_coeff(grp) *
                           mset.number_of_arrangements(grp))
    return ways
Exemple #2
0
def run_example():
    """Demonstrate sample outputs.

    ::

        >> run_example()    # ADD > to re-activate doctest (runs in ~20 sec)
        Short example, involving 108 multisets
        Printing the probability of missing 1 or more results from the top 20
        results, given 4 workers, as a function of the number of top results
        requested per worker.
        Probability of  5 or more of top 20 from one of 4 sets is 1.0000e+00.
        Probability of  6 or more of top 20 from one of 4 sets is 9.8933e-01.
        Probability of  7 or more of top 20 from one of 4 sets is 7.5516e-01.
        Probability of  8 or more of top 20 from one of 4 sets is 3.9874e-01.
        Probability of  9 or more of top 20 from one of 4 sets is 1.6346e-01.
        Probability of 10 or more of top 20 from one of 4 sets is 5.5457e-02.
        Probability of 11 or more of top 20 from one of 4 sets is 1.5769e-02.
        Probability of 12 or more of top 20 from one of 4 sets is 3.7416e-03.
        Probability of 13 or more of top 20 from one of 4 sets is 7.3482e-04.
        Probability of 14 or more of top 20 from one of 4 sets is 1.1805e-04.
        Probability of 15 or more of top 20 from one of 4 sets is 1.5252e-05.
        Probability of 16 or more of top 20 from one of 4 sets is 1.5461e-06.
        Probability of 17 or more of top 20 from one of 4 sets is 1.1842e-07.
        Probability of 18 or more of top 20 from one of 4 sets is 6.4429e-09.
        Probability of 19 or more of top 20 from one of 4 sets is 2.2192e-10.
        Probability of 20 or more of top 20 from one of 4 sets is 3.6380e-12.
        computing longer example, involving 6292069 multisets ...
        Longer example
        Chance of omitting documents from top 100 when returning 20 results
        from each of 10 workers is 8.0721981476e-03

    """

    mset = Multiset()
    n1, m1 = 20, 4
    print """Short example, involving %d multisets
Printing the probability of missing 1 or more results from the top %d
results, given %d workers, as a function of the number of top results
requested per worker.""" % (mset.num_uniq_msets(total=n1, length=m1),
        n1, m1)
    print_cumulative_prob(n=n1, m=m1)

    n2, m2 = 100, 10
    num_docs = 20
    num_ms = mset.num_uniq_msets(total=n2, length=m2)
    print 'computing longer example, involving %d multisets ...' % num_ms
    # add one because result is omitted only when set size exceeds request
    for stats in compute_probabilities(n=n2, m=m2, t=num_docs + 1):
        if stats['count'] == num_docs + 1:
            print ' '.join(['Longer example\nChance of omitting documents',
                'from top %d when returning %d results\nfrom each of',
                '%d workers is %0.10e']) % (n2, num_docs, m2, stats['p'])
Exemple #3
0
def compute_probabilities(n, m, t=()):
    """Compute probability that a result is missed.

    Inputs
      :n: total number (e.g., total number of highest scoring results)
      :m: number of non-negative integers to sum to n (e.g., number of
          workers, each returning an integer number of results)
      :t: optional threshold to short-circuit computation
          * integer t is the maximum number of results to return per worker

    Output
      :stats: dict containing fields:
          * count is the the number of results returned per worker
          * n is the total number of highest scoring results
          * m is the number of workers
          * p is the cumulative probability that a result is missed

    """

    if not is_nonneg_int(t):
        t = ()
    numerator = m ** n
    denominator = float(numerator)
    stats = {'n': n, 'm': m, 'count': 0, 'p': 0}
    mset = Multiset(n)
    for (cnt, ways) in mset.num_ways(n, m):
        stats['count'] = cnt
        stats['p'] = numerator / denominator
        if cnt < t:
            yield stats.copy()
        elif cnt == t:
            yield stats.copy()
            raise StopIteration
        else:
            raise StopIteration
        numerator -= ways
Exemple #4
0
 def setUp(self):
     self.mset = Multiset()