def count_ways_to_obtain_largest_subpopulation(n, m): """Return dict of number of ways to obtain largest subpopulation. Inputs :n: total number (e.g., total number of highest scoring results) :m: number of non-negative integers to sum to n (e.g., number of workers) Output :ways: dictionary whose keys are the maximum value of a multiset and whose values are the sum of each distinct ordering of results, corresponding to an arrangement of a multiset, computed over all arrangements of all multisets sharing a maximum value. Implementation Although Multiset.uniq_msets() returns tuples in lexicographical order, this implementation would function regardless of order. """ mset = Multiset(n) ways = defaultdict(int) for grp in mset.uniq_msets(n, m): ways[max(grp)] += (mset.multinomial_coeff(grp) * mset.number_of_arrangements(grp)) return ways
def run_example(): """Demonstrate sample outputs. :: >> run_example() # ADD > to re-activate doctest (runs in ~20 sec) Short example, involving 108 multisets Printing the probability of missing 1 or more results from the top 20 results, given 4 workers, as a function of the number of top results requested per worker. Probability of 5 or more of top 20 from one of 4 sets is 1.0000e+00. Probability of 6 or more of top 20 from one of 4 sets is 9.8933e-01. Probability of 7 or more of top 20 from one of 4 sets is 7.5516e-01. Probability of 8 or more of top 20 from one of 4 sets is 3.9874e-01. Probability of 9 or more of top 20 from one of 4 sets is 1.6346e-01. Probability of 10 or more of top 20 from one of 4 sets is 5.5457e-02. Probability of 11 or more of top 20 from one of 4 sets is 1.5769e-02. Probability of 12 or more of top 20 from one of 4 sets is 3.7416e-03. Probability of 13 or more of top 20 from one of 4 sets is 7.3482e-04. Probability of 14 or more of top 20 from one of 4 sets is 1.1805e-04. Probability of 15 or more of top 20 from one of 4 sets is 1.5252e-05. Probability of 16 or more of top 20 from one of 4 sets is 1.5461e-06. Probability of 17 or more of top 20 from one of 4 sets is 1.1842e-07. Probability of 18 or more of top 20 from one of 4 sets is 6.4429e-09. Probability of 19 or more of top 20 from one of 4 sets is 2.2192e-10. Probability of 20 or more of top 20 from one of 4 sets is 3.6380e-12. computing longer example, involving 6292069 multisets ... Longer example Chance of omitting documents from top 100 when returning 20 results from each of 10 workers is 8.0721981476e-03 """ mset = Multiset() n1, m1 = 20, 4 print """Short example, involving %d multisets Printing the probability of missing 1 or more results from the top %d results, given %d workers, as a function of the number of top results requested per worker.""" % (mset.num_uniq_msets(total=n1, length=m1), n1, m1) print_cumulative_prob(n=n1, m=m1) n2, m2 = 100, 10 num_docs = 20 num_ms = mset.num_uniq_msets(total=n2, length=m2) print 'computing longer example, involving %d multisets ...' % num_ms # add one because result is omitted only when set size exceeds request for stats in compute_probabilities(n=n2, m=m2, t=num_docs + 1): if stats['count'] == num_docs + 1: print ' '.join(['Longer example\nChance of omitting documents', 'from top %d when returning %d results\nfrom each of', '%d workers is %0.10e']) % (n2, num_docs, m2, stats['p'])
def compute_probabilities(n, m, t=()): """Compute probability that a result is missed. Inputs :n: total number (e.g., total number of highest scoring results) :m: number of non-negative integers to sum to n (e.g., number of workers, each returning an integer number of results) :t: optional threshold to short-circuit computation * integer t is the maximum number of results to return per worker Output :stats: dict containing fields: * count is the the number of results returned per worker * n is the total number of highest scoring results * m is the number of workers * p is the cumulative probability that a result is missed """ if not is_nonneg_int(t): t = () numerator = m ** n denominator = float(numerator) stats = {'n': n, 'm': m, 'count': 0, 'p': 0} mset = Multiset(n) for (cnt, ways) in mset.num_ways(n, m): stats['count'] = cnt stats['p'] = numerator / denominator if cnt < t: yield stats.copy() elif cnt == t: yield stats.copy() raise StopIteration else: raise StopIteration numerator -= ways
def setUp(self): self.mset = Multiset()