def _compute_upper_bound_tight(self, tree, n_parts, n_examples, n_features): """ Optimized implementation of Algorithm 1 of Appendix D of the paper. """ c, m, l = n_parts, n_examples, n_features if c > m or c > tree.n_leaves: return 0 elif c == m or c == 1 or m == 1: return 1 elif m <= tree.n_leaves: return stirling(m, c) # Modification 1: Check first in the table if value is already computed. if tree not in self.pfub_table: self.pfub_table[tree] = {} if (c, m, l) not in self.pfub_table[tree]: N = 0 min_k = tree.left_subtree.n_leaves max_k = m - tree.right_subtree.n_leaves for k in range(min_k, max_k + 1): # Modification 2: Since c = 2 is the most common use case, we give an optimized version, writing explicitely the sum over a and b. if c == 2: N += min(2 * l, binom( m, k)) * (1 + 2 * self._compute_upper_bound_tight( tree.left_subtree, 2, k, l) + 2 * self._compute_upper_bound_tight( tree.right_subtree, 2, m - k, l) + 2 * self._compute_upper_bound_tight( tree.left_subtree, 2, k, l) * self._compute_upper_bound_tight( tree.right_subtree, 2, m - k, l)) else: N += min(2 * l, binom(m, k)) * sum( sum( binom(a, c - b) * binom(b, c - a) * factorial(a + b - c) * self._compute_upper_bound_tight( tree.left_subtree, a, k, l) * self._compute_upper_bound_tight( tree.right_subtree, b, m - k, l) for b in range(max(1, c - a), c + 1)) for a in range(1, c + 1)) if tree.left_subtree == tree.right_subtree: N /= 2 # Modification 3: Add value to look up table. self.pfub_table[tree][n_parts, n_examples, n_features] = min( N, stirling(n_examples, n_parts)) return self.pfub_table[tree][n_parts, n_examples, n_features]
def _compute_upper_bound_loose(self, tree, n_parts, n_examples, n_features): """ Looser but faster implementation of Algorithm 1 of Appendix D of the paper. The corresponding equation can be found at the end of section 6.2. """ c, m, l = n_parts, n_examples, n_features if c > m or c > tree.n_leaves: return 0 elif c == m or c == 1 or m == 1: return 1 elif m <= tree.n_leaves: return stirling(m, c) if tree not in self.pfub_table: self.pfub_table[tree] = {} if (c, m, l) not in self.pfub_table[tree]: N = 0 k_left = m - tree.right_subtree.n_leaves k_right = m - tree.left_subtree.n_leaves N = 0 if c == 2: N += 2 * l * (1 + 2 * self._compute_upper_bound_loose( tree.left_subtree, 2, k_left, l) + 2 * self._compute_upper_bound_loose( tree.right_subtree, 2, k_right, l) + 2 * self._compute_upper_bound_loose( tree.left_subtree, 2, k_left, l) * self._compute_upper_bound_loose( tree.right_subtree, 2, k_right, l)) else: N += 2 * l * sum( sum( binom(a, c - b) * binom(b, c - a) * factorial(a + b - c) * self._compute_upper_bound_loose( tree.left_subtree, a, k_left, l) * self._compute_upper_bound_loose( tree.right_subtree, b, k_right, l) for b in range(max(1, c - a), c + 1)) for a in range(1, c + 1)) N *= m - tree.n_leaves if tree.left_subtree == tree.right_subtree: N /= 2 self.pfub_table[tree][c, m, l] = min(N, stirling(n_examples, n_parts)) return self.pfub_table[tree][c, m, l]
def _resample_auxiliary_transition_atom_mh(alpha, beta, n, m_curr, use_approximation=True): """ Use a Metropolos Hastings resampling approach that often rejects the proposed value. This can cause the convergence to slow down (as the values are less dynamic) but speeds up the computation. :param alpha: :param beta: :param n: :param m_curr: :param use_approximation: :return: """ # propose new m n = max(n, 1) m_proposed = random.choice(range(1, n + 1)) if m_curr > n: return m_proposed # find relative probabilities if use_approximation and n > 10: logp_diff = ( (m_proposed - 0.5) * np.log(m_proposed) - (m_curr - 0.5) * np.log(m_curr) + (m_proposed - m_curr) * np.log(alpha * beta * np.exp(1)) + (m_proposed - m_curr) * np.log(0.57721 + np.log(n - 1))) else: p_curr = float(stirling(n, m_curr, kind=1)) * ( (alpha * beta)**m_curr) p_proposed = float(stirling(n, m_proposed, kind=1)) * ( (alpha * beta)**m_proposed) logp_diff = np.log(p_proposed) - np.log(p_curr) # use MH variable to decide whether to accept m_proposed with catch_warnings(record=True) as caught_warnings: p_accept = min(1, np.exp(logp_diff)) p_accept = bool(np.random.binomial( n=1, p=p_accept)) # convert to boolean if caught_warnings: p_accept = True return m_proposed if p_accept else m_curr
def _resample_auxiliary_transition_atom_complete(alpha, beta, n, use_approximation=True): """ Use a resampling approach that estimates probabilities for all auxiliary transition parameters. This avoids the slowdown in convergence caused by Metropolis Hastings rejections, but is more computationally costly. :param alpha: :param beta: :param n: :param use_approximation: :return: """ # initialise values required to resample p_required = np.random.uniform(0, 1) m = 0 p_cumulative = 0 scale = alpha * beta if not use_approximation: # use precise probabilities try: logp_constant = np.log(special.gamma(scale)) - np.log( special.gamma(scale + n)) while p_cumulative == 0 or p_cumulative < p_required and m < n: # accumulate probability m += 1 logp_accept = (m * np.log(scale) + np.log(stirling(n, m, kind=1)) + logp_constant) p_cumulative += np.exp(logp_accept) # after one failure use only the approximation except (RecursionError, OverflowError): # correct for failed case before m -= 1 while p_cumulative < p_required and m < n: # problems with stirling recursion (large n & m), use approximation instead # magic number is the Euler constant # approximation derived in documentation m += 1 logp_accept = (m + (m + scale - 0.5) * np.log(scale) + (m - 1) * np.log(0.57721 + np.log(n - 1)) - (m - 0.5) * np.log(m) - scale * np.log(scale + n) - scale) p_cumulative += np.exp(logp_accept) # breaks out of loop after m is sufficiently large return max(m, 1)
def test_nC_nP_nT(): from sympy.utilities.iterables import (multiset_permutations, multiset_combinations, multiset_partitions, partitions, subsets, permutations) from sympy.functions.combinatorial.numbers import (nP, nC, nT, stirling, _multiset_histogram, _AOP_product) from sympy.combinatorics.permutations import Permutation from sympy.core.numbers import oo from random import choice c = string.ascii_lowercase for i in range(100): s = ''.join(choice(c) for i in range(7)) u = len(s) == len(set(s)) try: tot = 0 for i in range(8): check = nP(s, i) tot += check assert len(list(multiset_permutations(s, i))) == check if u: assert nP(len(s), i) == check assert nP(s) == tot except AssertionError: print(s, i, 'failed perm test') raise ValueError() for i in range(100): s = ''.join(choice(c) for i in range(7)) u = len(s) == len(set(s)) try: tot = 0 for i in range(8): check = nC(s, i) tot += check assert len(list(multiset_combinations(s, i))) == check if u: assert nC(len(s), i) == check assert nC(s) == tot if u: assert nC(len(s)) == tot except AssertionError: print(s, i, 'failed combo test') raise ValueError() for i in range(1, 10): tot = 0 for j in range(1, i + 2): check = nT(i, j) tot += check assert sum(1 for p in partitions(i, j, size=True) if p[0] == j) == check assert nT(i) == tot for i in range(1, 10): tot = 0 for j in range(1, i + 2): check = nT(range(i), j) tot += check assert len(list(multiset_partitions(list(range(i)), j))) == check assert nT(range(i)) == tot for i in range(100): s = ''.join(choice(c) for i in range(7)) u = len(s) == len(set(s)) try: tot = 0 for i in range(1, 8): check = nT(s, i) tot += check assert len(list(multiset_partitions(s, i))) == check if u: assert nT(range(len(s)), i) == check if u: assert nT(range(len(s))) == tot assert nT(s) == tot except AssertionError: print(s, i, 'failed partition test') raise ValueError() # tests for Stirling numbers of the first kind that are not tested in the # above assert [stirling(9, i, kind=1) for i in range(11) ] == [0, 40320, 109584, 118124, 67284, 22449, 4536, 546, 36, 1, 0] perms = list(permutations(range(4))) assert [ sum(1 for p in perms if Permutation(p).cycles == i) for i in range(5) ] == [0, 6, 11, 6, 1] == [stirling(4, i, kind=1) for i in range(5)] # http://oeis.org/A008275 assert [ stirling(n, k, signed=1) for n in range(10) for k in range(1, n + 1) ] == [ 1, -1, 1, 2, -3, 1, -6, 11, -6, 1, 24, -50, 35, -10, 1, -120, 274, -225, 85, -15, 1, 720, -1764, 1624, -735, 175, -21, 1, -5040, 13068, -13132, 6769, -1960, 322, -28, 1, 40320, -109584, 118124, -67284, 22449, -4536, 546, -36, 1 ] # https://en.wikipedia.org/wiki/Stirling_numbers_of_the_first_kind assert [stirling(n, k, kind=1) for n in range(10) for k in range(n + 1)] == [ 1, 0, 1, 0, 1, 1, 0, 2, 3, 1, 0, 6, 11, 6, 1, 0, 24, 50, 35, 10, 1, 0, 120, 274, 225, 85, 15, 1, 0, 720, 1764, 1624, 735, 175, 21, 1, 0, 5040, 13068, 13132, 6769, 1960, 322, 28, 1, 0, 40320, 109584, 118124, 67284, 22449, 4536, 546, 36, 1 ] # https://en.wikipedia.org/wiki/Stirling_numbers_of_the_second_kind assert [stirling(n, k, kind=2) for n in range(10) for k in range(n + 1)] == [ 1, 0, 1, 0, 1, 1, 0, 1, 3, 1, 0, 1, 7, 6, 1, 0, 1, 15, 25, 10, 1, 0, 1, 31, 90, 65, 15, 1, 0, 1, 63, 301, 350, 140, 21, 1, 0, 1, 127, 966, 1701, 1050, 266, 28, 1, 0, 1, 255, 3025, 7770, 6951, 2646, 462, 36, 1 ] assert stirling(3, 4, kind=1) == stirling(3, 4, kind=1) == 0 raises(ValueError, lambda: stirling(-2, 2)) def delta(p): if len(p) == 1: return oo return min(abs(i[0] - i[1]) for i in subsets(p, 2)) parts = multiset_partitions(range(5), 3) d = 2 assert (sum(1 for p in parts if all(delta(i) >= d for i in p)) == stirling(5, 3, d=d) == 7) # other coverage tests assert nC('abb', 2) == nC('aab', 2) == 2 assert nP(3, 3, replacement=True) == nP('aabc', 3, replacement=True) == 27 assert nP(3, 4) == 0 assert nP('aabc', 5) == 0 assert nC(4, 2, replacement=True) == nC('abcdd', 2, replacement=True) == \ len(list(multiset_combinations('aabbccdd', 2))) == 10 assert nC('abcdd') == sum(nC('abcdd', i) for i in range(6)) == 24 assert nC(list('abcdd'), 4) == 4 assert nT('aaaa') == nT(4) == len(list(partitions(4))) == 5 assert nT('aaab') == len(list(multiset_partitions('aaab'))) == 7 assert nC('aabb' * 3, 3) == 4 # aaa, bbb, abb, baa assert dict(_AOP_product((4, 1, 1, 1))) == { 0: 1, 1: 4, 2: 7, 3: 8, 4: 8, 5: 7, 6: 4, 7: 1 } # the following was the first t that showed a problem in a previous form of # the function, so it's not as random as it may appear t = (3, 9, 4, 6, 6, 5, 5, 2, 10, 4) assert sum(_AOP_product(t)[i] for i in range(55)) == 58212000 raises(ValueError, lambda: _multiset_histogram({1: 'a'}))
def test_nC_nP_nT(): from sympy.utilities.iterables import ( multiset_permutations, multiset_combinations, multiset_partitions, partitions, subsets, permutations) from sympy.functions.combinatorial.numbers import ( nP, nC, nT, stirling, _multiset_histogram, _AOP_product) from sympy.combinatorics.permutations import Permutation from sympy.core.numbers import oo from random import choice c = string.ascii_lowercase for i in range(100): s = ''.join(choice(c) for i in range(7)) u = len(s) == len(set(s)) try: tot = 0 for i in range(8): check = nP(s, i) tot += check assert len(list(multiset_permutations(s, i))) == check if u: assert nP(len(s), i) == check assert nP(s) == tot except AssertionError: print(s, i, 'failed perm test') raise ValueError() for i in range(100): s = ''.join(choice(c) for i in range(7)) u = len(s) == len(set(s)) try: tot = 0 for i in range(8): check = nC(s, i) tot += check assert len(list(multiset_combinations(s, i))) == check if u: assert nC(len(s), i) == check assert nC(s) == tot if u: assert nC(len(s)) == tot except AssertionError: print(s, i, 'failed combo test') raise ValueError() for i in range(1, 10): tot = 0 for j in range(1, i + 2): check = nT(i, j) tot += check assert sum(1 for p in partitions(i, j, size=True) if p[0] == j) == check assert nT(i) == tot for i in range(1, 10): tot = 0 for j in range(1, i + 2): check = nT(range(i), j) tot += check assert len(list(multiset_partitions(range(i), j))) == check assert nT(range(i)) == tot for i in range(100): s = ''.join(choice(c) for i in range(7)) u = len(s) == len(set(s)) try: tot = 0 for i in range(1, 8): check = nT(s, i) tot += check assert len(list(multiset_partitions(s, i))) == check if u: assert nT(range(len(s)), i) == check if u: assert nT(range(len(s))) == tot assert nT(s) == tot except AssertionError: print(s, i, 'failed partition test') raise ValueError() # tests for Stirling numbers of the first kind that are not tested in the # above assert [stirling(9, i, kind=1) for i in range(11)] == [ 0, 40320, 109584, 118124, 67284, 22449, 4536, 546, 36, 1, 0] perms = list(permutations(range(4))) assert [sum(1 for p in perms if Permutation(p).cycles == i) for i in range(5)] == [0, 6, 11, 6, 1] == [ stirling(4, i, kind=1) for i in range(5)] # http://oeis.org/A008275 assert [stirling(n, k, signed=1) for n in range(10) for k in range(1, n + 1)] == [ 1, -1, 1, 2, -3, 1, -6, 11, -6, 1, 24, -50, 35, -10, 1, -120, 274, -225, 85, -15, 1, 720, -1764, 1624, -735, 175, -21, 1, -5040, 13068, -13132, 6769, -1960, 322, -28, 1, 40320, -109584, 118124, -67284, 22449, -4536, 546, -36, 1] # http://en.wikipedia.org/wiki/Stirling_numbers_of_the_first_kind assert [stirling(n, k, kind=1) for n in range(10) for k in range(n+1)] == [ 1, 0, 1, 0, 1, 1, 0, 2, 3, 1, 0, 6, 11, 6, 1, 0, 24, 50, 35, 10, 1, 0, 120, 274, 225, 85, 15, 1, 0, 720, 1764, 1624, 735, 175, 21, 1, 0, 5040, 13068, 13132, 6769, 1960, 322, 28, 1, 0, 40320, 109584, 118124, 67284, 22449, 4536, 546, 36, 1] # http://en.wikipedia.org/wiki/Stirling_numbers_of_the_second_kind assert [stirling(n, k, kind=2) for n in range(10) for k in range(n+1)] == [ 1, 0, 1, 0, 1, 1, 0, 1, 3, 1, 0, 1, 7, 6, 1, 0, 1, 15, 25, 10, 1, 0, 1, 31, 90, 65, 15, 1, 0, 1, 63, 301, 350, 140, 21, 1, 0, 1, 127, 966, 1701, 1050, 266, 28, 1, 0, 1, 255, 3025, 7770, 6951, 2646, 462, 36, 1] assert stirling(3, 4, kind=1) == stirling(3, 4, kind=1) == 0 raises(ValueError, lambda: stirling(-2, 2)) def delta(p): if len(p) == 1: return oo return min(abs(i[0] - i[1]) for i in subsets(p, 2)) parts = multiset_partitions(range(5), 3) d = 2 assert (sum(1 for p in parts if all(delta(i) >= d for i in p)) == stirling(5, 3, d=d) == 7) # other coverage tests assert nC('abb', 2) == nC('aab', 2) == 2 assert nP(3, 3, replacement=True) == nP('aabc', 3, replacement=True) == 27 assert nP(3, 4) == 0 assert nP('aabc', 5) == 0 assert nC(4, 2, replacement=True) == nC('abcdd', 2, replacement=True) == \ len(list(multiset_combinations('aabbccdd', 2))) == 10 assert nC('abcdd') == sum(nC('abcdd', i) for i in range(6)) == 24 assert nC(list('abcdd'), 4) == 4 assert nT('aaaa') == nT(4) == len(list(partitions(4))) == 5 assert nT('aaab') == len(list(multiset_partitions('aaab'))) == 7 assert nC('aabb'*3, 3) == 4 # aaa, bbb, abb, baa assert dict(_AOP_product((4,1,1,1))) == { 0: 1, 1: 4, 2: 7, 3: 8, 4: 8, 5: 7, 6: 4, 7: 1} # the following was the first t that showed a problem in a previous form of # the function, so it's not as random as it may appear t = (3, 9, 4, 6, 6, 5, 5, 2, 10, 4) assert sum(_AOP_product(t)[i] for i in range(55)) == 58212000 raises(ValueError, lambda: _multiset_histogram({1:'a'}))
def test_F8(): assert stirling(5, 2, signed=True) == -50 # if signed, then kind=1
def test_F8(): assert stirling(5, 2, signed=True) == -50 # if signed, then kind=1
class MSampler(object): if STIRLING_LOADED: stirling_mat = lambda _, x, y: _stirling_mat[x, y] else: lgg.error( 'stirling.npy file not found, using sympy instead (MMSB_CGS model will be 100 time slower !)' ) stirling_mat = lambda _, x, y: np.asarray( [float(sympy.log(stirling(x, i, kind=1)).evalf()) for i in y]) def __init__(self, zsampler): self.zsampler = zsampler self.get_log_alpha_beta = zsampler.get_log_alpha_beta self.count_k_by_j = zsampler.doc_topic_counts # We don't know the preconfiguration of tables ! self.m = np.ones(self.count_k_by_j.shape, dtype=int) self.m_dotk = self.m.sum(axis=0) def sample(self): self._update_m() indices = np.ndenumerate(self.count_k_by_j) lgg.debug('Sample m...') for ind in indices: j, k = ind[0] count = ind[1] if count > 0: # Sample number of tables in j serving dishe k params = self.prob_jk(j, k) sample = categorical(params) + 1 else: sample = 0 self.m[j, k] = sample self.m_dotk = self.m.sum(0) self.purge_empty_tables() return self.m def _update_m(self): # Remove tables associated with purged topics for k in sorted(self.zsampler.last_purged_topics, reverse=True): self.m = np.delete(self.m, k, axis=1) # Passed by reference, but why not... self.count_k_by_j = self.zsampler.doc_topic_counts K = self.count_k_by_j.shape[1] # Add empty table for new fancy topics new_k = K - self.m.shape[1] if new_k > 0: lgg.debug('msampler: %d new topics' % (new_k)) J = self.m.shape[0] self.m = np.hstack((self.m, np.zeros((J, new_k), dtype=int))) # Removes empty table. def purge_empty_tables(self): # cant be. pass def prob_jk(self, j, k): # -1 because table of current sample topic jk, is not conditioned on njdotk = self.count_k_by_j[j, k] if njdotk == 1: return np.ones(1) possible_ms = np.arange(1, njdotk) # +1-1 log_alpha_beta_k = self.get_log_alpha_beta(k) alpha_beta_k = np.exp(log_alpha_beta_k) normalizer = gammaln(alpha_beta_k) - gammaln(alpha_beta_k + njdotk) log_stir = self.stirling_mat(njdotk, possible_ms) params = normalizer + log_stir + possible_ms * log_alpha_beta_k return lognormalize(params)
def test_stirling_matrices(self): """ Test the calculation of Stirling numbers of first kind. """ # Test the unnormalized Stirling numbers for K in xrange(3, 21): pyStirling = [] for k0 in xrange(K + 1): pyStirling.append( array( [int(stirling(k0, k, kind=1)) for k in xrange(K + 1)])) pyStirling = array(pyStirling) S = get_stirling_numbers(K) self.assertTrue(allclose(pyStirling, S)) # Test the normalized Stirling numbers using sympy for K in xrange(3, 21): pyStirling = [] for k0 in xrange(K + 1): s = array( [int(stirling(k0, k, kind=1)) for k in xrange(K + 1)]) pyStirling.append(s / float(s.max())) pyStirling = array(pyStirling) S = get_normalized_stirling_numbers(K) self.assertTrue(allclose(pyStirling, S)) # Test the normalized Stirling numbers using hand-made numpy function for K in (5, 10, 20, 30, 40, 50, 100, 200, 1000): pyStirling = stirlingnumbers(K) S = get_normalized_stirling_numbers(K) self.assertTrue(allclose(pyStirling, S)) # Test the lite versions of the Python stirling matrix calculators. n = 100 # number of columns in the matrix (less one) k = 10 # number of elements in the I index array for _ in xrange(100): I = sorted(permutation(n)[:k]) self.assertTrue( allclose(stirlingnumbers(max(I))[I], stirlingnumbers2(I))) self.assertTrue( allclose(stirlingnumbers(max(I))[I], stirlingnumbers3(I))) S = get_normalized_stirling_numbers2(I, max(I), len(I)) self.assertTrue(allclose(S, stirlingnumbers3(I))) self.assertTrue(allclose(S, stirlingnumbers2(I))) self.assertTrue(allclose(S, stirlingnumbers(max(I))[I]))
def apply(self, m, n, evaluation): "%(name)s[n_Integer, m_Integer]" n_value = n.get_int_value() m_value = m.get_int_value() return Integer(stirling(n_value, m_value, kind=2))
def _stirling_table_dishe(self, n, m): if m > n: return np.inf else: return sym.log(stirling(n, m, kind=self.kind)).evalf()