def test_cross_validator_with_default_indices(): n_samples = 4 n_unique_labels = 4 n_folds = 2 p = 2 n_iter = 10 # (the default value) X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) X_1d = np.array([1, 2, 3, 4]) y = np.array([1, 1, 2, 2]) labels = np.array([1, 2, 3, 4]) loo = LeaveOneOut() lpo = LeavePOut(p) kf = KFold(n_folds) skf = StratifiedKFold(n_folds) lolo = LeaveOneLabelOut() lopo = LeavePLabelOut(p) ss = ShuffleSplit(random_state=0) ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 n_splits = [n_samples, comb(n_samples, p), n_folds, n_folds, n_unique_labels, comb(n_unique_labels, p), n_iter, 2] for i, cv in enumerate([loo, lpo, kf, skf, lolo, lopo, ss, ps]): # Test if get_n_splits works correctly assert_equal(n_splits[i], cv.get_n_splits(X, y, labels)) # Test if the cross-validator works as expected even if # the data is 1d np.testing.assert_equal(list(cv.split(X, y, labels)), list(cv.split(X_1d, y, labels))) # Test that train, test indices returned are integers for train, test in cv.split(X, y, labels): assert_equal(np.asarray(train).dtype.kind, 'i') assert_equal(np.asarray(train).dtype.kind, 'i')
def findF(k,p,q,x, side): #print k, p, q, x if k in Fdict: if p in Fdict[k]: if q in Fdict[k][p]: if x in Fdict[k][p][q]: if side in Fdict[k][p][q][x]: return Fdict[k][p][q][x][side] if q>0: if side==LEFT: #left ret=0 for l in xrange(min(p*q+1, x+1)): ret+=findF(k, p, 0, x-l, LEFT)*sc.comb(p*q,l) elif side==RIGHT: #right ret=0 for l in xrange(min(p*q+1, x+1)): if l==0: ret+=0 #findF(k, p, 0, x-l, LEFT) it requires at least one edge from left to right. else: ret+=findF(k, p, 0, x-l, LEFT)*sc.comb(q,l) #external nodes can only connect the "root" of the right tree #ret+=findF(k, p, 0, x-l, LEFT)*sc.comb(q,1)*sc.comb((p-1)*q,l-1) else: if k==0: if p==1: if q==0: if x==0: ret=1 else: ret=0 else: print "Error 1" else: print "Errro 0" else: ret=0 for i in xrange(k): tmp=0 for j in xrange(k-i-1,x-i): tmp += findF(k-i-1,k-i,i+1,j, LEFT)*findF(i, i+1, k-i, x-j, RIGHT) ret += tmp*sc.comb(k-1,i) if k not in Fdict: Fdict[k]={} if p not in Fdict[k]: Fdict[k][p]={} if q not in Fdict[k][p]: Fdict[k][p][q]={} if x not in Fdict[k][p][q]: Fdict[k][p][q][x]={} Fdict[k][p][q][x][side]=ret return ret
def solution3(): ''' >>> solution3() 137846528820 ''' from scipy.misc import comb print comb(40, 20, exact=True)
def eval_gof( self, data): probs = self.eval(data.levels) expected = probs * data.Ntrials # Compute various goodness of fit statistics LL = -2*sum((data.Ncorr*np.log(expected/(data.Ncorr+np.finfo(float).eps)) +(data.Ntrials-data.Ncorr)*np.log((data.Ntrials-expected)/(data.Ntrials-data.Ncorr+np.finfo(float).eps)))); X2 = sum((data.Ncorr-expected)**2./expected/(1.-probs)) # Adding eps to avoid log(0) Nans self.prinsNLL = -sum( data.Ncorr*np.log(probs+np.finfo(float).eps)+(data.Ntrials-data.Ncorr)*np.log(1.-probs+np.finfo(float).eps) ) # Treutwein/Strasburger 1999 Eq 6 (likelihood of the data) L_ts = 2**(sum( data.Ntrials )) LL_ts = 0.0 #L_ts = 1.0 for level in np.arange( len(data.levels) ): # TODO: is right to use observed data or function values?: next two lines can chg to try fitted thisN = data.Ntrials[level] thisCorr = data.Ncorr[level] L_ts *= misc.comb( thisN, thisCorr ) * (probs[level]**thisCorr) * (1.0 - probs[level])**(thisN-thisCorr) LL_ts += np.log(misc.comb( thisN, thisCorr )) + thisCorr*np.log(probs[level]) +np.log(1.0 - probs[level])*(thisN-thisCorr) #TODO: This is how Prins' clamps the lapse. Parameterize. if (self.params[self.PARAM_UPPER] < 0) or (self.params[self.PARAM_UPPER] > 0.05): self.prinsNLL=np.inf return probs,LL,X2,L_ts,LL_ts,self.prinsNLL
def centroid(self): '''Find the centroid x and y from these coefficients''' xcen = 0.0 ycen = 0.0 fluxtot = self.total_flux() for i1 in range(self.n1): if i1%2==0: continue #consider odd i1 for i2 in range(self.n2): if i2%2!=0: continue # consider even i2 xcen = xcen + np.power(i1+1,0.5)*np.power(2,0.5*(2-i1-i2))* np.power(comb(i1+1,(i1+1)/2)*comb(i2,i2/2),0.5)*self.coeff[i1,i2] for i1 in range(self.n1): if i1%2!=0: continue #consider even i1 for i2 in range(self.n2): if i2%2==0:continue # consider odd i2 ycen = ycen + np.power(i2+1,0.5)*np.power(2,0.5*(2-i2-i1))* np.power(comb(i2+1,(i2+1)/2)*comb(i1,i1/2),0.5)*self.coeff[i1,i2] xcen = xcen*np.sqrt(pi)*self.beta*self.beta/fluxtot ycen = ycen*np.sqrt(pi)*self.beta*self.beta/fluxtot return xcen,ycen
def get_motifspace_size(q, n): """return length of search space according to equation which is mentioned in Section 3.1 of the paper""" return reduce( lambda x, y: x + (int(sc.comb(q, y, exact=True)) * 4 ** (q - y)), [i for i in range(1, n + 1)], int(sc.comb(q, 0, exact=True)) * 4 ** (q - 0), )
def brive(N, replace_zeros=True): """ The brive estimator Parameters ---------- N : np.array, int Counts vector replace_zeros: bool Replaces zeros with uniform prior Returns ------- pvals """ N = N.astype(np.int) n = sum(N) pvals = np.zeros(len(N), dtype=np.float64) for i in range(len(N)): if N[i]==0 or N[i]==1: continue trials = [comb(t-1, N[i]-1) / (t * (comb(n, N[i]))) for t in range(N[i], n+1)] pvals[i] = (float(N[i]-1)) * sum(trials) if replace_zeros: m = sum(pvals) if 0 < m < 1 and (pvals==0).sum() > 0: pvals[pvals==0] = (1 - m) / (pvals==0).sum() return pvals
def pdf(self, x, k, n, p): '''distribution of success runs of length k or more Parameters ---------- x : float count of runs of length n k : int length of runs n : int total number of observations or trials p : float probability of success in each Bernoulli trial Returns ------- pdf : float probability that x runs of length of k are observed Notes ----- not yet vectorized References ---------- Muselli 1996, theorem 3 ''' q = 1-p m = np.arange(x, (n+1)//(k+1)+1)[:,None] terms = (-1)**(m-x) * comb(m, x) * p**(m*k) * q**(m-1) \ * (comb(n - m*k, m - 1) + q * comb(n - m*k, m)) return terms.sum(0)
def daub(p): """ The coefficients for the FIR low-pass filter producing Daubechies wavelets. p>=1 gives the order of the zero at f=1/2. There are 2p filter coefficients. Parameters ---------- p : int Order of the zero at f=1/2, can have values from 1 to 34. """ sqrt = np.sqrt if p < 1: raise ValueError("p must be at least 1.") if p==1: c = 1/sqrt(2) return np.array([c,c]) elif p==2: f = sqrt(2)/8 c = sqrt(3) return f*np.array([1+c,3+c,3-c,1-c]) elif p==3: tmp = 12*sqrt(10) z1 = 1.5 + sqrt(15+tmp)/6 - 1j*(sqrt(15)+sqrt(tmp-15))/6 z1c = np.conj(z1) f = sqrt(2)/8 d0 = np.real((1-z1)*(1-z1c)) a0 = np.real(z1*z1c) a1 = 2*np.real(z1) return f/d0*np.array([a0, 3*a0-a1, 3*a0-3*a1+1, a0-3*a1+3, 3-a1, 1]) elif p<35: # construct polynomial and factor it if p<35: P = [comb(p-1+k,k,exact=1) for k in range(p)][::-1] yj = np.roots(P) else: # try different polynomial --- needs work P = [comb(p-1+k,k,exact=1)/4.0**k for k in range(p)][::-1] yj = np.roots(P) / 4 # for each root, compute two z roots, select the one with |z|>1 # Build up final polynomial c = np.poly1d([1,1])**p q = np.poly1d([1]) for k in range(p-1): yval = yj[k] part = 2*sqrt(yval*(yval-1)) const = 1-2*yval z1 = const + part if (abs(z1)) < 1: z1 = const - part q = q * [1,-z1] q = c * np.real(q) # Normalize result q = q / np.sum(q) * sqrt(2) return q.c[::-1] else: raise ValueError("Polynomial factorization does not work " "well for p too large.")
def ARI(trueLab, predLab): """ compute adjusted rand index, ranges in [-1, 1] with random assignment score 0 and perfect score 1 :param trueLab: ground truth labels :param predLab: predicted labels :return: adjusted rand index """ n = len(trueLab) trueLab = np.array(trueLab) predLab = np.array(predLab) trueCluster = dict(zip(set(trueLab), [np.where(trueLab == x)[0] for x in set(trueLab)])) predCluster = dict(zip(set(predLab), [np.where(predLab == x)[0] for x in set(predLab)])) nTrue = len(trueCluster) nPred = len(predCluster) cTable = np.zeros((nTrue, nPred)) for i in range(nTrue): for j in range(nPred): cTable[i, j] = len(np.intersect1d(trueCluster.values()[i], predCluster.values()[j])) a = comb(np.sum(cTable, axis=1), 2).sum() b = comb(np.sum(cTable, axis=0), 2).sum() c = comb(n, 2) return (comb(cTable, 2).sum() - (a * b) / c) / (0.5 * (a + b) - (a * b) / c)
def rare(y, size): notabs = ~np.isnan(y) t = y[notabs] N = np.sum(t) diff = N - t rare_calc = np.sum(1 - comb(diff, size)/comb(N, size)) return rare_calc
def estimate(E, r, f, lp): if f <= r: return 0.0 if E - f < lp: return 1.0 probs = [(1 - (1.0 * scal(comb(E - f, lp)) / scal(comb(e, lp)))) for e in range(E, E - (r+ 1), -1)] return reduce(lambda x, y: x * y, probs, 1.0)
def bilinear(b, a, fs=1.0): """Return a digital filter from an analog filter using the bilinear transform. The bilinear transform substitutes ``(z-1) / (z+1``) for ``s``. """ #This function has been copied out of scipy fs =float(fs) a,b = map(num.atleast_1d,(a,b)) D = len(a) - 1 N = len(b) - 1 artype = float M = max([N,D]) Np = M Dp = M bprime = num.zeros(Np+1,artype) aprime = num.zeros(Dp+1,artype) for j in range(Np+1): val = 0.0 for i in range(N+1): for k in range(i+1): for l in range(M-i+1): if k+l == j: val += comb(i,k)*comb(M-i,l)*b[N-i]*pow(2*fs,i)*(-1)**k bprime[j] = num.real(val) for j in range(Dp+1): val = 0.0 for i in range(D+1): for k in range(i+1): for l in range(M-i+1): if k+l == j: val += comb(i,k)*comb(M-i,l)*a[D-i]*pow(2*fs,i)*(-1)**k aprime[j] = num.real(val) #return aprime, bprime return normalize(bprime, aprime)
def multiple_alignment(word_list): '''Returns the multiple alignment of a given list of words.''' from itertools import product from operator import add, mul from scipy.misc import comb # There are some issues scoring the first symbols, so force a match here and remove it from the alignment later. word_list = ['$'+word for word in word_list] # Initialize scoring and backtrack dictionaries, along with the indices and base score. S, backtrack = {}, {} perm_list = list(product([0, -1], repeat=len(word_list)))[1:] base_score = -1*comb(len(word_list), 2, exact=True) for index in product(*map(xrange,map(lambda s: len(s) + 1, word_list))): # We forced a match with the first symbols, so the zero-shell should lead to the zero index. if reduce(mul, index) == 0: # Since we forced a match with the first symbol, we want to force starting point to be the zero index. if sum(index) == 0: # All symbols match. S[index] = 0 else: # Make it smaller than the lowest possible score. S[index] = 2*base_score*reduce(add, map(len, word_list)) else: # Use previous scores to determine the best score for the current index. previous_scores = [S[tuple(map(add, index, perm))] for perm in perm_list] current_index_scores = [] for perm in perm_list: chars = [word_list[i][index[i]-1] if perm_value == -1 else '-' for i, perm_value in enumerate(perm)] current_index_scores.append(base_score + sum([comb(chars.count(ch), 2, exact=True) for ch in set(chars)])) scores = map(add, previous_scores, current_index_scores) backtrack[index], S[index] = max(enumerate(scores), key=lambda p: p[1]) # Initialize the alignment and indicies. alignment = word_list current_index = map(len, word_list) # Get the max score. # Note: The forced match at start of each word does not change the max score, as matched symbols have a score of zero. max_score = S[tuple(current_index)] # Quick lambda function to insert indels. insert_indel = lambda word, i: word[:i] + '-' + word[i:] # Insert indels to get the alignment. while reduce(mul, current_index) != 0: for i, perm_value in enumerate(perm_list[backtrack[tuple(current_index)]]): if perm_value == 0: alignment[i] = insert_indel(alignment[i], current_index[i]) else: current_index[i] -= 1 # Note: We don't need to prepend any indels because we forced a match at the start of all words. # Remove the forced match from all alignments to recover the correct alignment. return [str(max_score)] + [str(aligned[1:].seq) for aligned in alignment]
def rectangles_numbers(M, N): ''' 正方形是屬於長方形的一種。或者說,正方形是長方形的特例。矩形的定義是四個角皆為 90° 的四邊形。 如果要算不包含正方形的長方形,只需把所有的算出,然後減去正方形的個數。 ''' horizontal = sm.comb(M+1, 2, exact=True) vertical = sm.comb(N+1, 2, exact=True) return horizontal * vertical
def hyperg(k, N, m, n, verbose=False): """ k = intersection; m = population 1; n = population 2; N = total population. """ exact=1 if verbose and k%100==0: printnow("k: %s" % k) #return comb(m,k,exact) * comb(N-m,n-k,exact)/ comb(N,n,exact) return comb(m,k) * comb(N-m,n-k)/ comb(N,n)
def hypergeometricown(self, N, K, n, k): """ N= total number of genes in population K= number of GOA n= select a sample (top 50, bottom half, etc.) k= number of successes in the sample """ return comb(K, k) * comb(N-K, n-k) / comb(N, n)
def runs_prob_odd(self, r): n0, n1 = self.n0, self.n1 k = (r+1)//2 tmp0 = comb(n0-1, k-1) tmp1 = comb(n1-1, k-2) tmp3 = comb(n0-1, k-2) tmp4 = comb(n1-1, k-1) return (tmp0 * tmp1 + tmp3 * tmp4) / self.comball
def elm_comb(rho, N, m): offered_traffic = rho / (1 + rho) numerator = comb(N-1, m, exact=True)*pow(offered_traffic, m)*pow(1-offered_traffic, N-m) denominator = 0 for i in range(0, m+1): denominator += comb(N-1, i, exact=True)*pow(offered_traffic, i)*pow(1-offered_traffic, N-i) return numerator / denominator
def mv_hypergeometric(x, m): """ x : number of draws for each category. m : size of each category. """ x = np.asarray(x) m = np.asarray(m) return log(comb(m, x).prod() / comb(m.sum(), x.sum()))
def prob(N, k, n, m): part1 = comb(N - 2, k) part2 = comb(N - 2 - k, n - k - 1) part3 = comb(N - n - 1, m - k - 1) part4 = comb(N - 2, n - 1) part5 = comb(N - 2, m - 1) p = (part1 * part2 * part3) / (part4 * part5) return p
def colorcount(n): if n > 1: return comb(7, n, exact=True) * ( comb(19, n - 1, exact=True) - n * sum(comb(i, n - 2, exact=True) for i in range(0, 9)) ) # elif n==2: # return comb(7,n,exact=True) else: return 0
def vlj(l,j,m_a,p, q): vlj = (-1**p)*(m_a + l + 2*j) vlj *= comb(m_a + j + l - 1, l-1) vlj *= comb(j+l-1, l-1) vlj *= comb(l-1, p - j) vlj /= comb(q+l+j, l) return vlj
def peter(n): if n >= 23: return peter(45 - n) else: count = 0 for k in range((n-9)/4+1): j = n-9-4*k count += misc.comb(9,k)*misc.comb(9+j-1,j)*((-1)**(k+j*2)) return count/(4**9)
def compute_data_points(e, c): ue = np.linspace(c, max_ue, max_ue - c + 1) num_chosen = [x for x in range(c, e)] + [e]*(max_ue - e + 1) all_comb = comb(ue, num_chosen) good_comb = comb([x-c for x in ue], [x-c for x in num_chosen]) expected = all_comb / good_comb return ue, expected
def lamp_test(p_type, p_mut, total): significance = 0.05 tests = 2503339*3 sigma = float(significance)/float(tests) function = misc.comb(p_mut, p_type)/misc.comb(total, p_type) if sigma < function: return False else: return True
def gevrey_tanh(T, n, sigma=sigma_tanh, K=K_tanh): """ Provide the flat output y(t) = phi(t), with the gevrey-order 1+1/sigma, and the derivatives up to order n. :param t: [0, ... , t_end] (numpy array) :param n: (integer) :param sigma: (float) :param K: (float) :return: np.array([[phi], ... ,[phi^(n)]]) """ t_init = t = np.linspace(0., T, int(0.5*10**(2+np.log10(T)))) # pop t = np.delete(t, 0, 0) t = np.delete(t, -1, 0) # main tau = t/T a = dict() a[0] = K*(4*tau*(1-tau))**(1-sigma)/(2*(sigma-1)) a[1] = (2*tau - 1)*(sigma-1)/(tau*(1-tau))*a[0] for k in xrange(2, n+2): a[k] = (tau*(1-tau))**-1 * ((sigma-2+k)*(2*tau-1)*a[k-1]+(k-1)*(2*sigma-4+k)*a[k-2]) yy = dict() yy[0] = np.tanh(a[1]) if n > 0: yy[1] = a[2]*(1-yy[0]**2) z = dict() z[0] = (1-yy[0]**2) for i in xrange(2, n+1): sum_yy = np.zeros(len(t)) for k in xrange(i): if k == 0: sum_z = np.zeros(len(t)) for j in xrange(i): sum_z += -sm.comb(i-1, j)*yy[j]*yy[i-1-j] z[i-1] = sum_z sum_yy += sm.comb(i-1, k)*a[k+2]*z[i-1-k] yy[i] = sum_yy # push phi = np.nan*np.zeros((n+1, len(t)+2)) for i in xrange(n+1): phi_temp = 0.5*yy[i] if i == 0: phi_temp += 0.5 phi_temp = np.insert(phi_temp, 0, [0.], axis=0) phi[i, :] = np.append(phi_temp, [1.]) else: phi_temp = np.insert(phi_temp, 0, [0.], axis=0) # attention divide by T^i phi[i, :] = np.append(phi_temp, [0.])/T**i return phi, t_init
def serial_perm_equal_params(n): ev = n / math.factorial(n) ev2 = n**2 / math.factorial(n) for i in range(n-2, 0, -1): pmatch = np.prod([1/k for k in range(n, n-i, -1)]) pfail = 1/(n-i) ev += i * comb(n, i) * pmatch * pfail ev2 += i**2 * comb(n, i) * pmatch * pfail var = ev2 - (ev)**2 return ev, var
def central_geom_moment(self, p, q): m = np.arange(0, p + 1) n = np.arange(0, q + 1) x_0 = self.centroid()['x'] y_0 = self.centroid()['y'] M = self.geom_moments(p, q) # return (comb(p,m,exact=False)*(-x_0)**(p-m)).dot(M.dot((comb(q,n,exact=False)*(-y_0)**(q-n)))) return (M.dot((comb(q, n, exact=False) * (-y_0) ** (q - n)))).dot(comb(p, m, exact=False) * (-x_0) ** (p - m)) np.dot()
def comb(n, k): res = np.rint(spm.comb(n, k, False)).astype(int) if np.all(res >= 0) and np.all(res < _MAX_INT_FLOAT): return res elif isinstance(n, abc.Iterable) or isinstance(k, abc.Iterable): broad = np.broadcast(np.asarray(n), np.asarray(k)) res = np.empty(broad.shape, dtype=object) res.flat = [spm.comb(n_, k_, True) for n_, k_ in broad] return res else: return spm.comb(n, k, True)
def c_b(B): # 2**(B-2) s = 0 for i in range(B-1): s += sc_m.comb(B-2, i, exact=True) return s
print dta[[ 'AVYRSEXP', 'AVSALK', 'PERSPENK', 'PTRATIO', 'PCTAF', 'PCTCHRT', 'PCTYRRND' ]].head(10) formula = 'NABOVE + NBELOW ~ LOWINC + PERASIAN + PERBLACK + PERHISP + PCTCHRT ' formula += '+ PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF' ##### Aside: Binomial distribution # Toss a six-sided die 5 times, what's the probability of exactly 2 fours? stats.binom(5, 1. / 6).pmf(2) from scipy.misc import comb comb(5, 2) * (1 / 6.)**2 * (5 / 6.)**3 from statsmodels.formula.api import glm glm_mod = glm(formula, dta, family=sm.families.Binomial()).fit() print glm_mod.summary() # The number of trials glm_mod.model.data.orig_endog.sum(1) glm_mod.fittedvalues * glm_mod.model.data.orig_endog.sum(1) # First differences: We hold all explanatory variables constant at their means and manipulate the percentage of low income households to assess its impact # on the response variables:
def bernstein_poly(i, n, t): """ The Bernstein polynomial of n, i as a function of t """ return comb(n, i) * ( t**(n-i) ) * (1 - t)**i
def multivariate_lagrange(points, n): """ Given the list of values *points*, construct a multidimensional (with dimension equal to the length of each of the given points) polynomial of degree *n* using the method developed in the following reference: Kamron Saniee, A Simple Expression for Multivariate Lagrange Interpolation, SIAM, 2007. The method has a restriction: if p is the number of *point*s and m is one less than the problem domain (i.e., m = `len(points[0]) - 1`) then p = *n* + m choose *n*. """ # check dimensions p = len(points) m = len(points[0]) - 1 if not comb(n + m, n, exact=True) == p: raise ValueError('dimension mismatch') # setup symbols and build expression z_vec = SYM.Matrix([points[i][-1] for i in range(p)]) coef = SYM.symbols(' '.join(['a{}'.format(i) for i in range(1, p + 1)])) var = SYM.symbols(' '.join(['x{}'.format(i) for i in range(1, m + 1)])) # handle trivial case, n = 0 if n == 0: return SYM.Float(points[0][-1]), var, 1., [1.] z_terms = [SYM.Poly.from_dict({x: 1}, var) for x in poly_power_seq(m, n)] z = SYM.Poly.from_dict(dict(zip(poly_power_seq(m, n), coef)), var) # build M matrix M_rows = [] for point_i in points: z_i = z(*point_i[:-1]) M_rows.append([float(z_i.coeff(x)) for x in coef]) M = NP.array(M_rows) delta = NP.linalg.det(M) # compute delta_i delta_i_list = [] B = NP.ones(p - 1) C = NP.array(z_terms[:-1]) D = M[-1, -1] for i in range(p): # using block matrix property of determinants which assumes # that the matrix A below is non-singular (the typical case) # --- the code fails safe to calculating the symbolic # determinant when A is singular (which will be slow for large # problems) A = NP.vstack((M[ :i, :-1], M[i+1:, :-1])) try: b = NP.linalg.solve(A, B) # using row interchange property delta_i_list.append((-1)**(p-1-i) * NP.linalg.det(A) * (D - NP.dot(C, b))) except NP.linalg.linalg.LinAlgError: logger.warning('singular matrix encountered (this will happen when, e.g., points contain many zeros) --- resorting to symbolic determinant calculation which will be slow for large problems') row_i = SYM.Matrix([z_terms]) M_i = SYM.Matrix(M_rows) M_i.row_del(i) M_i = M_i.row_insert(p, row_i) # using row interchange property delta_i_list.append(M_i.det() * (-1)**(p-1-i)) f = lagrange_interpolator(points, delta, delta_i_list) return SYM.Poly(f.simplify()), var, delta, delta_i_list
#!/usr/bin/env python3.6 """ PROBLEM: 053 AUTHOR: Dirk Meijer STATUS: done EXPLANATION: scipy combinatorics """ from Euler.tictoc import tic, toc from Euler.eprint import eprint from scipy.misc import comb if __name__ == "__main__": tic() S = 0 for n in range(23, 101): for k in range(1, n + 1): S += comb(n, k) > 1e6 print(S) toc() exit()
def pmf(n, k, p): return comb(n, k) * p**k * (1 - p)**(n - k)
] # window specifies the starting and ending time of the period that the data user is interested in # step 3: pre-sanitize the database sanitized_profile_baseline = util.sanitize_data( day_profile, distance_metric='euclidean', anonymity_level=anonymity_level, rep_mode=rep_mode) loss_generic_metric = pe.get_information_loss( data_gt=day_profile, data_sanitized=sanitized_profile_baseline.round(), window=window) print("information loss with generic metric %s" % loss_generic_metric) df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample( frac=1) subsample_size_max = int(comb(len(df_subsampled_from), 2)) print('total number of pairs is %s' % subsample_size_max) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair = sp.uniform_sampling(subsample_size=subsample_size) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest=interest, window=window) similarity_label, class_label = sim.label_via_silhouette_analysis( range_n_clusters=range(2, 8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs # lam_vec is a set of candidate lambda's for weighting the l1-norm penalty in the metric learning optimization problem.
def daub(p): """ The coefficients for the FIR low-pass filter producing Daubechies wavelets. p>=1 gives the order of the zero at f=1/2. There are 2p filter coefficients. Parameters ---------- p : int Order of the zero at f=1/2, can have values from 1 to 34. Returns ------- daub : ndarray Return """ sqrt = np.sqrt if p < 1: raise ValueError("p must be at least 1.") if p == 1: c = 1 / sqrt(2) return np.array([c, c]) elif p == 2: f = sqrt(2) / 8 c = sqrt(3) return f * np.array([1 + c, 3 + c, 3 - c, 1 - c]) elif p == 3: tmp = 12 * sqrt(10) z1 = 1.5 + sqrt(15 + tmp) / 6 - 1j * (sqrt(15) + sqrt(tmp - 15)) / 6 z1c = np.conj(z1) f = sqrt(2) / 8 d0 = np.real((1 - z1) * (1 - z1c)) a0 = np.real(z1 * z1c) a1 = 2 * np.real(z1) return f / d0 * np.array([a0, 3 * a0 - a1, 3 * a0 - 3 * a1 + 1, a0 - 3 * a1 + 3, 3 - a1, 1]) elif p < 35: # construct polynomial and factor it if p < 35: P = [comb(p - 1 + k, k, exact=1) for k in range(p)][::-1] yj = np.roots(P) else: # try different polynomial --- needs work P = [comb(p - 1 + k, k, exact=1) / 4.0**k for k in range(p)][::-1] yj = np.roots(P) / 4 # for each root, compute two z roots, select the one with |z|>1 # Build up final polynomial c = np.poly1d([1, 1])**p q = np.poly1d([1]) for k in range(p - 1): yval = yj[k] part = 2 * sqrt(yval * (yval - 1)) const = 1 - 2 * yval z1 = const + part if (abs(z1)) < 1: z1 = const - part q = q * [1, -z1] q = c * np.real(q) # Normalize result q = q / np.sum(q) * sqrt(2) return q.c[::-1] else: raise ValueError("Polynomial factorization does not work " "well for p too large.")
def bp_easyline(n): return comb(2 * n, n, exact = True)
def pascal(n, kind='symmetric', exact=True): """ Returns the n x n Pascal matrix. The Pascal matrix is a matrix containing the binomial coefficients as its elements. .. versionadded:: 0.11.0 Parameters ---------- n : int The size of the matrix to create; that is, the result is an n x n matrix. kind : str, optional Must be one of 'symmetric', 'lower', or 'upper'. Default is 'symmetric'. exact : bool, optional If `exact` is True, the result is either an array of type numpy.uint64 (if n <= 35) or an object array of Python long integers. If `exact` is False, the coefficients in the matrix are computed using `scipy.misc.comb` with `exact=False`. The result will be a floating point array, and the values in the array will not be the exact coefficients, but this version is much faster than `exact=True`. Returns ------- p : (n, n) ndarray The Pascal matrix. Notes ----- See http://en.wikipedia.org/wiki/Pascal_matrix for more information about Pascal matrices. Examples -------- >>> from scipy.linalg import pascal >>> pascal(4) array([[ 1, 1, 1, 1], [ 1, 2, 3, 4], [ 1, 3, 6, 10], [ 1, 4, 10, 20]], dtype=uint64) >>> pascal(4, kind='lower') array([[1, 0, 0, 0], [1, 1, 0, 0], [1, 2, 1, 0], [1, 3, 3, 1]], dtype=uint64) >>> pascal(50)[-1, -1] 25477612258980856902730428600L >>> from scipy.misc import comb >>> comb(98, 49, exact=True) 25477612258980856902730428600L """ if kind not in ['symmetric', 'lower', 'upper']: raise ValueError("kind must be 'symmetric', 'lower', or 'upper'") if exact: if n > 35: L_n = np.empty((n, n), dtype=object) L_n.fill(0) else: L_n = np.zeros((n, n), dtype=np.uint64) for i in range(n): for j in range(i + 1): L_n[i, j] = comb(i, j, exact=True) else: L_n = comb(*np.ogrid[:n, :n]) if kind is 'lower': p = L_n elif kind is 'upper': p = L_n.T else: p = np.dot(L_n, L_n.T) return p
def invhilbert(n, exact=False): """ Compute the inverse of the Hilbert matrix of order `n`. The entries in the inverse of a Hilbert matrix are integers. When `n` is greater than 14, some entries in the inverse exceed the upper limit of 64 bit integers. The `exact` argument provides two options for dealing with these large integers. Parameters ---------- n : int The order of the Hilbert matrix. exact : bool If False, the data type of the array that is returned is np.float64, and the array is an approximation of the inverse. If True, the array is the exact integer inverse array. To represent the exact inverse when n > 14, the returned array is an object array of long integers. For n <= 14, the exact inverse is returned as an array with data type np.int64. Returns ------- invh : (n, n) ndarray The data type of the array is np.float64 if `exact` is False. If `exact` is True, the data type is either np.int64 (for n <= 14) or object (for n > 14). In the latter case, the objects in the array will be long integers. See Also -------- hilbert : Create a Hilbert matrix. Notes ----- .. versionadded:: 0.10.0 Examples -------- >>> from scipy.linalg import invhilbert >>> invhilbert(4) array([[ 16., -120., 240., -140.], [ -120., 1200., -2700., 1680.], [ 240., -2700., 6480., -4200.], [ -140., 1680., -4200., 2800.]]) >>> invhilbert(4, exact=True) array([[ 16, -120, 240, -140], [ -120, 1200, -2700, 1680], [ 240, -2700, 6480, -4200], [ -140, 1680, -4200, 2800]], dtype=int64) >>> invhilbert(16)[7,7] 4.2475099528537506e+19 >>> invhilbert(16, exact=True)[7,7] 42475099528537378560L """ if exact: if n > 14: dtype = object else: dtype = np.int64 else: dtype = np.float64 invh = np.empty((n, n), dtype=dtype) for i in xrange(n): for j in xrange(0, i + 1): s = i + j invh[i, j] = ((-1) ** s * (s + 1) * comb(n + i, n - j - 1, exact) * comb(n + j, n - i - 1, exact) * comb(s, i, exact) ** 2) if i != j: invh[j, i] = invh[i, j] return invh
Rosalind #: 090 URL: http://rosalind.info/problems/wfmd/ ''' from scipy.misc import comb with open('data/rosalind_wfmd.txt') as input_data: N, m, g, k = [int(num) for num in input_data.read().strip().split()] # Determine the probabiliy of a given of recessive allels in the first generation. # Use a binomial random variable with the given parameters. # Note: We omit the 0th term throughout the problem, as it has no contribution to the desired probability. # For future problems, start the ranges at 0 if the 0 term ever becomes necessary. p_rec = 1.0 - (m / (2.0 * N)) p = [ comb(2 * N, i) * ((p_rec)**i) * (1.0 - p_rec)**(2 * N - i) for i in range(1, 2 * N + 1) ] # Determine the probabiliy of a given of recessive allels in the 2nd to k-th generations. # Use the total law of probability, along with the probabilities from the previous generation. # i.e., P(1 Rec) = P(1 Rec | 0 Rec in previous gen) + P(1 Rec | 1 Rec in previous gen) + ... + P(1 Rec | 2N Rec in previous gen) # Notice that the conditional probabilities are binomial terms, similar to the first generation calculations. for gen in range(2, g + 1): temp_p = [] for j in range(1, 2 * N + 1): temp_term = [ comb(2 * N, j) * ((x / (2.0 * N))**j) * (1.0 - (x / (2.0 * N)))**(2 * N - j) for x in range(1, 2 * N + 1) ] temp_p.append(sum([temp_term[i] * p[i]
def run(self, niter): for i in xrange(niter): # sample z w/ limit on size # random permute dimensions for d in np.random.permutation(range(self.xdim)): # sample z_d # initialize final_z = self.z.copy() zd_old = self.z[d] self.z[d] = -1; a_size = np.sum(self.z == zd_old) max_log_prob_perturbed = np.log(a_size + self.alpha[zd_old]) \ - self.gp.nll + helper.gumbel() # find all possible category assignments # if z[d] is alone, the possible other category assignment is other_cat = np.unique(self.z) other_cat = other_cat[np.logical_and(other_cat != zd_old, other_cat != -1)] # otherwise, need to remove z[d] and add one additional category if a_size > 0 and other_cat.size + 1 < self.n_add: for a in xrange(self.n_add): if (a not in other_cat) and (a != zd_old): other_cat = np.append(other_cat, [a]) break # start sampling for a in np.random.permutation(other_cat): a_size = np.sum(self.z == a) if a_size < self.dim_limit: self.z[d] = a gp = self.get_gp() log_prob = np.log(a_size + self.alpha[a]) - gp.nll + helper.gumbel() if log_prob > max_log_prob_perturbed: max_log_prob_perturbed = log_prob self.gp = gp final_z = self.z.copy() self.z = final_z # end of sample z_d # sample k_d # initialize final_k = self.k.copy() kd_old = self.k[d] beta_post = lambda x: comb(self.beta[0]+x-1., x)/((1./self.beta[1]+1.)**x) max_log_prob_perturbed = beta_post(kd_old) - self.gp.nll + helper.gumbel() # define range for k_d? current k_d \pm 10 other_k = np.arange(-5, 5) + kd_old other_k = other_k[np.logical_and(other_k >= 2, other_k != kd_old)] # start sampling for b in np.random.permutation(other_k): self.k[d] = b gp = self.get_gp() log_prob = beta_post(b) - gp.nll + helper.gumbel() if log_prob > max_log_prob_perturbed: max_log_prob_perturbed = log_prob self.gp = gp final_k = self.k.copy() self.k = final_k return self.gp, self.z, self.k
def myComb(a, b): return comb(a, b, exact=True)
NXSUBAPS = numpy.array([7] * NWFS) NSUBAPS = numpy.array([36] * NWFS) SUBAPDIAM = numpy.array([telConfig["WFS"]["subapDiam"]] * NWFS) GSALT = numpy.array([0] * NWFS) GSTYPE = numpy.array([1] * NWFS) PUPILSHIFT = numpy.array(([1, 1], [0, 0])) PUPILMAG = numpy.array([NXSUBAPS[0]] * NWFS) PUPILROT = numpy.array([0] * NWFS) PUPILROT[0] = 0 OBS = 0.285 NCPU = 1 PART = 0 PUPIL_MASK = telConfig["WFS"]["pupilMask"] waveL = 500e-9 gam = numpy.array([waveL] * NWFS) combs = int(comb(GSPOS.shape[1], 2, exact=True)) selector = numpy.array((range(GSPOS.shape[0]))) selector = numpy.array((list(itertools.combinations(selector, 2)))) NLAYERS = 2 r0 = numpy.array([0.1] * NLAYERS) L0 = numpy.array([25.] * NLAYERS) LAYERHEIGHTS = numpy.array([0., 9281.91628112]) fitL0 = True offsets = True params = CovarianceMatrix(NWFS, PUPIL_MASK, TEL_DIAM, SUBAPDIAM, GSALT, GSPOS, gam, NLAYERS, LAYERHEIGHTS, offsets, fitL0, L0, PUPILSHIFT, PUPILROT, True) s = time.time() nmat = params.make_covariance_matrix(r0, L0, PUPILSHIFT, PUPILROT)
def test_big(self): p = pascal(50) assert_equal(p[-1, -1], comb(98, 49, exact=True))
h3k27ac_cell_type_names = ["neuron", "microglia", "glia"] marker_names = ["NeuN+", "Pu.1+", "NeuN-/Pu.1-"] brain_regions = [ "hpc", "dlpfc", "allbr", "hpc_female_controls", "hpc_female_cases", "hpc_male_controls", "hpc_male_cases" ] clusters = [ "exPFC1", "exPFC2", "exCA1", "exCA3", "GABA1", "GABA2", "exDG", "MG", "ODC1", "ODC2", "OPC", "ASC1", "ASC2", "NSC", "END" ] numClusters = len(clusters) totGreaterThanZeroTests = len(clusters) * len(h3k27ac_cell_type_names) * len( brain_regions) totCellTypeTests = misc.comb(len(h3k27ac_cell_type_names), 2) * len(clusters) * len(brain_regions) padj_threshold = 0.05 padj_stringent = 0.01 logfc_cutoff = 0.5 numColsPlot = 5 numRowsPlot = 3 h1 = 0.2 h2 = 0.05 prefix = "/habib_markers_analysis_disttss_filter/" suffix = "l2fc.txt" tStatMatrix = dict() pValMatrix = dict() l2fcMeansMatrix = dict() plotColor = dict()
def evaluation_total_usage(n): """ In the demo, we will showcase an example of special purpose publication. The data user wants the published energy database to maximally retain the information about peak-time energy usage """ # Initialization of some useful classes util = Utilities() pe = PerformanceEvaluation() # step 1: get the database to be published day_profile = pd.read_pickle('dataset/dataframe_all_energy.pkl') day_profile = day_profile.fillna(0) day_profile = day_profile.iloc[ 0:90, 0:: 4] # subsample the database to improve the speed for demonstration purpose day_profile.index = range(len(day_profile.index)) rep_mode = 'mean' anonymity_level = n # desired anonymity level # step 2: data user specifies his/her interest. In the example, the data user is interested in preserving the # information of the cumulative energy use during peak time. In this case, he/she would also need to specify the # starting and ending time of the peak usage time interest = 'window-usage' window = [17, 21] sanitized_profile_best = util.sanitize_data( day_profile, distance_metric='self-defined', anonymity_level=anonymity_level, rep_mode=rep_mode, mode=interest, window=window) # step 3: pre-sanitize the database sanitized_profile_baseline = util.sanitize_data( day_profile, distance_metric='euclidean', anonymity_level=anonymity_level, rep_mode=rep_mode) loss_best_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_best, mode=interest, window=window) loss_generic_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_baseline, mode=interest, window=window) # print("information loss with learned metric %s" % loss_generic_metric) df_subsampled_from = sanitized_profile_baseline.drop_duplicates().sample( frac=1) subsample_size_max = int(comb(len(df_subsampled_from), 2)) print('total number of pairs is %s' % subsample_size_max) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair, data_pair_all_index = sp.uniform_sampling( subsample_size=subsample_size, seed=None) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest='statistics', stat_type=interest, window=window) similarity_label, data_subsample = sim.label_via_silhouette_analysis( range_n_clusters=range(2, 8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs lm = Linear_Metric() lm.train(data_pair, similarity_label) dm = Deep_Metric() dm.train(data_pair, similarity_label) # step 6: the original database is privatized using the learned metric sanitized_profile_deep = util.sanitize_data( day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=dm, window=window) sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=lm, window=window) # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database loss_learned_metric_deep = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_deep.round(), mode=interest, window=window) loss_learned_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile, mode=interest, window=window) print('anonymity level %s' % anonymity_level) print("sampled size %s" % subsample_size) print("information loss with best metric %s" % loss_best_metric) print("information loss with generic metric %s" % loss_generic_metric) print("information loss with learned metric %s" % loss_learned_metric) print("information loss with learned metric deep %s" % (loss_learned_metric_deep)) return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size
def ensemble_error(n_classifier, error): k_start = int(math.ceil(n_classifier / 2.)) probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k) for k in range(k_start, n_classifier + 1)] return sum(probs)
def W(self, n1, n2, n3): """ Calculates an instance of W from Fluke. et. al. """ return np.sqrt(comb(n1, n1 / 2) * comb(n2, n2 / 2) * comb(n3, n3 / 2))
b = np.zeros(n) head = np.where(a > tt) tail = np.where(a < tt) b[head] = 1 b[tail] = 0 nh = np.size(head) nt = np.size(tail) x = np.arange(n) plt.figure('T=' + str(tt)) plt.plot(x, b, '.k') plt.ylim(-0.1, 2.) plt.figure('histogram') plt.hist(b, bins=2) pdata = misc.comb(n, nt) * (tt**nt) * ((1. - tt)**nh) pdatanr = 1. ptnr = 1. pt = pdata * ptnr plt.plot(tt, pt) nn = 501 t = np.zeros(nn) pt = np.zeros(nn) plt.figure('T vs P(T)') for ii in np.arange(nn): t0 = 1. / (nn - 1) * ii pdata = (t0**nt) * ((1. - t0)**nh) ptnr = 1.
def background_noise(unlabel_intensity, na, parent_atoms, parent_label, daughter_atoms, daughter_label): noise = unlabel_intensity * math.pow(na, parent_label)\ * comb(parent_atoms - daughter_atoms, parent_label - daughter_label)\ * comb(daughter_atoms, daughter_label) return noise
# rosalind_eval import numpy as np from scipy.misc import comb f = open('rosalind_indc.txt', 'r') t = f.readlines() n = 2 * np.int(t[0].rstrip()) out = np.log10(np.array([comb(n, i, exact=1) * .5**n for i in range(n+1)]).cumsum()[::-1])[1:] out.tofile('rosalind_indc_sub.txt', sep=' ')
def prob(self, k): assert isinstance(k, int), "event must occur an integer number of times" return comb(self.n, k) * (self.p**k) * ((1 - self.p)**(self.n - k))
def gen_dice_pdf(N,diceLim): P = diceLim/6 return [comb(N,i)* P**i * (1-P)**(N-i) for i in range(N+1)]
pdatanr = 1. ptnr = 1. pt = pdata * ptnr plt.plot(t, pt) #plt.show() n = 256 nn = 501 t = np.zeros(nn) pt = np.zeros(nn) plt.figure('T vs P(T)') for ii in np.arange(nn): t0 = 1. / (nn - 1) * ii pdata = (t0**nt) * ((1. - t0)**nh) ptnr = 1. t[ii] = t0 pt[ii] = pdata * ptnr print(t0, pdata, t0**nt, (1. - t0)**nh, nt, nh, misc.comb(n, nt)) plt.plot(t, pt, '-') plt.figure('normalized to max') mp = np.max(pt) pt = 1. / mp * pt plt.plot(t, pt, '-') print(pt[0:10]) plt.show()
def evaluation_total_usage(n, df_subsampled_from, day_profile): interest = 'window-usage' window = [17, 21] anonymity_level = n rep_mode = 'mean' subsample_size_max = int(comb(len(df_subsampled_from), 2)) print('total number of pairs is %s' % subsample_size_max) # step 4: sample a subset of pre-sanitized database and form the data points into pairs subsample_size = int(round(subsample_size_max)) sp = Subsampling(data=df_subsampled_from) data_pair, data_pair_all_index = sp.uniform_sampling( subsample_size=subsample_size, seed=None) # User receives the data pairs and label the similarity sim = Similarity(data=data_pair) sim.extract_interested_attribute(interest='statistics', stat_type=interest, window=window) similarity_label, data_subsample = sim.label_via_silhouette_analysis( range_n_clusters=range(2, 8)) # step 5: PAD learns a distance metric that represents the interest of the user from the labeled data pairs lm = Linear_Metric() lm.train(data_pair, similarity_label) dm = Deep_Metric() dm.train(data_pair, similarity_label) # step 6: the original database is privatized using the learned metric sanitized_profile_deep = util.sanitize_data( day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=dm, window=window) sanitized_profile = util.sanitize_data(day_profile, distance_metric="deep", anonymity_level=anonymity_level, rep_mode=rep_mode, deep_model=lm, window=window) # (optionally for evaluation purpose) Evaluating the information loss of the sanitized database loss_learned_metric_deep = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile_deep.round(), mode=interest, window=window) loss_learned_metric = pe.get_statistics_loss( data_gt=day_profile, data_sanitized=sanitized_profile, mode=interest, window=window) print('anonymity level %s' % anonymity_level) print("sampled size %s" % subsample_size) print("information loss with best metric %s" % loss_best_metric) print("information loss with generic metric %s" % loss_generic_metric) print("information loss with learned metric %s" % loss_learned_metric) print("information loss with learned metric deep %s" % (loss_learned_metric_deep)) return (sanitized_profile_best, sanitized_profile_baseline, sanitized_profile, sanitized_profile_deep), (loss_best_metric, loss_generic_metric, loss_learned_metric, loss_learned_metric_deep), subsample_size
def wignerd(j,m,n=0,approx_lim=10): ''' Wigner "small d" matrix. (Euler z-y-z convention) example: j = 2 m = 1 n = 0 beta = linspace(0,pi,100) wd210 = wignerd(j,m,n)(beta) some conditions have to be met: j >= 0 -j <= m <= j -j <= n <= j The approx_lim determines at what point bessel functions are used. Default is when: j > m+10 and j > n+10 for integer l and n=0, we can use the spherical harmonics. If in addition m=0, we can use the ordinary legendre polynomials. ''' if (j < 0) or (abs(m) > j) or (abs(n) > j): raise ValueError("wignerd(j = {0}, m = {1}, n = {2}) value error.".format(j,m,n) \ + " Valid range for parameters: j>=0, -j<=m,n<=j.") if (j > (m + approx_lim)) and (j > (n + approx_lim)): #print('bessel (approximation)') return lambda beta: jv(m-n, j*beta) if (floor(j) == j) and (n == 0): if m == 0: #print('legendre (exact)') return lambda beta: legendre(j)(cos(beta)) elif False: #print('spherical harmonics (exact)') a = sqrt(4.*pi / (2.*j + 1.)) return lambda beta: a * conjugate(sph_harm(m,j,beta,0.)) jmn_terms = { j+n : (m-n,m-n), j-n : (n-m,0.), j+m : (n-m,0.), j-m : (m-n,m-n), } k = min(jmn_terms) a, lmb = jmn_terms[k] b = 2.*j - 2.*k - a if (a < 0) or (b < 0): raise ValueError("wignerd(j = {0}, m = {1}, n = {2}) value error.".format(j,m,n) \ + " Encountered negative values in (a,b) = ({0},{1})".format(a,b)) coeff = power(-1.,lmb) * sqrt(comb(2.*j-k,k+a)) * (1./sqrt(comb(k+b,b))) #print('jacobi (exact)') return lambda beta: coeff \ * power(sin(0.5*beta),a) \ * power(cos(0.5*beta),b) \ * jacobi(k,a,b)(cos(beta))
def comb2(n): # the exact version is faster for k == 2: use it by default globally in # this module instead of the float approximate variant return comb(n, 2, exact=1)
def adjusted_rand_score(labels_true, labels_pred): """Rand index adjusted for chance The Rand Index computes a similarity measure between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings. The raw RI score is then "adjusted for chance" into the ARI score using the following scheme:: ARI = (RI - Expected_RI) / (max(RI) - Expected_RI) The adjusted Rand index is thus ensured to have a value close to 0.0 for random labeling independently of the number of clusters and samples and exactly 1.0 when the clusterings are identical (up to a permutation). ARI is a symmetric measure:: adjusted_rand_score(a, b) == adjusted_rand_score(b, a) Parameters ---------- labels_true : int array, shape = [n_samples] Ground truth class labels to be used as a reference labels_pred : array, shape = [n_samples] Cluster labels to evaluate Returns ------- ari: float Similarity score between -1.0 and 1.0. Random labelings have an ARI close to 0.0. 1.0 stands for perfect match. Examples -------- Perfectly maching labelings have a score of 1 even >>> from sklearn.metrics.cluster import adjusted_rand_score >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 1]) 1.0 >>> adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0]) 1.0 Labelings that assign all classes members to the same clusters are complete be not always pure, hence penalized:: >>> adjusted_rand_score([0, 0, 1, 2], [0, 0, 1, 1]) # doctest: +ELLIPSIS 0.57... ARI is symmetric, so labelings that have pure clusters with members coming from the same classes but unnecessary splits are penalized:: >>> adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 2]) # doctest: +ELLIPSIS 0.57... If classes members are completely split across different clusters, the assignment is totally incomplete, hence the ARI is very low:: >>> adjusted_rand_score([0, 0, 0, 0], [0, 1, 2, 3]) 0.0 References ---------- .. [Hubert1985] `L. Hubert and P. Arabie, Comparing Partitions, Journal of Classification 1985` http://www.springerlink.com/content/x64124718341j1j0/ .. [wk] http://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index See also -------- adjusted_mutual_info_score: Adjusted Mutual Information """ labels_true, labels_pred = check_clusterings(labels_true, labels_pred) n_samples = labels_true.shape[0] classes = np.unique(labels_true) clusters = np.unique(labels_pred) # Special limit cases: no clustering since the data is not split; # or trivial clustering where each document is assigned a unique cluster. # These are perfect matches hence return 1.0. if (classes.shape[0] == clusters.shape[0] == 1 or classes.shape[0] == clusters.shape[0] == 0 or classes.shape[0] == clusters.shape[0] == len(labels_true)): return 1.0 contingency = contingency_matrix(labels_true, labels_pred) # Compute the ARI using the contingency data sum_comb_c = sum(comb2(n_c) for n_c in contingency.sum(axis=1)) sum_comb_k = sum(comb2(n_k) for n_k in contingency.sum(axis=0)) sum_comb = sum(comb2(n_ij) for n_ij in contingency.flatten()) prod_comb = (sum_comb_c * sum_comb_k) / float(comb(n_samples, 2)) mean_comb = (sum_comb_k + sum_comb_c) / 2. return ((sum_comb - prod_comb) / (mean_comb - prod_comb))