def bezier_matrix(degree): m = degree Q = np.zeros((degree + 1, degree + 1)) for i in range(degree + 1): for j in range(degree + 1): if (0 <= (i + j)) and ((i + j) <= degree): Q[i, j] = choose(m, j) * choose(m - j, m - i - j) * ( (-1)**(m - i - j)) return Q
def bezier1d(points): # d = len(points[0]) N = len(points) - 1 bez_x = lambda t: sum([ choose(N,k)*np.power(t,k)*np.power(1-t,N-k)*points[k][0] for k in range(N+1) ]) bez_y = lambda t: sum([ choose(N,k)*np.power(t,k)*np.power(1-t,N-k)*points[k][1] for k in range(N+1) ]) return bez_x, bez_y
def abund_log_prob(genotype, abundance, refrabund=None, mean=30.0, sd=8.0, error=0.001): """Calculate probability of k-mer abundance conditioned on genotype. The `genotype` variable represents the number of assumed allele copies and is one of {0, 1, 2} (corresponding to genotypes {0/0, 0/1, and 1/1}). The `mean` and `sd` variables describe a normal distribution of observed abundances of k-mers with copy number 2. The `error` parameter is the sequencing error rate. For SNVs, there is a 1-to-1 correspondence of alternate allele k-mers to reference allele k-mers. We can therefore check the frequency of the reference allele in the reference genome and scale up the error rate if it is repetitive. There is no such mapping of alt allele k-mers to refr allele k-mers for indels, so we use a lower fixed error rate. """ if genotype == 0: if not refrabund: # INDEL mode refrabund = 1 error *= 0.01 scaledmean = mean * refrabund if abundance > scaledmean: abundance = scaledmean nCk = choose(scaledmean, abundance, exact=True) prob = ( log(nCk) + (abundance * log(error)) + ((scaledmean - abundance) * log(1.0 - error)) ) return prob elif genotype == 1: return scipy.stats.norm.logpdf(abundance, mean / 2, sd / 2) elif genotype == 2: return scipy.stats.norm.logpdf(abundance, mean, sd)
def _chao_7d(x, n, f1, p1, q): data, counts = np.unique(x, return_counts=True) term = np.zeros(data.shape[0]) zi = stats.lchoose(n, data) for i, z in enumerate(data): k = np.arange(n - z + 1) term[i] = np.sum( choose(k - q, k) * np.exp(stats.lchoose(n - k - 1, z - 1) - zi[i])) A = np.sum(counts * term) if f1 == 0 or p1 == 1: B = 0 else: B = f1 / n * (1 - p1)**(1. - n) r = np.arange(n) B *= (p1**(q - 1)) - np.sum(choose(q - 1, r) * (p1 - 1) ** r) return (A + B)**(1 / (1 - q))
def _get_vacancies(self): '''Vacancies.py: Group to create atomic vacancies from a seed configuration. Args: atom_seed (list, str, matdb.atoms.Atoms): The location of the files that will be read into to make the atoms object or an atoms object. ran_seed (hashable):(=1 default) seed for the random number generator for index of vacancies selection. nconfigs (int): number of cells with vacancies to create. vac_per_atom (int < 1): The number of vacancies to include per atom in the cell. (i.e. 0.1 would be 1 in every 10 atoms.) min_index (int):(default=0) Default choice with the same ran_seed would produce the same vacancies in each cell. .. note:: Additional attributes are also exposed by the super class :class:`~matdb.database.Group`. Attributes: name (str): name of this database type relative to the over database collection. This is also the name of the folder in which all of its calculations will be performed. num_atom(int): The number of atoms present in each atoms object. num_vac(int): The number of vacancies per cell. seed_state(tuple, len=4): values 1,3-4 are set by ran_seed after the first call to np.random and do not change, value 2 gives the ith value of a call to random select_atoms(list): list of lists with indices of atoms to be removed unique_perm(int): number of possible combinations Returns: vacancies(AtomsList): an list of atoms objects of length nconfigs with unique vacancies for each cell. ''' select_atoms = [] # list of lists with indices of atoms to be removed num_atoms = int(len(self.atoms.get_positions())) # number of atoms num_vac = int(num_atoms * self.vac_per_atom) np.random.seed(self.ran_seed) # Set the random seed for reproduction if (choose(num_atoms, num_vac) > 1000): select_atoms = self._get_random_choice(select_atoms, num_atoms, num_vac) else: select_atoms = self._get_combinations(select_atoms, num_atoms, num_vac) atom_seed = AtomsList() for i in select_atoms: local_atoms = self.atoms.copy() del local_atoms[i] atom_seed.append(local_atoms) return atom_seed, select_atoms
def _get_combinations(self, select_atoms, num_atoms, num_vac): '''This Approach allows for simple, efficient random iteration of all possible vacancies for small cell sizes. Limiting this approach that n choose k is less than 1000. Args: select_atoms(list): the selected indices to be removed from each config. num_atoms(int): number of atoms present in the cell. num_vac(int): number of vacancies to include in each config. Returns: select_atoms(list): the indices to remove from each config. ''' atomic_vacancies = range(choose(num_atoms, num_vac, exact=True)) np.random.shuffle(atomic_vacancies) # shuffle all possible options atomic_vacancies = list( islice(atomic_vacancies, self.min_index, (self.min_index + self.nconfigs))) for i in atomic_vacancies: select_atoms.append( list(islice(combinations(range(num_atoms), num_vac), i, i + 1))) return select_atoms
def cdf(self, a, b=None): """Computes P(X < `a`) for X distributed like this Gaussian. If `b` is also specified, this function will compute P(`a` < X < `b`). For multivariate Gaussians, this function performs inclusion-exclusion on (2 ^ N) CDF results, which computes the hypercubic intersection between CDF(upper limit) and CDF(lower limit). In the 2D case between points (a, b) and (c, d), where a < c and b < d, this works out to CDF(c, d) - CDF(b, d) - CDF(a, c) + CDF(a, b). NOTE: Ensure that the Gaussian covariance is positive semi-definite. TODO: Figure out how to use `mvnormcdf` from `statsmodels` for more efficient multivariate CDF intervals via sampling. Although their code looks sus... >>> Gaussian(1, 9).cdf(4) 0.841344746068543 >>> Gaussian(1, 9).cdf([4, 1]) array([0.84134475, 0.5 ]) >>> Gaussian(0, 1).cdf(-1, 1) 0.6826894921370861 >>> Gaussian(0, 1).cdf([-1, -2], [1, 2]) array([0.68268949, 0.95449974]) Output is slightly non-deterministic: >>> Gaussian(pd.Series([0, 0, 0]), pd.DataFrame([ \ [ 2, -1, 0], \ [-1, 2, -1], \ [ 0, -1, 2] \ ])).cdf([ \ [0, 0, 0], \ [-4, -2, -3] \ ], [ \ [1, 1, 1], \ [1, 2, 4] \ ]).round(3) array([0.017, 0.644]) >>> Gaussian(pd.Series([0, 2], index=['a', 'b']), [1, 1]) \ .cdf(pd.Series([2, 0], index=['b', 'a'])) 0.25 """ # Consider pre-computing and storing this distribution on the Gaussian. distribution = multivariate_normal(self.mean, self.covariance, allow_singular=True) if self.__should_vectorize(a): if b is None: b = np.empty(np.shape(a), dtype=object) if isinstance(a, pd.DataFrame): b = pd.DataFrame(b) b.columns, b.index = a.columns, a.index result = np.array( [self.cdf(a.iloc[i], b.iloc[i]) for i in a.index]) else: result = np.array( [self.cdf(a[i], b[i]) for i in range(len(a))]) # Restore row labels as necessary. if isinstance(a, (pd.DataFrame, pd.Series)): return pd.Series(result, index=a.index) return result elif b is None: # Sort `a` labels to match `mean` indexing. if self.__has_similar_labels(a): a = a[self.__mean.index] return distribution.cdf(a) # Multivariate intervals require a non-degenerate hypercube. elif np.any(np.array(b) - np.array(a) <= 0): return 0 # Apply inclusion-exclusion (see function header) to compute multivariate intervals. else: # Sort `a` labels to match `mean` indexing. if self.__has_similar_labels(a): a = a[self.__mean.index] # Sort `b` labels to match `mean` indexing. if self.__has_similar_labels(b): b = b[self.__mean.index] num_vars = len(self.__mean) # Returns e.g. [[0, 0], [0, 1], [1, 0], [1, 1]] for 2 variables. More generally, this # returns the (2 ^ N) bit vectors of length N, sorted ascending by the number of ones. inclusion_bits = np.array( sorted(list(itertools.product([0, 1], repeat=num_vars)), key=sum)).astype(bool) i = 0 result = 0 multiplier = 1 # Iterates through `inclusion_bits` grouped by the number of ones `num_upper` in the bit # vector. At each value of `num_upper`, there are (`num_vars` choose `num_upper`) such # elements in `inclusion_bits`. for num_upper in range(num_vars + 1): for _ in range(int(choose(num_vars, num_upper))): # If `inclusion_bits[i]` is e.g. 1001, we will construct a CDF limit by taking # the first variable from `b`, the second and third variables from `a`, and the # fourth variable from `b`. inclusion_exclusion = np.where(inclusion_bits[i], a, b) result += multiplier * distribution.cdf( inclusion_exclusion) i += 1 # This corresponds to switching the power of (-1) in the standard formula for # computing inclusion-exclusion. multiplier *= -1 return result
def gp(d_alpha,verbose=False): global gp_computed_values d,alpha = d_alpha alpha_sum = np.sum(alpha) if len(alpha) > 0 and np.min(alpha) < 0: if d == 0 and len(alpha) == 1 and alpha_sum == -1: return 1 else: return 0 n = 3*d-1-alpha_sum if n < 0: return 0 alpha = make_canonical_alpha(alpha) d_alpha = (d,alpha) alpha_sum = np.sum(alpha) n = 3*d-1-alpha_sum if d_alpha in gp_computed_values: return gp_computed_values[d_alpha] n = 3*d-1-alpha_sum #The number of extra point constraints needed to give the invariant corresponding to d_alpha index 0. if d == 1 and alpha_sum == 0: return 1 if verbose: print('Applying GP recursion for d = %s, alpha = %s' %(d,str(alpha))) if np.sum([elt*(elt-1) for elt in alpha]) > (d-1)*(d-2): #This condition checks whether the invariant is automatically zero by the adjunction formula. gp_computed_values[d_alpha] = 0 return 0 if n >= 3: out = 0 for d1_alpha1,d2_alpha2 in decompositions(d_alpha): d1,alpha1 = d1_alpha1 d2,alpha2 = d2_alpha2 n1 = 3*d1-1-np.sum(alpha1) out += gp(d1_alpha1,verbose)*gp(d2_alpha2,verbose)*(d1*d2-np.dot(alpha1,alpha2))*(d1*d2*choose(n-3,n1-1)-d1**2*choose(n-3,n1)) gp_computed_values[d_alpha] = out return out elif len(alpha) > 0: a = alpha[0] alpha_decr = tuple([alpha[0]-1] + list(alpha[1:])) d_alpha_decr = (d,alpha_decr) out = (d**2 - (a-1)**2) * gp(d_alpha_decr,verbose) for d1_alpha1,d2_alpha2 in decompositions(d_alpha_decr): d1,alpha1 = d1_alpha1 d2,alpha2 = d2_alpha2 n1 = 3*d1-1-np.sum(alpha1) b = alpha1[0] c = alpha2[0] out += gp(d1_alpha1,verbose)*gp(d2_alpha2,verbose)*(d1*d2 - np.dot(alpha1,alpha2))*(d1*d2*b*c-d1**2*c**2)*choose(n,n1) if out % (d**2*a) == 0: out = out // (d**2*a) gp_computed_values[d_alpha] = out return out else: raise Exception('Error! Expected divisibility in second branch of the recursion does not hold...') else: raise Exception('Error! Was not able to apply either branch of the recursion...')
r = 3 """ """ rows and cols of square grid """ ks = range(4, 12 + 1) for i, k in enumerate(ks): print("k=%d" % k) # SQUARE m = k**2 n = m // 2 #(3*m)//4 H = square_hypergraph(m, k) r = 2 # since SIP satisfied by (row,col) pair # CIRCLE #k = 3 # choose something less than m #H = cyclic_hypergraph(m, k) #r = k N_per_support = int((k - 1) * choose(m, k, exact=True)) + 1 N = len(H) * N_per_support num_trials = 1000 c2s = np.zeros(num_trials) for t in range(num_trials): print("Trial %d" % t, end='\r') A = np.random.randn(n, m) A = np.dot(A, np.diag(1. / np.linalg.norm(A, axis=0))) # normalize c2s[t] = C2(A, H, r) if t % 100 == 0: np.save("./c2s_k=%d_2x" % k, np.sort(c2s[c2s < np.inf])) np.save("./c2s_k=%d_2x" % k, np.sort(c2s[c2s < np.inf]))
def hypergeometric(n, k, N, K): return choose(n, k) * choose(N - n, K - k) / choose(N, n)
def binomialDist(x, n, p): return choose(n, x) * p**x * (1 - p)**(n - x)
def bij(t, i, n): # binomial coefficients return choose(n, i) * (t**i) * ((1 - t)**(n - i))
def likelihood(n, h, p): '''Returns the probability of h heads in n trials, with probability of heads being p''' return choose(n, h, exact=True) * (p**h) * ((1 - p)**(n - h))
def projection_fun(x, i, n, N, s=0, u=1e-8): # this can likely be done better in log space return (choose(n, i) * np.power(x, i) * np.power(1 - x, n - i) * afs_inf_sites(x, N, s, u))