def get_log_likelihood(self, observation): if len(observation) != 4: raise ValueError('expected the observation to be a vector of four integers') n = sum(observation) accum = 0 accum += StatsUtil.poisson_log_pmf(n, self.expected_coverage) accum += StatsUtil.multinomial_log_pmf(self.distribution, observation) return accum
def __call__(self, X): """ @return: negative log likelihood """ # unpack the params into a finite distribution a, b = X.tolist() mutation_rate = StatsUtil.expit(a) fitness_ratio = math.exp(b) h = 0.5 v = get_sample_distn(self.N, self.n, mutation_rate, fitness_ratio, h) # return the negative log likelihood return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
def __call__(self, X): """ @return: negative log likelihood """ # unpack the params into a finite distribution a, b = X.tolist() mutation_rate = StatsUtil.expit(a) fitness_ratio = math.exp(b) h = 0.5 v = get_sample_distn( self.N, self.n, mutation_rate, fitness_ratio, h) # return the negative log likelihood return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
def get_transition_matrix_slow(N_diploid, k, mutation, fit): """ Mutation probabilities are away from a fixed state. @param N_diploid: diploid population size @param k: number of alleles e.g. 4 for A,C,G,T @param mutation: k by k matrix of per-generation mutation probabilities @param fit: sequence of k fitness values @return: a transition matrix """ N = N_diploid * 2 states = [tuple(s) for s in gen_states(N,k)] nstates = len(states) s_to_i = dict((s, i) for i, s in enumerate(states)) P = np.zeros((nstates, nstates)) # Add rows corresponding to transitions from population states # for which an allele is currently fixed in the population. for i in range(k): P[i, i] = mutation[i, i] for j in range(k): if i == j: continue state = [0]*k state[i] = N-1 state[j] = 1 P[i, s_to_i[tuple(state)]] = mutation[i, j] # Add rows corresponding to transitions from polymorphic population states. for i, j in combinations(range(k), 2): for h in range(1, N): state = [0]*k state[i] = h state[j] = N-h index = s_to_i[tuple(state)] # Compute each child probability of having allele j. #pi, pj = wrightfisher.genic_diallelic(fit[i], fit[j], h, N-h) #s = fit[i] - fit[j] s = 1 - fit[j] / fit[i] pi, pj = wrightfisher.genic_diallelic(1.0, 1.0 - s, h, N-h) # Add entries corresponding to fixation of an allele. P[index, i] = math.exp(StatsUtil.binomial_log_pmf(N, N, pi)) P[index, j] = math.exp(StatsUtil.binomial_log_pmf(0, N, pi)) # Add entries corresponding to transitions to polymorphic states. for hsink in range(1, N): sink_state = [0]*k sink_state[i] = hsink sink_state[j] = N-hsink sink_index = s_to_i[tuple(sink_state)] logp = StatsUtil.binomial_log_pmf(hsink, N, pi) P[index, sink_index] = math.exp(logp) return P
def get_transition_matrix_slow(N_diploid, k, mutation, fit): """ Mutation probabilities are away from a fixed state. @param N_diploid: diploid population size @param k: number of alleles e.g. 4 for A,C,G,T @param mutation: k by k matrix of per-generation mutation probabilities @param fit: sequence of k fitness values @return: a transition matrix """ N = N_diploid * 2 states = [tuple(s) for s in gen_states(N, k)] nstates = len(states) s_to_i = dict((s, i) for i, s in enumerate(states)) P = np.zeros((nstates, nstates)) # Add rows corresponding to transitions from population states # for which an allele is currently fixed in the population. for i in range(k): P[i, i] = mutation[i, i] for j in range(k): if i == j: continue state = [0] * k state[i] = N - 1 state[j] = 1 P[i, s_to_i[tuple(state)]] = mutation[i, j] # Add rows corresponding to transitions from polymorphic population states. for i, j in combinations(range(k), 2): for h in range(1, N): state = [0] * k state[i] = h state[j] = N - h index = s_to_i[tuple(state)] # Compute each child probability of having allele j. #pi, pj = wrightfisher.genic_diallelic(fit[i], fit[j], h, N-h) #s = fit[i] - fit[j] s = 1 - fit[j] / fit[i] pi, pj = wrightfisher.genic_diallelic(1.0, 1.0 - s, h, N - h) # Add entries corresponding to fixation of an allele. P[index, i] = math.exp(StatsUtil.binomial_log_pmf(N, N, pi)) P[index, j] = math.exp(StatsUtil.binomial_log_pmf(0, N, pi)) # Add entries corresponding to transitions to polymorphic states. for hsink in range(1, N): sink_state = [0] * k sink_state[i] = hsink sink_state[j] = N - hsink sink_index = s_to_i[tuple(sink_state)] logp = StatsUtil.binomial_log_pmf(hsink, N, pi) P[index, sink_index] = math.exp(logp) return P
def get_expected_transitions_binomial(prandom, nstates, nsteps): """ This function is for transition matrices defined by their size and a single parameter. Use binomial coefficients to compute transition expectations. @param prandom: the probability of randomization at each step @param nstates: the number of states in the chain @param nsteps: one fewer than the length of the sequence @return: (expected_t_same, expected_t_different) """ # handle corner cases if not nsteps: return 0.0, float('nan') if nsteps == 1: return 0.0, 1.0 if not prandom: return 0.0, float('nan') # precalculate stuff p_notrans = prandom / nstates + (1 - prandom) p_any_trans = 1.0 - p_notrans # precalculate expected probability of each endpoint pair state prandom_total = 1 - (1 - prandom)**nsteps p_notrans_total = prandom_total / nstates + (1 - prandom_total) # initialize expectations e_same = 0 e_different = 0 # define expectations for ntrans in range(nsteps+1): log_p_ntrans = StatsUtil.binomial_log_pmf(ntrans, nsteps, p_any_trans) p_ntrans = math.exp(log_p_ntrans) p_same = (1 - (1 - nstates)**(1 - ntrans))/nstates e_same += p_same * p_ntrans * ntrans e_different += (1 - p_same) * p_ntrans * ntrans e_same /= p_notrans_total e_different /= (1 - p_notrans_total) return e_same, e_different
def get_expected_transitions_binomial(prandom, nstates, nsteps): """ This function is for transition matrices defined by their size and a single parameter. Use binomial coefficients to compute transition expectations. @param prandom: the probability of randomization at each step @param nstates: the number of states in the chain @param nsteps: one fewer than the length of the sequence @return: (expected_t_same, expected_t_different) """ # handle corner cases if not nsteps: return 0.0, float('nan') if nsteps == 1: return 0.0, 1.0 if not prandom: return 0.0, float('nan') # precalculate stuff p_notrans = prandom / nstates + (1 - prandom) p_any_trans = 1.0 - p_notrans # precalculate expected probability of each endpoint pair state prandom_total = 1 - (1 - prandom)**nsteps p_notrans_total = prandom_total / nstates + (1 - prandom_total) # initialize expectations e_same = 0 e_different = 0 # define expectations for ntrans in range(nsteps + 1): log_p_ntrans = StatsUtil.binomial_log_pmf(ntrans, nsteps, p_any_trans) p_ntrans = math.exp(log_p_ntrans) p_same = (1 - (1 - nstates)**(1 - ntrans)) / nstates e_same += p_same * p_ntrans * ntrans e_different += (1 - p_same) * p_ntrans * ntrans e_same /= p_notrans_total e_different /= (1 - p_notrans_total) return e_same, e_different
def get_response_content(fs): npop = fs.nB + fs.nb nstates = npop + 1 # Check the complexity; # solving a system of linear equations takes about n^3 effort. if nstates ** 3 > 1e6: raise ValueError('sorry this population size is too large') # Compute the transition matrix. # This assumes no mutation or selection or recombination. # It is pure binomial. P = np.zeros((nstates, nstates)) for i in range(nstates): nB_initial = i for j in range(nstates): nB_final = j log_p = StatsUtil.binomial_log_pmf( nB_final, npop, nB_initial / float(npop)) P[i, j] = math.exp(log_p) # Put the puzzle into the form Ax=b # so that it can be solved by a generic linear solver. A = P - np.eye(nstates) b = np.zeros(nstates) # Adjust the matrix to disambiguate absorbing states. A[0, 0] = 1.0 A[npop, npop] = 1.0 b[0] = 0.0 b[npop] = 1.0 # Solve Ax=b for x. x = linalg.solve(A, b) # Print the solution. out = StringIO() print >> out, 'probability of eventual fixation (as opposed to extinction)' print >> out, 'of allele B in the population:' print >> out, x[fs.nB] return out.getvalue()
def get_two_allele_distribution(N_big, N_small, f0, f1, f_subsample): """ Assumes small genic selection. Assumes small mutation. The mutational bias does not affect the distribution. @param N_big: total number of alleles in the population @param N_small: number of alleles sampled from the population @param f0: fitness of allele 0 @param f1: fitness of allele 1 @param f_subsample: subsampling function @return: distribution over all non-fixed population states """ # construct a transition matrix nstates = N_big + 1 P = np.zeros((nstates, nstates)) for i in range(nstates): p0, p1 = wrightfisher.genic_diallelic(f0, f1, i, N_big - i) if i == 0: P[i, 1] = 1.0 elif i == N_big: P[i, N_big - 1] = 1.0 else: for j in range(nstates): logp = StatsUtil.binomial_log_pmf(j, N_big, p0) P[i, j] = math.exp(logp) # find the stationary distribution v = MatrixUtil.get_stationary_distribution(P) MatrixUtil.assert_distribution(v) if not np.allclose(v, np.dot(v, P)): raise ValueError('expected a left eigenvector with eigenvalue 1') # return the stationary distribution conditional on dimorphism print v distn = f_subsample(v, N_small) return distn[1:-1] / np.sum(distn[1:-1])
def __call__(self, X): """ @param X: three encoded parameters @return: negative log likelihood """ # unpack the params into a finite distribution v = params_to_distn(self.M_large, self.M_small, X) # return the negative log likelihood return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
def params_to_distn(M_large, M_small, X): # unpack the parameters params = X.tolist() a, b, c = params # decode the parameters mut_ratio = math.exp(a) fit_ratio = math.exp(b) h = StatsUtil.expit(c) # get the distribution implied by the parameters return get_sample_distn(M_large, M_small, mut_ratio, fit_ratio, h)
def create_mutation_transition_matrix(npop, mutation_ab, mutation_ba): """ The states are indexed by the number of mutants. @param npop: total population size @param mutation_ab: wild-type to mutant transition probability @param mutation_ba: mutant to wild-type transition probability @return: a transition matrix """ StatsUtil.assert_probability(mutation_ab) StatsUtil.assert_probability(mutation_ba) nstates = npop + 1 P = np.zeros((nstates, nstates)) for a in range(nstates): for n_mut_to_wild in range(a+1): ba_observed_n = n_mut_to_wild ba_max_n = a ba_p_success = mutation_ba ba_log_p = StatsUtil.binomial_log_pmf( ba_observed_n, ba_max_n, ba_p_success) for n_wild_to_mut in range(npop - a + 1): ab_observed_n = n_wild_to_mut ab_max_n = npop - a ab_p_success = mutation_ab ab_log_p = StatsUtil.binomial_log_pmf( ab_observed_n, ab_max_n, ab_p_success) # p = math.exp(ba_log_p + ab_log_p) b = a + n_wild_to_mut - n_mut_to_wild P[a, b] += p return P
def create_mutation_transition_matrix(npop, mutation_ab, mutation_ba): """ The states are indexed by the number of mutants. @param npop: total population size @param mutation_ab: wild-type to mutant transition probability @param mutation_ba: mutant to wild-type transition probability @return: a transition matrix """ StatsUtil.assert_probability(mutation_ab) StatsUtil.assert_probability(mutation_ba) nstates = npop + 1 P = np.zeros((nstates, nstates)) for a in range(nstates): for n_mut_to_wild in range(a + 1): ba_observed_n = n_mut_to_wild ba_max_n = a ba_p_success = mutation_ba ba_log_p = StatsUtil.binomial_log_pmf(ba_observed_n, ba_max_n, ba_p_success) for n_wild_to_mut in range(npop - a + 1): ab_observed_n = n_wild_to_mut ab_max_n = npop - a ab_p_success = mutation_ab ab_log_p = StatsUtil.binomial_log_pmf(ab_observed_n, ab_max_n, ab_p_success) # p = math.exp(ba_log_p + ab_log_p) b = a + n_wild_to_mut - n_mut_to_wild P[a, b] += p return P
def __call__(self, X): """ @return: negative log likelihood """ # unpack the params into a finite distribution a, = X.tolist() fit_ratio = math.exp(a) mut_ratio = 1.0 h = 0.5 v = get_sample_distn(self.M_large, self.M_small, mut_ratio, fit_ratio, h) # return the negative log likelihood return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
def __call__(self, X): """ @return: negative log likelihood """ # unpack the params into a finite distribution a, = X.tolist() fit_ratio = math.exp(a) mut_ratio = 1.0 h = 0.5 v = get_sample_distn( self.M_large, self.M_small, mut_ratio, fit_ratio, h) # return the negative log likelihood return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
def UpdateStats(stats, t0, curr_lp, K, z, c, steps, gt_z, map_z, verbose): stats['lp'].append(curr_lp) stats['K'].append(K) stats['z'].append(z) stats['c'].append(c) curr_time = time.clock() - t0 stats['times'].append(curr_time) if verbose: print('Step: ' + str(steps) + ' Time: ' + str(curr_time) + ' LP: ' + str(curr_lp) + ' K: ' + str(K)) if gt_z.size > 0: stats['NMI'].append(StatsUtil.NMI(gt_z, map_z)) return stats
def __call__(self, X): """ @param X: six params defining mutation and selection @return: negative log likelihood """ # define the hardcoded number of alleles k = 4 # unpack the params params = X.tolist() theta, ka, kb, g0, g1, g2 = params if any(x < 0 for x in (theta, ka, kb)): return float('inf') mutation, fitnesses = kaizeng.params_to_mutation_fitness( self.N, params) # get the transition matrix P = kaizeng.get_transition_matrix(self.N, k, mutation, fitnesses) v = MatrixUtil.get_stationary_distribution(P) return -StatsUtil.multinomial_log_pmf(v, self.observed_counts)
def get_transition_matrix(npop, sAA, sAa): """ Note that sab is 0 by convention. @param npop: constant Wright-Fisher population @param sAA: a selection value @param sAa: a selection value @return: a transition matrix """ fitnesses = 1.0 + np.array([sAA, sAa, 0]) # precompute the index_to_composition and composition_to_index maps. compositions = list(gen_population_compositions(npop)) c_to_i = dict((c, i) for i, c in enumerate(compositions)) nstates = get_state_space_size(npop) if nstates != len(compositions): raise ValueError('internal error regarding state space size') # P = np.zeros((nstates, nstates)) for parent_index, parent_composition_tuple in enumerate(compositions): parent_compo = np.array(parent_composition_tuple) random_mating = True if random_mating: single_parent_distn = parent_compo / float(np.sum(parent_compo)) parent_distn = np.outer(single_parent_distn, single_parent_distn) child_distn = np.zeros(3) for i in range(3): for j in range(3): child_distn += parent_distn[i, j] * get_child_distn(i, j) child_distn *= fitnesses child_distn /= np.sum(child_distn) else: total = np.dot(fitnesses, parent_compo) single_parent_distn = (fitnesses * parent_compo) / total parent_distn = np.outer(single_parent_distn, single_parent_distn) child_distn = np.zeros(3) for i in range(3): for j in range(3): child_distn += parent_distn[i, j] * get_child_distn(i, j) for child_index, child_composition_tuple in enumerate(compositions): P[parent_index, child_index] = math.exp( StatsUtil.multinomial_log_pmf( child_distn, child_composition_tuple)) return P
def LearnSynthForDataset(synth): # Hyperparameters alpha = 10; kappa = 0.0001; nu = 1; sigsq = 0.01; pass_limit = 30; D = NormalizeConn(synth.D) # Normalize connectivity to zero mean, unit var # Compute our ddCRP-based parcellation Z = WardClustering.ClusterTree(D, synth.adj_list) _,dd_stats = initdd.InitializeAndRun(Z, D, synth.adj_list, range(1,21), alpha, kappa, nu, sigsq, pass_limit, synth.z, 0) DC = dd_stats['NMI'][-1] DC_K = dd_stats['K'][-1] # Ward Clustering, using number of clusters discovered from our method WC = StatsUtil.NMI(synth.z, WardClustering.Cluster(Z, DC_K)) return (WC,DC,DC_K)
def get_transition_matrix(npop, sAA, sAa): """ Note that sab is 0 by convention. @param npop: constant Wright-Fisher population @param sAA: a selection value @param sAa: a selection value @return: a transition matrix """ fitnesses = 1.0 + np.array([sAA, sAa, 0]) # precompute the index_to_composition and composition_to_index maps. compositions = list(gen_population_compositions(npop)) c_to_i = dict((c, i) for i, c in enumerate(compositions)) nstates = get_state_space_size(npop) if nstates != len(compositions): raise ValueError("internal error regarding state space size") # P = np.zeros((nstates, nstates)) for parent_index, parent_composition_tuple in enumerate(compositions): parent_compo = np.array(parent_composition_tuple) random_mating = True if random_mating: single_parent_distn = parent_compo / float(np.sum(parent_compo)) parent_distn = np.outer(single_parent_distn, single_parent_distn) child_distn = np.zeros(3) for i in range(3): for j in range(3): child_distn += parent_distn[i, j] * get_child_distn(i, j) child_distn *= fitnesses child_distn /= np.sum(child_distn) else: total = np.dot(fitnesses, parent_compo) single_parent_distn = (fitnesses * parent_compo) / total parent_distn = np.outer(single_parent_distn, single_parent_distn) child_distn = np.zeros(3) for i in range(3): for j in range(3): child_distn += parent_distn[i, j] * get_child_distn(i, j) for child_index, child_composition_tuple in enumerate(compositions): P[parent_index, child_index] = math.exp(StatsUtil.multinomial_log_pmf(child_distn, child_composition_tuple)) return P
def get_hobolth_eceo(R, v, a, b, T, nmax): """ The eceo means endpoint cnoditioned expected occupancy. Most of the function arguments are the same as those of the more verbosely named function. @param nmax: truncation of an infinite summation """ accum = np.zeros(len(v)) mu = np.max(-np.diag(R)) X = np.eye(len(v)) + R / mu for n in range(nmax+1): coeff = (T / (n+1)) * math.exp(StatsUtil.poisson_log_pmf(n, mu*T)) #print 'coeff:', coeff for alpha in range(len(v)): conditional_sum = 0 for i in range(n+1): prefix = np.linalg.matrix_power(X, i)[a, alpha] suffix = np.linalg.matrix_power(X, n-i)[alpha, b] conditional_sum += prefix * suffix #print 'conditional sum:', conditional_sum accum[alpha] += coeff * conditional_sum return accum / scipy.linalg.expm(R*T)[a,b]
def create_drift_selection_transition_matrix(npop, selection_ratio): """ The states are indexed by the number of mutants. @param npop: total population size @param selection_ratio: a value larger than unity means mutants are fitter @return: a transition matrix """ nstates = npop + 1 P = np.zeros((nstates, nstates)) for a in range(nstates): # compute the i.i.d probability of picking a mutant p = (selection_ratio * a) / (selection_ratio * a + (npop-a)) for b in range(nstates): # These are from a binomial distribution # with npop trials and p probability of success per trial. # (n choose k) p^k (1-p)^(n-k) observed_n = b max_n = npop p_success = p P[a, b] = math.exp(StatsUtil.binomial_log_pmf( observed_n, max_n, p_success)) return P
def get_hobolth_eceo(R, v, a, b, T, nmax): """ The eceo means endpoint cnoditioned expected occupancy. Most of the function arguments are the same as those of the more verbosely named function. @param nmax: truncation of an infinite summation """ accum = np.zeros(len(v)) mu = np.max(-np.diag(R)) X = np.eye(len(v)) + R / mu for n in range(nmax + 1): coeff = (T / (n + 1)) * math.exp(StatsUtil.poisson_log_pmf(n, mu * T)) #print 'coeff:', coeff for alpha in range(len(v)): conditional_sum = 0 for i in range(n + 1): prefix = np.linalg.matrix_power(X, i)[a, alpha] suffix = np.linalg.matrix_power(X, n - i)[alpha, b] conditional_sum += prefix * suffix #print 'conditional sum:', conditional_sum accum[alpha] += coeff * conditional_sum return accum / scipy.linalg.expm(R * T)[a, b]
def create_drift_selection_transition_matrix(npop, selection_ratio): """ The states are indexed by the number of mutants. @param npop: total population size @param selection_ratio: a value larger than unity means mutants are fitter @return: a transition matrix """ nstates = npop + 1 P = np.zeros((nstates, nstates)) for a in range(nstates): # compute the i.i.d probability of picking a mutant p = (selection_ratio * a) / (selection_ratio * a + (npop - a)) for b in range(nstates): # These are from a binomial distribution # with npop trials and p probability of success per trial. # (n choose k) p^k (1-p)^(n-k) observed_n = b max_n = npop p_success = p P[a, b] = math.exp( StatsUtil.binomial_log_pmf(observed_n, max_n, p_success)) return P
def get_sample_distn(N, n, mutation_rate, fitness_ratio, h): """ @param N: haploid pop size @param n: allele sample size @param mutation_rate: mutation rate @param fitness_ratio: fitness ratio @param h: dominance parameter 0.5 when additive @return: a distribution over n+1 mono-/di-morphic sample states """ s = 1.0 - fitness_ratio P = np.exp(wfengine.create_diallelic_recessive(N // 2, s, h)) MatrixUtil.assert_transition_matrix(P) # allow mutation out of the fixed states P[0, 0] = 1.0 - mutation_rate P[0, 1] = mutation_rate P[-1, -1] = 1.0 - mutation_rate P[-1, -2] = mutation_rate MatrixUtil.assert_transition_matrix(P) # get the population stationary distribution v_large = MatrixUtil.get_stationary_distribution(P) # get the allele distribution v_small = StatsUtil.subsample_pmf_without_replacement(v_large, n) return v_small
def get_response_content(fs): np.set_printoptions(linewidth=200) out = StringIO() nsamples = 1 arr = [] # nsites = 50000 N = 15*2 k = 4 params = (0.002, 1, 1, 0, 0, 0) #params = (0.008, 1, 1, 0.5, 1, 1.5) mutation, fitnesses = kaizeng.params_to_mutation_fitness(N, params) # tm = time.time() P = kaizeng.get_transition_matrix(N, k, mutation, fitnesses) print 'time to construct transition matrix:', time.time() - tm # tm = time.time() v = MatrixUtil.get_stationary_distribution(P) print 'time to get stationary distribution:', time.time() - tm # tm = time.time() counts = np.random.multinomial(nsites, v) print 'time to sample multinomial counts:', time.time() - tm # tm = time.time() logp = StatsUtil.multinomial_log_pmf(v, counts) print 'time to get multinomial log pmf:', time.time() - tm # for i in range(nsamples): counts = np.random.multinomial(nsites, v) X0 = np.array(params) g = G(N, counts) Xopt = optimize.fmin(g, X0) arr.append(Xopt) print >> out, np.array(arr) return out.getvalue()
def get_response_content(fs): np.set_printoptions(linewidth=200) out = StringIO() nsamples = 1 arr = [] # nsites = 50000 N = 15 * 2 k = 4 params = (0.002, 1, 1, 0, 0, 0) #params = (0.008, 1, 1, 0.5, 1, 1.5) mutation, fitnesses = kaizeng.params_to_mutation_fitness(N, params) # tm = time.time() P = kaizeng.get_transition_matrix(N, k, mutation, fitnesses) print 'time to construct transition matrix:', time.time() - tm # tm = time.time() v = MatrixUtil.get_stationary_distribution(P) print 'time to get stationary distribution:', time.time() - tm # tm = time.time() counts = np.random.multinomial(nsites, v) print 'time to sample multinomial counts:', time.time() - tm # tm = time.time() logp = StatsUtil.multinomial_log_pmf(v, counts) print 'time to get multinomial log pmf:', time.time() - tm # for i in range(nsamples): counts = np.random.multinomial(nsites, v) X0 = np.array(params) g = G(N, counts) Xopt = optimize.fmin(g, X0) arr.append(Xopt) print >> out, np.array(arr) return out.getvalue()
def LogProbWC(D, Z, sizes, alpha, kappa, nu, sigsq): hyp = ddCRP.ComputeCachedLikelihoodTerms(kappa, nu, sigsq) logp = np.zeros(len(sizes)) for i in range(len(sizes)): z = cluster.hierarchy.fcluster(Z, t=sizes[i], criterion='maxclust') sorted_i = np.argsort(z) sorted_z = np.sort(z) parcels = np.split(sorted_i, np.flatnonzero(np.diff(sorted_z)) + 1) # Formally we should construct a spanning tree within each cluster so # that we can evaluate the probability. However, the only property of # the "c" links that impacts the probability directly is the number of # self-connections. So we simply add the correct number of self- # connections (equal to the number of parcels) and leave the rest # set to zero c = np.zeros(len(z)) c[0:sizes[i]] = np.arange(sizes[i]) logp[i] = ddCRP.FullProbabilityddCRP(D, c, parcels, alpha, hyp, StatsUtil.CheckSymApprox(D)) return logp
def get_response_content(fs): np.set_printoptions(linewidth=200) out = StringIO() # extract user-supplied parameters N_diploid = fs.N_diploid nsites = fs.nsites nalleles = fs.nalleles mutation_rate = fs.mutation_rate Ns = fs.Ns h = fs.h # N_hap = 2 * N_diploid N = N_hap n = nalleles s = fs.Ns / float(N) fitness_ratio = 1 - s v_small = get_sample_distn(N, n, mutation_rate, fitness_ratio, h) # sample from this distribution counts = np.random.multinomial(nsites, v_small) # negloglik = -StatsUtil.multinomial_log_pmf(v_small, counts) # print >> out, 'actual mutation rate:', mutation_rate print >> out, 'actual fitness ratio:', fitness_ratio print >> out, 'actual h:', h print >> out, 'implied finite polymorphic diallelic distribution:' print >> out, v_small print >> out, 'negative log likelihood:', negloglik print >> out # # try to estimate the parameters X0 = np.array([ StatsUtil.logit(mutation_rate), math.log(fitness_ratio), StatsUtil.logit(h), ], dtype=float) g = G(N, n, counts) Xopt = optimize.fmin(g, X0) # a, b, c = Xopt.tolist() mutation_rate_hat = StatsUtil.expit(a) fitness_ratio_hat = math.exp(b) h_hat = StatsUtil.expit(c) v_small_hat = get_sample_distn(N, n, mutation_rate_hat, fitness_ratio_hat, h_hat) # negloglik_alt = g(Xopt) # print >> out, 'estim. mutation rate:', mutation_rate_hat print >> out, 'estim. fitness ratio:', fitness_ratio_hat print >> out, 'estim. h:', h_hat print >> out, 'implied finite polymorphic diallelic distribution:' print >> out, v_small_hat print >> out, 'negative log likelihood:', negloglik_alt print >> out # # constrain to additive selection X0 = np.array([ StatsUtil.logit(mutation_rate), math.log(fitness_ratio), ], dtype=float) g = G_additive(N, n, counts) Xopt = optimize.fmin(g, X0) a, b = Xopt.tolist() mutation_rate_hat = StatsUtil.expit(a) fitness_ratio_hat = math.exp(b) h_hat = 0.5 v_small_hat = get_sample_distn(N, n, mutation_rate_hat, fitness_ratio_hat, h_hat) # negloglik_null = g(Xopt) # print >> out, '-- inference assuming additive selection (h = 0.5) --' print >> out, 'estim. mutation rate:', mutation_rate_hat print >> out, 'estim. fitness ratio:', fitness_ratio_hat print >> out, 'estim. h:', h_hat print >> out, 'implied finite polymorphic diallelic distribution:' print >> out, v_small_hat print >> out, 'negative log likelihood:', negloglik_null print >> out # D = 2 * (negloglik_null - negloglik_alt) print >> out, 'likelihood ratio test statistic:', D print >> out, 'chi squared 1-df 0.05 significance threshold:', 3.84 print >> out # """ # constrain to additive selection and equal expected mutation X0 = np.zeros(1) g = G_fit_only(M_large, M_small, counts) Xopt = optimize.fmin(g, X0) a, = Xopt.tolist() mut_ratio_hat = 1.0 fit_ratio_hat = math.exp(a) h_hat = 0.5 v_small_hat = get_sample_distn( M_large, M_small, mut_ratio_hat, fit_ratio_hat, h_hat) print >> out, '-- inference assuming additive selection and equal mut --' print >> out, 'estim. mut_ratio:', mut_ratio_hat print >> out, 'estim. fit_ratio:', fit_ratio_hat print >> out, 'estim. h:', h_hat print >> out, 'implied finite polymorphic diallelic distribution:' print >> out, v_small_hat print >> out """ # return out.getvalue()
def get_log_likelihood(self, obs): n = sum(obs) accum = 0 accum += self.coverage_distribution.get_log_likelihood(n) accum += StatsUtil.multinomial_log_pmf(self.distribution, obs) return accum
def ddCRP(D, adj_list, init_c, gt_z, num_passes, alpha, kappa, nu, sigsq, stats_interval, verbose): map_z = np.zeros(np.shape(D)[0]) stats = {'times': [], 'lp': [], 'NMI': [], 'K': [], 'z': [], 'c': []} hyp = ComputeCachedLikelihoodTerms(kappa, nu, sigsq) num_el = len(adj_list) # Generate random initialization if not specified if init_c.size == 0: c = np.zeros(num_el) for i in range(num_el): neighbors = np.concatenate((adj_list[i], i), axis=1) c[i] = neighbors[rd.randint(1, len(neighbors))] else: c = init_c # Initialize spatial connection matrix G = sparse.coo_matrix((np.ones(num_el), (np.arange(num_el), c)), shape=(num_el, num_el)) K, z, parcels = ConnectedComp(G) sym = StatsUtil.CheckSymApprox(D) curr_lp = FullProbabilityddCRP(D, c, parcels, alpha, hyp, sym) max_lp = -float('inf') steps = 0 t0 = time.clock() for curr_pass in range(num_passes): order = np.random.permutation(num_el) # Visit elements randomly for i in order: if curr_lp > max_lp: max_lp = curr_lp map_z = z if steps % stats_interval == 0: stats = UpdateStats(stats, t0, curr_lp, K, z, c, steps, gt_z, map_z, verbose) # Compute change in log-prob when removing the edge c_i CooModifyRow(G, i, -1) if c[i] == i: # Removing self-loop, parcellation won't change rem_delta_lp, z_rem, parcels_rem = -mt.log(alpha), z, parcels else: K_rem, z_rem, parcels_rem = ConnectedComp(G) if K_rem != K: # We split a cluster, compute change in likelihood rem_delta_lp = -LikelihoodDiff(D, parcels_rem, z_rem[i], z_rem[c[i]], hyp, sym) else: rem_delta_lp = 0 # Compute change in log-prob for each possible edge c_i adj_list_i = adj_list[i] lp = np.zeros((len(adj_list_i) + 1)) lp[len(adj_list_i)] = mt.log(alpha) cached_merge = -1 * np.ones(len(adj_list_i), dtype=np.int32) for n_ind in range(len(adj_list_i)): n = adj_list_i[n_ind] if z_rem[n] == z_rem[c[i]]: # Just undoing edge removal lp[n_ind] = -rem_delta_lp - (c[i] == i) * mt.log(alpha) elif z_rem[n] != z_rem[i]: # Proposing merge # First check cache to see if this is already computed prev_lp = np.flatnonzero(cached_merge == z_rem[n]) if prev_lp.size > 0: lp[n_ind] = lp[prev_lp[0]] else: # This is a novel merge, compute change in likelihood lp[n_ind] = LikelihoodDiff(D, parcels_rem, z_rem[i], z_rem[n], hyp, sym) cached_merge[n_ind] = z_rem[n] # Pick new edge proportional to probability new_neighbor = ChooseFromLP(lp) if new_neighbor < len(adj_list_i): c[i] = adj_list_i[new_neighbor] else: c[i] = i # Update likelihood and parcellation curr_lp = curr_lp + rem_delta_lp + lp[new_neighbor] CooModifyRow(G, i, c[i]) K, z, parcels = ConnectedComp(G) steps = steps + 1 stats = UpdateStats(stats, t0, curr_lp, K, z, c, steps, gt_z, map_z, verbose) return (map_z, stats)
def get_response_content(fs): np.set_printoptions(linewidth=200) out = StringIO() # extract user-supplied parameters N_diploid = fs.N_diploid nsites = fs.nsites nalleles = fs.nalleles mutation_rate = fs.mutation_rate Ns = fs.Ns h = fs.h # N_hap = 2 * N_diploid N = N_hap n = nalleles s = fs.Ns / float(N) fitness_ratio = 1 - s v_small = get_sample_distn(N, n, mutation_rate, fitness_ratio, h) # sample from this distribution counts = np.random.multinomial(nsites, v_small) # negloglik = -StatsUtil.multinomial_log_pmf( v_small, counts) # print >> out, 'actual mutation rate:', mutation_rate print >> out, 'actual fitness ratio:', fitness_ratio print >> out, 'actual h:', h print >> out, 'implied finite polymorphic diallelic distribution:' print >> out, v_small print >> out, 'negative log likelihood:', negloglik print >> out # # try to estimate the parameters X0 = np.array([ StatsUtil.logit(mutation_rate), math.log(fitness_ratio), StatsUtil.logit(h), ], dtype=float) g = G(N, n, counts) Xopt = optimize.fmin(g, X0) # a, b, c = Xopt.tolist() mutation_rate_hat = StatsUtil.expit(a) fitness_ratio_hat = math.exp(b) h_hat = StatsUtil.expit(c) v_small_hat = get_sample_distn( N, n, mutation_rate_hat, fitness_ratio_hat, h_hat) # negloglik_alt = g(Xopt) # print >> out, 'estim. mutation rate:', mutation_rate_hat print >> out, 'estim. fitness ratio:', fitness_ratio_hat print >> out, 'estim. h:', h_hat print >> out, 'implied finite polymorphic diallelic distribution:' print >> out, v_small_hat print >> out, 'negative log likelihood:', negloglik_alt print >> out # # constrain to additive selection X0 = np.array([ StatsUtil.logit(mutation_rate), math.log(fitness_ratio), ], dtype=float) g = G_additive(N, n, counts) Xopt = optimize.fmin(g, X0) a, b = Xopt.tolist() mutation_rate_hat = StatsUtil.expit(a) fitness_ratio_hat = math.exp(b) h_hat = 0.5 v_small_hat = get_sample_distn( N, n, mutation_rate_hat, fitness_ratio_hat, h_hat) # negloglik_null = g(Xopt) # print >> out, '-- inference assuming additive selection (h = 0.5) --' print >> out, 'estim. mutation rate:', mutation_rate_hat print >> out, 'estim. fitness ratio:', fitness_ratio_hat print >> out, 'estim. h:', h_hat print >> out, 'implied finite polymorphic diallelic distribution:' print >> out, v_small_hat print >> out, 'negative log likelihood:', negloglik_null print >> out # D = 2*(negloglik_null - negloglik_alt) print >> out, 'likelihood ratio test statistic:', D print >> out, 'chi squared 1-df 0.05 significance threshold:', 3.84 print >> out # """ # constrain to additive selection and equal expected mutation X0 = np.zeros(1) g = G_fit_only(M_large, M_small, counts) Xopt = optimize.fmin(g, X0) a, = Xopt.tolist() mut_ratio_hat = 1.0 fit_ratio_hat = math.exp(a) h_hat = 0.5 v_small_hat = get_sample_distn( M_large, M_small, mut_ratio_hat, fit_ratio_hat, h_hat) print >> out, '-- inference assuming additive selection and equal mut --' print >> out, 'estim. mut_ratio:', mut_ratio_hat print >> out, 'estim. fit_ratio:', fit_ratio_hat print >> out, 'estim. h:', h_hat print >> out, 'implied finite polymorphic diallelic distribution:' print >> out, v_small_hat print >> out """ # return out.getvalue()
def ClusterTree(D, adj_list): if StatsUtil.CheckSymApprox(D): X = D else: X = np.concatenate((D,D.transpose()),axis=1) # Compute squared euclidean distance Y between rows Qx = np.tile(np.linalg.norm(X, axis=1)**2,(X.shape[0],1)) Y = Qx + Qx.transpose()-2*np.dot(X, X.transpose()) Y = spatial.distance.squareform(Y,checks=False) Y[Y<0] = 0 # Correct for numerical errors in very similar rows # Construct adjacency matrix N = len(adj_list) A = np.zeros([N,N], dtype=bool) for i in range(N): A[i,adj_list[i]] = True connected = spatial.distance.squareform(A).astype(bool) # Initialize all data structures valid_clusts = np.ones(N, dtype=bool) # which clusters still remain col_limits = np.cumsum(np.concatenate((np.array([N-2]), np.arange(N-2, 0, -1)))) # During updating clusters, cluster index is constantly changing, R is # a index vector mapping the original index to the current (row, column) # index in Y. C denotes how many points are contained in each cluster. m = mt.ceil(mt.sqrt(2*Y.shape[0])) C = np.zeros(2*m-1) C[0:m] = 1 R = np.arange(m) all_inds = np.arange(Y.shape[0]) conn_inds = all_inds[connected] # pairs of adjacent clusters that can be merged Z = np.zeros([m-1,4]) for s in range(m-1): if conn_inds.size==0: # The graph was disconnected (e.g. two hemispheres) # Just add all connections to finish up cluster tree connected = np.zeros(len(connected)) conn_inds = [] valid_clust_inds = np.flatnonzero(valid_clusts) for i in valid_clust_inds: U = valid_clusts U[i] = 0 new_conns = PdistInds(i, N, U) connected[new_conns] = True conn_inds = np.concatenate((conn_inds, new_conns)) conn_inds = np.unique(conn_inds) # Find closest pair of clusters v = np.amin(Y[conn_inds]) k = conn_inds[np.argmin(Y[conn_inds])] j = np.where(k <= col_limits)[0][0] i = N - (col_limits[j] - k) - 1 Z[s,0:3] = np.array([R[i], R[j], v]) # Add row to output linkage # Update Y with this new cluster i containing old clusters i and j U = valid_clusts U[np.array([i,j])] = 0 I = PdistInds(i, N, U) J = PdistInds(j, N, U) Y[I] = ((C[R[U]]+C[R[i]])*Y[I] + (C[R[U]]+C[R[j]])*Y[J] - C[R[U]]*v)/(C[R[i]]+C[R[j]]+C[R[U]]) # Add j's connections to new cluster i new_conns = connected[J] & ~connected[I] connected[I] = connected[I] | new_conns conn_inds = np.sort(np.concatenate((conn_inds,I[new_conns]))) # Remove all of j's connections from conn_inds and connected U[i]=1 J = PdistInds(j, N, U) conn_inds = conn_inds[np.in1d(conn_inds,J, assume_unique=True, invert=True)] connected[J] = np.zeros(len(J)) valid_clusts[j] = 0 # update m, N, R C[m+s] = C[R[i]] + C[R[j]] Z[s,3] = C[m+s] R[i] = m+s Z[:,2] = np.sqrt(Z[:,2]) return Z
def get_log_likelihood(self, observation): if len(observation) != 4: raise ValueError('expected the observation to be a vector of four integers') mu = self.expected_coverage / 4.0 pr = 1/(mu+1) return sum(StatsUtil.geometric_log_pmf(obs, pr) for obs in observation)
def get_likelihood(self, obs): return math.exp(StatsUtil.poisson_log_pmf(obs, self.expectation))