def get_heuristics(M, R): """ Return a multiline string with some heuristics. The heuristics are independendent of time and of the information variant. Greater stationary distribution shannon entropy suggests less saturation. Greater stationary distribution logical entropy suggests less saturation. Greater expected rate suggests more saturation. Greater spectral rate suggests more saturation. @param M: pure mutation rate matrix @param R: mutation-selection balance rate matrix @return: multiline string """ # get the stationary distributions M_v = mrate.R_to_distn(M) R_v = mrate.R_to_distn(R) # check a different way to get the stationary distribution just for fun M_v_nonspectral = mrate.R_to_distn_nonspectral(M) R_v_nonspectral = mrate.R_to_distn_nonspectral(R) if not np.allclose(M_v, M_v_nonspectral): raise ValueError('internal stationary distribution calculation error') if not np.allclose(R_v, R_v_nonspectral): raise ValueError('internal stationary distribution calculation error') # compute the shannon entropy of the matrices M_shannon_entropy = -sum(p * math.log(p) for p in M_v) R_shannon_entropy = -sum(p * math.log(p) for p in R_v) shannon_entropy_sign = np.sign(M_shannon_entropy - R_shannon_entropy) # compute the logical entropy of the matrices M_logical_entropy = 1 - sum(p * p for p in M_v) R_logical_entropy = 1 - sum(p * p for p in R_v) logical_entropy_sign = np.sign(M_logical_entropy - R_logical_entropy) # compute the expected rate M_expected_rate = mrate.Q_to_expected_rate(M) R_expected_rate = mrate.Q_to_expected_rate(R) expected_rate_sign = np.sign(R_expected_rate - M_expected_rate) # compute the spectral rate M_spectral_rate = 1 / mrate.R_to_relaxation_time(M) R_spectral_rate = 1 / mrate.R_to_relaxation_time(R) spectral_rate_sign = np.sign(R_spectral_rate - M_spectral_rate) # report the heuristics out = StringIO() print >> out, 'Greater Shannon entropy of the stationary distribution', print >> out, 'suggests more information about divergence time.' print >> out, _heuristic_helper(shannon_entropy_sign) print >> out print >> out, 'Greater logical entropy of the stationary distribution', print >> out, 'suggests more information about divergence time.' print >> out, _heuristic_helper(logical_entropy_sign) print >> out print >> out, 'Smaller expected rate', print >> out, 'suggests more information about divergence time.' print >> out, _heuristic_helper(expected_rate_sign) print >> out print >> out, 'Smaller spectral rate', print >> out, 'suggests more information about divergence time.' print >> out, _heuristic_helper(spectral_rate_sign) print >> out return out.getvalue().strip()
def get_rate_matrix_summary(Q): out = StringIO() Q_v = mrate.R_to_distn(Q) Q_r = mrate.Q_to_expected_rate(Q) Q_t = mrate.R_to_relaxation_time(Q) print >> out, 'rate matrix:' print >> out, Q print >> out print >> out, 'this should be near zero for detailed balance:' print >> out, get_detailed_balance_error(Q) print >> out print >> out, 'computed stationary distribution:' print >> out, Q_v print >> out print >> out, 'expected rate:' print >> out, Q_r print >> out print >> out, 'relaxation time' print >> out, Q_t print >> out print >> out, '(expected rate) * (relaxation time):' print >> out, Q_r * Q_t print >> out print >> out return out.getvalue().rstrip()
def __init__(self, Q): self.Q = Q self.relaxation_time = mrate.R_to_relaxation_time(Q) self.p = min(mrate.R_to_distn(Q)) self.N = len(Q) self.lam = - 1 / self.relaxation_time key_time_points = ctmcmitaylor.get_key_time_points( self.lam, self.p, self.N) self.time_to_uniformity, self.time_to_usefulness = key_time_points
def __init__(self, M, t): """ @param M: mutation matrix @param t: the distance to go in the requested direction """ self.M = M self.t = t # get the stationary distribution of the mutation process self.v = mrate.R_to_distn(M) # get the mutation process relaxation time self.r_mut = mrate.R_to_relaxation_time(M)
def __call__(self, X): """ @param X: a vector to be converted into a finite distribution """ v_target = X_to_distn(X) v_new = (1 - self.t) * self.v + self.t * v_target R = mrate.to_gtr_halpern_bruno(self.M, v_new) if not np.allclose(v_new, mrate.R_to_distn(R)): raise ValueError('stationary distribution error') r_sel = mrate.R_to_relaxation_time(R) # we want to minimize this return self.r_mut - r_sel
def get_response_content(fs): np.set_printoptions(linewidth=200) out = StringIO() n = fs.nstates t = 0.001 # sample the initial mutation rate matrix S = sample_symmetric_rate_matrix(n) v = sample_distribution(n) M = mrate.to_gtr_halpern_bruno(S, v) if not np.allclose(v, mrate.R_to_distn(M)): raise ValueError('stationary distribution error') print >> out, 't:', t print >> out print >> out, 'initial GTR matrix:' print >> out, M print >> out # Try to iteratively increase the relaxation time # by repeatedly applying Halpern-Bruno selection. R = M v_old = v for i in range(20): # print some properties of the matrix print >> out, v_old print >> out, mrate.R_to_relaxation_time(R) print >> out f = MyOpt(R, t) x0 = [1.0] * (n - 1) result = scipy.optimize.fmin(f, x0, disp=0, full_output=1, ftol=0.000001) xopt, fopt, niters, funcalls, warnflag = result if fopt > 0: print >> out, 'failed to increase relaxation time' print >> out break # compute the next stationary distribution v_target = X_to_distn(xopt) v_new = (1 - t) * v_old + t * v_target print >> out, v_new - v_old print >> out # compute the next rate matrix and update its stationary distribution R = mrate.to_gtr_halpern_bruno(R, v_new) if not np.allclose(v_new, mrate.R_to_distn(R)): raise ValueError('stationary distribution error') v_old = v_new print >> out, 'final rate matrix:' print >> out, R print >> out return out.getvalue()
def get_statistic_ratios(Q_mut, Q_sels): """ @param Q_mut: mutation rate matrix @param Q_sels: mutations-selection balance rate matrices @return: ER_ratios, NSR_ratios, ER_NSR_ratios """ ER_mut = mrate.Q_to_expected_rate(Q_mut) ER_sels = [mrate.Q_to_expected_rate(Q) for Q in Q_sels] ER_ratios = [ER_sel / ER_mut for ER_sel in ER_sels] ER_NSR_mut = 1 / mrate.R_to_relaxation_time(Q_mut) ER_NSR_sels = [1 / mrate.R_to_relaxation_time(Q) for Q in Q_sels] ER_NSR_ratios = [ER_NSR_sel / ER_NSR_mut for ER_NSR_sel in ER_NSR_sels] NSR_ratios = [a / b for a, b in zip(ER_NSR_ratios, ER_ratios)] # do some extra investigation """ nsels = len(Q_sels) for i in range(nsels): if ER_NSR_ratios[i] < 1: print 'found a slower-decaying mutation-selection matrix:' print Q_sels[i] print print print 'ER_mut:' print ER_mut print print 'ER_NSR_mut:' print ER_NSR_mut print print 'ER_sels:' for x in ER_sels: print x print print 'ER_NSR_sels:' for x in ER_NSR_sels: print x print """ return ER_ratios, NSR_ratios, ER_NSR_ratios
def test_large_variance(self): n = 4 v = sample_distribution(n) S = sample_symmetric_rate_matrix(n) R = mrate.to_gtr_halpern_bruno(S, v) """ a = .1 b = .2 c = .7 R = np.array([ [-(b+c), b, c], [a, -(a+c), c], [a, b, -(a+b)]]) """ t = 2.0 dt = 0.0000001 rtime = mrate.R_to_relaxation_time(R) var_a = get_ml_variance(R, t) var_b = get_ml_variance(R, t + dt) var_slope = (var_b - var_a) / dt deriv_ratio = get_p_id_deriv_ratio(R, t) clever_ratio = get_ml_variance_ratio(R, t) print 'time:', t print 'variance:', var_a print 'variance slope:', var_slope print 'var_slope / var_a:', var_slope / var_a print 'var_slope / var_a [clever]:', clever_ratio print 'log variance:', math.log(var_a) print 'relaxation time:', rtime print '2 / relaxation_time:', 2 / rtime print "p_id(t)'' / p_id(t)':", deriv_ratio print print '--- new attempt ---' print 'mutual information:', ctmcmi.get_mutual_information(R, t) print 'reciprocal of MI:', 1.0 / ctmcmi.get_mutual_information(R, t) print 'asymptotic variance:', get_asymptotic_variance(R, t) print 'asymptotic variance (ver. 2):', get_asymptotic_variance_b(R, t) print 'asymptotic variance (ver. 3):', get_asymptotic_variance_c(R, t) print 'AV approx (ver. 4):', get_asymptotic_variance_d(R, t) print 'AV approx (ver. 5):', get_asymptotic_variance_e(R, t) print print '--- another thing ---' fi_slow = get_fisher_info_known_distn(R, v, t) fi_fast = get_fisher_info_known_distn_fast(R, v, t) print 'slow asymptotic variance:', 1 / fi_slow print 'fast asymptotic variance:', 1 / fi_fast print
def get_rate_matrix_summary(Q): out = StringIO() Q_t = mrate.R_to_relaxation_time(Q) Q_cheeger_bound = get_local_cheeger_ratio_bound(Q) if len(Q) < 16: real_cheeger = get_real_cheeger(Q) cheeger_string = str(real_cheeger) else: cheeger_string = 'takes too long to compute' print >> out, 'rate matrix:' print >> out, Q print >> out print >> out, 'algebraic connectivity (all hypothesized to be <= 2)' print >> out, 1 / Q_t print >> out print >> out, 'local cheeger ratio bound (all hypothesized to be <= 1):' print >> out, Q_cheeger_bound print >> out print >> out, 'actual Cheeger constant (all hypothesized to be <= 1):' print >> out, cheeger_string print >> out return out.getvalue().rstrip()
def __call__(self): """ @return: True if a counterexample is found """ n = self.nstates # sample a fairly generic GTR mutation rate matrix S = sample_symmetric_rate_matrix(n) v = sample_distribution(n) M = mrate.to_gtr_halpern_bruno(S, v) # look at the fiedler-like eigenvector of the mutation rate matrix r_recip, fiedler = mrate._R_to_eigenpair(M) r_mut = 1 / r_recip value_min, state_min = min((fiedler[i], i) for i in range(n)) value_max, state_max = max((fiedler[i], i) for i in range(n)) # move the stationary distribution towards a 50/50 distribution v_target = np.zeros(n) v_target[state_min] = 0.5 v_target[state_max] = 0.5 v_new = (1 - self.t) * v + self.t * v_target R = mrate.to_gtr_halpern_bruno(M, v_new) r_sel = mrate.R_to_relaxation_time(R) # the mutation-selection balance should have longer relaxation time #if r_sel < r_mut: #if True: if maxind(np.abs(fiedler / v)) != maxind(np.abs(fiedler / np.sqrt(v))): self.M = M self.fiedler = fiedler self.r_mut = r_mut self.r_sel = r_sel self.v = v self.v_new = v_new self.v_target = v_target self.opt_target = self._get_opt_target() return True else: return False
def get_time_point_summary(Q_mut, Q_sels, t): """ @param Q_mut: the mutation rate matrix @param Q_sels: sequence of mutation-selection rate matrices @param t: the time point under consideration @return: a sequence of statistics """ # Compute the following statistics at this time point: # t # mutation MI # selection MI max # selection MI high # selection MI mean # selection MI low # selection MI min # correlation fn 1 # correlation fn 2 # correlation fn 3 # correlation fn 4 # correlation fn 5 # proportion sign agreement fn 1 # proportion sign agreement fn 2 # proportion sign agreement fn 3 # proportion sign agreement fn 4 # proportion sign agreement fn 5 # informativeness fn 1 # informativeness fn 2 # informativeness fn 3 # informativeness fn 4 # informativeness fn 5 # # First compute the mutual information for mut and mut-sel. nsels = len(Q_sels) mi_mut = ctmcmi.get_mutual_information(Q_mut, t) mi_sels = [ctmcmi.get_mutual_information(Q, t) for Q in Q_sels] mi_signs = [1 if mi_sel > mi_mut else -1 for mi_sel in mi_sels] # Now compute some other functions v0 = [ctmcmi.get_mutual_information_small_approx_c(Q, t) for Q in Q_sels] v1 = [ctmcmi.get_mutual_information_small_approx(Q, t) for Q in Q_sels] v2 = [ctmcmi.get_mutual_information_approx_c(Q, t) for Q in Q_sels] v3 = [math.exp(-2*t/mrate.R_to_relaxation_time(Q)) for Q in Q_sels] v4 = [math.exp(-t*mrate.Q_to_expected_rate(Q)) for Q in Q_sels] # Now that we have computed all of the vectors at this time point, # we can compute the statistics that we want to report. statistics = [] statistics.append(t) statistics.append(mi_mut) # add the mutual information statistics sorted_mi = sorted(mi_sels) n_extreme = nsels / 20 statistics.append(sorted_mi[-1]) statistics.append(sorted_mi[-n_extreme]) statistics.append(sum(sorted_mi) / nsels) statistics.append(sorted_mi[n_extreme-1]) statistics.append(sorted_mi[0]) # add the correlations for v in (v0, v1, v2, v3, v4): r, p = scipy.stats.stats.pearsonr(v, mi_sels) statistics.append(r) # add the sign proportions for v in (v0, v1, v2, v3, v4): v_signs = [1 if value > mi_mut else -1 for value in v] total = sum(1 for a, b in zip(mi_signs, v_signs) if a == b) p = float(total) / nsels statistics.append(p) # add the informativenesses for v in (v0, v1, v2, v3, v4): v_signs = [1 if value > mi_mut else -1 for value in v] informativeness = 0 for pair in ((1, 1), (1, -1), (-1, 1), (-1, -1)): v_value, m_value = pair v_marginal_count = sum(1 for x in v_signs if x == v_value) m_marginal_count = sum(1 for x in mi_signs if x == m_value) joint_count = sum(1 for x in zip(v_signs, mi_signs) if x == pair) if joint_count: joint_prob = joint_count / float(nsels) a = math.log(joint_prob) b = math.log(v_marginal_count / float(nsels)) c = math.log(m_marginal_count / float(nsels)) informativeness += joint_prob * (a - b - c) statistics.append(informativeness) # return the statistics return statistics
def sample_row(): n = 4 # sample the exchangeability S = np.zeros((n, n)) S[1, 0] = random.expovariate(1) S[2, 0] = random.expovariate(1) S[2, 1] = random.expovariate(1) S[3, 0] = random.expovariate(1) S[3, 1] = random.expovariate(1) S[3, 2] = random.expovariate(1) # sample the mutation stationary distribution mdistn = np.array([random.expovariate(1) for i in range(n)]) mdistn /= np.sum(mdistn) # sample the mutation selection balance stationary distribution bdistn = np.array([random.expovariate(1) for i in range(n)]) bdistn /= np.sum(bdistn) # sample the time t = random.expovariate(1) # sample the info type infotype = random.choice(('infotype.mi', 'infotype.fi')) # Compute some intermediate variables # from which the summary statistics and the label are computed. S = S + S.T M = S * mdistn M -= np.diag(np.sum(M, axis=1)) R = mrate.to_gtr_halpern_bruno(M, bdistn) shannon_ent_mut = -sum(p * log(p) for p in mdistn) shannon_ent_bal = -sum(p * log(p) for p in bdistn) logical_ent_mut = 1.0 - sum(p * p for p in mdistn) logical_ent_bal = 1.0 - sum(p * p for p in bdistn) expected_rate_mut = mrate.Q_to_expected_rate(M) expected_rate_bal = mrate.Q_to_expected_rate(R) spectral_rate_mut = 1 / mrate.R_to_relaxation_time(M) spectral_rate_bal = 1 / mrate.R_to_relaxation_time(R) mi_mut = ctmcmi.get_mutual_information(M, t) mi_bal = ctmcmi.get_mutual_information(R, t) fi_mut = divtime.get_fisher_information(M, t) fi_bal = divtime.get_fisher_information(R, t) # compute the summary statistics summary_entries = [ shannon_ent_bal - shannon_ent_mut, logical_ent_bal - logical_ent_mut, log(shannon_ent_bal) - log(shannon_ent_mut), log(logical_ent_bal) - log(logical_ent_mut), expected_rate_bal - expected_rate_mut, spectral_rate_bal - spectral_rate_mut, log(expected_rate_bal) - log(expected_rate_mut), log(spectral_rate_bal) - log(spectral_rate_mut), mi_bal - mi_mut, fi_bal - fi_mut, math.log(mi_bal) - math.log(mi_mut), math.log(fi_bal) - math.log(fi_mut), ] # get the definition entries definition_entries = [ S[1, 0], S[2, 0], S[2, 1], S[3, 0], S[3, 1], S[3, 2], mdistn[0], mdistn[1], mdistn[2], mdistn[3], bdistn[0], bdistn[1], bdistn[2], bdistn[3], infotype, t, ] # define the label if infotype == 'infotype.mi' and mi_mut > mi_bal: label = 'mut.is.better' elif infotype == 'infotype.mi' and mi_mut < mi_bal: label = 'bal.is.better' elif infotype == 'infotype.fi' and fi_mut > fi_bal: label = 'mut.is.better' elif infotype == 'infotype.fi' and fi_mut < fi_bal: label = 'bal.is.better' else: label = 'indistinguishable' # return the row return definition_entries + summary_entries + [label]