def profile_sig2(discrete=True, k=4): reps = 10 Ms = [50, 100] Ns = [1_000, 10_000] rows = [] i = 1 total_iters = reps*len(Ms)*len(Ns) for rep in range(reps): for m in Ms: if discrete: ref_tree = random_discrete_tree(m, 1, k) trans = NoahClade.NoahClade.gen_symmetric_transition else: ref_tree = random_gaussian_tree(m, 1, std_bounds=(0.1, 0.3)) trans = NoahClade.NoahClade.gen_linear_transition #ref_tree.root.ascii() for n in Ns: if discrete: root_data = np.random.choice(a=k, size=n) ref_tree.root.gen_subtree_data(root_data, trans, proba_bounds=(0.75, 0.95)) else: root_data = np.random.uniform(0, 1, n) ref_tree.root.gen_subtree_data(root_data, trans, std_bounds=(0.1, 0.3)) obs, labels = ref_tree.root.observe() ref_tree.root.reset_taxasets(labels) if discrete: M = similarity_matrix(obs) else: M = np.corrcoef(obs) non_terms = ref_tree.get_nonterminals() splits = [] for node in non_terms: # loop thru all the real splits splits.append((node.taxa_set, True)) for _ in range(2): other = random.choice(non_terms) # loop thru some fake ones bad_A = node.taxa_set ^ other.taxa_set splits.append((bad_A, False)) for A, real in splits: M_A = M[np.ix_(A, ~A)] if M_A.shape[0] > 1 and M_A.shape[1] > 1: s = np.linalg.svd(M_A, compute_uv=False) s1 = s[0] s2 = s[1] s23 = s[1:].sum() frob = np.linalg.norm(M_A) size = min(A.sum(), (~A).sum())#A.sum()#min(A.sum(), (~A).sum()) rows.append({"|A|": size, "|A|/|T|": size/m, "n":n, "m":m, "mn":"{0},{1}".format(m,n), "s1":s1, "s2":s2, "s23":s23, "frob":frob, "real":real}) print("{0} / {1}".format(i, total_iters)) i += 1 return rows
T_xA*T_Ay Pxy T_Ax*P_A*T_Ay P_Ax*P_Ay/P_A P_Ax*P_Ay/(0.25)**4 # how are these different??? n*Pxy_matrix n*P_Ax_matrix.dot(P_A_inv).dot(P_Ay_matrix.T) print("="*30) print(exact_similarity(ref_tree)[:5,:5]) print("+"*20) print(similarity_matrix(obs)[:5,:5]) print("+"*20) print(factored_similarity(ref_tree)[:5,:5]) np.allclose(factored_similarity2(ref_tree), factored_similarity(ref_tree)) normalized_similarity_matrix(obs) - exact_similarity(ref_tree) hamming = pdist(obs[(0,1),:], metric='hamming')[0] expected_det = (1 - hamming*k/(k-1))**(k-1) t = -np.log(Pxy_norm)/(k*(k-1)) # this is txy Pxs = [(np.unique(obs[leaf,:], return_counts=True)[1]/n).prod() for leaf in range(m)] np.outer(Pxs,Pxs) np.exp(np.log(Pxs).mean()) (1/4)**4
sns.regplot(x='|A|/|T|', y='s1', data=data_sub[~data_sub['real']], scatter_kws={"alpha":.3}) sns.relplot(x="|A|/|T|", y="s1", col="mn", hue='real', data=data, alpha=.3, col_wrap=2) # %% # How does distance on the tree affect similarity? from itertools import combinations, product ref_tree = random_discrete_tree(64, 10_000, 4) #trans = NoahClade.NoahClade.gen_symmetric_transition #root_data = np.random.choice(a=4, size=n) #ref_tree.root.gen_subtree_data(root_data, trans, proba_bounds=(0.75, 0.95)) ref_tree.root.ascii() obs, labels = ref_tree.root.observe() M = similarity_matrix(obs) random_gaussian_tree(64, 10_000, std_bounds=(0.1, 0.3)) obs, labels = ref_tree.root.observe() M = np.corrcoef(obs) RRROW = [] for left_ix, right_ix in combinations(range(len(labels)), 2): dist = ref_tree.distance(labels[left_ix], labels[right_ix]) RRROW.append({"dist":dist, "sim": M[left_ix, right_ix]}) sns.regplot(x='dist', y='sim', data=pd.DataFrame(RRROW)) # %% def moves_away(split, good_splits): return min(len(split ^ good)/len(split) for good in good_splits)
def NJ_logdet(observations, labels=None, classes=None): similarity = similarity_matrix(observations, classes=classes) dm = -np.log(np.clip(similarity, a_min=1e-20, a_max=None)) return NJ(dm, labels=labels)