def profile_sig2(discrete=True, k=4):
    reps = 10
    Ms = [50, 100]
    Ns = [1_000, 10_000]
    rows = []
    i = 1
    total_iters = reps*len(Ms)*len(Ns)
    for rep in range(reps):
        for m in Ms:
            if discrete:
                ref_tree = random_discrete_tree(m, 1, k)
                trans = NoahClade.NoahClade.gen_symmetric_transition
            else:
                ref_tree = random_gaussian_tree(m, 1, std_bounds=(0.1, 0.3))
                trans = NoahClade.NoahClade.gen_linear_transition
            #ref_tree.root.ascii()
            for n in Ns:
                if discrete:
                    root_data = np.random.choice(a=k, size=n)
                    ref_tree.root.gen_subtree_data(root_data, trans, proba_bounds=(0.75, 0.95))
                else:
                    root_data = np.random.uniform(0, 1, n)
                    ref_tree.root.gen_subtree_data(root_data, trans, std_bounds=(0.1, 0.3))
                obs, labels = ref_tree.root.observe()
                ref_tree.root.reset_taxasets(labels)
                if discrete:
                    M = similarity_matrix(obs)
                else:
                    M = np.corrcoef(obs)
                non_terms = ref_tree.get_nonterminals()

                splits = []
                for node in non_terms:
                    # loop thru all the real splits
                    splits.append((node.taxa_set, True))
                    for _ in range(2):
                        other = random.choice(non_terms)
                        # loop thru some fake ones
                        bad_A = node.taxa_set ^ other.taxa_set
                        splits.append((bad_A, False))

                for A, real in splits:
                    M_A = M[np.ix_(A, ~A)]
                    if M_A.shape[0] > 1 and M_A.shape[1] > 1:
                        s = np.linalg.svd(M_A, compute_uv=False)
                        s1 = s[0]
                        s2 = s[1]
                        s23 = s[1:].sum()
                        frob = np.linalg.norm(M_A)
                        size = min(A.sum(), (~A).sum())#A.sum()#min(A.sum(), (~A).sum())
                        rows.append({"|A|": size, "|A|/|T|": size/m, "n":n, "m":m, "mn":"{0},{1}".format(m,n), "s1":s1, "s2":s2, "s23":s23, "frob":frob, "real":real})
                print("{0} / {1}".format(i, total_iters))
                i += 1
    return rows
    T_xA*T_Ay

    Pxy
    T_Ax*P_A*T_Ay
    P_Ax*P_Ay/P_A
    P_Ax*P_Ay/(0.25)**4

    # how are these different???
    n*Pxy_matrix
    n*P_Ax_matrix.dot(P_A_inv).dot(P_Ay_matrix.T)


    print("="*30)
    print(exact_similarity(ref_tree)[:5,:5])
    print("+"*20)
    print(similarity_matrix(obs)[:5,:5])
    print("+"*20)
    print(factored_similarity(ref_tree)[:5,:5])

    np.allclose(factored_similarity2(ref_tree), factored_similarity(ref_tree))

    normalized_similarity_matrix(obs) - exact_similarity(ref_tree)

    hamming = pdist(obs[(0,1),:], metric='hamming')[0]
    expected_det = (1 - hamming*k/(k-1))**(k-1)
    t = -np.log(Pxy_norm)/(k*(k-1)) # this is txy

    Pxs = [(np.unique(obs[leaf,:], return_counts=True)[1]/n).prod() for leaf in range(m)]
    np.outer(Pxs,Pxs)
    np.exp(np.log(Pxs).mean())
    (1/4)**4
sns.regplot(x='|A|/|T|', y='s1', data=data_sub[~data_sub['real']], scatter_kws={"alpha":.3})

sns.relplot(x="|A|/|T|", y="s1", col="mn", hue='real', data=data, alpha=.3, col_wrap=2)
# %%


# How does distance on the tree affect similarity?
from itertools import combinations, product

ref_tree = random_discrete_tree(64, 10_000, 4)
#trans = NoahClade.NoahClade.gen_symmetric_transition
#root_data = np.random.choice(a=4, size=n)
#ref_tree.root.gen_subtree_data(root_data, trans, proba_bounds=(0.75, 0.95))
ref_tree.root.ascii()
obs, labels = ref_tree.root.observe()
M = similarity_matrix(obs)

random_gaussian_tree(64, 10_000, std_bounds=(0.1, 0.3))
obs, labels = ref_tree.root.observe()
M = np.corrcoef(obs)

RRROW = []
for left_ix, right_ix in combinations(range(len(labels)), 2):
    dist = ref_tree.distance(labels[left_ix], labels[right_ix])
    RRROW.append({"dist":dist, "sim": M[left_ix, right_ix]})
sns.regplot(x='dist', y='sim', data=pd.DataFrame(RRROW))

# %%
def moves_away(split, good_splits):
    return min(len(split ^ good)/len(split) for good in good_splits)
Example #4
0
def NJ_logdet(observations, labels=None, classes=None):
    similarity = similarity_matrix(observations, classes=classes)
    dm = -np.log(np.clip(similarity, a_min=1e-20, a_max=None))
    return NJ(dm, labels=labels)