def test_to_igraph(): #Make sure the igraph output has correct same structure T1 = SuchTree(gopher_tree) T2 = SuchTree(lice_tree) links = pd.read_csv(gl_links, index_col=0) SLT = SuchLinkedTrees(T1, T2, links) g = SLT.to_igraph() # igraph returns an unweighted adjacency matrix, # so we'll convert SuchLinkedTrees weighted # adjacency matrix to an unweighted form. saj = numpy.ceil(SLT.adjacency()) # For some reason, igraph invented its own Matrix # class that doesn't implement a standard numpy # interface. :-/ iaj = numpy.array(list(map(list, g.get_adjacency()))) # matrixes must be the same shape assert saj.shape == iaj.shape # all matrix elements must be equal assert reduce(lambda a, b: a and b, (saj == iaj).flatten())
def test_link_identities(): with tempfile.NamedTemporaryFile() as f1: f1.file.write(b'(A:1,(B:1,(C:1,D:1)E:1)F:1)G:1;') f1.file.close() T1 = SuchTree(f1.name) with tempfile.NamedTemporaryFile() as f2: f2.file.write(b'((a:1,b:1)e:1,(c:1,d:1)f:1)g:1;') f2.file.close() T2 = SuchTree(f2.name) ll = (('A', 'a'), ('B', 'c'), ('B', 'd'), ('C', 'd'), ('D', 'd')) links = pd.DataFrame(numpy.zeros((4, 4), dtype=int), index=list(T1.leafs.keys()), columns=list(T2.leafs.keys())) for i, j in ll: links.at[i, j] = 1 SLT = SuchLinkedTrees(T1, T2, links) t1_sfeal = dict(zip(T1.leafs.values(), T1.leafs.keys())) t2_sfeal = dict(zip(T2.leafs.values(), T2.leafs.keys())) lll = set((t1_sfeal[j], t2_sfeal[i]) for i, j in SLT.linklist.tolist()) assert set(ll) == lll
def test_distance(): T = SuchTree(test_tree) for line in open('SuchTree/tests/test.matrix'): a, b, d1 = line.split() d1 = float(d1) d2 = T.distance(a, b) assert d1 == approx(d2, 0.001)
def test_get_children(): T = SuchTree(test_tree) for node in dpt.inorder_node_iter(): if not node.taxon: left, right = [n.label for n in node.child_nodes()] else: left, right = -1, -1 L, R = T.get_children(node.label) assert L == left assert R == right
def test_distances_by_name(): T = SuchTree(test_tree) ids = [] d1 = [] for line in open('SuchTree/tests/test.matrix'): a, b, d = line.split() d1.append(float(d)) ids.append((a, b)) result = T.distances_by_name(ids) for D1, D2 in zip(d1, result): assert D1 == approx(D2, 0.001)
def test_distances(): T = SuchTree(test_tree) ids = [] d1 = [] for line in open('SuchTree/tests/test.matrix'): a, b, d = line.split() d1.append(float(d)) A = T.leafs[a] B = T.leafs[b] ids.append((A, B)) result = T.distances(numpy.array(ids, dtype=numpy.int64)) for D1, D2 in zip(d1, result): assert D1 == approx(D2, 0.001)
def test_init_both_trees_by_file(): T = SuchTree(test_tree) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys())) SLT = SuchLinkedTrees(test_tree, test_tree, links) assert type(SLT) == SuchLinkedTrees
def test_row_names(): T = SuchTree(test_tree) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys())) SLT = SuchLinkedTrees(T, T, links) assert SLT.row_names == list(T.leafs.keys())
def test_get_column_leafs(): T = SuchTree(test_tree) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys())) SLT = SuchLinkedTrees(T, T, links) for n, colname in enumerate(links.columns): s = links.applymap(bool)[colname] leafs1 = set(map(lambda x: T.leafs[x], s[s > 0].index)) leafs2 = set(SLT.get_column_leafs(n)) assert leafs1 == leafs2
def test_col_ids(): T = SuchTree(test_tree) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys())) SLT = SuchLinkedTrees(T, T, links) col_ids = SLT.col_ids leaf_ids = T.leafs.values() assert len(col_ids) == len(leaf_ids) for i, j in zip(col_ids, leaf_ids): assert i == j
def test_linkmatrix_property(): T = SuchTree(test_tree) row_names = list(T.leafs.keys()) numpy.random.shuffle(row_names) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=row_names) SLT = SuchLinkedTrees(T, T, links) for col in SLT.col_names: for row in SLT.row_names: col_id = SLT.col_names.index(col) row_id = SLT.row_names.index(row) assert bool(links.T[row][col]) == SLT.linkmatrix[row_id][col_id]
def test_get_column_links(): T = SuchTree(test_tree) row_names = list(T.leafs.keys()) numpy.random.shuffle(row_names) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=row_names) SLT = SuchLinkedTrees(T, T, links) for n, colname in enumerate(links.columns): s = links.applymap(bool)[colname] c = SLT.get_column_links(n) for m, rowname in enumerate(SLT.row_names): assert s[rowname] == c[m]
def test_is_ancestor(): T = SuchTree(test_tree) assert T.length - 1 == sum( map(lambda x: T.is_ancestor(T.root, x), T.get_descendant_nodes(T.root))) assert 1 - T.length == sum( map(lambda x: T.is_ancestor(x, T.root), T.get_descendant_nodes(T.root)))
def test_linklist_property(): T = SuchTree(test_tree) row_names = list(T.leafs.keys()) numpy.random.shuffle(row_names) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=row_names) SLT = SuchLinkedTrees(T, T, links) l = links.unstack() A = set( map(lambda x: (SLT.TreeB.leafs[x[0]], SLT.TreeA.leafs[x[1]]), list(l[l > 0].index))) B = set(map(lambda x: (x[0], x[1]), SLT.linklist)) assert A == B
def test_get_column_leafs_by_name_as_row_ids(): T = SuchTree(test_tree) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=list(T.leafs.keys())) SLT = SuchLinkedTrees(T, T, links) for colname in links.columns: s = links.applymap(bool)[colname] leafs1 = set( map( list(SLT.col_ids).index, map(lambda x: T.leafs[x], s[s > 0].index))) leafs2 = set(SLT.get_column_leafs(colname, as_row_ids=True)) assert leafs1 == leafs2
def test_adjacency(): T = SuchTree(test_tree) aj, leaf_ids = T.adjacency(T.root).values() leaf_ids = list(leaf_ids) for node in chain(T.leafs.values(), list(T.get_internal_nodes())): if node == T.root: continue # skip the root node parent = T.get_parent(node) distance = T.distance(node, parent) i, j = leaf_ids.index(node), leaf_ids.index(parent) print(node, parent, ':', i, j, ' :: ', aj[i, j], distance)
def test_subset_b(): T = SuchTree(test_tree) row_names = list(T.leafs.keys()) numpy.random.shuffle(row_names) links = pd.DataFrame(numpy.random.random_integers(0, 3, size=(N, N)), columns=list(T.leafs.keys()), index=row_names) SLT = SuchLinkedTrees(T, T, links) sfeal = dict(zip(SLT.TreeB.leafs.values(), SLT.TreeB.leafs.keys())) subset_links = links[list(map(lambda x: sfeal[x], SLT.TreeB.get_leafs(1)))] l = subset_links.unstack() SLT.subset_b(1) A = set( map(lambda x: (SLT.TreeB.leafs[x[0]], SLT.TreeA.leafs[x[1]]), list(l[l > 0].index))) B = set(map(lambda x: (x[0], x[1]), SLT.linklist)) assert A == B
def test_hierarchy(): T = SuchTree(test_tree) all_leafs = set(T.get_leafs(T.root)) for i in T.get_internal_nodes(): some_leafs = set(T.get_leafs(i)) assert some_leafs <= all_leafs
def test_get_leafs(): T = SuchTree(test_tree) assert set(list(T.get_leafs(T.root))) == set(T.leafs.values())
def test_get_distance_to_root(): T = SuchTree(test_tree) for leaf in dpt.leaf_node_iter(): assert T.get_distance_to_root(leaf.label) == approx( leaf.distance_from_root(), 0.001)
def test_init(): T = SuchTree(test_tree) assert type(T) == SuchTree
def simtree(prefix, birth_rate=0.3, death_rate=0.1, min_host_leafs=8, max_host_leafs=64, min_guest_leafs=4, max_guest_leafs=128, duplication_rate=0.2, loss_rate=0.1, switch_rate=0.05, k=2.0, theta=0.5): ''' Time interval is always 1.0 units, and GuestTreeGen stops after 1000 attempts. ''' max_guest_attempts = 1000 # make output directory if not exists(prefix): mkdir(prefix) # build the host tree E = subprocess.call(['java'] + java_ops + [ '-jar', 'jprime.jar', 'HostTreeGen', '-bi', '-min', str(min_host_leafs), '-max', str(max_host_leafs), '1.0', str(birth_rate), str(death_rate), prefix + '/' + 'host' ]) if not E == 0: raise JPrIMEError('HostTreeGen failed.') E = subprocess.call(['java'] + java_ops + [ '-jar', 'jprime.jar', 'BranchRelaxer', '-o', prefix + '/' + 'host.relaxed.tree', prefix + '/' + 'host.pruned.tree', 'IIDGamma', str(k), str(theta) ]) if not E == 0: raise JPrIMEError('BranchRelaxer failed on host tree.') # build the guest tree E = subprocess.call(['java'] + java_ops + [ '-jar', 'jprime.jar', 'GuestTreeGen', '--max-attempts', str(max_guest_attempts), '-min', str(min_guest_leafs), '-max', str(max_guest_leafs), prefix + '/' + 'host.pruned.tree', str(duplication_rate), str(loss_rate), str(switch_rate), prefix + '/' + 'guest' ]) if not E == 0: raise JPrIMEError('GuestTreGen failed.') E = subprocess.call(['java'] + java_ops + [ '-jar', 'jprime.jar', 'BranchRelaxer', '-o', prefix + '/' + 'guest.relaxed.tree', prefix + '/' + 'guest.pruned.tree', 'IIDGamma', str(k), str(theta) ]) if not E == 0: raise JPrIMEError('BranchRelaxer failed on guest tree.') # load the trees T1 = SuchTree(prefix + '/' + 'host.relaxed.tree') T2 = SuchTree(prefix + '/' + 'guest.relaxed.tree') # populate the link matrix using the leaf names l = zeros((T1.n_leafs, T2.n_leafs), dtype=int) hostnames = T1.leafs.keys() guestnames = T2.leafs.keys() for L in T2.leafs.keys(): guest, host = L.split('_') #host = 'H' + host i = hostnames.index(host) j = guestnames.index(L) l[i, j] = 1 links = pandas.DataFrame(l, index=hostnames, columns=guestnames) links.to_csv(prefix + '/' + 'links.csv') # initialize the SuchLinkedTrees object SLT = SuchLinkedTrees(T1, T2, links) # plot the adjacency matrix aj = SLT.adjacency() lp_plot = seaborn.heatmap(aj.T, cmap='viridis', vmin=0, vmax=1, cbar=False, square=True, xticklabels=False, yticklabels=False) lp_plot.invert_yaxis() fig = lp_plot.get_figure() fig.savefig(prefix + '/' + 'adjacency.png', size=6) fig.clf() # plot cophylogeny using R r_code = ''' tr1 <- read.tree( "HOST_TREE" ) tr2 <- read.tree( "GUEST_TREE" ) links <- read.csv( "LINKS", row.names=1, stringsAsFactors = F ) im <- graph_from_incidence_matrix( as.matrix( links ) ) assoc <- as_edgelist( im ) obj <- cophylo( tr1, tr2, assoc=assoc ) pdf( "OUTFILE", width = 10, height = 12 ) plot( obj ) dev.off() ''' r_code = r_code.replace('HOST_TREE', prefix + '/' + 'host.relaxed.tree') r_code = r_code.replace('GUEST_TREE', prefix + '/' + 'guest.relaxed.tree') r_code = r_code.replace('LINKS', prefix + '/' + 'links.csv') r_code = r_code.replace('OUTFILE', prefix + '/' + 'cophylo.pdf') robjects.r(r_code) # calculate spectral densities lambdas = SLT.spectrum() a_lambd = eigvalsh(SLT.TreeA.laplacian()['laplacian']) b_lambd = eigvalsh(SLT.TreeB.laplacian()['laplacian']) with open(prefix + '/' + 'eigenvalues.csv', 'w') as f: f.write('graph ' + ','.join(map(str, lambdas)) + '\n') f.write('TreeA ' + ','.join(map(str, a_lambd)) + '\n') f.write('TreeB ' + ','.join(map(str, b_lambd)) + '\n') bandwidth = 0.4 X = linspace(-0.5, 1.5, 200) density = gaussian_kde(lambdas / max(lambdas), bw_method=bandwidth).pdf(X) a_dnsty = gaussian_kde(a_lambd / max(a_lambd), bw_method=bandwidth).pdf(X) b_dnsty = gaussian_kde(b_lambd / max(b_lambd), bw_method=bandwidth).pdf(X) with open(prefix + '/' + 'densities.txt', 'w') as f: f.write('graph ' + ','.join(map(str, density)) + '\n') f.write('TreeA ' + ','.join(map(str, a_dnsty)) + '\n') f.write('TreeB ' + ','.join(map(str, b_dnsty)) + '\n') # calculate Hommola correlation d = SLT.linked_distances() r, p = pearsonr(d['TreeA'], d['TreeB']) with open(prefix + '/' + 'distances.txt', 'w') as f: f.write('TreeA ' + ','.join(map(str, d['TreeA'])) + '\n') f.write('TreeB ' + ','.join(map(str, d['TreeB'])) + '\n') # save jointplot of patristic distances jp = seaborn.jointplot(d['TreeA'], d['TreeB'], size=6) jp.savefig(prefix + '/' + 'correlation.png') jp.fig.clf() # output moment data moments = {} moments['eigengap'] = lambdas[-1] - lambdas[-2] moments['skew'] = skew(density) moments['kurtosis'] = kurtosis(density) moments['treedist'] = pdd(a_dnsty, b_dnsty) moments['occupancy'] = ( 2.0 * SLT.n_links ) \ / ( SLT.TreeA.n_leafs \ + SLT.TreeB.n_leafs ) moments['squareness'] = float( SLT.TreeA.n_leafs ) \ / SLT.TreeB.n_leafs moments['r'] = r moments['p'] = p with open(prefix + '/' + 'moments.csv', 'w') as f: f.write(','.join(moments.keys()) + '\n') f.write(','.join(map(str, moments.values()))) # output simulation parameters data = {} data['prefix'] = prefix data['host_leafs'] = T1.n_leafs data['guest_leafs'] = T2.n_leafs data['links'] = SLT.n_links data['birth_rate'] = birth_rate data['death_rate'] = death_rate data['min_host_leafs'] = min_host_leafs data['max_host_leafs'] = max_host_leafs data['min_guest_leafs'] = min_guest_leafs data['max_guest_leafs'] = max_guest_leafs data['duplication_rate'] = duplication_rate data['loss_rate'] = loss_rate data['switch_rate'] = switch_rate data['k'] = k data['theta'] = theta with open(prefix + '/' + 'data.csv', 'w') as f: f.write(','.join(data.keys()) + '\n') f.write(','.join(map(str, data.values())))
def test_get_descendant_nodes(): T = SuchTree(test_tree) A = set(T.get_descendant_nodes(T.root)) B = set(T.get_leafs(T.root)) C = set(T.get_internal_nodes()) assert A == B | C