def extendedMain(pres_matrix,tree_file,fmt='newick',g=1,transpose=False,sep='\t'): """ matrix should be rows -> proteins; columns -> strains; else use transpose. First row/column should be headers. """ import numpy as np from Bio.Phylo import read pres_matrix=[l.strip().split(sep) for l in open(pres_matrix)] pres_matrix,colnames,rownames=np.array([i[1:] for i in pres_matrix[1:]]),np.array(pres_matrix[0]),np.array([i[0] for i in pres_matrix[1:]]) if transpose: pres_matrix,colnames,rownames=pres_matrix.T,rownames,colnames proteins,strains=rownames,colnames prot_dict={} tree=read(tree_file,fmt) # compute parsimony events for each protein for i,prot in enumerate(proteins): pres_abs=np.array(map(int,pres_matrix[i])) pres_list=set(strains[pres_abs==1]) container,root=main(pres_list,tree) prot_dict[prot]=container[root].propagate(container) # get luca genes root=tree.get_nonterminals()[0] LUCA_genes=[prot for k,v in prot_dict.iteritems() if root in v['gains']] # get a count of gain/losses for each clade clades=tree.get_terminals()+tree.get_nonterminals() counts={} for c in clades: counts[c]={'gains':set(),'losses':set()} for p in prot_dict: if c in prot_dict[p]['gains']: counts[c]['gains'].add(p) if c in prot_dict[p]['losses']: counts[c]['losses'].add(p) return LUCA_genes,counts
def main(): tree = read(sys.argv[1], 'newick') seqs = index(sys.argv[2], 'fasta') if not tree.rooted: tree.root_at_midpoint() tree.ladderize(reverse=True) for leaf in tree.get_terminals(): write(seqs[leaf.name], sys.stdout, 'fasta')
def main(presence_list,tree_file,fmt='newick',protein='protein',g=1): """ presence list is the list/set of taxa in which the protein is present """ from Bio.Phylo import read try: tree_file.trace except: t=read(tree_file,fmt) else: t=tree_file visited={} global visited clade=t.get_nonterminals()[0] phyleticWalk(clade,presence_list) return visited,clade
def load_notung_nhx(filename): """load reconciled gene tree from NHX formatted file returns networkx graph object strips information from the comment field and converts into node properties""" with open(filename, 'r') as f: tree = read(f, format='newick') tree.rooted = True tree = to_networkx(tree) node_translator = {} for node in tree.nodes(): node_translator[node] = str(len(node_translator)) graph = nx.DiGraph() for node in tree.nodes(): new_node = node_translator[node] properties = {'name': str(node)} for match in re.findall(r'[^:]*\=[^:]*', node.comment): properties[match.split('=')[0]] = match.split('=')[1] graph.add_node(new_node, **properties) for source, target in tree.edges(): new_source = node_translator[source] new_target = node_translator[target] graph.add_edge(new_source, new_target, distance=source.distance(target), **tree.edge[source][target]) for s, t in graph.edges(): graph.edge[s][t].pop('weight') # follow convention by renaming the root node to 'X0' root = nx.topological_sort(graph)[0] graph.node[root]['name'] = 'X0' # rename lost genes, so all nodes have unique names for n in [n for n in graph.nodes() if 'lost' in graph.node[n]['name'].lower()]: graph.node[n]['name'] = n + graph.node[n]['name'] # build dictionary to replace node objects with the name of each node new_node_names = {n: graph.node[n]['name'] for n in graph.nodes()} nx.relabel_nodes(graph, new_node_names, copy=False) return graph