Exemple #1
0
def extendedMain(pres_matrix,tree_file,fmt='newick',g=1,transpose=False,sep='\t'):
	""" matrix should be rows -> proteins; columns -> strains; else use transpose.
		First row/column should be headers.
	"""
	import numpy as np
	from Bio.Phylo import read
	pres_matrix=[l.strip().split(sep) for l in open(pres_matrix)]
	pres_matrix,colnames,rownames=np.array([i[1:] for i in pres_matrix[1:]]),np.array(pres_matrix[0]),np.array([i[0] for i in pres_matrix[1:]])
	if transpose: pres_matrix,colnames,rownames=pres_matrix.T,rownames,colnames
	proteins,strains=rownames,colnames
	prot_dict={}
	tree=read(tree_file,fmt)
	# compute parsimony events for each protein
	for i,prot in enumerate(proteins):
		pres_abs=np.array(map(int,pres_matrix[i]))
		pres_list=set(strains[pres_abs==1])
		container,root=main(pres_list,tree)
		prot_dict[prot]=container[root].propagate(container)
	# get luca genes
	root=tree.get_nonterminals()[0]
	LUCA_genes=[prot for k,v in prot_dict.iteritems() if root in v['gains']]
	# get a count of gain/losses for each clade
	clades=tree.get_terminals()+tree.get_nonterminals()
	counts={}
	for c in clades:
		counts[c]={'gains':set(),'losses':set()}
		for p in prot_dict:
			if c in prot_dict[p]['gains']: counts[c]['gains'].add(p)
			if c in prot_dict[p]['losses']: counts[c]['losses'].add(p)
	return LUCA_genes,counts
Exemple #2
0
def main():
    tree = read(sys.argv[1], 'newick')
    seqs = index(sys.argv[2], 'fasta')
    if not tree.rooted:
        tree.root_at_midpoint()
    tree.ladderize(reverse=True)
    for leaf in tree.get_terminals():
        write(seqs[leaf.name], sys.stdout, 'fasta')
Exemple #3
0
def main(presence_list,tree_file,fmt='newick',protein='protein',g=1):
	""" presence list is the list/set of taxa in which the protein is present """
	from Bio.Phylo import read
	try: tree_file.trace
	except: t=read(tree_file,fmt)
	else: t=tree_file
	visited={}
	global visited
	clade=t.get_nonterminals()[0]
	phyleticWalk(clade,presence_list)
	return visited,clade
Exemple #4
0
def load_notung_nhx(filename):
    """load reconciled gene tree from NHX formatted file

    returns networkx graph object
    strips information from the comment field and converts into node properties"""

    with open(filename, 'r') as f:
        tree = read(f, format='newick')

    tree.rooted = True

    tree = to_networkx(tree)

    node_translator = {}
    for node in tree.nodes():
        node_translator[node] = str(len(node_translator))

    graph = nx.DiGraph()

    for node in tree.nodes():
        new_node = node_translator[node]

        properties = {'name': str(node)}
        for match in re.findall(r'[^:]*\=[^:]*', node.comment):
            properties[match.split('=')[0]] = match.split('=')[1]

        graph.add_node(new_node, **properties)

    for source, target in tree.edges():
        new_source = node_translator[source]
        new_target = node_translator[target]
        graph.add_edge(new_source, new_target,
                       distance=source.distance(target),
                       **tree.edge[source][target])

    for s, t in graph.edges():
        graph.edge[s][t].pop('weight')

    # follow convention by renaming the root node to 'X0'
    root                     = nx.topological_sort(graph)[0]
    graph.node[root]['name'] = 'X0'

    # rename lost genes, so all nodes have unique names
    for n in [n for n in graph.nodes() if 'lost' in graph.node[n]['name'].lower()]:
        graph.node[n]['name'] = n + graph.node[n]['name']

    # build dictionary to replace node objects with the name of each node
    new_node_names = {n: graph.node[n]['name'] for n in graph.nodes()}

    nx.relabel_nodes(graph, new_node_names, copy=False)

    return graph