def load_tree(self, consFile=None, **args): n_tree, n_branch = parse_trees(**args) with open(consFile) as fin: schema = 'nexus' if fin.readline().upper().startswith( '#NEXUS') else 'newick' for tre in Tree.yield_from_files([consFile], schema=schema): break internal_id = n_taxa = len(self.data['taxa']) digit_code = np.power(2, np.arange(n_taxa, dtype='object')) for node in tre.postorder_node_iter(): if node.is_leaf(): node.id = self.data['taxa'][node.taxon.label] node.barcode = digit_code[node.id] else: node.id, internal_id = internal_id, internal_id + 1 node.barcode = sum([c.barcode for c in node.child_nodes()]) tre.seed_node.age = tre.seed_node.distance_from_tip() for node in tre.preorder_node_iter(): if node.parent_node: node.age = node.parent_node.age - node.edge_length return [[ n.barcode, len(n_branch.get(n.barcode, [])) / n_tree, n_branch.get(n.barcode, []), n.age, n.edge_length ] for n in tre.preorder_node_iter()]
def main (folder=None,seed=None): print("Folder %s, seed %s") % (folder,seed) r=numpy.random.RandomState(seed) gene_trees=TreeList() taxa = dendropy.TaxonNamespace() treefiles=glob.glob(args.sd+"/"+folder+"/g_trees*.trees") tree_yielder=Tree.yield_from_files(files=treefiles,schema="newick",rooting="default-rooted",preserve_underscores=True,taxon_namespace=taxa) #Modify gene trees #I have to modify here the trees if args.mk=="random": for gtree in tree_yielder: onodes=gtree.leaf_nodes() nodes=remove_taxa_prov(r,onodes,args.pr) if len(nodes) < len(onodes)-3: #Tree with missing leaves gtree.prune_taxa(nodes,update_bipartitions=False, suppress_unifurcations=True) gene_trees.append(gtree) else: #The whole tree is missing (the tree would have 3 leaves or less, which is not an unrooted tree) continue elif args.mk=="byindividual": tagProbs=None for gtree in tree_yielder: onodes=gtree.leaf_nodes() if not tagProbs: tagProbs={} probs=truncated_normal(r,n=len(onodes),mean=args.pr,sd=args.ist,min=args.itmin,max=args.itmax) #one prob for each leaf for leafi in xrange(len(onodes)): tagProbs[onodes[leafi].taxon.label]=probs[leafi]#assigment to leaf labels in the dictionary nodes=remove_taxa_tagprobs(r,onodes,tagProbs) if len(nodes) < len(onodes)-3: #Tree with missing leaves gtree.prune_taxa(nodes,update_bipartitions=False, suppress_unifurcations=True) gene_trees.append(gtree) else: #The whole tree is missing (the tree would have 3 leaves or less, which is not an unrooted tree) continue else: print("Yet unsupported option") #Write gene trees gene_trees.write(path=args.sd+"/"+folder+"/"+args.o,schema="newick")
def read_trees(control_treeFile, tree_burnin=0, tree_sampleFreq=1, tree_maxNum=10, trait_ignoreMissing=True, **args) : data_trees = [] schema = 'newick' with open(control_treeFile) as fin : if fin.readline().upper().startswith('#NEXUS') : schema = 'nexus' for id, tre in enumerate(Tree.yield_from_files([control_treeFile], schema=schema)) : if id >= tree_burnin : if (id - tree_burnin) % tree_sampleFreq == 0 : data_trees.append(tre) if len(data_trees) >= tree_maxNum : break n_tree, n_node = len(data_trees), len(data_trees[0].nodes()) data_traits = {'branch.length':[np.zeros(shape=[ 1, n_tree, n_node, 1]), None, ['branch.length']]} for node in data_trees[0].preorder_node_iter() : for annotation in node.annotations : if annotation.name not in data_traits : if isinstance(annotation.value, list) : try : if isinstance(float(annotation.value[0]), float) : data_traits[annotation.name] = [np.zeros(shape=[ 1, n_tree, n_node, len(annotation.value)]), None, [annotation.name]] except : data_traits[annotation.name] = [np.zeros(shape=[ 1, n_tree, n_node, len(annotation.value)], dtype=int), \ {'0':-1, '-':-1, '':-1} if trait_ignoreMissing else {'':-1}, [annotation.name]] else : try : if isinstance(float(annotation.value), float) : data_traits[annotation.name] = [np.zeros(shape=[ 1, n_tree, n_node, 1]), None, [annotation.name]] except : data_traits[annotation.name] = [np.zeros(shape=[ 1, n_tree, n_node, 1], dtype=int), \ {'0':-1, '-':-1, '':-1} if trait_ignoreMissing else {'':-1}, [annotation.name]] for tid, tre in enumerate(data_trees) : for nid, node in enumerate(tre.nodes()) : node.id = nid if node.edge_length < 1e-8 and node.parent_node is not None : if node.is_leaf() : node.edge_length = 1e-8 else : parent = node.parent_node for child in node.child_nodes() : child._set_parent_node(parent) parent.remove_child(node) parent.set_child_nodes(parent.child_nodes() + node.child_nodes()) continue data_traits['branch.length'][0][0, tid, nid, 0] = node.edge_length if node.parent_node else 0.0 for annotation in node.annotations : if annotation.name in data_traits : k, v = annotation.name, annotation.value if isinstance(v, basestring) : v = [v] if data_traits[k][1] is None : data_traits[k][0][0, tid, nid] = [float(vv) for vv in v] else : for vv in v : if vv not in data_traits[k][1] : data_traits[k][1][vv] = max(data_traits[k][1].values()) + 1 data_traits[k][0][0, tid, nid] = [data_traits[k][1][vv] for vv in v] node.annotations.clear() node.annotations.add_new('id', node.id) for key, states in data_traits.iteritems() : if states[1] is not None : s, categories, tags = states new_state = np.zeros([s.shape[0], s.shape[1], s.shape[2], np.max(s)+1], dtype=int) axis = np.where(s>=0)[:3] new_state[axis[0], axis[1], axis[2], s[s>=0]] = 1 data_traits[key][0] = new_state return data_trees, data_traits
def load_tree(self, treefile, burnIn=0, sampleFreq=1, maxNum=10, ignoreMissing=False, **args): # sumtrees file ASTRID # read trees (including traits when possible) data_trees = [] with open(treefile) as fin: schema = 'nexus' if fin.readline().upper().startswith( '#NEXUS') else 'newick' for id, tre in enumerate( Tree.yield_from_files([treefile], schema=schema)): if maxNum > 0 and id > maxNum: break if id >= burnIn: if not tre.label: tre.label = str(id) if (id - burnIn) % sampleFreq == 0: data_trees.append(tre) # find all tips taxa = {} for tre in data_trees: for taxon in tre.taxon_namespace: taxa[taxon.label] = 1 for id, taxon in enumerate(sorted(taxa.keys())): taxa[taxon] = id # load in metadata trait types n_taxa, n_tree, n_node = len(taxa), len(data_trees), 0 digit_code = np.power(2, np.arange(n_taxa, dtype='object')) trait_categories = { 'branch.length': [1, 'continuous', None], 'branch.age': [1, 'continuous', None] } for tre in data_trees: internal_id = n_taxa for node in tre.postorder_node_iter(): for annotation in node.annotations: n, v = annotation.name, annotation.value if annotation.name not in trait_categories: if isinstance(v, list): trait_categories[n] = [ len(v), 'continuous', None ] if isnum(v[0]) else [len(v), 'discrete', {}] else: trait_categories[n] = [ 1, 'continuous', None ] if isnum(v) else [1, 'discrete', {}] if trait_categories[n][1] == 'discrete': if isinstance(v, list): for vv in v: trait_categories[n][2][vv] = 1 else: trait_categories[n][2][v] = 1 if node.is_leaf(): node.id = taxa[node.taxon.label] node.barcode = digit_code[node.id] else: node.id, internal_id = internal_id, internal_id + 1 node.barcode = sum([c.barcode for c in node.child_nodes()]) if internal_id > n_node: n_node = internal_id tre.seed_node.age = tre.seed_node.distance_from_tip() for node in tre.preorder_node_iter(): if node.parent_node: node.age = node.parent_node.age - node.edge_length # convert traits into discrete characters for cc, tc in trait_categories.iteritems(): if tc[1] == 'discrete': if ignoreMissing: tc[2].update({"-": -1, "": -1, "0": -1}) tc[2].update( dict([[k, id] for id, k in enumerate( sorted([k for k, v in tc[2].iteritems() if v > 0]))])) # read traits' values trees = [] for tre in data_trees: for node in tre.nodes(): if node.edge_length < 1e-8 and node.parent_node is not None: if node.is_leaf(): node.edge_length = 1e-8 else: parent = node.parent_node for child in node.child_nodes(): child._set_parent_node(parent) parent.remove_child(node) parent.set_child_nodes(parent.child_nodes() + node.child_nodes()) trees.append(MetaTree(tre)) mt = trees[-1] mt.traits['branch'] = [ np.empty([2, n_node, 1]), [['length', 0], ['age', 1]], None ] for node in tre.postorder_node_iter(): mt.traits['branch'][0][:, node.id, 0] = [node.edge_length, node.age] for annotation in node.annotations: k, v = annotation.name, annotation.value tck = trait_categories[k] if tck[1] == 'continuous': if k not in mt.traits: mt.traits[k] = [ np.empty([1, n_node, tck[0]]), [[k + ':0', 0]], None ] mt.traits[k][0].fill(np.nan) mt.traits[k][0][0, node.id, :] = v else: if k not in mt.traits: mt.traits[k] = [ np.zeros([tck[0], n_node], dtype=int), [['{0}:{1}'.format(k, id), id] for id in np.arange(tck[0])], tck[2] ] mt.traits[k][0].fill(-1) mt.traits[k][0][:, node.id] = np.vectorize(tck[2].get)(v) node.annotations.clear() node.annotations.add_new('id', node.id) for k, v in mt.traits.iteritems(): if v[2] is not None: ids = np.lexsort(v[0].T) d = v[0][ids] uniq_ids = np.concatenate([[1], np.sum(d[:-1] != d[1:], 1)]) d = d[uniq_ids > 0] data = np.ones( [d.shape[0], d.shape[1], np.max(v[2].values()) + 1], dtype=float) data.fill(np.nan) axis = np.where(d >= 0) data[axis[0], axis[1], :] = 0 data[axis[0], axis[1], d[d >= 0]] = 1 v[0] = data dd = [] mat_id = -1 for i in uniq_ids: if i > 0: mat_id += 1 dd.append(mat_id) v[1] = [[name, i] for (name, oi), i in zip(v[1], np.array(dd)[ids])] return taxa, trees, n_node