Exemple #1
0
        def load_tree(self, consFile=None, **args):
            n_tree, n_branch = parse_trees(**args)

            with open(consFile) as fin:
                schema = 'nexus' if fin.readline().upper().startswith(
                    '#NEXUS') else 'newick'
            for tre in Tree.yield_from_files([consFile], schema=schema):
                break

            internal_id = n_taxa = len(self.data['taxa'])
            digit_code = np.power(2, np.arange(n_taxa, dtype='object'))

            for node in tre.postorder_node_iter():
                if node.is_leaf():
                    node.id = self.data['taxa'][node.taxon.label]
                    node.barcode = digit_code[node.id]
                else:
                    node.id, internal_id = internal_id, internal_id + 1
                    node.barcode = sum([c.barcode for c in node.child_nodes()])

            tre.seed_node.age = tre.seed_node.distance_from_tip()
            for node in tre.preorder_node_iter():
                if node.parent_node:
                    node.age = node.parent_node.age - node.edge_length
            return [[
                n.barcode,
                len(n_branch.get(n.barcode, [])) / n_tree,
                n_branch.get(n.barcode, []), n.age, n.edge_length
            ] for n in tre.preorder_node_iter()]
def main (folder=None,seed=None):
	print("Folder %s, seed %s") % (folder,seed)
	r=numpy.random.RandomState(seed)
	gene_trees=TreeList()
	taxa = dendropy.TaxonNamespace()
	treefiles=glob.glob(args.sd+"/"+folder+"/g_trees*.trees")
	tree_yielder=Tree.yield_from_files(files=treefiles,schema="newick",rooting="default-rooted",preserve_underscores=True,taxon_namespace=taxa)
	#Modify gene trees
	#I have to modify here the trees
	if args.mk=="random":
		for gtree in tree_yielder:
			onodes=gtree.leaf_nodes()
			nodes=remove_taxa_prov(r,onodes,args.pr)
			if len(nodes) < len(onodes)-3: #Tree with missing leaves
				gtree.prune_taxa(nodes,update_bipartitions=False, suppress_unifurcations=True)
				gene_trees.append(gtree)
			else:	#The whole tree is missing (the tree would have 3 leaves or less, which is not an unrooted tree)
				continue
	elif args.mk=="byindividual":
		tagProbs=None
		for gtree in tree_yielder:
                        onodes=gtree.leaf_nodes()
			if not tagProbs:
				tagProbs={}
				probs=truncated_normal(r,n=len(onodes),mean=args.pr,sd=args.ist,min=args.itmin,max=args.itmax) #one prob for each leaf
				for leafi in xrange(len(onodes)):
					tagProbs[onodes[leafi].taxon.label]=probs[leafi]#assigment to leaf labels in the dictionary
                        nodes=remove_taxa_tagprobs(r,onodes,tagProbs)
			if len(nodes) < len(onodes)-3: #Tree with missing leaves
                                gtree.prune_taxa(nodes,update_bipartitions=False, suppress_unifurcations=True)
                                gene_trees.append(gtree)
                        else:   #The whole tree is missing (the tree would have 3 leaves or less, which is not an unrooted tree)
                                continue
	else:
		print("Yet unsupported option")
	#Write gene trees
	gene_trees.write(path=args.sd+"/"+folder+"/"+args.o,schema="newick")
Exemple #3
0
def read_trees(control_treeFile, tree_burnin=0, tree_sampleFreq=1, tree_maxNum=10, trait_ignoreMissing=True, **args) :
    data_trees = []
    schema = 'newick'
    with open(control_treeFile) as fin :
        if fin.readline().upper().startswith('#NEXUS') :
            schema = 'nexus'
    for id, tre in enumerate(Tree.yield_from_files([control_treeFile], schema=schema)) :
        if id >= tree_burnin :
            if (id - tree_burnin) % tree_sampleFreq == 0 :
                data_trees.append(tre)
            if len(data_trees) >= tree_maxNum : break
        
    n_tree, n_node = len(data_trees), len(data_trees[0].nodes())
    data_traits = {'branch.length':[np.zeros(shape=[ 1, n_tree, n_node, 1]), None, ['branch.length']]}

    for node in data_trees[0].preorder_node_iter() :
        for annotation in node.annotations :
            if annotation.name not in data_traits :
                if isinstance(annotation.value, list) :
                    try :
                        if isinstance(float(annotation.value[0]), float) :
                            data_traits[annotation.name] = [np.zeros(shape=[ 1, n_tree, n_node, len(annotation.value)]), None, [annotation.name]]
                    except :
                        data_traits[annotation.name] = [np.zeros(shape=[ 1, n_tree, n_node, len(annotation.value)], dtype=int), \
                                                        {'0':-1, '-':-1, '':-1} if trait_ignoreMissing else {'':-1}, [annotation.name]]
                else :
                    try :
                        if isinstance(float(annotation.value), float) :
                            data_traits[annotation.name] = [np.zeros(shape=[ 1, n_tree, n_node, 1]), None, [annotation.name]]
                    except :
                        data_traits[annotation.name] = [np.zeros(shape=[ 1, n_tree, n_node, 1], dtype=int), \
                                                        {'0':-1, '-':-1, '':-1} if trait_ignoreMissing else {'':-1}, [annotation.name]]
    for tid, tre in enumerate(data_trees) :
        for nid, node in enumerate(tre.nodes()) :
            node.id = nid
            if node.edge_length < 1e-8 and node.parent_node is not None :
                if node.is_leaf() :
                    node.edge_length = 1e-8
                else :
                    parent = node.parent_node
                    for child in node.child_nodes() :
                        child._set_parent_node(parent)
                    parent.remove_child(node)
                    parent.set_child_nodes(parent.child_nodes() + node.child_nodes())
                    continue
            data_traits['branch.length'][0][0, tid, nid, 0] = node.edge_length if node.parent_node else 0.0
            
            for annotation in node.annotations :
                if annotation.name in data_traits :
                    k, v = annotation.name, annotation.value
                    if isinstance(v, basestring) : v = [v]

                    if data_traits[k][1] is None :
                        data_traits[k][0][0, tid, nid] = [float(vv) for vv in v]
                    else :
                        for vv in v :
                            if vv not in data_traits[k][1] :
                                data_traits[k][1][vv] = max(data_traits[k][1].values()) + 1
                        data_traits[k][0][0, tid, nid] = [data_traits[k][1][vv] for vv in v]
            node.annotations.clear()
            node.annotations.add_new('id', node.id)
    for key, states in data_traits.iteritems() :
        if states[1] is not None :
            s, categories, tags = states
            new_state = np.zeros([s.shape[0], s.shape[1], s.shape[2], np.max(s)+1], dtype=int)
            axis = np.where(s>=0)[:3]
            new_state[axis[0], axis[1], axis[2], s[s>=0]] = 1
            data_traits[key][0] = new_state
    return data_trees, data_traits
Exemple #4
0
    def load_tree(self,
                  treefile,
                  burnIn=0,
                  sampleFreq=1,
                  maxNum=10,
                  ignoreMissing=False,
                  **args):  # sumtrees file ASTRID
        # read trees (including traits when possible)
        data_trees = []
        with open(treefile) as fin:
            schema = 'nexus' if fin.readline().upper().startswith(
                '#NEXUS') else 'newick'

        for id, tre in enumerate(
                Tree.yield_from_files([treefile], schema=schema)):
            if maxNum > 0 and id > maxNum: break
            if id >= burnIn:
                if not tre.label:
                    tre.label = str(id)
                if (id - burnIn) % sampleFreq == 0:
                    data_trees.append(tre)
        # find all tips
        taxa = {}
        for tre in data_trees:
            for taxon in tre.taxon_namespace:
                taxa[taxon.label] = 1
        for id, taxon in enumerate(sorted(taxa.keys())):
            taxa[taxon] = id
        # load in metadata trait types
        n_taxa, n_tree, n_node = len(taxa), len(data_trees), 0
        digit_code = np.power(2, np.arange(n_taxa, dtype='object'))
        trait_categories = {
            'branch.length': [1, 'continuous', None],
            'branch.age': [1, 'continuous', None]
        }
        for tre in data_trees:
            internal_id = n_taxa
            for node in tre.postorder_node_iter():
                for annotation in node.annotations:
                    n, v = annotation.name, annotation.value
                    if annotation.name not in trait_categories:
                        if isinstance(v, list):
                            trait_categories[n] = [
                                len(v), 'continuous', None
                            ] if isnum(v[0]) else [len(v), 'discrete', {}]
                        else:
                            trait_categories[n] = [
                                1, 'continuous', None
                            ] if isnum(v) else [1, 'discrete', {}]
                    if trait_categories[n][1] == 'discrete':
                        if isinstance(v, list):
                            for vv in v:
                                trait_categories[n][2][vv] = 1
                        else:
                            trait_categories[n][2][v] = 1
                if node.is_leaf():
                    node.id = taxa[node.taxon.label]
                    node.barcode = digit_code[node.id]
                else:
                    node.id, internal_id = internal_id, internal_id + 1
                    node.barcode = sum([c.barcode for c in node.child_nodes()])
            if internal_id > n_node: n_node = internal_id
            tre.seed_node.age = tre.seed_node.distance_from_tip()
            for node in tre.preorder_node_iter():
                if node.parent_node:
                    node.age = node.parent_node.age - node.edge_length

        # convert traits into discrete characters
        for cc, tc in trait_categories.iteritems():
            if tc[1] == 'discrete':
                if ignoreMissing:
                    tc[2].update({"-": -1, "": -1, "0": -1})
                tc[2].update(
                    dict([[k, id] for id, k in enumerate(
                        sorted([k for k, v in tc[2].iteritems() if v > 0]))]))
        # read traits' values
        trees = []
        for tre in data_trees:
            for node in tre.nodes():
                if node.edge_length < 1e-8 and node.parent_node is not None:
                    if node.is_leaf():
                        node.edge_length = 1e-8
                    else:
                        parent = node.parent_node
                        for child in node.child_nodes():
                            child._set_parent_node(parent)
                        parent.remove_child(node)
                        parent.set_child_nodes(parent.child_nodes() +
                                               node.child_nodes())
            trees.append(MetaTree(tre))
            mt = trees[-1]
            mt.traits['branch'] = [
                np.empty([2, n_node, 1]), [['length', 0], ['age', 1]], None
            ]
            for node in tre.postorder_node_iter():
                mt.traits['branch'][0][:, node.id,
                                       0] = [node.edge_length, node.age]
                for annotation in node.annotations:
                    k, v = annotation.name, annotation.value

                    tck = trait_categories[k]
                    if tck[1] == 'continuous':
                        if k not in mt.traits:
                            mt.traits[k] = [
                                np.empty([1, n_node, tck[0]]), [[k + ':0', 0]],
                                None
                            ]
                            mt.traits[k][0].fill(np.nan)
                        mt.traits[k][0][0, node.id, :] = v
                    else:
                        if k not in mt.traits:
                            mt.traits[k] = [
                                np.zeros([tck[0], n_node], dtype=int),
                                [['{0}:{1}'.format(k, id), id]
                                 for id in np.arange(tck[0])], tck[2]
                            ]
                            mt.traits[k][0].fill(-1)

                        mt.traits[k][0][:,
                                        node.id] = np.vectorize(tck[2].get)(v)
                node.annotations.clear()
                node.annotations.add_new('id', node.id)
            for k, v in mt.traits.iteritems():
                if v[2] is not None:
                    ids = np.lexsort(v[0].T)
                    d = v[0][ids]
                    uniq_ids = np.concatenate([[1],
                                               np.sum(d[:-1] != d[1:], 1)])
                    d = d[uniq_ids > 0]

                    data = np.ones(
                        [d.shape[0], d.shape[1],
                         np.max(v[2].values()) + 1],
                        dtype=float)
                    data.fill(np.nan)
                    axis = np.where(d >= 0)
                    data[axis[0], axis[1], :] = 0
                    data[axis[0], axis[1], d[d >= 0]] = 1
                    v[0] = data
                    dd = []
                    mat_id = -1
                    for i in uniq_ids:
                        if i > 0:
                            mat_id += 1
                        dd.append(mat_id)
                    v[1] = [[name, i]
                            for (name, oi), i in zip(v[1],
                                                     np.array(dd)[ids])]
        return taxa, trees, n_node