def naive_monte_carlo(data, depth=15, log_bad_trees=True): recent_languages = len(data) data_as_counter = collections.Counter(tuple(x) for x in data.values()) tree = None while True: suggested_tree = random_tree(depth, root=newick.Node('s')) if len(suggested_tree.get_leaves()) != len(data): if log_bad_trees: print("Wrong number of leaves: {:}".format( suggested_tree.newick)) yield None continue else: tree = suggested_tree if tree is None: continue s_data = random_observed_data(tree) s_data_as_counter = collections.Counter( tuple(x) for x in s_data.values()) if data_as_counter != s_data_as_counter: print("Not generating the right data: {:}".format(tree.newick)) yield None continue used = set() for s_l in tree.get_leaves(): for l, v in data.items(): if l not in used and v == s_data[s_l.name]: s_l.name = l used.add(l) break else: raise RuntimeError yield tree
def random_tree(depth=15, split_on="🂱🂡🃁🃑", root=None): if root is None: root = newick.Node('0') if depth > 0: root.length += 1 depth -= 1 if draw_card() in split_on: left = newick.Node(root.name + "l") root.add_descendant(left) random_tree(depth, split_on, root=left) right = newick.Node(root.name + "r") root.add_descendant(right) random_tree(depth, split_on, root=right) else: random_tree(depth, split_on, root=root) return root
def get_newick_node(self, label_name): label = "-" if label_name in self.dico: label = str(self.dico[label_name]) result = newick.Node(label) for child in self.children: result.descendants.append(child.get_newick_node(label_name)) return result
def create_balanced_random_tree(taxa, branch_length=random.random()): """Generate a random tree. This builds a random tree with a given branch length distribution, and roughly balanced node heights. """ nodes = [] for taxon in taxa: nodes.append( newick.Node(name=taxon, length=branch_length(), length_parser=float, length_formatter="{:f}".format)) nodes.sort(key=lambda x: x.length) heights = [node.length for node in nodes] while len(nodes) > 1: # Take the two lowest nodes node0 = nodes[0] node1 = nodes[1] height = (heights[0] + heights[1]) / 2 # Keep the rest nodes = nodes[2:] heights = heights[2:] # Stick a new node on top of those lowest nodes new_branch_length = branch_length() height += new_branch_length tree = newick.Node(length=new_branch_length, length_parser=float, length_formatter="{:f}".format) tree.add_descendant(node0) tree.add_descendant(node1) # Put the new subtree in the right place in height order i = bisect.bisect(heights, height) heights.insert(i, height) nodes.insert(i, tree) return nodes[0]
def create_random_tree(taxa, branch_length=lambda: random.random()): """Generate a random tree typology This is a re-implementation of the random tree generator from lingpy. """ taxa_list = [t for t in taxa] random.shuffle(taxa_list) clades = [] for taxon in taxa_list: clades.append(newick.Node(str(taxon), length=str(branch_length()))) while len(clades) > 1: ulti_elem = clades.pop() penulti_elem = clades.pop() clades.insert( 0, newick.Node.create(length=str(branch_length()), descendants=[ulti_elem, penulti_elem])) random.shuffle(clades) return clades[0]
def get_distances(file): tree = open(file).readline().strip() ancestral_nodes = [] leaves = {} while 1: # END OF TREE: semicolon if tree.startswith(";"): break # START INNER NODE if tree.startswith("("): tree = tree[1:] n = newick.Node() if len(ancestral_nodes) > 0: n.parent = ancestral_nodes[-1] ancestral_nodes.append(n) continue # END INNER NODE if tree.startswith(")"): tree = tree[1:] if re.match(":(\d+)", tree): distance = re.match(":(\d+)", tree).group(1) ancestral_nodes[-1].distance_to_parent = distance while re.match("[:\d]+", tree): tree = tree[1:] ancestral_nodes.pop(-1) continue # OUTER NODE SINGLE if re.match(",([A-Za-z]+):(\d+)\)", tree): els = re.match(",([A-Za-z]+):(\d+)", tree).groups() n1 = newick.Node() n1.parent = ancestral_nodes[-1] n1.distance_to_parent = els[1] leaves[els[0]] = n1 while not tree.startswith(")"): tree = tree[1:] continue # OUTER NODE DOUBLE if re.match("([A-Za-z]+):(\d+),([A-Za-z]+):(\d+)", tree): els = re.match("([A-Za-z]+):(\d+),([A-Za-z]+):(\d+)", tree).groups() n1 = newick.Node() n1.parent = ancestral_nodes[-1] n1.distance_to_parent = els[1] n1.distance_to_parent = els[1] n2 = newick.Node() n2.parent = ancestral_nodes[-1] n2.distance_to_parent = els[3] leaves[els[0]] = n1 leaves[els[2]] = n2 while not tree.startswith(")"): tree = tree[1:] continue # INTERNAL INNER NODE if tree.startswith(",("): tree = tree[2:] n = newick.Node() if len(ancestral_nodes) > 0: n.parent = ancestral_nodes[-1] ancestral_nodes.append(n) continue if tree.startswith(","): tree = tree[1:] continue distances = {} for species1, leafnode1 in leaves.iteritems(): for species2, leafnode2 in leaves.iteritems(): distances[species1 + "," + species2] = str( leafnode1.summed_distance_to(leafnode2)) return distances
c = collections.defaultdict(lambda: scipy.zeros(6, int)) with Path("../beastling/indexes.log").open() as indices: for line in csv.DictReader(indices, delimiter="\t"): for id, index in line.items(): if id == "Sample": continue _, concept = id.rsplit(":", 1) c[concept][int(index)] += 1 def distance(d1, d2): return (scipy.stats.chisquare(d1 + 1, d2 + 1).statistic) nodes = {newick.Node(n): d for n, d in c.items()} old_nodes = {} distances = {(n1, n2): distance(d1, d2) for (n1, d1), (n2, d2) in itertools.combinations(nodes.items(), 2)} while len(nodes) > 1: argmin = min(distances, key=distances.get) d0 = nodes.pop(argmin[0]) d1 = nodes.pop(argmin[1]) old_nodes[argmin[0]] = d0 old_nodes[argmin[1]] = d1 d = distances.pop(argmin) argmin[0].length = d / 2 argmin[1].length = d / 2 n = newick.Node(None)
def as_newick(self, seq_id_to_metadata: Dict[msa.SequenceID, graph.SequenceMetadata] = None, separate_leaves=False) -> str: """Returns Affinity Tree in Newick format. Args: seq_id_to_metadata: Dictionary of _sequences IDs to the desired name used in newick file. For example: {SequenceID('KM0123'): 'cat', SequenceID('ZX124'): 'dog'} separate_leaves: A switch to control if tree leaves having assigned multiple _sequences should have appended children singleton leaves single sequence assigned. Returns: A string with the Affinity Tree converted to newick format. https://en.wikipedia.org/wiki/Newick_format If the tree has no nodes, an empty string is returned. """ def _get_sequence_attr_if_exists(seq_metadata: graph.SequenceMetadata, attr: str) -> str: """Returns dictionary value if they key attr exists.""" if attr in seq_metadata: return str(seq_metadata[attr]) else: return "" def _newick_nhx(newick_node: newick.Node) -> str: """Converts newick tree to newick string""" node_label = newick_node.name or '' if newick_node._length: for cn in sorted_nodes: if str(cn.id_) == newick_node.name: if seq_id_to_metadata: if len(cn.sequences) == 1: name = _get_sequence_attr_if_exists( seq_id_to_metadata[cn.sequences[0]], "name") if name == "": name = cn.sequences[0] group = _get_sequence_attr_if_exists( seq_id_to_metadata[cn.sequences[0]], "group") seqid = cn.sequences[0] metadata = f"[&&NHX:name={name}:group={group}:seqid={seqid}:mincomp={cn.mincomp}]" elif len(cn.sequences) == 0: name = f"EmptyAffinityNode {cn.id_}" metadata = f"[&&NHX:name={name}:mincomp={cn.mincomp}]" else: name = f"AffinityNode {cn.id_}" metadata = f"[&&NHX:name={name}:mincomp={cn.mincomp}]" else: if len(cn.sequences) == 1: name = cn.sequences[0] elif len(cn.sequences) == 0: name = f"EmptyAffinityNode {cn.id_}" else: name = f"AffinityNode {cn.id_}" mincomp = cn.mincomp metadata = f"[&&NHX:name={name}:mincomp={mincomp}]" try: node_label += ':' + newick_node._length + metadata except Exception: print("metadata") descendants = ','.join( [_newick_nhx(n) for n in newick_node.descendants]) if descendants: descendants = '(' + descendants + ')' return descendants + node_label if not self.nodes: return "" sorted_nodes = sorted(self.nodes, key=lambda x: x.id_) remove_children = [] if separate_leaves: new_leaves_count = 0 for node in self.nodes: if len(node.children) == 0 and len(node.sequences) > 1: for seq_id in node.sequences: affinity_node_id = len(self.nodes) + new_leaves_count node.children.append(affinity_node_id) remove_children.append(node.id_) sorted_nodes.append( AffinityNode(id_=AffinityNodeID(affinity_node_id), parent=node.id_, children=[], sequences=[seq_id], mincomp=graph.Compatibility(1.0))) new_leaves_count += 1 nodes_to_process = [(None, sorted_nodes[0])] newick_tree = None while nodes_to_process: n = nodes_to_process.pop() node_parent_label = n[0] node = n[1] label = str(node.id_) if node.parent is None: length = "1" else: parent_minComp = sorted_nodes[ node.parent].mincomp.base_value().value length = str((1 - parent_minComp) - (1 - node.mincomp.base_value().value)) newick_node = newick.Node(name=label, length=length) if newick_tree is None: newick_tree = newick_node else: parent_node = newick_tree.get_node(node_parent_label) parent_node.add_descendant(newick_node) for child in node.children: nodes_to_process.append((label, sorted_nodes[child])) for node in self.nodes: if node.id_ in remove_children: node.children = [] return "(" + _newick_nhx(newick_tree) + ")"
def tree_data(req, species_query, experiment_count=lambda s: s.count_experiments): node_data = {} ntrees = [] colormap = collections.Counter() colormap2 = collections.Counter() count_leafs = species_query.count() species = species_query.order_by( Species.kingdom, Species.phylum_sortkey, Species.klass_sortkey, Species.order_sortkey, Species.family_sortkey, Species.genus_sortkey, Species.sortkey).options( joinedload(common.Language.valuesets).joinedload( common.ValueSet.values)) coverage = {} nodes = [] ngenus = 0 for kingdom, items1 in itertools.groupby(species, lambda s: s.kingdom): node1 = newick.Node() for phylum, items2 in itertools.groupby(items1, lambda s: s.phylum): nid = '_'.join((phylum, )) nodes.append((nid, 'Phylum', 'classes')) if phylum not in coverage: coverage[phylum] = {} node2 = newick.Node(nid) for klass, items3 in itertools.groupby(items2, lambda s: s.klass): nid = '_'.join((phylum, klass)) nodes.append((nid, 'Class', 'orders')) if klass not in coverage[phylum]: coverage[phylum][klass] = {} node3 = newick.Node(nid) for order, items4 in itertools.groupby(items3, lambda s: s.order): nid = '_'.join((phylum, klass, order)) nodes.append((nid, 'Order', 'families')) if order not in coverage[phylum][klass]: coverage[phylum][klass][order] = {} node4 = newick.Node(nid) for family, items5 in itertools.groupby( items4, lambda s: s.family): nid = '_'.join((phylum, klass, order, family)) nodes.append((nid, 'Family', 'genera')) if family not in coverage[phylum][klass][order]: coverage[phylum][klass][order][family] = {} node5 = newick.Node(nid) for genus, items6 in itertools.groupby( items5, lambda s: s.genus): ngenus += 1 nid = '_'.join( (phylum, klass, order, family, genus)) nodes.append((nid, 'Genus', 'species')) items6 = list(items6) coverage[phylum][klass][order][family][ genus] = len(items6) colormap.update([s.family for s in items6]) colormap2.update([s.klass for s in items6]) node6 = newick.Node.create( name=nid, descendants=[ newick.Node( '%s{__id__%s}' % (s.name.replace(' ', '_'), s.id)) for s in items6 ]) node_data.update({ s.id: species_node(s, req, experiment_count(s)) for s in items6 }) node5.add_descendant(node6) node4.add_descendant(node5) node3.add_descendant(node4) node2.add_descendant(node3) node1.add_descendant(node2) ntrees.append(node1) node_data.update({ nid: coverage_data(req, nid, rank, subranks, coverage) for nid, rank, subranks in nodes }) res = dict( count_leafs=count_leafs, newick=newick.dumps(ntrees), colormap={ k[0]: (v, svg.data_url(svg.icon(v.replace('#', 'c')))) for k, v in zip(colormap.most_common(), color.qualitative_colors(len(colormap))) }, colormap2={ k[0]: (v, svg.data_url(svg.icon(v.replace('#', 's')))) for k, v in zip(colormap2.most_common(), color.qualitative_colors(len(colormap), set='tol')) }, node_data=node_data) res['edgecolors'] = {k: v[0] for k, v in res['colormap2'].items()} return res