Esempio n. 1
0
def naive_monte_carlo(data, depth=15, log_bad_trees=True):
    recent_languages = len(data)
    data_as_counter = collections.Counter(tuple(x) for x in data.values())
    tree = None
    while True:
        suggested_tree = random_tree(depth, root=newick.Node('s'))
        if len(suggested_tree.get_leaves()) != len(data):
            if log_bad_trees:
                print("Wrong number of leaves: {:}".format(
                    suggested_tree.newick))
                yield None
                continue
        else:
            tree = suggested_tree
        if tree is None:
            continue
        s_data = random_observed_data(tree)
        s_data_as_counter = collections.Counter(
            tuple(x) for x in s_data.values())
        if data_as_counter != s_data_as_counter:
            print("Not generating the right data: {:}".format(tree.newick))
            yield None
            continue
        used = set()
        for s_l in tree.get_leaves():
            for l, v in data.items():
                if l not in used and v == s_data[s_l.name]:
                    s_l.name = l
                    used.add(l)
                    break
            else:
                raise RuntimeError
        yield tree
Esempio n. 2
0
def random_tree(depth=15, split_on="🂱🂡🃁🃑", root=None):
    if root is None:
        root = newick.Node('0')
    if depth > 0:
        root.length += 1
        depth -= 1
        if draw_card() in split_on:
            left = newick.Node(root.name + "l")
            root.add_descendant(left)
            random_tree(depth, split_on, root=left)
            right = newick.Node(root.name + "r")
            root.add_descendant(right)
            random_tree(depth, split_on, root=right)
        else:
            random_tree(depth, split_on, root=root)
    return root
Esempio n. 3
0
 def get_newick_node(self, label_name):
     label = "-"
     if label_name in self.dico:
         label = str(self.dico[label_name])
     result = newick.Node(label)
     for child in self.children:
         result.descendants.append(child.get_newick_node(label_name))
     return result
Esempio n. 4
0
def create_balanced_random_tree(taxa, branch_length=random.random()):
    """Generate a random tree.

    This builds a random tree with a given branch length
    distribution, and roughly balanced node heights.

    """
    nodes = []
    for taxon in taxa:
        nodes.append(
            newick.Node(name=taxon,
                        length=branch_length(),
                        length_parser=float,
                        length_formatter="{:f}".format))

    nodes.sort(key=lambda x: x.length)
    heights = [node.length for node in nodes]
    while len(nodes) > 1:
        # Take the two lowest nodes
        node0 = nodes[0]
        node1 = nodes[1]
        height = (heights[0] + heights[1]) / 2
        # Keep the rest
        nodes = nodes[2:]
        heights = heights[2:]
        # Stick a new node on top of those lowest nodes
        new_branch_length = branch_length()
        height += new_branch_length
        tree = newick.Node(length=new_branch_length,
                           length_parser=float,
                           length_formatter="{:f}".format)
        tree.add_descendant(node0)
        tree.add_descendant(node1)
        # Put the new subtree in the right place in height order
        i = bisect.bisect(heights, height)
        heights.insert(i, height)
        nodes.insert(i, tree)
    return nodes[0]
Esempio n. 5
0
def create_random_tree(taxa, branch_length=lambda: random.random()):
    """Generate a random tree typology

    This is a re-implementation of the random tree generator from
    lingpy.

    """
    taxa_list = [t for t in taxa]
    random.shuffle(taxa_list)

    clades = []
    for taxon in taxa_list:
        clades.append(newick.Node(str(taxon), length=str(branch_length())))
    while len(clades) > 1:
        ulti_elem = clades.pop()
        penulti_elem = clades.pop()
        clades.insert(
            0,
            newick.Node.create(length=str(branch_length()),
                               descendants=[ulti_elem, penulti_elem]))
        random.shuffle(clades)
    return clades[0]
Esempio n. 6
0
def get_distances(file):
    tree = open(file).readline().strip()
    ancestral_nodes = []
    leaves = {}
    while 1:
        # END OF TREE: semicolon
        if tree.startswith(";"): break

        # START INNER NODE
        if tree.startswith("("):
            tree = tree[1:]
            n = newick.Node()
            if len(ancestral_nodes) > 0: n.parent = ancestral_nodes[-1]
            ancestral_nodes.append(n)
            continue

        # END INNER NODE
        if tree.startswith(")"):
            tree = tree[1:]
            if re.match(":(\d+)", tree):
                distance = re.match(":(\d+)", tree).group(1)
                ancestral_nodes[-1].distance_to_parent = distance
                while re.match("[:\d]+", tree):
                    tree = tree[1:]
            ancestral_nodes.pop(-1)
            continue

        # OUTER NODE SINGLE
        if re.match(",([A-Za-z]+):(\d+)\)", tree):
            els = re.match(",([A-Za-z]+):(\d+)", tree).groups()
            n1 = newick.Node()
            n1.parent = ancestral_nodes[-1]
            n1.distance_to_parent = els[1]
            leaves[els[0]] = n1
            while not tree.startswith(")"):
                tree = tree[1:]
            continue

        # OUTER NODE DOUBLE
        if re.match("([A-Za-z]+):(\d+),([A-Za-z]+):(\d+)", tree):
            els = re.match("([A-Za-z]+):(\d+),([A-Za-z]+):(\d+)",
                           tree).groups()
            n1 = newick.Node()
            n1.parent = ancestral_nodes[-1]
            n1.distance_to_parent = els[1]
            n1.distance_to_parent = els[1]
            n2 = newick.Node()
            n2.parent = ancestral_nodes[-1]
            n2.distance_to_parent = els[3]
            leaves[els[0]] = n1
            leaves[els[2]] = n2
            while not tree.startswith(")"):
                tree = tree[1:]
            continue

        # INTERNAL INNER NODE
        if tree.startswith(",("):
            tree = tree[2:]
            n = newick.Node()
            if len(ancestral_nodes) > 0: n.parent = ancestral_nodes[-1]
            ancestral_nodes.append(n)
            continue
        if tree.startswith(","):
            tree = tree[1:]
            continue

    distances = {}
    for species1, leafnode1 in leaves.iteritems():
        for species2, leafnode2 in leaves.iteritems():
            distances[species1 + "," + species2] = str(
                leafnode1.summed_distance_to(leafnode2))
    return distances
Esempio n. 7
0
c = collections.defaultdict(lambda: scipy.zeros(6, int))

with Path("../beastling/indexes.log").open() as indices:
    for line in csv.DictReader(indices, delimiter="\t"):
        for id, index in line.items():
            if id == "Sample":
                continue
            _, concept = id.rsplit(":", 1)
            c[concept][int(index)] += 1


def distance(d1, d2):
    return (scipy.stats.chisquare(d1 + 1, d2 + 1).statistic)


nodes = {newick.Node(n): d for n, d in c.items()}
old_nodes = {}
distances = {(n1, n2): distance(d1, d2)
             for (n1, d1), (n2,
                            d2) in itertools.combinations(nodes.items(), 2)}

while len(nodes) > 1:
    argmin = min(distances, key=distances.get)
    d0 = nodes.pop(argmin[0])
    d1 = nodes.pop(argmin[1])
    old_nodes[argmin[0]] = d0
    old_nodes[argmin[1]] = d1
    d = distances.pop(argmin)
    argmin[0].length = d / 2
    argmin[1].length = d / 2
    n = newick.Node(None)
Esempio n. 8
0
    def as_newick(self,
                  seq_id_to_metadata: Dict[msa.SequenceID,
                                           graph.SequenceMetadata] = None,
                  separate_leaves=False) -> str:
        """Returns Affinity Tree in Newick format.

        Args:
            seq_id_to_metadata: Dictionary of _sequences IDs to the desired
                name used in newick file. For example:
                                {SequenceID('KM0123'): 'cat',
                                SequenceID('ZX124'): 'dog'}
            separate_leaves: A switch to control if tree leaves having
                assigned multiple _sequences should have appended
                children singleton leaves single sequence assigned.

        Returns:
            A string with the Affinity Tree converted to newick format.
            https://en.wikipedia.org/wiki/Newick_format
            If the tree has no nodes, an empty string is returned.
        """
        def _get_sequence_attr_if_exists(seq_metadata: graph.SequenceMetadata,
                                         attr: str) -> str:
            """Returns dictionary value if they key attr exists."""

            if attr in seq_metadata:
                return str(seq_metadata[attr])
            else:
                return ""

        def _newick_nhx(newick_node: newick.Node) -> str:
            """Converts newick tree to newick string"""

            node_label = newick_node.name or ''
            if newick_node._length:
                for cn in sorted_nodes:
                    if str(cn.id_) == newick_node.name:
                        if seq_id_to_metadata:
                            if len(cn.sequences) == 1:
                                name = _get_sequence_attr_if_exists(
                                    seq_id_to_metadata[cn.sequences[0]],
                                    "name")
                                if name == "":
                                    name = cn.sequences[0]
                                group = _get_sequence_attr_if_exists(
                                    seq_id_to_metadata[cn.sequences[0]],
                                    "group")
                                seqid = cn.sequences[0]
                                metadata = f"[&&NHX:name={name}:group={group}:seqid={seqid}:mincomp={cn.mincomp}]"
                            elif len(cn.sequences) == 0:
                                name = f"EmptyAffinityNode {cn.id_}"
                                metadata = f"[&&NHX:name={name}:mincomp={cn.mincomp}]"
                            else:
                                name = f"AffinityNode {cn.id_}"
                                metadata = f"[&&NHX:name={name}:mincomp={cn.mincomp}]"
                        else:
                            if len(cn.sequences) == 1:
                                name = cn.sequences[0]
                            elif len(cn.sequences) == 0:
                                name = f"EmptyAffinityNode {cn.id_}"
                            else:
                                name = f"AffinityNode {cn.id_}"
                            mincomp = cn.mincomp
                            metadata = f"[&&NHX:name={name}:mincomp={mincomp}]"
                try:
                    node_label += ':' + newick_node._length + metadata
                except Exception:
                    print("metadata")
            descendants = ','.join(
                [_newick_nhx(n) for n in newick_node.descendants])
            if descendants:
                descendants = '(' + descendants + ')'
            return descendants + node_label

        if not self.nodes:
            return ""

        sorted_nodes = sorted(self.nodes, key=lambda x: x.id_)
        remove_children = []
        if separate_leaves:
            new_leaves_count = 0
            for node in self.nodes:
                if len(node.children) == 0 and len(node.sequences) > 1:
                    for seq_id in node.sequences:
                        affinity_node_id = len(self.nodes) + new_leaves_count
                        node.children.append(affinity_node_id)
                        remove_children.append(node.id_)
                        sorted_nodes.append(
                            AffinityNode(id_=AffinityNodeID(affinity_node_id),
                                         parent=node.id_,
                                         children=[],
                                         sequences=[seq_id],
                                         mincomp=graph.Compatibility(1.0)))
                        new_leaves_count += 1

        nodes_to_process = [(None, sorted_nodes[0])]
        newick_tree = None
        while nodes_to_process:
            n = nodes_to_process.pop()
            node_parent_label = n[0]
            node = n[1]

            label = str(node.id_)
            if node.parent is None:
                length = "1"
            else:
                parent_minComp = sorted_nodes[
                    node.parent].mincomp.base_value().value
                length = str((1 - parent_minComp) -
                             (1 - node.mincomp.base_value().value))

            newick_node = newick.Node(name=label, length=length)

            if newick_tree is None:
                newick_tree = newick_node
            else:
                parent_node = newick_tree.get_node(node_parent_label)
                parent_node.add_descendant(newick_node)

            for child in node.children:
                nodes_to_process.append((label, sorted_nodes[child]))
        for node in self.nodes:
            if node.id_ in remove_children:
                node.children = []

        return "(" + _newick_nhx(newick_tree) + ")"
Esempio n. 9
0
def tree_data(req,
              species_query,
              experiment_count=lambda s: s.count_experiments):
    node_data = {}
    ntrees = []
    colormap = collections.Counter()
    colormap2 = collections.Counter()
    count_leafs = species_query.count()
    species = species_query.order_by(
        Species.kingdom, Species.phylum_sortkey, Species.klass_sortkey,
        Species.order_sortkey, Species.family_sortkey, Species.genus_sortkey,
        Species.sortkey).options(
            joinedload(common.Language.valuesets).joinedload(
                common.ValueSet.values))
    coverage = {}
    nodes = []

    ngenus = 0
    for kingdom, items1 in itertools.groupby(species, lambda s: s.kingdom):
        node1 = newick.Node()
        for phylum, items2 in itertools.groupby(items1, lambda s: s.phylum):
            nid = '_'.join((phylum, ))
            nodes.append((nid, 'Phylum', 'classes'))
            if phylum not in coverage:
                coverage[phylum] = {}

            node2 = newick.Node(nid)
            for klass, items3 in itertools.groupby(items2, lambda s: s.klass):
                nid = '_'.join((phylum, klass))
                nodes.append((nid, 'Class', 'orders'))
                if klass not in coverage[phylum]:
                    coverage[phylum][klass] = {}

                node3 = newick.Node(nid)
                for order, items4 in itertools.groupby(items3,
                                                       lambda s: s.order):
                    nid = '_'.join((phylum, klass, order))
                    nodes.append((nid, 'Order', 'families'))
                    if order not in coverage[phylum][klass]:
                        coverage[phylum][klass][order] = {}

                    node4 = newick.Node(nid)
                    for family, items5 in itertools.groupby(
                            items4, lambda s: s.family):
                        nid = '_'.join((phylum, klass, order, family))
                        nodes.append((nid, 'Family', 'genera'))
                        if family not in coverage[phylum][klass][order]:
                            coverage[phylum][klass][order][family] = {}

                        node5 = newick.Node(nid)
                        for genus, items6 in itertools.groupby(
                                items5, lambda s: s.genus):
                            ngenus += 1
                            nid = '_'.join(
                                (phylum, klass, order, family, genus))
                            nodes.append((nid, 'Genus', 'species'))

                            items6 = list(items6)
                            coverage[phylum][klass][order][family][
                                genus] = len(items6)

                            colormap.update([s.family for s in items6])
                            colormap2.update([s.klass for s in items6])
                            node6 = newick.Node.create(
                                name=nid,
                                descendants=[
                                    newick.Node(
                                        '%s{__id__%s}' %
                                        (s.name.replace(' ', '_'), s.id))
                                    for s in items6
                                ])
                            node_data.update({
                                s.id: species_node(s, req, experiment_count(s))
                                for s in items6
                            })
                            node5.add_descendant(node6)
                        node4.add_descendant(node5)
                    node3.add_descendant(node4)
                node2.add_descendant(node3)
            node1.add_descendant(node2)
        ntrees.append(node1)

    node_data.update({
        nid: coverage_data(req, nid, rank, subranks, coverage)
        for nid, rank, subranks in nodes
    })

    res = dict(
        count_leafs=count_leafs,
        newick=newick.dumps(ntrees),
        colormap={
            k[0]: (v, svg.data_url(svg.icon(v.replace('#', 'c'))))
            for k, v in zip(colormap.most_common(),
                            color.qualitative_colors(len(colormap)))
        },
        colormap2={
            k[0]: (v, svg.data_url(svg.icon(v.replace('#', 's'))))
            for k, v in zip(colormap2.most_common(),
                            color.qualitative_colors(len(colormap), set='tol'))
        },
        node_data=node_data)
    res['edgecolors'] = {k: v[0] for k, v in res['colormap2'].items()}
    return res