コード例 #1
0
ファイル: formats.py プロジェクト: saunack/tskit
def _get_upgrade_provenance(root):
    """
    Returns the provenance string from upgrading the specified HDF5 file.
    """
    # TODO add more parameters here like filename, etc.
    parameters = {
        "command": "upgrade",
        "source_version": list(map(int, root.attrs["format_version"])),
    }
    s = json.dumps(provenance.get_provenance_dict(parameters))
    return s.encode()
コード例 #2
0
ファイル: formats.py プロジェクト: saunack/tskit
def _get_v2_provenance(command, attrs):
    """
    Returns the V2 tree provenance attributes reformatted as a provenance record.
    """
    environment = {}
    parameters = {}
    # Try to get the provenance strings. Malformed JSON should not prevent us
    # from finishing the conversion.
    try:
        environment = json.loads(str(attrs["environment"]))
    except ValueError:
        logging.warn("Failed to convert environment provenance")
    try:
        parameters = json.loads(str(attrs["parameters"]))
    except ValueError:
        logging.warn("Failed to convert parameters provenance")
    parameters["command"] = command
    provenance_dict = provenance.get_provenance_dict(parameters)
    provenance_dict["version"] = environment.get("msprime_version",
                                                 "Unknown_version")
    provenance_dict["environment"] = environment
    return json.dumps(provenance_dict).encode()
コード例 #3
0
ファイル: tsutil.py プロジェクト: daniel-goldstein/tskit
def add_provenance(provenance_table, method_name):
    d = provenance.get_provenance_dict(
        {"command": "tsutil.{}".format(method_name)})
    provenance_table.add_row(json.dumps(d))
コード例 #4
0
ファイル: tsutil.py プロジェクト: jeromekelleher/msprime
def add_provenance(provenance_table, method_name):
    d = provenance.get_provenance_dict({"command": "tsutil.{}".format(method_name)})
    provenance_table.add_row(json.dumps(d))
コード例 #5
0
ファイル: tsutil.py プロジェクト: saunack/tskit
def add_provenance(provenance_table, method_name):
    d = provenance.get_provenance_dict({"command": f"tsutil.{method_name}"})
    provenance_table.add_row(json.dumps(d))
コード例 #6
0
def randomly_split_polytomies(
    self,
    *,
    epsilon=None,
    squash_edges=True,
    record_provenance=True,
    random_seed=None,
):
    """
    Modifies the table collection in place, adding extra nodes and edges
    so that any node with greater than 2 children (i.e. a multifurcation
    or "polytomy") is resolved into successive bifurcations. This is identical
    to :meth:`TreeSequence.randomly_split_polytomies` but acts *in place* to
    alter the data in this :class:`TableCollection`. Please see
    :meth:`TreeSequence.randomly_split_polytomies` for a fuller description,
    and details of parameters.
    """
    if epsilon is None:
        epsilon = 1e-10
    rng = np.random.default_rng(seed=random_seed)

    def resolve_polytomy(parent_node_id, child_ids, new_nodes_by_time_desc):
        """
        For a polytomy and list of child node ids, return a list of (child, parent)
        tuples, describing a bifurcating tree, rooted at parent_node_id, where the
        new_nodes_by_time_desc have been used to break polytomies. All possible
        topologies should be equiprobable.
        """
        nonlocal rng
        assert len(child_ids) == len(new_nodes_by_time_desc) + 2
        # Polytomies broken by sequentially splicing onto edges, so an initial edge
        # is required. This will always remain above the top node & is removed later
        edges = [
            [child_ids[0], None],
        ]
        # We know beforehand how many random ints are needed: generate them all now
        edge_choice = rng.integers(0, np.arange(1, len(child_ids) * 2 - 1, 2))
        tmp_new_node_lab = [parent_node_id] + new_nodes_by_time_desc
        assert len(edge_choice) == len(child_ids) - 1
        for node_lab, child_id, target_edge_id in zip(tmp_new_node_lab,
                                                      child_ids[1:],
                                                      edge_choice):
            target_edge = edges[target_edge_id]
            # Insert in the right place, to keep edges in parent time order
            edges.insert(target_edge_id, [child_id, node_lab])
            edges.insert(target_edge_id, [target_edge[0], node_lab])
            target_edge[0] = node_lab
        top_edge = edges.pop()  # remove the edge above the top node
        assert top_edge[1] is None

        # Re-map the internal nodes IDs so they are used in time order
        real_node = iter(new_nodes_by_time_desc)
        node_map = {c: c for c in child_ids}
        node_map[edges[-1][1]] = parent_node_id  # last edge == oldest parent
        for e in reversed(edges):
            # Reversing along the edges, parents are in inverse time order
            for idx in (1, 0):  # look at parent (1) then child (0)
                if e[idx] not in node_map:
                    node_map[e[idx]] = next(real_node)
                e[idx] = node_map[e[idx]]
        assert len(
            node_map) == len(new_nodes_by_time_desc) + len(child_ids) + 1
        return edges

    edge_table = self.edges
    node_table = self.nodes
    # Store existing left, so we can change it if the edge is split
    existing_edges_left = edge_table.left
    # Keep other edge arrays etc. for fast read access
    existing_edges_right = edge_table.right
    existing_edges_parent = edge_table.parent
    existing_edges_child = edge_table.child
    existing_node_time = node_table.time

    # We can save a lot of effort if we don't need to check the time of mutations
    # We definitely don't need to check on the first iteration, a
    check_mutations = np.any(
        np.logical_not(tskit.is_unknown_time(self.mutations.time)))
    ts = self.tree_sequence()  # Only needed to check mutations
    tree_iter = ts.trees()  # ditto

    edge_table.clear()

    edges_from_node = collections.defaultdict(
        set)  # Active descendant edge ids
    nodes_changed = set()

    for interval, e_out, e_in in ts.edge_diffs(include_terminal=True):
        pos = interval[0]
        prev_tree = None if pos == 0 else next(tree_iter)

        for edge in itertools.chain(e_out, e_in):
            if edge.parent != tskit.NULL:
                nodes_changed.add(edge.parent)

        oldest_mutation_for_node = {}
        if check_mutations and prev_tree is not None:
            # It would also help if mutations were sorted such that all mutations
            # above the same node appeared consecutively, with oldest first.
            for site in prev_tree.sites():
                for mutation in site.mutations:
                    if not tskit.is_unknown_time(mutation.time):
                        if mutation.node in oldest_mutation_for_node:
                            oldest_mutation_for_node[mutation.node] = max(
                                oldest_mutation_for_node[mutation.node],
                                mutation.time)
                        else:
                            oldest_mutation_for_node[
                                mutation.node] = mutation.time
        for parent_node in nodes_changed:
            child_edge_ids = edges_from_node[parent_node]
            if len(child_edge_ids) >= 3:
                # We have a previous polytomy to break
                parent_time = existing_node_time[parent_node]
                new_nodes = []
                child_ids = existing_edges_child[list(child_edge_ids)]
                left = None
                max_time = 0
                # Split existing edges
                for edge_id, child_id in zip(child_edge_ids, child_ids):
                    max_time = max(max_time, existing_node_time[child_id])
                    if check_mutations and child_id in oldest_mutation_for_node:
                        max_time = max(max_time,
                                       oldest_mutation_for_node[child_id])
                    if left is None:
                        left = existing_edges_left[edge_id]
                    else:
                        assert left == existing_edges_left[edge_id]
                    if existing_edges_right[edge_id] > interval[0]:
                        # make sure we carry on the edge after this polytomy
                        existing_edges_left[edge_id] = pos
                # Arbitrarily, if epsilon is not small enough, use half the min dist
                dt = min((parent_time - max_time) / (len(child_ids) * 2),
                         epsilon)
                # Break this N-degree polytomy. This requires N-2 extra nodes to be
                # introduced: create them here in order of decreasing time
                new_nodes = [
                    node_table.add_row(time=parent_time - (i * dt))
                    for i in range(1,
                                   len(child_ids) - 1)
                ]
                # print("New nodes:", new_nodes, node_table.time[new_nodes])
                for new_edge in resolve_polytomy(parent_node, child_ids,
                                                 new_nodes):
                    edge_table.add_row(
                        left=left,
                        right=pos,
                        child=new_edge[0],
                        parent=new_edge[1],
                    )
                    # print("new_edge: left={}, right={}, child={}, parent={}"
                    #    .format(left, pos, new_edge[0], new_edge[1]))
            else:
                # Previous node was not a polytomy - just add the edges_out
                for edge_id in child_edge_ids:
                    if existing_edges_right[edge_id] == pos:  # is an out edge
                        edge_table.add_row(
                            left=existing_edges_left[edge_id],
                            right=pos,
                            parent=parent_node,
                            child=existing_edges_child[edge_id],
                        )

        for edge in e_out:
            if edge.parent != tskit.NULL:
                # print("REMOVE", edge.id)
                edges_from_node[edge.parent].remove(edge.id)
        for edge in e_in:
            if edge.parent != tskit.NULL:
                # print("ADD", edge.id)
                edges_from_node[edge.parent].add(edge.id)

        # Chop if we have created a polytomy: the polytomy itself will be resolved
        # at a future iteration, when any edges move into or out of the polytomy
        while nodes_changed:
            node = nodes_changed.pop()
            edge_ids = edges_from_node[node]
            # print("Looking at", node)

            if len(edge_ids) == 0:
                del edges_from_node[node]
            # if this node has changed *to* a polytomy, we need to cut all of the
            # child edges that were previously present by adding the previous
            # segment and left-truncating
            elif len(edge_ids) >= 3:
                for edge_id in edge_ids:
                    if existing_edges_left[edge_id] < interval[0]:
                        self.edges.add_row(
                            left=existing_edges_left[edge_id],
                            right=interval[0],
                            parent=existing_edges_parent[edge_id],
                            child=existing_edges_child[edge_id],
                        )
                    existing_edges_left[edge_id] = interval[0]
    assert len(edges_from_node) == 0
    self.sort()

    if squash_edges:
        self.edges.squash()
        self.sort()  # Bug: https://github.com/tskit-dev/tskit/issues/808

    if record_provenance:
        parameters = {"command": "randomly_split_polytomies"}
        self.provenances.add_row(
            record=json.dumps(provenance.get_provenance_dict(parameters)))