Exemple #1
0
    def read(self):
        roots = self.create_graph()
        graph = Graph(roots)
        graph.enumerate_traverse()

        dataframe = pd.DataFrame.from_dict(data=list(self.name_to_dict.values()))
        index = ["node"]
        dataframe.set_index(index, inplace=True)
        dataframe.sort_index(inplace=True)

        return hatchet.graphframe.GraphFrame(graph, dataframe, ["time"], ["time (inc)"])
Exemple #2
0
def test_invalid_constructor():
    # bad Graph
    with pytest.raises(ValueError):
        GraphFrame(None, None)

    # bad dataframe
    with pytest.raises(ValueError):
        GraphFrame(Graph([]), None)

    # dataframe has no "node" index
    with pytest.raises(ValueError):
        GraphFrame(Graph([]), pd.DataFrame())
Exemple #3
0
def test_trees_are_trees():
    g = Graph.from_lists(("a", ))
    assert g.is_tree()

    g = Graph.from_lists(("a", ("b", ("c"))))
    assert g.is_tree()

    g = Graph.from_lists(("a", "b", "c"))
    assert g.is_tree()

    g = Graph.from_lists(("a", ("b", "e", "f", "g"), ("c", "e", "f", "g"),
                          ("d", "e", "f", "g")))
    assert g.is_tree()
Exemple #4
0
    def read(self):
        """Read the caliper records to extract the calling context tree."""
        if isinstance(self.filename_or_caliperreader, str):
            if self.filename_ext != ".cali":
                raise ValueError("from_caliperreader() needs a .cali file")
            else:
                cali_file = self.filename_or_caliperreader
                self.filename_or_caliperreader = cr.CaliperReader()
                self.filename_or_caliperreader.read(cali_file)

        with self.timer.phase("graph construction"):
            list_roots = self.create_graph()

        # create a graph object once all the nodes have been added
        graph = Graph(list_roots)
        graph.enumerate_traverse()

        dataframe = pd.DataFrame(data=self.node_dicts)

        indices = ["node"]
        if "rank" in dataframe.columns:
            indices.append("rank")
        dataframe.set_index(indices, inplace=True)
        dataframe.sort_index(inplace=True)

        # change column names
        for idx, item in enumerate(dataframe.columns):
            # make other columns consistent with other readers
            if item == "mpi.rank":
                dataframe.columns.values[idx] = "rank"
            if item == "module#cali.sampler.pc":
                dataframe.columns.values[idx] = "module"

        # create list of exclusive and inclusive metric columns
        exc_metrics = []
        inc_metrics = []
        for column in self.metric_columns:
            if "(inc)" in column:
                inc_metrics.append(column)
            else:
                exc_metrics.append(column)

        metadata = self.filename_or_caliperreader.globals

        return hatchet.graphframe.GraphFrame(graph,
                                             dataframe,
                                             exc_metrics,
                                             inc_metrics,
                                             metadata=metadata)
Exemple #5
0
    def read(self):
        list_roots = []
        node_dicts = []
        frame_to_node_dict = {}
        frame = None

        # start with creating a node_dict for each root
        for i in range(len(self.graph_dict)):
            frame = Frame(self.graph_dict[i]["frame"])
            graph_root = Node(frame, None)

            # depending on the node type, the name may not be in the frame
            node_name = self.graph_dict[i]["frame"].get("name")
            if not node_name:
                node_name = self.graph_dict[i]["name"]

            node_dict = dict({
                "node": graph_root,
                "name": node_name
            }, **self.graph_dict[i]["metrics"])
            node_dicts.append(node_dict)

            list_roots.append(graph_root)
            frame_to_node_dict[frame] = graph_root

            # call recursively on all children of root
            if "children" in self.graph_dict[i]:
                for child in self.graph_dict[i]["children"]:
                    self.parse_node_literal(frame_to_node_dict, node_dicts,
                                            child, graph_root)

        graph = Graph(list_roots)
        graph.enumerate_traverse()

        exc_metrics = []
        inc_metrics = []
        for key in self.graph_dict[i]["metrics"].keys():
            if "(inc)" in key:
                inc_metrics.append(key)
            else:
                exc_metrics.append(key)

        dataframe = pd.DataFrame(data=node_dicts)
        dataframe.set_index(["node"], inplace=True)
        dataframe.sort_index(inplace=True)

        return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics,
                                             inc_metrics)
Exemple #6
0
def test_copy():
    d = Node(Frame(name="d"))
    diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d)))
    g = Graph.from_lists(("e", "f", diamond_subdag),
                         ("g", diamond_subdag, "h"))

    assert g.copy() == g
Exemple #7
0
def test_filter_squash_diamond():
    r"""Test that diamond edges are collapsed when squashing.

    Ensure we can handle the most basic DAG.

            a
           / \      remove bc     a
          b   c    ---------->    |
           \ /                    d
            d

    """
    d = Node(Frame(name="d"))
    check_filter_squash(
        GraphFrame.from_lists(("a", ("b", d), ("c", d))),
        lambda row: row["node"].frame["name"] not in ("b", "c"),
        Graph.from_lists(("a", "d")),
        [2, 1],  # a, d
    )

    check_filter_no_squash(
        GraphFrame.from_lists(("a", ("b", d), ("c", d))),
        lambda row: row["node"].frame["name"] not in ("b", "c"),
        2,  # a, d
    )
Exemple #8
0
def test_filter_squash_with_rootless_merge():
    r"""Test squash on a simple tree with several rootless node merges.

               a
          ___/ | \___     remove abcd
         b     c     d   ------------>  e f g
        /|\   /|\   /|\
       e f g e f g e f g

    Note that here, because b and d have been removed, a will have only
    one child called c, which will contain merged (summed) data from the
    original c rows.

    """
    check_filter_squash(
        GraphFrame.from_lists(
            ("a", ("b", "e", "f", "g"), ("c", "e", "f", "g"), ("d", "e", "f", "g"))
        ),
        lambda row: row["node"].frame["name"] not in ("a", "b", "c", "d"),
        Graph.from_lists(["e"], ["f"], ["g"]),
        [3, 3, 3],  # e, f, g
    )

    check_filter_no_squash(
        GraphFrame.from_lists(
            ("a", ("b", "e", "f", "g"), ("c", "e", "f", "g"), ("d", "e", "f", "g"))
        ),
        lambda row: row["node"].frame["name"] not in ("a", "b", "c", "d"),
        9,  # e, f, g, e, f, g, e, f, g
    )
Exemple #9
0
def test_filter_squash_bunny():
    r"""Test squash on a complicated "bunny" shaped graph.

    This has multiple roots as well as multiple parents that themselves
    have parents.

          e   g
         / \ / \
        f   a   h    remove abc     e   g
           / \      ----------->   / \ / \
          b   c                   f   d   h
           \ /
            d

    """
    d = Node(Frame(name="d"))
    diamond = Node.from_lists(("a", ("b", d), ("c", d)))

    new_d = Node(Frame(name="d"))

    check_filter_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "b", "c"),
        Graph.from_lists(("e", new_d, "f"), ("g", new_d, "h")),
        [3, 1, 1, 3, 1],  # e, d, f, g, h
    )

    check_filter_no_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "b", "c"),
        5,  # e, d, f, g, h
    )
Exemple #10
0
def test_filter_squash_bunny_to_goat():
    r"""Test squash on a "bunny" shaped graph:

    This one is more complex because there are more transitive edges to
    maintain between the roots (e, g) and b and c.

          e   g                     e   g
         / \ / \                   /|\ /|\
        f   a   h    remove ac    f | b | h
           / \      ---------->     | | |
          b   c                      \|/
           \ /                        d
            d

    """
    d = Node(Frame(name="d"))
    diamond = Node.from_lists(("a", ("b", d), ("c", d)))

    new_d = Node(Frame(name="d"))
    new_b = Node.from_lists(("b", new_d))

    check_filter_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "c"),
        Graph.from_lists(("e", new_b, new_d, "f"), ("g", new_b, new_d, "h")),
        [4, 2, 1, 1, 4, 1],  # e, b, d, f, g, h
    )

    check_filter_no_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "c"),
        6,  # e, b, d, f, g, h
    )
Exemple #11
0
def test_filter_squash_bunny_to_goat_with_merge():
    r"""Test squash on a "bunny" shaped graph:

    This one is more complex because there are more transitive edges to
    maintain between the roots (e, g) and b and c.

          e   g
         / \ / \
        f   a   h    remove ac      e   g
           / \      ---------->    / \ / \
          b   c                   f   b   h
           \ /
            b

    """
    b = Node(Frame(name="b"))
    diamond = Node.from_lists(("a", ("b", b), ("c", b)))

    new_b = Node(Frame(name="b"))

    check_filter_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "c"),
        Graph.from_lists(("e", new_b, "f"), ("g", new_b, "h")),
        [4, 2, 1, 4, 1],  # e, b, f, g, h
    )

    check_filter_no_squash(
        GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")),
        lambda row: row["node"].frame["name"] not in ("a", "c"),
        5,  # e, b, f, g, h
    )
Exemple #12
0
def test_filter_squash_with_merge():
    r"""Test squash with a simple node merge.

          a
         / \      remove bd     a
        b   d    ---------->    |
       /      \                 c
      c        c

    Note that here, because b and d have been removed, a will have only
    one child called c, which will contain merged (summed) data from the
    original c rows.

    """
    check_filter_squash(
        GraphFrame.from_lists(("a", ("b", "c"), ("d", "c"))),
        lambda row: row["node"].frame["name"] in ("a", "c"),
        Graph.from_lists(("a", "c")),
        [3, 2],  # a, c
    )

    check_filter_no_squash(
        GraphFrame.from_lists(("a", ("b", "c"), ("d", "c"))),
        lambda row: row["node"].frame["name"] in ("a", "c"),
        3,  # a, c, c
    )
Exemple #13
0
def test_traverse_paths():
    d = Node(Frame(name="d"))
    diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d)))

    g = Graph.from_lists(("e", "f", diamond_subdag),
                         ("g", diamond_subdag, "h"))
    assert list(
        g.traverse(attrs="name")) == ["e", "a", "b", "d", "c", "f", "g", "h"]
Exemple #14
0
def test_union_dag():
    # make graphs g1, g2, and g3, where you know g3 is the union of g1 and g2
    c = Node.from_lists(("c", "d"))
    g1 = Graph.from_lists(("a", ("b", c), ("e", c, "f")))

    d = Node(Frame(name="d"))
    g2 = Graph.from_lists(("a", ("b", ("c", d)), ("e", d, "f")))

    d2 = Node(Frame(name="d"))
    c2 = Node.from_lists(("c", d2))
    g3 = Graph.from_lists(("a", ("b", c2), ("e", c2, d2, "f")))

    assert g1 != g2

    g4 = g1.union(g2)

    assert g4 == g3
Exemple #15
0
def test_from_lists():
    """Ensure we can traverse roots in correct order without repeating a
       shared subdag.
    """
    d = Node(Frame(name="d"))
    diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d)))

    g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h"))
    assert list(g.traverse(attrs="name")) == ["e", "a", "b", "d", "c", "f", "g", "h"]
Exemple #16
0
def _reconstruct_graph(df, rel_dict):
    node_list = sorted(list(df.index.to_frame()["node"]))
    for i in range(len(df)):
        node = _get_node_from_df_iloc(df, i)
        if len(node.children) == 0:
            node.children = [
                node_list[nid] for nid in rel_dict[node]["children"]
            ]
        if len(node.parents) == 0:
            node.parents = [
                node_list[nid] for nid in rel_dict[node]["parents"]
            ]
    roots = [node for node in node_list if len(node.parents) == 0]
    return Graph(roots)
Exemple #17
0
def test_filter_squash_different_roots():
    r"""Test squash on a simple tree with one root but make multiple roots.

          a
         / \      remove a     b  d
        b   d    --------->   /    \
       /      \              c      e
      c        e

    """
    check_filter_squash(
        GraphFrame.from_lists(("a", ("b", "c"), ("d", "e"))),
        lambda row: row["node"].frame["name"] != "a",
        Graph.from_lists(("b", "c"), ("d", "e")),
        [2, 1, 2, 1],  # b, c, d, e
    )
Exemple #18
0
def test_filter_squash():
    r"""Test squash on a simple tree with one root.

          a
         / \      remove bd     a
        b   d    ---------->   / \
       /      \               c   e
      c        e

    """
    check_filter_squash(
        GraphFrame.from_lists(("a", ("b", "c"), ("d", "e"))),
        lambda row: row["node"].frame["name"] in ("a", "c", "e"),
        Graph.from_lists(("a", "c", "e")),
        [3, 1, 1],  # a, c, e
    )
Exemple #19
0
def test_dag_is_not_tree():
    g = Graph.from_lists(("b", "c"), ("d", "e"))
    assert not g.is_tree()

    d = Node(Frame(name="d"))
    diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d)))
    g = Graph([diamond_subdag])
    assert not g.is_tree()

    g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h"))
    assert not g.is_tree()
    def create_graph(self):
        def parse_node_literal(child_dict, hparent):
            """Create node_dict for one node and then call the function
            recursively on all children."""

            hnode = Node(
                Frame({
                    "name": child_dict["function"],
                    "type": "function"
                }), hparent)

            child_node_dict = {
                "node": hnode,
                "name": child_dict["function"],
                "file": child_dict["file_path_short"],
                "line": child_dict["line_no"],
                "time": child_dict["time"],
                "time (inc)": child_dict["time"],
                "is_application_code": child_dict["is_application_code"],
            }

            hparent.add_child(hnode)
            self.node_dicts.append(child_node_dict)

            if "children" in child_dict:
                for child in child_dict["children"]:
                    # Pyinstrument's time metric actually stores inclusive time.
                    # To calculate exclusive time, we subtract the children's time
                    # from the parent's time.
                    child_node_dict["time"] -= child["time"]
                    parse_node_literal(child, hnode)

        # start with creating a node_dict for each root
        graph_root = Node(
            Frame({
                "name": self.graph_dict["root_frame"]["function"],
                "type": "function"
            }),
            None,
        )

        node_dict = {
            "node":
            graph_root,
            "name":
            self.graph_dict["root_frame"]["function"],
            "file":
            self.graph_dict["root_frame"]["file_path_short"],
            "line":
            self.graph_dict["root_frame"]["line_no"],
            "time":
            self.graph_dict["root_frame"]["time"],
            "time (inc)":
            self.graph_dict["root_frame"]["time"],
            "is_application_code":
            self.graph_dict["root_frame"]["is_application_code"],
        }

        self.node_dicts.append(node_dict)
        self.list_roots.append(graph_root)

        # call recursively on all children of root
        if "children" in self.graph_dict["root_frame"]:
            for child in self.graph_dict["root_frame"]["children"]:
                # Pyinstrument's time metric actually stores inclusive time.
                # To calculate exclusive time, we subtract the children's time
                # from the parent's time.
                node_dict["time"] -= child["time"]
                parse_node_literal(child, graph_root)

        graph = Graph(self.list_roots)
        graph.enumerate_traverse()

        return graph
Exemple #21
0
def test_len_chain():
    graph = Graph.from_lists(("a", "b", "c", "d", "e"))
    assert len(graph) == 5
Exemple #22
0
def test_len_diamond():
    d = Node(Frame(name="d"))
    graph = Graph.from_lists(("a", ("b", d), ("c", d)))
    assert len(graph) == 4
Exemple #23
0
def test_len_tree():
    graph = Graph.from_lists(("a", ("b", "d"), ("c", "d")))
    assert len(graph) == 5
Exemple #24
0
    def read(self):
        """Read the TAU profile file to extract the calling context tree."""
        # Add all nodes and roots.
        roots = self.create_graph()
        # Create a graph object once all nodes have been added.
        graph = Graph(roots)
        graph.enumerate_traverse()

        dataframe = pd.DataFrame.from_dict(data=self.node_dicts)

        indices = []
        # Set indices according to rank/thread numbers.
        if self.multiple_ranks and self.multiple_threads:
            indices = ["node", "rank", "thread"]
        elif self.multiple_ranks:
            dataframe.drop(columns=["thread"], inplace=True)
            indices = ["node", "rank"]
        elif self.multiple_threads:
            dataframe.drop(columns=["rank"], inplace=True)
            indices = ["node", "thread"]
        else:
            indices = ["node"]

        dataframe.set_index(indices, inplace=True)
        dataframe.sort_index(inplace=True)

        # Fill the missing ranks
        # After unstacking and iterating over rows, there
        # will be "NaN" values for some ranks. Find the first
        # rank that has notna value and use it for other rows/ranks
        # of the multiindex.
        # TODO: iterrows() is not the best way to iterate over rows.
        if self.multiple_ranks or self.multiple_threads:
            dataframe = dataframe.unstack()
            for idx, row in dataframe.iterrows():
                # There is always a valid name for an index.
                # Take that valid name and assign to other ranks/rows.
                name = row["name"][row["name"].first_valid_index()]
                dataframe.loc[idx, "name"] = name

                # Sometimes there is no file information.
                if row["file"].first_valid_index() is not None:
                    file = row["file"][row["file"].first_valid_index()]
                    dataframe.loc[idx, "file"] = file

                # Sometimes there is no module information.
                if row["module"].first_valid_index() is not None:
                    module = row["module"][row["module"].first_valid_index()]
                    dataframe.loc[idx, "module"] = module

                # Fill the rest with 0
                dataframe.fillna(0, inplace=True)

            # Stack the dataframe
            dataframe = dataframe.stack()

        default_metric = "time (inc)"

        return hatchet.graphframe.GraphFrame(graph, dataframe,
                                             self.exc_metrics,
                                             self.inc_metrics, default_metric)
Exemple #25
0
    def read(self):
        """Read the caliper JSON file to extract the calling context tree."""
        with self.timer.phase("read json"):
            self.read_json_sections()

        with self.timer.phase("graph construction"):
            list_roots = self.create_graph()

        # create a dataframe of metrics from the data section
        self.df_json_data = pd.DataFrame(self.json_data,
                                         columns=self.json_cols)

        # map non-numeric columns to their mappings in the nodes section
        for idx, item in enumerate(self.json_cols_mdata):
            if item["is_value"] is False and self.json_cols[
                    idx] != self.nid_col_name:
                if self.json_cols[idx] == "sourceloc#cali.sampler.pc":
                    # split source file and line number into two columns
                    self.df_json_data["file"] = self.df_json_data[
                        self.json_cols[idx]].apply(
                            lambda x: re.match(r"(.*):(\d+)", self.json_nodes[
                                x]["label"]).group(1))
                    self.df_json_data["line"] = self.df_json_data[
                        self.json_cols[idx]].apply(
                            lambda x: re.match(r"(.*):(\d+)", self.json_nodes[
                                x]["label"]).group(2))
                    self.df_json_data.drop(self.json_cols[idx],
                                           axis=1,
                                           inplace=True)
                    sourceloc_idx = idx
                else:
                    self.df_json_data[self.json_cols[idx]] = self.df_json_data[
                        self.json_cols[idx]].apply(
                            lambda x: self.json_nodes[x]["label"])

        # since we split sourceloc, we should update json_cols and
        # json_cols_mdata
        if "sourceloc#cali.sampler.pc" in self.json_cols:
            self.json_cols.pop(sourceloc_idx)
            self.json_cols_mdata.pop(sourceloc_idx)
            self.json_cols.append("file")
            self.json_cols.append("line")
            self.json_cols_mdata.append({"is_value": False})
            self.json_cols_mdata.append({"is_value": False})

        max_nid = self.df_json_data[self.nid_col_name].max()

        if "line" in self.df_json_data.columns:
            # split nodes that have multiple file:line numbers to have a child
            # each with a unique file:line number
            unique_nodes = self.df_json_data.groupby(self.nid_col_name)
            df_concat = [self.df_json_data]

            for nid, super_node in unique_nodes:
                line_groups = super_node.groupby("line")
                # only need to do something if there are more than one
                # file:line number entries for the node
                if len(line_groups.size()) > 1:
                    sn_hnode = self.idx_to_node[nid]["node"]

                    for line, line_group in line_groups:
                        # create the node label
                        file_path = (line_group.head(1))["file"].item()
                        file_name = os.path.basename(file_path)
                        node_label = file_name + ":" + line

                        # create a new hatchet node
                        max_nid += 1
                        idx = max_nid
                        hnode = Node(
                            Frame({
                                "type": "statement",
                                "file": file_path,
                                "line": line
                            }),
                            sn_hnode,
                        )
                        sn_hnode.add_child(hnode)

                        node_dict = {
                            self.nid_col_name: idx,
                            "name": node_label,
                            "node": hnode,
                        }
                        self.idx_to_node[idx] = node_dict

                        # change nid of the original node to new node in place
                        for index, row in line_group.iterrows():
                            self.df_json_data.loc[index, "nid"] = max_nid

                    # add new row for original node
                    node_copy = super_node.head(1).copy()
                    for cols in self.metric_columns:
                        node_copy[cols] = 0
                    df_concat.append(node_copy)

            # concatenate all the newly created dataframes with
            # self.df_json_data
            self.df_fixed_data = pd.concat(df_concat)
        else:
            self.df_fixed_data = self.df_json_data

        # create a dataframe with all nodes in the call graph
        self.df_nodes = pd.DataFrame.from_dict(
            data=list(self.idx_to_node.values()))

        # add missing intermediate nodes to the df_fixed_data dataframe
        if "rank" in self.json_cols:
            self.num_ranks = self.df_fixed_data["rank"].max() + 1
            rank_list = range(0, self.num_ranks)

        # create a standard dict to be used for filling all missing rows
        default_metric_dict = {}
        for idx, item in enumerate(self.json_cols_mdata):
            if self.json_cols[idx] != self.nid_col_name:
                if item["is_value"] is True:
                    default_metric_dict[self.json_cols[idx]] = 0
                else:
                    default_metric_dict[self.json_cols[idx]] = None

        # create a list of dicts, one dict for each missing row
        missing_nodes = []
        for iteridx, row in self.df_nodes.iterrows():
            # check if df_nodes row exists in df_fixed_data
            metric_rows = self.df_fixed_data.loc[self.df_fixed_data[
                self.nid_col_name] == row[self.nid_col_name]]
            if "rank" not in self.json_cols:
                if metric_rows.empty:
                    # add a single row
                    node_dict = dict(default_metric_dict)
                    node_dict[self.nid_col_name] = row[self.nid_col_name]
                    missing_nodes.append(node_dict)
            else:
                if metric_rows.empty:
                    # add a row per MPI rank
                    for rank in rank_list:
                        node_dict = dict(default_metric_dict)
                        node_dict[self.nid_col_name] = row[self.nid_col_name]
                        node_dict["rank"] = rank
                        missing_nodes.append(node_dict)
                elif len(metric_rows) < self.num_ranks:
                    # add a row for each missing MPI rank
                    present_ranks = metric_rows["rank"].values
                    missing_ranks = [
                        x for x in rank_list if x not in present_ranks
                    ]
                    for rank in missing_ranks:
                        node_dict = dict(default_metric_dict)
                        node_dict[self.nid_col_name] = row[self.nid_col_name]
                        node_dict["rank"] = rank
                        missing_nodes.append(node_dict)

        self.df_missing = pd.DataFrame.from_dict(data=missing_nodes)
        self.df_metrics = pd.concat([self.df_fixed_data, self.df_missing])

        # create a graph object once all the nodes have been added
        graph = Graph(list_roots)
        graph.enumerate_traverse()

        # merge the metrics and node dataframes on the idx column
        with self.timer.phase("data frame"):
            dataframe = pd.merge(self.df_metrics,
                                 self.df_nodes,
                                 on=self.nid_col_name)
            # set the index to be a MultiIndex
            indices = ["node"]
            if "rank" in self.json_cols:
                indices.append("rank")
            dataframe.set_index(indices, inplace=True)
            dataframe.sort_index(inplace=True)

        # create list of exclusive and inclusive metric columns
        exc_metrics = []
        inc_metrics = []
        for column in self.metric_columns:
            if "(inc)" in column:
                inc_metrics.append(column)
            else:
                exc_metrics.append(column)

        return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics,
                                             inc_metrics)
Exemple #26
0
    def read(self):
        """Read the experiment.xml file to extract the calling context tree and create
        a dataframe out of it. Then merge the two dataframes to create the final
        dataframe.

        Return:
            (GraphFrame): new GraphFrame with HPCToolkit data.
        """
        with self.timer.phase("fill tables"):
            self.fill_tables()

        with self.timer.phase("read metric db"):
            self.read_all_metricdb_files()

        list_roots = []

        # parse the ElementTree to generate a calling context tree
        for root in self.callpath_profile.findall("PF"):
            global src_file

            nid = int(root.get("i"))
            src_file = root.get("f")

            # start with the root and create the callpath and node for the root
            # also a corresponding node_dict to be inserted into the dataframe
            graph_root = Node(
                Frame({
                    "type": "function",
                    "name": self.procedure_names[root.get("n")]
                }),
                None,
            )
            node_dict = self.create_node_dict(
                nid,
                graph_root,
                self.procedure_names[root.get("n")],
                "PF",
                self.src_files[src_file],
                int(root.get("l")),
                self.load_modules[root.get("lm")],
            )

            self.node_dicts.append(node_dict)
            list_roots.append(graph_root)

            # start graph construction at the root
            with self.timer.phase("graph construction"):
                self.parse_xml_children(root, graph_root)

            # put updated metrics back in dataframe
            for i, column in enumerate(self.metric_columns):
                if "(inc)" not in column and "(I)" not in column:
                    self.df_metrics[column] = self.np_metrics.T[i]

        with self.timer.phase("graph construction"):
            graph = Graph(list_roots)
            graph.enumerate_traverse()

        # create a dataframe for all the nodes in the graph
        self.df_nodes = pd.DataFrame.from_dict(data=self.node_dicts)

        # merge the metrics and node dataframes
        with self.timer.phase("data frame"):
            dataframe = pd.merge(self.df_metrics, self.df_nodes, on="nid")

            # set the index to be a MultiIndex
            if self.num_threads_per_rank > 1:
                indices = ["node", "rank", "thread"]
            # if number of threads per rank is 1, do not make thread an index
            elif self.num_threads_per_rank == 1:
                indices = ["node", "rank"]
            dataframe.set_index(indices, inplace=True)
            dataframe.sort_index(inplace=True)

        # create list of exclusive and inclusive metric columns
        exc_metrics = []
        inc_metrics = []
        for column in self.metric_columns:
            if "(inc)" in column or "(I)" in column:
                inc_metrics.append(column)
            else:
                exc_metrics.append(column)

        return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics,
                                             inc_metrics)
Exemple #27
0
    def read(self):
        # print(self.dict)

        # filter regions
        if len(self.filter) > 0:
            for rank, rank_value in list(self.dict["ranks"].items()):
                for thread, thread_value in list(
                        rank_value["threads"].items()):
                    for region, data in list(thread_value["regions"].items()):
                        if any(map(data["name"].__contains__,
                                   self.filter)) is True:
                            del self.dict["ranks"][str(rank)]["threads"][str(
                                thread)]["regions"][str(region)]

        # add default metrics 'cycles' and 'real_time_nsec' to inc_metrics
        self.inc_metrics.append("cycles")
        self.inc_metrics.append("real_time_nsec")

        # determine thread with the largest number of regions to create the graph
        max_regions = 1
        graph_rank = 0
        graph_thread = 0

        rank_cnt = 0
        thread_cnt = 0
        for rank, rank_value in iter(self.dict["ranks"].items()):
            rank_cnt += 1
            for thread, thread_value in iter(rank_value["threads"].items()):
                thread_cnt += 1
                if len(thread_value["regions"]) > max_regions:
                    max_regions = len(thread_value["regions"])
                    graph_rank = int(rank)
                    graph_thread = int(thread)

        # create graph
        list_roots = []
        node_dicts = []
        self.__create_graph(graph_rank, graph_thread, list_roots, node_dicts)

        # fill up node dictionaries for all remaining ranks and threads
        for rank, rank_value in iter(self.dict["ranks"].items()):
            for thread, thread_value in iter(rank_value["threads"].items()):
                if int(rank) != graph_rank or int(thread) != graph_thread:
                    node_graph_id = -1
                    for data in iter(thread_value["regions"].values()):
                        # print(data["name"])

                        node_graph_id += 1
                        if node_graph_id >= len(self.node_graph_dict):
                            self.__print_error_and_exit(
                                data["name"], rank, thread)

                        # find matching regions
                        found_match = False
                        while found_match is False:
                            if self.node_graph_dict[node_graph_id][0] == data[
                                    "name"]:
                                found_match = True
                            else:
                                # create a tuple of zero values
                                zero_metrics = self.__get_zero_metrics()
                                node_dict = dict({
                                    "name":
                                    self.node_graph_dict[node_graph_id][0],
                                    "node":
                                    self.node_graph_dict[node_graph_id][1],
                                    "rank":
                                    int(rank),
                                    "thread":
                                    int(thread),
                                    **zero_metrics,
                                })
                                node_dicts.append(node_dict)

                                # set index to the next region
                                node_graph_id += 1
                                if node_graph_id >= len(self.node_graph_dict):
                                    self.__print_error_and_exit(
                                        data["name"], rank, thread)

                        if found_match is True:
                            # we found a match
                            contain_read_events = [0]
                            metrics = self.__get_metrics(
                                data, contain_read_events)

                            node_dict = dict({
                                "name":
                                self.node_graph_dict[node_graph_id][0],
                                "node":
                                self.node_graph_dict[node_graph_id][1],
                                "rank":
                                int(rank),
                                "thread":
                                int(thread),
                                **metrics,
                            })
                            node_dicts.append(node_dict)
                            # check if we have to add read events
                            if contain_read_events[0] == 1:

                                # check how many read calls are used
                                read_num = len(data["cycles"])

                                for i in range(1, read_num):
                                    node_name_read = "read_" + str(i)

                                    read_metrics = self.__get_read_metrics(
                                        data, node_name_read)
                                    node_dict = dict({
                                        "name":
                                        node_name_read,
                                        "node":
                                        self.__find_read_node(
                                            self.node_graph_dict[node_graph_id]
                                            [1],
                                            node_name_read,
                                        ),
                                        "rank":
                                        int(rank),
                                        "thread":
                                        int(thread),
                                        **read_metrics,
                                    })
                                    node_dicts.append(node_dict)

        # setup data for hatchet graphframe
        graph = Graph(list_roots)
        graph.enumerate_traverse()

        dataframe = pd.DataFrame(data=node_dicts)

        # check graph indices
        if rank_cnt > 1 and thread_cnt > 1:
            indices = ["node", "rank", "thread"]
        elif rank_cnt > 1:
            dataframe.drop(columns=["thread"], inplace=True)
            indices = ["node", "rank"]
        elif thread_cnt > 1:
            dataframe.drop(columns=["rank"], inplace=True)
            indices = ["node", "thread"]
        else:
            dataframe.drop(columns=["rank", "thread"], inplace=True)
            indices = ["node"]

        dataframe.set_index(indices, inplace=True)
        dataframe.sort_index(inplace=True)

        default_metric = "real_time_nsec"

        return hatchet.graphframe.GraphFrame(graph, dataframe, [],
                                             self.inc_metrics, default_metric)
    def read(self):
        """Read the experiment.xml file to extract the calling context tree and create
        a dataframe out of it. Then merge the two dataframes to create the final
        dataframe.

        Return:
            (GraphFrame): new GraphFrame with HPCToolkit data.
        """
        with self.timer.phase("fill tables"):
            self.fill_tables()

        with self.timer.phase("read metric db"):
            self.read_all_metricdb_files()

        list_roots = []

        # parse the ElementTree to generate a calling context tree
        for root in self.callpath_profile.findall("PF"):
            nid = int(root.get("i"))

            # start with the root and create the callpath and node for the root
            # also a corresponding node_dict to be inserted into the dataframe
            node_callpath = []
            node_callpath.append(self.procedure_names[root.get("n")])
            graph_root = Node(
                Frame({
                    "type": "function",
                    "name": self.procedure_names[root.get("n")]
                }),
                None,
            )
            node_dict = self.create_node_dict(
                nid,
                graph_root,
                self.procedure_names[root.get("n")],
                "PF",
                self.src_files[root.get("f")],
                root.get("l"),
                self.load_modules[root.get("lm")],
            )

            self.node_dicts.append(node_dict)
            list_roots.append(graph_root)

            # start graph construction at the root
            with self.timer.phase("graph construction"):
                self.parse_xml_children(root, graph_root, list(node_callpath))

        # create a dataframe for all the nodes in the graph
        self.df_nodes = pd.DataFrame.from_dict(data=self.node_dicts)

        # merge the metrics and node dataframes
        with self.timer.phase("data frame"):
            dataframe = pd.merge(self.df_metrics, self.df_nodes, on="nid")
            # set the index to be a MultiIndex
            indices = ["node", "rank"]
            dataframe.set_index(indices, drop=False, inplace=True)

        # create list of exclusive and inclusive metric columns
        exc_metrics = []
        inc_metrics = []
        for column in self.metric_columns:
            if "(inc)" in column:
                inc_metrics.append(column)
            else:
                exc_metrics.append(column)

        return hatchet.graphframe.GraphFrame(Graph(list_roots), dataframe,
                                             exc_metrics, inc_metrics)