Ejemplo n.º 1
0
    def read(self):
        list_roots = []
        node_dicts = []
        frame_to_node_dict = {}
        frame = None
        seen_nids = []
        hnid = -1

        # start with creating a node_dict for each root
        for i in range(len(self.graph_dict)):
            if "_hatchet_nid" in self.graph_dict[i]["metrics"]:
                hnid = self.graph_dict[i]["metrics"]["_hatchet_nid"]
                seen_nids.append(hnid)
            frame = Frame(self.graph_dict[i]["frame"])
            graph_root = Node(frame, None, hnid=hnid)

            # depending on the node type, the name may not be in the frame
            node_name = self.graph_dict[i]["frame"].get("name")
            if not node_name:
                node_name = self.graph_dict[i]["name"]

            node_dict = dict({
                "node": graph_root,
                "name": node_name
            }, **self.graph_dict[i]["metrics"])
            node_dicts.append(node_dict)

            list_roots.append(graph_root)
            frame_to_node_dict[frame] = graph_root

            # call recursively on all children of root
            if "children" in self.graph_dict[i]:
                for child in self.graph_dict[i]["children"]:
                    self.parse_node_literal(frame_to_node_dict, node_dicts,
                                            child, graph_root, seen_nids)

        graph = Graph(list_roots)

        # test if nids are already loaded
        if -1 in [n._hatchet_nid for n in graph.traverse()]:
            graph.enumerate_traverse()
        else:
            graph.enumerate_depth()

        exc_metrics = []
        inc_metrics = []
        for key in self.graph_dict[i]["metrics"].keys():
            if "(inc)" in key:
                inc_metrics.append(key)
            else:
                exc_metrics.append(key)

        dataframe = pd.DataFrame(data=node_dicts)
        dataframe.set_index(["node"], inplace=True)
        dataframe.sort_index(inplace=True)

        return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics,
                                             inc_metrics)
Ejemplo n.º 2
0
    def read(self):
        roots = self.create_graph()
        graph = Graph(roots)
        graph.enumerate_traverse()

        dataframe = pd.DataFrame.from_dict(data=list(self.name_to_dict.values()))
        index = ["node"]
        dataframe.set_index(index, inplace=True)
        dataframe.sort_index(inplace=True)

        return hatchet.graphframe.GraphFrame(graph, dataframe, ["time"], ["time (inc)"])
Ejemplo n.º 3
0
    def read(self):
        """Read the caliper records to extract the calling context tree."""
        if isinstance(self.filename_or_caliperreader, str):
            if self.filename_ext != ".cali":
                raise ValueError("from_caliperreader() needs a .cali file")
            else:
                cali_file = self.filename_or_caliperreader
                self.filename_or_caliperreader = cr.CaliperReader()
                self.filename_or_caliperreader.read(cali_file)

        with self.timer.phase("graph construction"):
            list_roots = self.create_graph()

        # create a graph object once all the nodes have been added
        graph = Graph(list_roots)
        graph.enumerate_traverse()

        dataframe = pd.DataFrame(data=self.node_dicts)

        indices = ["node"]
        if "rank" in dataframe.columns:
            indices.append("rank")
        dataframe.set_index(indices, inplace=True)
        dataframe.sort_index(inplace=True)

        # change column names
        for idx, item in enumerate(dataframe.columns):
            # make other columns consistent with other readers
            if item == "mpi.rank":
                dataframe.columns.values[idx] = "rank"
            if item == "module#cali.sampler.pc":
                dataframe.columns.values[idx] = "module"

        # create list of exclusive and inclusive metric columns
        exc_metrics = []
        inc_metrics = []
        for column in self.metric_columns:
            if "(inc)" in column:
                inc_metrics.append(column)
            else:
                exc_metrics.append(column)

        metadata = self.filename_or_caliperreader.globals

        return hatchet.graphframe.GraphFrame(graph,
                                             dataframe,
                                             exc_metrics,
                                             inc_metrics,
                                             metadata=metadata)
Ejemplo n.º 4
0
    def read(self):
        """Read the TAU profile file to extract the calling context tree."""
        # Add all nodes and roots.
        roots = self.create_graph()
        # Create a graph object once all nodes have been added.
        graph = Graph(roots)
        graph.enumerate_traverse()

        dataframe = pd.DataFrame.from_dict(data=self.node_dicts)

        indices = []
        # Set indices according to rank/thread numbers.
        if self.multiple_ranks and self.multiple_threads:
            indices = ["node", "rank", "thread"]
        elif self.multiple_ranks:
            dataframe.drop(columns=["thread"], inplace=True)
            indices = ["node", "rank"]
        elif self.multiple_threads:
            dataframe.drop(columns=["rank"], inplace=True)
            indices = ["node", "thread"]
        else:
            indices = ["node"]

        dataframe.set_index(indices, inplace=True)
        dataframe.sort_index(inplace=True)

        # Fill the missing ranks
        # After unstacking and iterating over rows, there
        # will be "NaN" values for some ranks. Find the first
        # rank that has notna value and use it for other rows/ranks
        # of the multiindex.
        # TODO: iterrows() is not the best way to iterate over rows.
        if self.multiple_ranks or self.multiple_threads:
            dataframe = dataframe.unstack()
            for idx, row in dataframe.iterrows():
                # There is always a valid name for an index.
                # Take that valid name and assign to other ranks/rows.
                name = row["name"][row["name"].first_valid_index()]
                dataframe.loc[idx, "name"] = name

                # Sometimes there is no file information.
                if row["file"].first_valid_index() is not None:
                    file = row["file"][row["file"].first_valid_index()]
                    dataframe.loc[idx, "file"] = file

                # Sometimes there is no module information.
                if row["module"].first_valid_index() is not None:
                    module = row["module"][row["module"].first_valid_index()]
                    dataframe.loc[idx, "module"] = module

                # Fill the rest with 0
                dataframe.fillna(0, inplace=True)

            # Stack the dataframe
            dataframe = dataframe.stack()

        default_metric = "time (inc)"

        return hatchet.graphframe.GraphFrame(graph, dataframe,
                                             self.exc_metrics,
                                             self.inc_metrics, default_metric)
Ejemplo n.º 5
0
    def read(self):
        """Read the caliper JSON file to extract the calling context tree."""
        with self.timer.phase("read json"):
            self.read_json_sections()

        with self.timer.phase("graph construction"):
            list_roots = self.create_graph()

        # create a dataframe of metrics from the data section
        self.df_json_data = pd.DataFrame(self.json_data,
                                         columns=self.json_cols)

        # map non-numeric columns to their mappings in the nodes section
        for idx, item in enumerate(self.json_cols_mdata):
            if item["is_value"] is False and self.json_cols[
                    idx] != self.nid_col_name:
                if self.json_cols[idx] == "sourceloc#cali.sampler.pc":
                    # split source file and line number into two columns
                    self.df_json_data["file"] = self.df_json_data[
                        self.json_cols[idx]].apply(
                            lambda x: re.match(r"(.*):(\d+)", self.json_nodes[
                                x]["label"]).group(1))
                    self.df_json_data["line"] = self.df_json_data[
                        self.json_cols[idx]].apply(
                            lambda x: re.match(r"(.*):(\d+)", self.json_nodes[
                                x]["label"]).group(2))
                    self.df_json_data.drop(self.json_cols[idx],
                                           axis=1,
                                           inplace=True)
                    sourceloc_idx = idx
                else:
                    self.df_json_data[self.json_cols[idx]] = self.df_json_data[
                        self.json_cols[idx]].apply(
                            lambda x: self.json_nodes[x]["label"])

        # since we split sourceloc, we should update json_cols and
        # json_cols_mdata
        if "sourceloc#cali.sampler.pc" in self.json_cols:
            self.json_cols.pop(sourceloc_idx)
            self.json_cols_mdata.pop(sourceloc_idx)
            self.json_cols.append("file")
            self.json_cols.append("line")
            self.json_cols_mdata.append({"is_value": False})
            self.json_cols_mdata.append({"is_value": False})

        max_nid = self.df_json_data[self.nid_col_name].max()

        if "line" in self.df_json_data.columns:
            # split nodes that have multiple file:line numbers to have a child
            # each with a unique file:line number
            unique_nodes = self.df_json_data.groupby(self.nid_col_name)
            df_concat = [self.df_json_data]

            for nid, super_node in unique_nodes:
                line_groups = super_node.groupby("line")
                # only need to do something if there are more than one
                # file:line number entries for the node
                if len(line_groups.size()) > 1:
                    sn_hnode = self.idx_to_node[nid]["node"]

                    for line, line_group in line_groups:
                        # create the node label
                        file_path = (line_group.head(1))["file"].item()
                        file_name = os.path.basename(file_path)
                        node_label = file_name + ":" + line

                        # create a new hatchet node
                        max_nid += 1
                        idx = max_nid
                        hnode = Node(
                            Frame({
                                "type": "statement",
                                "file": file_path,
                                "line": line
                            }),
                            sn_hnode,
                        )
                        sn_hnode.add_child(hnode)

                        node_dict = {
                            self.nid_col_name: idx,
                            "name": node_label,
                            "node": hnode,
                        }
                        self.idx_to_node[idx] = node_dict

                        # change nid of the original node to new node in place
                        for index, row in line_group.iterrows():
                            self.df_json_data.loc[index, "nid"] = max_nid

                    # add new row for original node
                    node_copy = super_node.head(1).copy()
                    for cols in self.metric_columns:
                        node_copy[cols] = 0
                    df_concat.append(node_copy)

            # concatenate all the newly created dataframes with
            # self.df_json_data
            self.df_fixed_data = pd.concat(df_concat)
        else:
            self.df_fixed_data = self.df_json_data

        # create a dataframe with all nodes in the call graph
        self.df_nodes = pd.DataFrame.from_dict(
            data=list(self.idx_to_node.values()))

        # add missing intermediate nodes to the df_fixed_data dataframe
        if "rank" in self.json_cols:
            self.num_ranks = self.df_fixed_data["rank"].max() + 1
            rank_list = range(0, self.num_ranks)

        # create a standard dict to be used for filling all missing rows
        default_metric_dict = {}
        for idx, item in enumerate(self.json_cols_mdata):
            if self.json_cols[idx] != self.nid_col_name:
                if item["is_value"] is True:
                    default_metric_dict[self.json_cols[idx]] = 0
                else:
                    default_metric_dict[self.json_cols[idx]] = None

        # create a list of dicts, one dict for each missing row
        missing_nodes = []
        for iteridx, row in self.df_nodes.iterrows():
            # check if df_nodes row exists in df_fixed_data
            metric_rows = self.df_fixed_data.loc[self.df_fixed_data[
                self.nid_col_name] == row[self.nid_col_name]]
            if "rank" not in self.json_cols:
                if metric_rows.empty:
                    # add a single row
                    node_dict = dict(default_metric_dict)
                    node_dict[self.nid_col_name] = row[self.nid_col_name]
                    missing_nodes.append(node_dict)
            else:
                if metric_rows.empty:
                    # add a row per MPI rank
                    for rank in rank_list:
                        node_dict = dict(default_metric_dict)
                        node_dict[self.nid_col_name] = row[self.nid_col_name]
                        node_dict["rank"] = rank
                        missing_nodes.append(node_dict)
                elif len(metric_rows) < self.num_ranks:
                    # add a row for each missing MPI rank
                    present_ranks = metric_rows["rank"].values
                    missing_ranks = [
                        x for x in rank_list if x not in present_ranks
                    ]
                    for rank in missing_ranks:
                        node_dict = dict(default_metric_dict)
                        node_dict[self.nid_col_name] = row[self.nid_col_name]
                        node_dict["rank"] = rank
                        missing_nodes.append(node_dict)

        self.df_missing = pd.DataFrame.from_dict(data=missing_nodes)
        self.df_metrics = pd.concat([self.df_fixed_data, self.df_missing])

        # create a graph object once all the nodes have been added
        graph = Graph(list_roots)
        graph.enumerate_traverse()

        # merge the metrics and node dataframes on the idx column
        with self.timer.phase("data frame"):
            dataframe = pd.merge(self.df_metrics,
                                 self.df_nodes,
                                 on=self.nid_col_name)
            # set the index to be a MultiIndex
            indices = ["node"]
            if "rank" in self.json_cols:
                indices.append("rank")
            dataframe.set_index(indices, inplace=True)
            dataframe.sort_index(inplace=True)

        # create list of exclusive and inclusive metric columns
        exc_metrics = []
        inc_metrics = []
        for column in self.metric_columns:
            if "(inc)" in column:
                inc_metrics.append(column)
            else:
                exc_metrics.append(column)

        return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics,
                                             inc_metrics)
Ejemplo n.º 6
0
    def read(self):
        """Read the experiment.xml file to extract the calling context tree and create
        a dataframe out of it. Then merge the two dataframes to create the final
        dataframe.

        Return:
            (GraphFrame): new GraphFrame with HPCToolkit data.
        """
        with self.timer.phase("fill tables"):
            self.fill_tables()

        with self.timer.phase("read metric db"):
            self.read_all_metricdb_files()

        list_roots = []

        # parse the ElementTree to generate a calling context tree
        for root in self.callpath_profile.findall("PF"):
            global src_file

            nid = int(root.get("i"))
            src_file = root.get("f")

            # start with the root and create the callpath and node for the root
            # also a corresponding node_dict to be inserted into the dataframe
            graph_root = Node(
                Frame({
                    "type": "function",
                    "name": self.procedure_names[root.get("n")]
                }),
                None,
            )
            node_dict = self.create_node_dict(
                nid,
                graph_root,
                self.procedure_names[root.get("n")],
                "PF",
                self.src_files[src_file],
                int(root.get("l")),
                self.load_modules[root.get("lm")],
            )

            self.node_dicts.append(node_dict)
            list_roots.append(graph_root)

            # start graph construction at the root
            with self.timer.phase("graph construction"):
                self.parse_xml_children(root, graph_root)

            # put updated metrics back in dataframe
            for i, column in enumerate(self.metric_columns):
                if "(inc)" not in column and "(I)" not in column:
                    self.df_metrics[column] = self.np_metrics.T[i]

        with self.timer.phase("graph construction"):
            graph = Graph(list_roots)
            graph.enumerate_traverse()

        # create a dataframe for all the nodes in the graph
        self.df_nodes = pd.DataFrame.from_dict(data=self.node_dicts)

        # merge the metrics and node dataframes
        with self.timer.phase("data frame"):
            dataframe = pd.merge(self.df_metrics, self.df_nodes, on="nid")

            # set the index to be a MultiIndex
            if self.num_threads_per_rank > 1:
                indices = ["node", "rank", "thread"]
            # if number of threads per rank is 1, do not make thread an index
            elif self.num_threads_per_rank == 1:
                indices = ["node", "rank"]
            dataframe.set_index(indices, inplace=True)
            dataframe.sort_index(inplace=True)

        # create list of exclusive and inclusive metric columns
        exc_metrics = []
        inc_metrics = []
        for column in self.metric_columns:
            if "(inc)" in column or "(I)" in column:
                inc_metrics.append(column)
            else:
                exc_metrics.append(column)

        return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics,
                                             inc_metrics)
Ejemplo n.º 7
0
    def create_graph(self):
        def parse_node_literal(child_dict, hparent):
            """Create node_dict for one node and then call the function
            recursively on all children."""

            hnode = Node(
                Frame({
                    "name": child_dict["function"],
                    "type": "function"
                }), hparent)

            child_node_dict = {
                "node": hnode,
                "name": child_dict["function"],
                "file": child_dict["file_path_short"],
                "line": child_dict["line_no"],
                "time": child_dict["time"],
                "time (inc)": child_dict["time"],
                "is_application_code": child_dict["is_application_code"],
            }

            hparent.add_child(hnode)
            self.node_dicts.append(child_node_dict)

            if "children" in child_dict:
                for child in child_dict["children"]:
                    # Pyinstrument's time metric actually stores inclusive time.
                    # To calculate exclusive time, we subtract the children's time
                    # from the parent's time.
                    child_node_dict["time"] -= child["time"]
                    parse_node_literal(child, hnode)

        # start with creating a node_dict for each root
        graph_root = Node(
            Frame({
                "name": self.graph_dict["root_frame"]["function"],
                "type": "function"
            }),
            None,
        )

        node_dict = {
            "node":
            graph_root,
            "name":
            self.graph_dict["root_frame"]["function"],
            "file":
            self.graph_dict["root_frame"]["file_path_short"],
            "line":
            self.graph_dict["root_frame"]["line_no"],
            "time":
            self.graph_dict["root_frame"]["time"],
            "time (inc)":
            self.graph_dict["root_frame"]["time"],
            "is_application_code":
            self.graph_dict["root_frame"]["is_application_code"],
        }

        self.node_dicts.append(node_dict)
        self.list_roots.append(graph_root)

        # call recursively on all children of root
        if "children" in self.graph_dict["root_frame"]:
            for child in self.graph_dict["root_frame"]["children"]:
                # Pyinstrument's time metric actually stores inclusive time.
                # To calculate exclusive time, we subtract the children's time
                # from the parent's time.
                node_dict["time"] -= child["time"]
                parse_node_literal(child, graph_root)

        graph = Graph(self.list_roots)
        graph.enumerate_traverse()

        return graph
Ejemplo n.º 8
0
    def read(self):
        # print(self.dict)

        # filter regions
        if len(self.filter) > 0:
            for rank, rank_value in list(self.dict["ranks"].items()):
                for thread, thread_value in list(
                        rank_value["threads"].items()):
                    for region, data in list(thread_value["regions"].items()):
                        if any(map(data["name"].__contains__,
                                   self.filter)) is True:
                            del self.dict["ranks"][str(rank)]["threads"][str(
                                thread)]["regions"][str(region)]

        # add default metrics 'cycles' and 'real_time_nsec' to inc_metrics
        self.inc_metrics.append("cycles")
        self.inc_metrics.append("real_time_nsec")

        # determine thread with the largest number of regions to create the graph
        max_regions = 1
        graph_rank = 0
        graph_thread = 0

        rank_cnt = 0
        thread_cnt = 0
        for rank, rank_value in iter(self.dict["ranks"].items()):
            rank_cnt += 1
            for thread, thread_value in iter(rank_value["threads"].items()):
                thread_cnt += 1
                if len(thread_value["regions"]) > max_regions:
                    max_regions = len(thread_value["regions"])
                    graph_rank = int(rank)
                    graph_thread = int(thread)

        # create graph
        list_roots = []
        node_dicts = []
        self.__create_graph(graph_rank, graph_thread, list_roots, node_dicts)

        # fill up node dictionaries for all remaining ranks and threads
        for rank, rank_value in iter(self.dict["ranks"].items()):
            for thread, thread_value in iter(rank_value["threads"].items()):
                if int(rank) != graph_rank or int(thread) != graph_thread:
                    node_graph_id = -1
                    for data in iter(thread_value["regions"].values()):
                        # print(data["name"])

                        node_graph_id += 1
                        if node_graph_id >= len(self.node_graph_dict):
                            self.__print_error_and_exit(
                                data["name"], rank, thread)

                        # find matching regions
                        found_match = False
                        while found_match is False:
                            if self.node_graph_dict[node_graph_id][0] == data[
                                    "name"]:
                                found_match = True
                            else:
                                # create a tuple of zero values
                                zero_metrics = self.__get_zero_metrics()
                                node_dict = dict({
                                    "name":
                                    self.node_graph_dict[node_graph_id][0],
                                    "node":
                                    self.node_graph_dict[node_graph_id][1],
                                    "rank":
                                    int(rank),
                                    "thread":
                                    int(thread),
                                    **zero_metrics,
                                })
                                node_dicts.append(node_dict)

                                # set index to the next region
                                node_graph_id += 1
                                if node_graph_id >= len(self.node_graph_dict):
                                    self.__print_error_and_exit(
                                        data["name"], rank, thread)

                        if found_match is True:
                            # we found a match
                            contain_read_events = [0]
                            metrics = self.__get_metrics(
                                data, contain_read_events)

                            node_dict = dict({
                                "name":
                                self.node_graph_dict[node_graph_id][0],
                                "node":
                                self.node_graph_dict[node_graph_id][1],
                                "rank":
                                int(rank),
                                "thread":
                                int(thread),
                                **metrics,
                            })
                            node_dicts.append(node_dict)
                            # check if we have to add read events
                            if contain_read_events[0] == 1:

                                # check how many read calls are used
                                read_num = len(data["cycles"])

                                for i in range(1, read_num):
                                    node_name_read = "read_" + str(i)

                                    read_metrics = self.__get_read_metrics(
                                        data, node_name_read)
                                    node_dict = dict({
                                        "name":
                                        node_name_read,
                                        "node":
                                        self.__find_read_node(
                                            self.node_graph_dict[node_graph_id]
                                            [1],
                                            node_name_read,
                                        ),
                                        "rank":
                                        int(rank),
                                        "thread":
                                        int(thread),
                                        **read_metrics,
                                    })
                                    node_dicts.append(node_dict)

        # setup data for hatchet graphframe
        graph = Graph(list_roots)
        graph.enumerate_traverse()

        dataframe = pd.DataFrame(data=node_dicts)

        # check graph indices
        if rank_cnt > 1 and thread_cnt > 1:
            indices = ["node", "rank", "thread"]
        elif rank_cnt > 1:
            dataframe.drop(columns=["thread"], inplace=True)
            indices = ["node", "rank"]
        elif thread_cnt > 1:
            dataframe.drop(columns=["rank"], inplace=True)
            indices = ["node", "thread"]
        else:
            dataframe.drop(columns=["rank", "thread"], inplace=True)
            indices = ["node"]

        dataframe.set_index(indices, inplace=True)
        dataframe.sort_index(inplace=True)

        default_metric = "real_time_nsec"

        return hatchet.graphframe.GraphFrame(graph, dataframe, [],
                                             self.inc_metrics, default_metric)