def read(self): roots = self.create_graph() graph = Graph(roots) graph.enumerate_traverse() dataframe = pd.DataFrame.from_dict(data=list(self.name_to_dict.values())) index = ["node"] dataframe.set_index(index, inplace=True) dataframe.sort_index(inplace=True) return hatchet.graphframe.GraphFrame(graph, dataframe, ["time"], ["time (inc)"])
def test_invalid_constructor(): # bad Graph with pytest.raises(ValueError): GraphFrame(None, None) # bad dataframe with pytest.raises(ValueError): GraphFrame(Graph([]), None) # dataframe has no "node" index with pytest.raises(ValueError): GraphFrame(Graph([]), pd.DataFrame())
def test_trees_are_trees(): g = Graph.from_lists(("a", )) assert g.is_tree() g = Graph.from_lists(("a", ("b", ("c")))) assert g.is_tree() g = Graph.from_lists(("a", "b", "c")) assert g.is_tree() g = Graph.from_lists(("a", ("b", "e", "f", "g"), ("c", "e", "f", "g"), ("d", "e", "f", "g"))) assert g.is_tree()
def read(self): """Read the caliper records to extract the calling context tree.""" if isinstance(self.filename_or_caliperreader, str): if self.filename_ext != ".cali": raise ValueError("from_caliperreader() needs a .cali file") else: cali_file = self.filename_or_caliperreader self.filename_or_caliperreader = cr.CaliperReader() self.filename_or_caliperreader.read(cali_file) with self.timer.phase("graph construction"): list_roots = self.create_graph() # create a graph object once all the nodes have been added graph = Graph(list_roots) graph.enumerate_traverse() dataframe = pd.DataFrame(data=self.node_dicts) indices = ["node"] if "rank" in dataframe.columns: indices.append("rank") dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) # change column names for idx, item in enumerate(dataframe.columns): # make other columns consistent with other readers if item == "mpi.rank": dataframe.columns.values[idx] = "rank" if item == "module#cali.sampler.pc": dataframe.columns.values[idx] = "module" # create list of exclusive and inclusive metric columns exc_metrics = [] inc_metrics = [] for column in self.metric_columns: if "(inc)" in column: inc_metrics.append(column) else: exc_metrics.append(column) metadata = self.filename_or_caliperreader.globals return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics, metadata=metadata)
def read(self): list_roots = [] node_dicts = [] frame_to_node_dict = {} frame = None # start with creating a node_dict for each root for i in range(len(self.graph_dict)): frame = Frame(self.graph_dict[i]["frame"]) graph_root = Node(frame, None) # depending on the node type, the name may not be in the frame node_name = self.graph_dict[i]["frame"].get("name") if not node_name: node_name = self.graph_dict[i]["name"] node_dict = dict({ "node": graph_root, "name": node_name }, **self.graph_dict[i]["metrics"]) node_dicts.append(node_dict) list_roots.append(graph_root) frame_to_node_dict[frame] = graph_root # call recursively on all children of root if "children" in self.graph_dict[i]: for child in self.graph_dict[i]["children"]: self.parse_node_literal(frame_to_node_dict, node_dicts, child, graph_root) graph = Graph(list_roots) graph.enumerate_traverse() exc_metrics = [] inc_metrics = [] for key in self.graph_dict[i]["metrics"].keys(): if "(inc)" in key: inc_metrics.append(key) else: exc_metrics.append(key) dataframe = pd.DataFrame(data=node_dicts) dataframe.set_index(["node"], inplace=True) dataframe.sort_index(inplace=True) return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics)
def test_copy(): d = Node(Frame(name="d")) diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d))) g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h")) assert g.copy() == g
def test_filter_squash_diamond(): r"""Test that diamond edges are collapsed when squashing. Ensure we can handle the most basic DAG. a / \ remove bc a b c ----------> | \ / d d """ d = Node(Frame(name="d")) check_filter_squash( GraphFrame.from_lists(("a", ("b", d), ("c", d))), lambda row: row["node"].frame["name"] not in ("b", "c"), Graph.from_lists(("a", "d")), [2, 1], # a, d ) check_filter_no_squash( GraphFrame.from_lists(("a", ("b", d), ("c", d))), lambda row: row["node"].frame["name"] not in ("b", "c"), 2, # a, d )
def test_filter_squash_with_rootless_merge(): r"""Test squash on a simple tree with several rootless node merges. a ___/ | \___ remove abcd b c d ------------> e f g /|\ /|\ /|\ e f g e f g e f g Note that here, because b and d have been removed, a will have only one child called c, which will contain merged (summed) data from the original c rows. """ check_filter_squash( GraphFrame.from_lists( ("a", ("b", "e", "f", "g"), ("c", "e", "f", "g"), ("d", "e", "f", "g")) ), lambda row: row["node"].frame["name"] not in ("a", "b", "c", "d"), Graph.from_lists(["e"], ["f"], ["g"]), [3, 3, 3], # e, f, g ) check_filter_no_squash( GraphFrame.from_lists( ("a", ("b", "e", "f", "g"), ("c", "e", "f", "g"), ("d", "e", "f", "g")) ), lambda row: row["node"].frame["name"] not in ("a", "b", "c", "d"), 9, # e, f, g, e, f, g, e, f, g )
def test_filter_squash_bunny(): r"""Test squash on a complicated "bunny" shaped graph. This has multiple roots as well as multiple parents that themselves have parents. e g / \ / \ f a h remove abc e g / \ -----------> / \ / \ b c f d h \ / d """ d = Node(Frame(name="d")) diamond = Node.from_lists(("a", ("b", d), ("c", d))) new_d = Node(Frame(name="d")) check_filter_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "b", "c"), Graph.from_lists(("e", new_d, "f"), ("g", new_d, "h")), [3, 1, 1, 3, 1], # e, d, f, g, h ) check_filter_no_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "b", "c"), 5, # e, d, f, g, h )
def test_filter_squash_bunny_to_goat(): r"""Test squash on a "bunny" shaped graph: This one is more complex because there are more transitive edges to maintain between the roots (e, g) and b and c. e g e g / \ / \ /|\ /|\ f a h remove ac f | b | h / \ ----------> | | | b c \|/ \ / d d """ d = Node(Frame(name="d")) diamond = Node.from_lists(("a", ("b", d), ("c", d))) new_d = Node(Frame(name="d")) new_b = Node.from_lists(("b", new_d)) check_filter_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "c"), Graph.from_lists(("e", new_b, new_d, "f"), ("g", new_b, new_d, "h")), [4, 2, 1, 1, 4, 1], # e, b, d, f, g, h ) check_filter_no_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "c"), 6, # e, b, d, f, g, h )
def test_filter_squash_bunny_to_goat_with_merge(): r"""Test squash on a "bunny" shaped graph: This one is more complex because there are more transitive edges to maintain between the roots (e, g) and b and c. e g / \ / \ f a h remove ac e g / \ ----------> / \ / \ b c f b h \ / b """ b = Node(Frame(name="b")) diamond = Node.from_lists(("a", ("b", b), ("c", b))) new_b = Node(Frame(name="b")) check_filter_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "c"), Graph.from_lists(("e", new_b, "f"), ("g", new_b, "h")), [4, 2, 1, 4, 1], # e, b, f, g, h ) check_filter_no_squash( GraphFrame.from_lists(("e", "f", diamond), ("g", diamond, "h")), lambda row: row["node"].frame["name"] not in ("a", "c"), 5, # e, b, f, g, h )
def test_filter_squash_with_merge(): r"""Test squash with a simple node merge. a / \ remove bd a b d ----------> | / \ c c c Note that here, because b and d have been removed, a will have only one child called c, which will contain merged (summed) data from the original c rows. """ check_filter_squash( GraphFrame.from_lists(("a", ("b", "c"), ("d", "c"))), lambda row: row["node"].frame["name"] in ("a", "c"), Graph.from_lists(("a", "c")), [3, 2], # a, c ) check_filter_no_squash( GraphFrame.from_lists(("a", ("b", "c"), ("d", "c"))), lambda row: row["node"].frame["name"] in ("a", "c"), 3, # a, c, c )
def test_traverse_paths(): d = Node(Frame(name="d")) diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d))) g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h")) assert list( g.traverse(attrs="name")) == ["e", "a", "b", "d", "c", "f", "g", "h"]
def test_union_dag(): # make graphs g1, g2, and g3, where you know g3 is the union of g1 and g2 c = Node.from_lists(("c", "d")) g1 = Graph.from_lists(("a", ("b", c), ("e", c, "f"))) d = Node(Frame(name="d")) g2 = Graph.from_lists(("a", ("b", ("c", d)), ("e", d, "f"))) d2 = Node(Frame(name="d")) c2 = Node.from_lists(("c", d2)) g3 = Graph.from_lists(("a", ("b", c2), ("e", c2, d2, "f"))) assert g1 != g2 g4 = g1.union(g2) assert g4 == g3
def test_from_lists(): """Ensure we can traverse roots in correct order without repeating a shared subdag. """ d = Node(Frame(name="d")) diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d))) g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h")) assert list(g.traverse(attrs="name")) == ["e", "a", "b", "d", "c", "f", "g", "h"]
def _reconstruct_graph(df, rel_dict): node_list = sorted(list(df.index.to_frame()["node"])) for i in range(len(df)): node = _get_node_from_df_iloc(df, i) if len(node.children) == 0: node.children = [ node_list[nid] for nid in rel_dict[node]["children"] ] if len(node.parents) == 0: node.parents = [ node_list[nid] for nid in rel_dict[node]["parents"] ] roots = [node for node in node_list if len(node.parents) == 0] return Graph(roots)
def test_filter_squash_different_roots(): r"""Test squash on a simple tree with one root but make multiple roots. a / \ remove a b d b d ---------> / \ / \ c e c e """ check_filter_squash( GraphFrame.from_lists(("a", ("b", "c"), ("d", "e"))), lambda row: row["node"].frame["name"] != "a", Graph.from_lists(("b", "c"), ("d", "e")), [2, 1, 2, 1], # b, c, d, e )
def test_filter_squash(): r"""Test squash on a simple tree with one root. a / \ remove bd a b d ----------> / \ / \ c e c e """ check_filter_squash( GraphFrame.from_lists(("a", ("b", "c"), ("d", "e"))), lambda row: row["node"].frame["name"] in ("a", "c", "e"), Graph.from_lists(("a", "c", "e")), [3, 1, 1], # a, c, e )
def test_dag_is_not_tree(): g = Graph.from_lists(("b", "c"), ("d", "e")) assert not g.is_tree() d = Node(Frame(name="d")) diamond_subdag = Node.from_lists(("a", ("b", d), ("c", d))) g = Graph([diamond_subdag]) assert not g.is_tree() g = Graph.from_lists(("e", "f", diamond_subdag), ("g", diamond_subdag, "h")) assert not g.is_tree()
def create_graph(self): def parse_node_literal(child_dict, hparent): """Create node_dict for one node and then call the function recursively on all children.""" hnode = Node( Frame({ "name": child_dict["function"], "type": "function" }), hparent) child_node_dict = { "node": hnode, "name": child_dict["function"], "file": child_dict["file_path_short"], "line": child_dict["line_no"], "time": child_dict["time"], "time (inc)": child_dict["time"], "is_application_code": child_dict["is_application_code"], } hparent.add_child(hnode) self.node_dicts.append(child_node_dict) if "children" in child_dict: for child in child_dict["children"]: # Pyinstrument's time metric actually stores inclusive time. # To calculate exclusive time, we subtract the children's time # from the parent's time. child_node_dict["time"] -= child["time"] parse_node_literal(child, hnode) # start with creating a node_dict for each root graph_root = Node( Frame({ "name": self.graph_dict["root_frame"]["function"], "type": "function" }), None, ) node_dict = { "node": graph_root, "name": self.graph_dict["root_frame"]["function"], "file": self.graph_dict["root_frame"]["file_path_short"], "line": self.graph_dict["root_frame"]["line_no"], "time": self.graph_dict["root_frame"]["time"], "time (inc)": self.graph_dict["root_frame"]["time"], "is_application_code": self.graph_dict["root_frame"]["is_application_code"], } self.node_dicts.append(node_dict) self.list_roots.append(graph_root) # call recursively on all children of root if "children" in self.graph_dict["root_frame"]: for child in self.graph_dict["root_frame"]["children"]: # Pyinstrument's time metric actually stores inclusive time. # To calculate exclusive time, we subtract the children's time # from the parent's time. node_dict["time"] -= child["time"] parse_node_literal(child, graph_root) graph = Graph(self.list_roots) graph.enumerate_traverse() return graph
def test_len_chain(): graph = Graph.from_lists(("a", "b", "c", "d", "e")) assert len(graph) == 5
def test_len_diamond(): d = Node(Frame(name="d")) graph = Graph.from_lists(("a", ("b", d), ("c", d))) assert len(graph) == 4
def test_len_tree(): graph = Graph.from_lists(("a", ("b", "d"), ("c", "d"))) assert len(graph) == 5
def read(self): """Read the TAU profile file to extract the calling context tree.""" # Add all nodes and roots. roots = self.create_graph() # Create a graph object once all nodes have been added. graph = Graph(roots) graph.enumerate_traverse() dataframe = pd.DataFrame.from_dict(data=self.node_dicts) indices = [] # Set indices according to rank/thread numbers. if self.multiple_ranks and self.multiple_threads: indices = ["node", "rank", "thread"] elif self.multiple_ranks: dataframe.drop(columns=["thread"], inplace=True) indices = ["node", "rank"] elif self.multiple_threads: dataframe.drop(columns=["rank"], inplace=True) indices = ["node", "thread"] else: indices = ["node"] dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) # Fill the missing ranks # After unstacking and iterating over rows, there # will be "NaN" values for some ranks. Find the first # rank that has notna value and use it for other rows/ranks # of the multiindex. # TODO: iterrows() is not the best way to iterate over rows. if self.multiple_ranks or self.multiple_threads: dataframe = dataframe.unstack() for idx, row in dataframe.iterrows(): # There is always a valid name for an index. # Take that valid name and assign to other ranks/rows. name = row["name"][row["name"].first_valid_index()] dataframe.loc[idx, "name"] = name # Sometimes there is no file information. if row["file"].first_valid_index() is not None: file = row["file"][row["file"].first_valid_index()] dataframe.loc[idx, "file"] = file # Sometimes there is no module information. if row["module"].first_valid_index() is not None: module = row["module"][row["module"].first_valid_index()] dataframe.loc[idx, "module"] = module # Fill the rest with 0 dataframe.fillna(0, inplace=True) # Stack the dataframe dataframe = dataframe.stack() default_metric = "time (inc)" return hatchet.graphframe.GraphFrame(graph, dataframe, self.exc_metrics, self.inc_metrics, default_metric)
def read(self): """Read the caliper JSON file to extract the calling context tree.""" with self.timer.phase("read json"): self.read_json_sections() with self.timer.phase("graph construction"): list_roots = self.create_graph() # create a dataframe of metrics from the data section self.df_json_data = pd.DataFrame(self.json_data, columns=self.json_cols) # map non-numeric columns to their mappings in the nodes section for idx, item in enumerate(self.json_cols_mdata): if item["is_value"] is False and self.json_cols[ idx] != self.nid_col_name: if self.json_cols[idx] == "sourceloc#cali.sampler.pc": # split source file and line number into two columns self.df_json_data["file"] = self.df_json_data[ self.json_cols[idx]].apply( lambda x: re.match(r"(.*):(\d+)", self.json_nodes[ x]["label"]).group(1)) self.df_json_data["line"] = self.df_json_data[ self.json_cols[idx]].apply( lambda x: re.match(r"(.*):(\d+)", self.json_nodes[ x]["label"]).group(2)) self.df_json_data.drop(self.json_cols[idx], axis=1, inplace=True) sourceloc_idx = idx else: self.df_json_data[self.json_cols[idx]] = self.df_json_data[ self.json_cols[idx]].apply( lambda x: self.json_nodes[x]["label"]) # since we split sourceloc, we should update json_cols and # json_cols_mdata if "sourceloc#cali.sampler.pc" in self.json_cols: self.json_cols.pop(sourceloc_idx) self.json_cols_mdata.pop(sourceloc_idx) self.json_cols.append("file") self.json_cols.append("line") self.json_cols_mdata.append({"is_value": False}) self.json_cols_mdata.append({"is_value": False}) max_nid = self.df_json_data[self.nid_col_name].max() if "line" in self.df_json_data.columns: # split nodes that have multiple file:line numbers to have a child # each with a unique file:line number unique_nodes = self.df_json_data.groupby(self.nid_col_name) df_concat = [self.df_json_data] for nid, super_node in unique_nodes: line_groups = super_node.groupby("line") # only need to do something if there are more than one # file:line number entries for the node if len(line_groups.size()) > 1: sn_hnode = self.idx_to_node[nid]["node"] for line, line_group in line_groups: # create the node label file_path = (line_group.head(1))["file"].item() file_name = os.path.basename(file_path) node_label = file_name + ":" + line # create a new hatchet node max_nid += 1 idx = max_nid hnode = Node( Frame({ "type": "statement", "file": file_path, "line": line }), sn_hnode, ) sn_hnode.add_child(hnode) node_dict = { self.nid_col_name: idx, "name": node_label, "node": hnode, } self.idx_to_node[idx] = node_dict # change nid of the original node to new node in place for index, row in line_group.iterrows(): self.df_json_data.loc[index, "nid"] = max_nid # add new row for original node node_copy = super_node.head(1).copy() for cols in self.metric_columns: node_copy[cols] = 0 df_concat.append(node_copy) # concatenate all the newly created dataframes with # self.df_json_data self.df_fixed_data = pd.concat(df_concat) else: self.df_fixed_data = self.df_json_data # create a dataframe with all nodes in the call graph self.df_nodes = pd.DataFrame.from_dict( data=list(self.idx_to_node.values())) # add missing intermediate nodes to the df_fixed_data dataframe if "rank" in self.json_cols: self.num_ranks = self.df_fixed_data["rank"].max() + 1 rank_list = range(0, self.num_ranks) # create a standard dict to be used for filling all missing rows default_metric_dict = {} for idx, item in enumerate(self.json_cols_mdata): if self.json_cols[idx] != self.nid_col_name: if item["is_value"] is True: default_metric_dict[self.json_cols[idx]] = 0 else: default_metric_dict[self.json_cols[idx]] = None # create a list of dicts, one dict for each missing row missing_nodes = [] for iteridx, row in self.df_nodes.iterrows(): # check if df_nodes row exists in df_fixed_data metric_rows = self.df_fixed_data.loc[self.df_fixed_data[ self.nid_col_name] == row[self.nid_col_name]] if "rank" not in self.json_cols: if metric_rows.empty: # add a single row node_dict = dict(default_metric_dict) node_dict[self.nid_col_name] = row[self.nid_col_name] missing_nodes.append(node_dict) else: if metric_rows.empty: # add a row per MPI rank for rank in rank_list: node_dict = dict(default_metric_dict) node_dict[self.nid_col_name] = row[self.nid_col_name] node_dict["rank"] = rank missing_nodes.append(node_dict) elif len(metric_rows) < self.num_ranks: # add a row for each missing MPI rank present_ranks = metric_rows["rank"].values missing_ranks = [ x for x in rank_list if x not in present_ranks ] for rank in missing_ranks: node_dict = dict(default_metric_dict) node_dict[self.nid_col_name] = row[self.nid_col_name] node_dict["rank"] = rank missing_nodes.append(node_dict) self.df_missing = pd.DataFrame.from_dict(data=missing_nodes) self.df_metrics = pd.concat([self.df_fixed_data, self.df_missing]) # create a graph object once all the nodes have been added graph = Graph(list_roots) graph.enumerate_traverse() # merge the metrics and node dataframes on the idx column with self.timer.phase("data frame"): dataframe = pd.merge(self.df_metrics, self.df_nodes, on=self.nid_col_name) # set the index to be a MultiIndex indices = ["node"] if "rank" in self.json_cols: indices.append("rank") dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) # create list of exclusive and inclusive metric columns exc_metrics = [] inc_metrics = [] for column in self.metric_columns: if "(inc)" in column: inc_metrics.append(column) else: exc_metrics.append(column) return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics)
def read(self): """Read the experiment.xml file to extract the calling context tree and create a dataframe out of it. Then merge the two dataframes to create the final dataframe. Return: (GraphFrame): new GraphFrame with HPCToolkit data. """ with self.timer.phase("fill tables"): self.fill_tables() with self.timer.phase("read metric db"): self.read_all_metricdb_files() list_roots = [] # parse the ElementTree to generate a calling context tree for root in self.callpath_profile.findall("PF"): global src_file nid = int(root.get("i")) src_file = root.get("f") # start with the root and create the callpath and node for the root # also a corresponding node_dict to be inserted into the dataframe graph_root = Node( Frame({ "type": "function", "name": self.procedure_names[root.get("n")] }), None, ) node_dict = self.create_node_dict( nid, graph_root, self.procedure_names[root.get("n")], "PF", self.src_files[src_file], int(root.get("l")), self.load_modules[root.get("lm")], ) self.node_dicts.append(node_dict) list_roots.append(graph_root) # start graph construction at the root with self.timer.phase("graph construction"): self.parse_xml_children(root, graph_root) # put updated metrics back in dataframe for i, column in enumerate(self.metric_columns): if "(inc)" not in column and "(I)" not in column: self.df_metrics[column] = self.np_metrics.T[i] with self.timer.phase("graph construction"): graph = Graph(list_roots) graph.enumerate_traverse() # create a dataframe for all the nodes in the graph self.df_nodes = pd.DataFrame.from_dict(data=self.node_dicts) # merge the metrics and node dataframes with self.timer.phase("data frame"): dataframe = pd.merge(self.df_metrics, self.df_nodes, on="nid") # set the index to be a MultiIndex if self.num_threads_per_rank > 1: indices = ["node", "rank", "thread"] # if number of threads per rank is 1, do not make thread an index elif self.num_threads_per_rank == 1: indices = ["node", "rank"] dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) # create list of exclusive and inclusive metric columns exc_metrics = [] inc_metrics = [] for column in self.metric_columns: if "(inc)" in column or "(I)" in column: inc_metrics.append(column) else: exc_metrics.append(column) return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics)
def read(self): # print(self.dict) # filter regions if len(self.filter) > 0: for rank, rank_value in list(self.dict["ranks"].items()): for thread, thread_value in list( rank_value["threads"].items()): for region, data in list(thread_value["regions"].items()): if any(map(data["name"].__contains__, self.filter)) is True: del self.dict["ranks"][str(rank)]["threads"][str( thread)]["regions"][str(region)] # add default metrics 'cycles' and 'real_time_nsec' to inc_metrics self.inc_metrics.append("cycles") self.inc_metrics.append("real_time_nsec") # determine thread with the largest number of regions to create the graph max_regions = 1 graph_rank = 0 graph_thread = 0 rank_cnt = 0 thread_cnt = 0 for rank, rank_value in iter(self.dict["ranks"].items()): rank_cnt += 1 for thread, thread_value in iter(rank_value["threads"].items()): thread_cnt += 1 if len(thread_value["regions"]) > max_regions: max_regions = len(thread_value["regions"]) graph_rank = int(rank) graph_thread = int(thread) # create graph list_roots = [] node_dicts = [] self.__create_graph(graph_rank, graph_thread, list_roots, node_dicts) # fill up node dictionaries for all remaining ranks and threads for rank, rank_value in iter(self.dict["ranks"].items()): for thread, thread_value in iter(rank_value["threads"].items()): if int(rank) != graph_rank or int(thread) != graph_thread: node_graph_id = -1 for data in iter(thread_value["regions"].values()): # print(data["name"]) node_graph_id += 1 if node_graph_id >= len(self.node_graph_dict): self.__print_error_and_exit( data["name"], rank, thread) # find matching regions found_match = False while found_match is False: if self.node_graph_dict[node_graph_id][0] == data[ "name"]: found_match = True else: # create a tuple of zero values zero_metrics = self.__get_zero_metrics() node_dict = dict({ "name": self.node_graph_dict[node_graph_id][0], "node": self.node_graph_dict[node_graph_id][1], "rank": int(rank), "thread": int(thread), **zero_metrics, }) node_dicts.append(node_dict) # set index to the next region node_graph_id += 1 if node_graph_id >= len(self.node_graph_dict): self.__print_error_and_exit( data["name"], rank, thread) if found_match is True: # we found a match contain_read_events = [0] metrics = self.__get_metrics( data, contain_read_events) node_dict = dict({ "name": self.node_graph_dict[node_graph_id][0], "node": self.node_graph_dict[node_graph_id][1], "rank": int(rank), "thread": int(thread), **metrics, }) node_dicts.append(node_dict) # check if we have to add read events if contain_read_events[0] == 1: # check how many read calls are used read_num = len(data["cycles"]) for i in range(1, read_num): node_name_read = "read_" + str(i) read_metrics = self.__get_read_metrics( data, node_name_read) node_dict = dict({ "name": node_name_read, "node": self.__find_read_node( self.node_graph_dict[node_graph_id] [1], node_name_read, ), "rank": int(rank), "thread": int(thread), **read_metrics, }) node_dicts.append(node_dict) # setup data for hatchet graphframe graph = Graph(list_roots) graph.enumerate_traverse() dataframe = pd.DataFrame(data=node_dicts) # check graph indices if rank_cnt > 1 and thread_cnt > 1: indices = ["node", "rank", "thread"] elif rank_cnt > 1: dataframe.drop(columns=["thread"], inplace=True) indices = ["node", "rank"] elif thread_cnt > 1: dataframe.drop(columns=["rank"], inplace=True) indices = ["node", "thread"] else: dataframe.drop(columns=["rank", "thread"], inplace=True) indices = ["node"] dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) default_metric = "real_time_nsec" return hatchet.graphframe.GraphFrame(graph, dataframe, [], self.inc_metrics, default_metric)
def read(self): """Read the experiment.xml file to extract the calling context tree and create a dataframe out of it. Then merge the two dataframes to create the final dataframe. Return: (GraphFrame): new GraphFrame with HPCToolkit data. """ with self.timer.phase("fill tables"): self.fill_tables() with self.timer.phase("read metric db"): self.read_all_metricdb_files() list_roots = [] # parse the ElementTree to generate a calling context tree for root in self.callpath_profile.findall("PF"): nid = int(root.get("i")) # start with the root and create the callpath and node for the root # also a corresponding node_dict to be inserted into the dataframe node_callpath = [] node_callpath.append(self.procedure_names[root.get("n")]) graph_root = Node( Frame({ "type": "function", "name": self.procedure_names[root.get("n")] }), None, ) node_dict = self.create_node_dict( nid, graph_root, self.procedure_names[root.get("n")], "PF", self.src_files[root.get("f")], root.get("l"), self.load_modules[root.get("lm")], ) self.node_dicts.append(node_dict) list_roots.append(graph_root) # start graph construction at the root with self.timer.phase("graph construction"): self.parse_xml_children(root, graph_root, list(node_callpath)) # create a dataframe for all the nodes in the graph self.df_nodes = pd.DataFrame.from_dict(data=self.node_dicts) # merge the metrics and node dataframes with self.timer.phase("data frame"): dataframe = pd.merge(self.df_metrics, self.df_nodes, on="nid") # set the index to be a MultiIndex indices = ["node", "rank"] dataframe.set_index(indices, drop=False, inplace=True) # create list of exclusive and inclusive metric columns exc_metrics = [] inc_metrics = [] for column in self.metric_columns: if "(inc)" in column: inc_metrics.append(column) else: exc_metrics.append(column) return hatchet.graphframe.GraphFrame(Graph(list_roots), dataframe, exc_metrics, inc_metrics)