def read(self): list_roots = [] node_dicts = [] frame_to_node_dict = {} frame = None seen_nids = [] hnid = -1 # start with creating a node_dict for each root for i in range(len(self.graph_dict)): if "_hatchet_nid" in self.graph_dict[i]["metrics"]: hnid = self.graph_dict[i]["metrics"]["_hatchet_nid"] seen_nids.append(hnid) frame = Frame(self.graph_dict[i]["frame"]) graph_root = Node(frame, None, hnid=hnid) # depending on the node type, the name may not be in the frame node_name = self.graph_dict[i]["frame"].get("name") if not node_name: node_name = self.graph_dict[i]["name"] node_dict = dict({ "node": graph_root, "name": node_name }, **self.graph_dict[i]["metrics"]) node_dicts.append(node_dict) list_roots.append(graph_root) frame_to_node_dict[frame] = graph_root # call recursively on all children of root if "children" in self.graph_dict[i]: for child in self.graph_dict[i]["children"]: self.parse_node_literal(frame_to_node_dict, node_dicts, child, graph_root, seen_nids) graph = Graph(list_roots) # test if nids are already loaded if -1 in [n._hatchet_nid for n in graph.traverse()]: graph.enumerate_traverse() else: graph.enumerate_depth() exc_metrics = [] inc_metrics = [] for key in self.graph_dict[i]["metrics"].keys(): if "(inc)" in key: inc_metrics.append(key) else: exc_metrics.append(key) dataframe = pd.DataFrame(data=node_dicts) dataframe.set_index(["node"], inplace=True) dataframe.sort_index(inplace=True) return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics)
def read(self): roots = self.create_graph() graph = Graph(roots) graph.enumerate_traverse() dataframe = pd.DataFrame.from_dict(data=list(self.name_to_dict.values())) index = ["node"] dataframe.set_index(index, inplace=True) dataframe.sort_index(inplace=True) return hatchet.graphframe.GraphFrame(graph, dataframe, ["time"], ["time (inc)"])
def read(self): """Read the caliper records to extract the calling context tree.""" if isinstance(self.filename_or_caliperreader, str): if self.filename_ext != ".cali": raise ValueError("from_caliperreader() needs a .cali file") else: cali_file = self.filename_or_caliperreader self.filename_or_caliperreader = cr.CaliperReader() self.filename_or_caliperreader.read(cali_file) with self.timer.phase("graph construction"): list_roots = self.create_graph() # create a graph object once all the nodes have been added graph = Graph(list_roots) graph.enumerate_traverse() dataframe = pd.DataFrame(data=self.node_dicts) indices = ["node"] if "rank" in dataframe.columns: indices.append("rank") dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) # change column names for idx, item in enumerate(dataframe.columns): # make other columns consistent with other readers if item == "mpi.rank": dataframe.columns.values[idx] = "rank" if item == "module#cali.sampler.pc": dataframe.columns.values[idx] = "module" # create list of exclusive and inclusive metric columns exc_metrics = [] inc_metrics = [] for column in self.metric_columns: if "(inc)" in column: inc_metrics.append(column) else: exc_metrics.append(column) metadata = self.filename_or_caliperreader.globals return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics, metadata=metadata)
def read(self): """Read the TAU profile file to extract the calling context tree.""" # Add all nodes and roots. roots = self.create_graph() # Create a graph object once all nodes have been added. graph = Graph(roots) graph.enumerate_traverse() dataframe = pd.DataFrame.from_dict(data=self.node_dicts) indices = [] # Set indices according to rank/thread numbers. if self.multiple_ranks and self.multiple_threads: indices = ["node", "rank", "thread"] elif self.multiple_ranks: dataframe.drop(columns=["thread"], inplace=True) indices = ["node", "rank"] elif self.multiple_threads: dataframe.drop(columns=["rank"], inplace=True) indices = ["node", "thread"] else: indices = ["node"] dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) # Fill the missing ranks # After unstacking and iterating over rows, there # will be "NaN" values for some ranks. Find the first # rank that has notna value and use it for other rows/ranks # of the multiindex. # TODO: iterrows() is not the best way to iterate over rows. if self.multiple_ranks or self.multiple_threads: dataframe = dataframe.unstack() for idx, row in dataframe.iterrows(): # There is always a valid name for an index. # Take that valid name and assign to other ranks/rows. name = row["name"][row["name"].first_valid_index()] dataframe.loc[idx, "name"] = name # Sometimes there is no file information. if row["file"].first_valid_index() is not None: file = row["file"][row["file"].first_valid_index()] dataframe.loc[idx, "file"] = file # Sometimes there is no module information. if row["module"].first_valid_index() is not None: module = row["module"][row["module"].first_valid_index()] dataframe.loc[idx, "module"] = module # Fill the rest with 0 dataframe.fillna(0, inplace=True) # Stack the dataframe dataframe = dataframe.stack() default_metric = "time (inc)" return hatchet.graphframe.GraphFrame(graph, dataframe, self.exc_metrics, self.inc_metrics, default_metric)
def read(self): """Read the caliper JSON file to extract the calling context tree.""" with self.timer.phase("read json"): self.read_json_sections() with self.timer.phase("graph construction"): list_roots = self.create_graph() # create a dataframe of metrics from the data section self.df_json_data = pd.DataFrame(self.json_data, columns=self.json_cols) # map non-numeric columns to their mappings in the nodes section for idx, item in enumerate(self.json_cols_mdata): if item["is_value"] is False and self.json_cols[ idx] != self.nid_col_name: if self.json_cols[idx] == "sourceloc#cali.sampler.pc": # split source file and line number into two columns self.df_json_data["file"] = self.df_json_data[ self.json_cols[idx]].apply( lambda x: re.match(r"(.*):(\d+)", self.json_nodes[ x]["label"]).group(1)) self.df_json_data["line"] = self.df_json_data[ self.json_cols[idx]].apply( lambda x: re.match(r"(.*):(\d+)", self.json_nodes[ x]["label"]).group(2)) self.df_json_data.drop(self.json_cols[idx], axis=1, inplace=True) sourceloc_idx = idx else: self.df_json_data[self.json_cols[idx]] = self.df_json_data[ self.json_cols[idx]].apply( lambda x: self.json_nodes[x]["label"]) # since we split sourceloc, we should update json_cols and # json_cols_mdata if "sourceloc#cali.sampler.pc" in self.json_cols: self.json_cols.pop(sourceloc_idx) self.json_cols_mdata.pop(sourceloc_idx) self.json_cols.append("file") self.json_cols.append("line") self.json_cols_mdata.append({"is_value": False}) self.json_cols_mdata.append({"is_value": False}) max_nid = self.df_json_data[self.nid_col_name].max() if "line" in self.df_json_data.columns: # split nodes that have multiple file:line numbers to have a child # each with a unique file:line number unique_nodes = self.df_json_data.groupby(self.nid_col_name) df_concat = [self.df_json_data] for nid, super_node in unique_nodes: line_groups = super_node.groupby("line") # only need to do something if there are more than one # file:line number entries for the node if len(line_groups.size()) > 1: sn_hnode = self.idx_to_node[nid]["node"] for line, line_group in line_groups: # create the node label file_path = (line_group.head(1))["file"].item() file_name = os.path.basename(file_path) node_label = file_name + ":" + line # create a new hatchet node max_nid += 1 idx = max_nid hnode = Node( Frame({ "type": "statement", "file": file_path, "line": line }), sn_hnode, ) sn_hnode.add_child(hnode) node_dict = { self.nid_col_name: idx, "name": node_label, "node": hnode, } self.idx_to_node[idx] = node_dict # change nid of the original node to new node in place for index, row in line_group.iterrows(): self.df_json_data.loc[index, "nid"] = max_nid # add new row for original node node_copy = super_node.head(1).copy() for cols in self.metric_columns: node_copy[cols] = 0 df_concat.append(node_copy) # concatenate all the newly created dataframes with # self.df_json_data self.df_fixed_data = pd.concat(df_concat) else: self.df_fixed_data = self.df_json_data # create a dataframe with all nodes in the call graph self.df_nodes = pd.DataFrame.from_dict( data=list(self.idx_to_node.values())) # add missing intermediate nodes to the df_fixed_data dataframe if "rank" in self.json_cols: self.num_ranks = self.df_fixed_data["rank"].max() + 1 rank_list = range(0, self.num_ranks) # create a standard dict to be used for filling all missing rows default_metric_dict = {} for idx, item in enumerate(self.json_cols_mdata): if self.json_cols[idx] != self.nid_col_name: if item["is_value"] is True: default_metric_dict[self.json_cols[idx]] = 0 else: default_metric_dict[self.json_cols[idx]] = None # create a list of dicts, one dict for each missing row missing_nodes = [] for iteridx, row in self.df_nodes.iterrows(): # check if df_nodes row exists in df_fixed_data metric_rows = self.df_fixed_data.loc[self.df_fixed_data[ self.nid_col_name] == row[self.nid_col_name]] if "rank" not in self.json_cols: if metric_rows.empty: # add a single row node_dict = dict(default_metric_dict) node_dict[self.nid_col_name] = row[self.nid_col_name] missing_nodes.append(node_dict) else: if metric_rows.empty: # add a row per MPI rank for rank in rank_list: node_dict = dict(default_metric_dict) node_dict[self.nid_col_name] = row[self.nid_col_name] node_dict["rank"] = rank missing_nodes.append(node_dict) elif len(metric_rows) < self.num_ranks: # add a row for each missing MPI rank present_ranks = metric_rows["rank"].values missing_ranks = [ x for x in rank_list if x not in present_ranks ] for rank in missing_ranks: node_dict = dict(default_metric_dict) node_dict[self.nid_col_name] = row[self.nid_col_name] node_dict["rank"] = rank missing_nodes.append(node_dict) self.df_missing = pd.DataFrame.from_dict(data=missing_nodes) self.df_metrics = pd.concat([self.df_fixed_data, self.df_missing]) # create a graph object once all the nodes have been added graph = Graph(list_roots) graph.enumerate_traverse() # merge the metrics and node dataframes on the idx column with self.timer.phase("data frame"): dataframe = pd.merge(self.df_metrics, self.df_nodes, on=self.nid_col_name) # set the index to be a MultiIndex indices = ["node"] if "rank" in self.json_cols: indices.append("rank") dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) # create list of exclusive and inclusive metric columns exc_metrics = [] inc_metrics = [] for column in self.metric_columns: if "(inc)" in column: inc_metrics.append(column) else: exc_metrics.append(column) return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics)
def read(self): """Read the experiment.xml file to extract the calling context tree and create a dataframe out of it. Then merge the two dataframes to create the final dataframe. Return: (GraphFrame): new GraphFrame with HPCToolkit data. """ with self.timer.phase("fill tables"): self.fill_tables() with self.timer.phase("read metric db"): self.read_all_metricdb_files() list_roots = [] # parse the ElementTree to generate a calling context tree for root in self.callpath_profile.findall("PF"): global src_file nid = int(root.get("i")) src_file = root.get("f") # start with the root and create the callpath and node for the root # also a corresponding node_dict to be inserted into the dataframe graph_root = Node( Frame({ "type": "function", "name": self.procedure_names[root.get("n")] }), None, ) node_dict = self.create_node_dict( nid, graph_root, self.procedure_names[root.get("n")], "PF", self.src_files[src_file], int(root.get("l")), self.load_modules[root.get("lm")], ) self.node_dicts.append(node_dict) list_roots.append(graph_root) # start graph construction at the root with self.timer.phase("graph construction"): self.parse_xml_children(root, graph_root) # put updated metrics back in dataframe for i, column in enumerate(self.metric_columns): if "(inc)" not in column and "(I)" not in column: self.df_metrics[column] = self.np_metrics.T[i] with self.timer.phase("graph construction"): graph = Graph(list_roots) graph.enumerate_traverse() # create a dataframe for all the nodes in the graph self.df_nodes = pd.DataFrame.from_dict(data=self.node_dicts) # merge the metrics and node dataframes with self.timer.phase("data frame"): dataframe = pd.merge(self.df_metrics, self.df_nodes, on="nid") # set the index to be a MultiIndex if self.num_threads_per_rank > 1: indices = ["node", "rank", "thread"] # if number of threads per rank is 1, do not make thread an index elif self.num_threads_per_rank == 1: indices = ["node", "rank"] dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) # create list of exclusive and inclusive metric columns exc_metrics = [] inc_metrics = [] for column in self.metric_columns: if "(inc)" in column or "(I)" in column: inc_metrics.append(column) else: exc_metrics.append(column) return hatchet.graphframe.GraphFrame(graph, dataframe, exc_metrics, inc_metrics)
def create_graph(self): def parse_node_literal(child_dict, hparent): """Create node_dict for one node and then call the function recursively on all children.""" hnode = Node( Frame({ "name": child_dict["function"], "type": "function" }), hparent) child_node_dict = { "node": hnode, "name": child_dict["function"], "file": child_dict["file_path_short"], "line": child_dict["line_no"], "time": child_dict["time"], "time (inc)": child_dict["time"], "is_application_code": child_dict["is_application_code"], } hparent.add_child(hnode) self.node_dicts.append(child_node_dict) if "children" in child_dict: for child in child_dict["children"]: # Pyinstrument's time metric actually stores inclusive time. # To calculate exclusive time, we subtract the children's time # from the parent's time. child_node_dict["time"] -= child["time"] parse_node_literal(child, hnode) # start with creating a node_dict for each root graph_root = Node( Frame({ "name": self.graph_dict["root_frame"]["function"], "type": "function" }), None, ) node_dict = { "node": graph_root, "name": self.graph_dict["root_frame"]["function"], "file": self.graph_dict["root_frame"]["file_path_short"], "line": self.graph_dict["root_frame"]["line_no"], "time": self.graph_dict["root_frame"]["time"], "time (inc)": self.graph_dict["root_frame"]["time"], "is_application_code": self.graph_dict["root_frame"]["is_application_code"], } self.node_dicts.append(node_dict) self.list_roots.append(graph_root) # call recursively on all children of root if "children" in self.graph_dict["root_frame"]: for child in self.graph_dict["root_frame"]["children"]: # Pyinstrument's time metric actually stores inclusive time. # To calculate exclusive time, we subtract the children's time # from the parent's time. node_dict["time"] -= child["time"] parse_node_literal(child, graph_root) graph = Graph(self.list_roots) graph.enumerate_traverse() return graph
def read(self): # print(self.dict) # filter regions if len(self.filter) > 0: for rank, rank_value in list(self.dict["ranks"].items()): for thread, thread_value in list( rank_value["threads"].items()): for region, data in list(thread_value["regions"].items()): if any(map(data["name"].__contains__, self.filter)) is True: del self.dict["ranks"][str(rank)]["threads"][str( thread)]["regions"][str(region)] # add default metrics 'cycles' and 'real_time_nsec' to inc_metrics self.inc_metrics.append("cycles") self.inc_metrics.append("real_time_nsec") # determine thread with the largest number of regions to create the graph max_regions = 1 graph_rank = 0 graph_thread = 0 rank_cnt = 0 thread_cnt = 0 for rank, rank_value in iter(self.dict["ranks"].items()): rank_cnt += 1 for thread, thread_value in iter(rank_value["threads"].items()): thread_cnt += 1 if len(thread_value["regions"]) > max_regions: max_regions = len(thread_value["regions"]) graph_rank = int(rank) graph_thread = int(thread) # create graph list_roots = [] node_dicts = [] self.__create_graph(graph_rank, graph_thread, list_roots, node_dicts) # fill up node dictionaries for all remaining ranks and threads for rank, rank_value in iter(self.dict["ranks"].items()): for thread, thread_value in iter(rank_value["threads"].items()): if int(rank) != graph_rank or int(thread) != graph_thread: node_graph_id = -1 for data in iter(thread_value["regions"].values()): # print(data["name"]) node_graph_id += 1 if node_graph_id >= len(self.node_graph_dict): self.__print_error_and_exit( data["name"], rank, thread) # find matching regions found_match = False while found_match is False: if self.node_graph_dict[node_graph_id][0] == data[ "name"]: found_match = True else: # create a tuple of zero values zero_metrics = self.__get_zero_metrics() node_dict = dict({ "name": self.node_graph_dict[node_graph_id][0], "node": self.node_graph_dict[node_graph_id][1], "rank": int(rank), "thread": int(thread), **zero_metrics, }) node_dicts.append(node_dict) # set index to the next region node_graph_id += 1 if node_graph_id >= len(self.node_graph_dict): self.__print_error_and_exit( data["name"], rank, thread) if found_match is True: # we found a match contain_read_events = [0] metrics = self.__get_metrics( data, contain_read_events) node_dict = dict({ "name": self.node_graph_dict[node_graph_id][0], "node": self.node_graph_dict[node_graph_id][1], "rank": int(rank), "thread": int(thread), **metrics, }) node_dicts.append(node_dict) # check if we have to add read events if contain_read_events[0] == 1: # check how many read calls are used read_num = len(data["cycles"]) for i in range(1, read_num): node_name_read = "read_" + str(i) read_metrics = self.__get_read_metrics( data, node_name_read) node_dict = dict({ "name": node_name_read, "node": self.__find_read_node( self.node_graph_dict[node_graph_id] [1], node_name_read, ), "rank": int(rank), "thread": int(thread), **read_metrics, }) node_dicts.append(node_dict) # setup data for hatchet graphframe graph = Graph(list_roots) graph.enumerate_traverse() dataframe = pd.DataFrame(data=node_dicts) # check graph indices if rank_cnt > 1 and thread_cnt > 1: indices = ["node", "rank", "thread"] elif rank_cnt > 1: dataframe.drop(columns=["thread"], inplace=True) indices = ["node", "rank"] elif thread_cnt > 1: dataframe.drop(columns=["rank"], inplace=True) indices = ["node", "thread"] else: dataframe.drop(columns=["rank", "thread"], inplace=True) indices = ["node"] dataframe.set_index(indices, inplace=True) dataframe.sort_index(inplace=True) default_metric = "real_time_nsec" return hatchet.graphframe.GraphFrame(graph, dataframe, [], self.inc_metrics, default_metric)