def _parse_consts(self, consts): """ Parse `anf_ir_pb2.NameValueProto` object, and create a const node. Args: consts (list[anf_ir_pb2.NameValueProto]): Refer to `anf_ir_pb2.NameValueProto` object. """ logger.debug("Start to parse consts from proto.") for const in consts: if not const.key: logger.warning("Finding a const with an empty key will not save it.") continue node = Node(name=const.key, node_id=const.key) node.type = NodeTypeEnum.CONST.value node.add_attr({const.key: str(const.value)}) if const.value.dtype == DataType.DT_TENSOR: shape = list(const.value.tensor_val.dims) node.output_shape.append(shape) if const.value.tensor_val.HasField('data_type'): node.elem_types.append(DataType.Name(const.value.tensor_val.data_type)) else: node.elem_types.append(DataType.Name(const.value.dtype)) # dim is zero node.output_shape.append([]) node.output_nums = len(node.output_shape) self._cache_node(node)
def load(self, executor=None): """ Load all log valid files. When the file is reloaded, it will continue to load from where it left off. Args: executor (Optional[executor]): The Executor instance. Returns: bool, True if the train job is finished loading. """ logger.debug("Start to load data in ms data loader.") if isinstance(executor, Executor): return self._load(executor) if executor is not None: raise TypeError( "'executor' should be an Executor instance or None.") with ComputingResourceManager() as mgr: with mgr.get_executor() as new_executor: while not self._load(new_executor): pass new_executor.wait_all_tasks_finish() return True
def samples(self): """Return all stored samples.""" with self._mutex: if self._visual_range_up_to_date: return list(self._samples) # calc visual range visual_range = _VisualRange() max_count = 0 for sample in self._samples: histogram = sample.value if histogram.count == 0: # ignore empty tensor continue max_count = max(histogram.count, max_count) visual_range.update(histogram.max, histogram.min) if visual_range.max == visual_range.min and not max_count: logger.info("Max equals to min. Count is zero.") bins = calc_histogram_bins(max_count) # update visual range logger.debug( "Visual histogram: min %s, max %s, bins %s, max_count %s.", visual_range.min, visual_range.max, bins, max_count) for sample in self._samples: histogram = sample.value histogram.set_visual_range(visual_range.max, visual_range.min, bins) self._visual_range_up_to_date = True return list(self._samples)
def _build_polymeric_nodes(self): """Build polymeric node.""" logger.debug("Start to build polymeric nodes") self._find_polymeric_nodes() group_count_map = {} for group_name, group in self._node_groups.items(): name = group_name.split('/')[-1] count = group_count_map.get(name, 0) count += 1 group_count_map[name] = count polymeric_node_name = group_name + '_{}_[{}]'.format( count, len(group)) polymeric_node = Node(polymeric_node_name, node_id=polymeric_node_name) polymeric_node.node_type = NodeTypeEnum.POLYMERIC_SCOPE.value polymeric_node.name_scope = '/'.join(group_name.split('/')[:-1]) polymeric_node.subnode_count = len(group) for name_tmp, node_tmp in group.items(): node_tmp.polymeric_scope_name = polymeric_node_name self._polymeric_nodes.update({name_tmp: node_tmp}) polymeric_node.update_input(node_tmp.input) polymeric_node.update_output(node_tmp.output) self._normal_nodes.update({polymeric_node_name: polymeric_node}) self._update_input_output()
def _parse_parameters(self, parameter_protos): """ Parse `anf_ir_pb2.ParameterProto` object, and create a parameter node. Args: parameter_protos (list[anf_ir_pb2.ParameterProto]): Refer to anf_ir_pb2.ParameterProto. """ logger.debug("Start to parse parameters from proto.") for parameter in parameter_protos: if not parameter.name: logger.warning( "Finding a parameter with an empty name will not save it.") continue check_invalid_character(parameter.name) node = Node(name=parameter.name, node_id=parameter.name) node.type = NodeTypeEnum.PARAMETER.value node.output_shape = self._get_shape_by_parse_type_proto( parameter.type) node.output_nums = len(node.output_shape) node.output_data_type = self._get_data_type_by_parse_type_proto( parameter.type, node) attr = dict( type=self._get_data_type_by_parse_type_proto( parameter.type, node), shape=str(self._get_shape_by_parse_type_proto(parameter.type))) node.add_attr(attr) self._cache_node(node) logger.debug( "Foreach graph proto parameters, node id: %s, node name: %s, " "node def name: %s", node.node_id, node.name, parameter.name)
def _load_single_file(self, file_handler): """ Load a log file data. Args: file_handler (FileHandler): A file handler. """ logger.debug("Load single summary file, file path: %s.", file_handler.file_path) while True: start_offset = file_handler.offset try: event_str = self._event_load(file_handler) if event_str is None: file_handler.reset_offset(start_offset) break event = summary_pb2.Event.FromString(event_str) self._event_parse(event) except exceptions.CRCFailedError: file_handler.reset_offset(start_offset) logger.warning("Check crc faild and ignore this file, file_path=%s, " "offset=%s.", file_handler.file_path, file_handler.offset) break except (OSError, DecodeError, exceptions.MindInsightException) as ex: logger.warning("Parse log file fail, and ignore this file, detail: %r," "file path: %s.", str(ex), file_handler.file_path) break except Exception as ex: logger.exception(ex) raise UnknownError(str(ex))
def _execute_loader(self, loader_id): """ Load data form data_loader. If there is something wrong by loading, add logs and delete the loader. Args: loader_id (str): An ID for `Loader`. """ try: with self._loader_pool_mutex: loader = self._loader_pool.get(loader_id, None) if loader is None: logger.debug( "Loader %r has been deleted, will not load data.", loader_id) return loader.data_loader.load() # Update loader cache status to CACHED. # Loader with cache status CACHED should remain the same cache status. loader.cache_status = CacheStatus.CACHED except MindInsightException as ex: logger.warning( "Data loader %r load data failed. " "Delete data_loader. Detail: %s", loader_id, ex) with self._loader_pool_mutex: self._delete_loader(loader_id)
def _execute_loader(self, loader_id): """ Load data form data_loader. If there is something wrong by loading, add logs and delete the loader. Args: loader_id (str): An ID for `Loader`. """ try: with self._loader_pool_mutex: loader = self._loader_pool.get(loader_id, None) if loader is None: logger.debug( "Loader %r has been deleted, will not load data.", loader_id) return loader.data_loader.load() except MindInsightException as ex: logger.warning( "Data loader %r load data failed. " "Delete data_loader. Detail: %s", loader_id, ex) with self._loader_pool_mutex: self._delete_loader(loader_id)
def list_tensors(self, train_id, tag): """ List tensors of the given train job and tag. If the tensor can not find by the given tag, will raise exception. Args: train_id (str): ID for train job. tag (str): The tag name. Returns: list, the NameTuple format is `collections.namedtuple('_Tensor', ['wall_time', 'event_step', 'value'])`. the value will contain the given tag data. """ loader_pool = self._get_snapshot_loader_pool() if not self._is_loader_in_loader_pool(train_id, loader_pool): raise TrainJobNotExistError( "Can not find the given train job in cache.") data_loader = loader_pool[train_id].data_loader tensors = [] try: events_data = data_loader.get_events_data() tensors = events_data.tensors(tag) except KeyError: error_msg = "Can not find any data in this train job by given tag." raise ParamValueError(error_msg) except AttributeError: logger.debug( "Train job %r has been deleted or it has not loaded data, " "and set tags to empty list.", train_id) return tensors
def _parse_op_nodes(self, node_protos): """ Parse `anf_ir_pb2.NodeProto` object, and create a normal node. Args: node_protos (list[anf_ir_pb2.NodeProto]): Refer to anf_ir_pb2.NodeProto. """ logger.debug("Start to parse op nodes from proto.") for node_proto in node_protos: if not node_proto.name: logger.warning("Finding a node with an empty name will not save it.") continue if not node_proto.full_name or any( node_proto.full_name.lower().endswith(f'[:{plugin.value.lower()}]') for plugin in PluginNameEnum): node_name = Node.create_node_name(scope=node_proto.scope, base_name=f'{node_proto.op_type}{node_proto.name}') else: node_name = node_proto.full_name node = Node(name=node_name, node_id=node_proto.name) node.full_name = node_proto.full_name node.type = node_proto.op_type self._parse_attributes(node_proto.attribute, node) self._parse_inputs(node_proto.input, node) node.output_i = node_proto.output_i node.scope = node_proto.scope node.output_shape = self._get_shape_by_parse_type_proto(node_proto.output_type) node.output_nums = len(node.output_shape) node.output_data_type = self._get_data_type_by_parse_type_proto(node_proto.output_type, node) self._cache_node(node)
def _load_data(self): """This function will load data once and ignore it if the status is loading.""" with self._status_mutex: if self.status == DataManagerStatus.LOADING.value: logger.debug( "Current status is %s , will ignore to load data.", self.status) return self.status = DataManagerStatus.LOADING.value with ComputingResourceManager( executors_cnt=1, max_processes_cnt=settings.MAX_PROCESSES_COUNT ) as computing_resource_mgr: with computing_resource_mgr.get_executor() as executor: self._brief_cache.update_cache(executor) brief_cache_update = time.time() for _ in self._detail_cache.update_cache(executor): update_interval = time.time() - brief_cache_update logger.debug( 'Loading one round of detail cache taking %ss.', update_interval) if update_interval > 3: # Use 3 seconds as threshold to avoid updating too often self._brief_cache.update_cache(executor) brief_cache_update += update_interval executor.wait_all_tasks_finish() with self._status_mutex: if not self._brief_cache.has_content( ) and not self._detail_cache.has_content(): self.status = DataManagerStatus.INVALID.value else: self.status = DataManagerStatus.DONE.value logger.info("Load brief data end, and loader pool size is %r.", self._detail_cache.loader_pool_size())
def _update_node_name_of_cache(self, node, new_name, update_parent=False): """ Update a node name which is stored in cache. Args: node (Node): The node that will be renamed. new_name (str): The new name. update_parent (bool): Determines whether the input and output of the parent node need to be updated. """ logger.debug('Update node name of cache, node(%s), new name is %s.', str(node), new_name) origin_name = node.name node.name = new_name # Find all nodes that need to modify the input and input update_node_map = {} for method in ['input', 'output', 'proxy_input', 'proxy_output']: for target_name in getattr(node, method): target_node = self._get_normal_node(node_name=target_name) if target_node is None: message = f"Node should not be None, name: {target_name}, {method}: {list(getattr(node, method))}." logger.error(message) continue update_node_map.update({target_name: target_node}) if not update_parent: continue slash_index = target_name.find('/') while slash_index != -1: scope_name = target_name[:slash_index] slash_index = target_name.find('/', slash_index+1) if update_node_map.get(scope_name): continue scope_node = self._get_normal_node(node_name=scope_name) if scope_node is None: message = f"Can not find the scope node by scope name({scope_name}), " \ f"may be this scope node has not been built." logger.debug(message) continue update_node_map.update({scope_name: scope_node}) # Update the input and output of the nodes for target_node in update_node_map.values(): for method in ['input', 'output', 'proxy_input', 'proxy_output']: attr_temp = getattr(target_node, method).get(origin_name) if attr_temp is None: # This method does not have this node, so it is skipped continue # Delete the old attribute and update new name to source node or destination node. getattr(target_node, f'delete_{method}')(origin_name) getattr(target_node, f'add_{method}')(new_name, attr_temp) # Delete the origin node in cache. self._delete_nodes_of_cache(node_names=[origin_name]) self._cache_node(node)
def _find_polymeric_nodes(self): """Find polymeric nodes from node groups.""" node_groups = copy.deepcopy(self._node_groups) for group_name, group in node_groups.items(): if len(group) < self.MIN_POLYMERIC_NODE_COUNT: self._normal_nodes.update(group) self._node_groups.pop(group_name) continue move_node_names = [] is_move_group = False for node_name, group_node in group.items(): node_list = [] is_in_group = False for dst_name in group_node.output: node_tmp = self._leaf_nodes[dst_name] node_list.append(node_tmp) start = time.time() run_count = 0 visit_nodes = {} while node_list: # Iterate to find if the output of the node in the group causes a loop # example: there is a group A, and node_a is a Node in group. # if there is a loop in node_a, like A/node_a -> B/node_b -> A/node_b # we will remove the node_a from group A. node_tmp = node_list[0] node_list = node_list[1:] visit_nodes.update({node_tmp.name: True}) if node_tmp in group.values(): is_in_group = True break for dst_name_tmp in node_tmp.output: run_count += 1 node_tmp = self._leaf_nodes[dst_name_tmp] if visit_nodes.get(dst_name_tmp): continue node_list.append(node_tmp) logger.debug( "Find group %s node end, is_in_group: %s, use time: %s, " "run count: %s.", group_name, is_in_group, time.time() - start, run_count) if is_in_group: move_node_names.append(node_name) if (len(group) - len(move_node_names)) < self.MIN_POLYMERIC_NODE_COUNT: is_move_group = True break if is_move_group: self._normal_nodes.update(group) self._node_groups.pop(group_name) else: for name_tmp in move_node_names: node_tmp = self._node_groups[group_name].pop(name_tmp) self._normal_nodes.update({name_tmp: node_tmp})
def _calc_input(self, leaf_node_id_map_name, graph_proto, const_nodes_map): """ Calc input for every leaf node. Args: leaf_node_id_map_name (dict[str, str]): Format is {'node_id': 'node_name'}. graph_proto (anf_ir_pb2.model_proto.graph): See anf_ir_pb2.model_proto.graph. const_nodes_map (dict[str, Node]): Format is {'node name': <Const node>}. """ logger.debug("Start to calc input.") for node_def in graph_proto.node: node_name = leaf_node_id_map_name[node_def.name] node = self._leaf_nodes[node_name] for input_def in node_def.input: edge_type = EdgeTypeEnum.data if input_def.type == "CONTROL_EDGE": edge_type = EdgeTypeEnum.control if const_nodes_map.get(input_def.name): const_node = copy.deepcopy(const_nodes_map[input_def.name]) src_name = '{}/{}'.format(node.name_scope, input_def.name) if not self._normal_nodes.get(src_name): const_node.name = src_name const_node.name_scope = node.name_scope self._normal_nodes.update({src_name: const_node}) self._leaf_nodes.update({src_name: const_node}) src_node = self._leaf_nodes.get(src_name) else: src_name = leaf_node_id_map_name.get(input_def.name) if not src_name: logger.warning( "The input_def name '%s' in node '%s' is invalid, " "will be ignore.", input_def.name, node_name) continue src_node = self._leaf_nodes.get(src_name) if src_node is None: logger.warning( "The input '%s' in node '%s' is not in " "leaf nodes.", src_name, node_name) continue input_item = { src_name: { "shape": src_node.shape, "edge_type": edge_type, "scope": NodeTypeEnum.NAME_SCOPE.value } } node.update_input(input_item) if self._normal_nodes.get(node_name): self._normal_nodes[node_name] = node else: group_name = self._create_group_name(node.name_scope, node.node_type, node.name) self._node_groups[group_name][node.name] = node
def _delete_loader(self, loader_id): """ Delete loader from loader pool by loader id. Args: loader_id (str): ID of loader. """ if self._loader_pool.get(loader_id) is not None: logger.debug("delete loader %s", loader_id) self._loader_pool.pop(loader_id)
def reload_data(self): """ Reload the data once. This function needs to be used after `start_load_data` function. """ logger.debug("start to reload data") thread = threading.Thread(target=self._load_data_in_thread, name='reload_data_thread') thread.daemon = False thread.start()
def _parse_op_nodes(self, node_protos): """ Parse `anf_ir_pb2.NodeProto` object, and create a normal node. Args: node_protos (list[anf_ir_pb2.NodeProto]): Refer to anf_ir_pb2.NodeProto. """ logger.debug("Start to parse op nodes from proto.") for topological_index, node_proto in enumerate(node_protos): if not node_proto.name: logger.warning( "Finding a node with an empty name will not save it.") continue if node_proto.op_type == "Load": # The Load operator needs to be renamed as it has the same name with parameter node_name = Node.create_node_name( scope=node_proto.scope, base_name=f'{node_proto.op_type}-op{node_proto.name}') node_proto.full_name = node_name elif not node_proto.full_name or any(node_proto.full_name.lower( ).endswith(f'[:{plugin.value.lower()}]') for plugin in PluginNameEnum): node_name = Node.create_node_name( scope=node_proto.scope, base_name=f'{node_proto.op_type}{node_proto.name}') else: node_name = node_proto.full_name # The Graphviz plug-in that the UI USES can't handle these special characters. check_invalid_character(node_name) node = Node(name=node_name, node_id=node_proto.name, topological_index=topological_index) node.full_name = node_proto.full_name node.type = node_proto.op_type if getattr(node_proto, 'source_address', None): node.stack = DebuggerSource.build_stack_from_source_address( node_proto.source_address) self._parse_attributes(node_proto.attribute, node) self._parse_inputs(node_proto.input, node) node.output_i = node_proto.output_i node.scope = node_proto.scope node.output_shape = self._get_shape_by_parse_type_proto( node_proto.output_type) node.output_nums = len(node.output_shape) node.output_data_type = self._get_data_type_by_parse_type_proto( node_proto.output_type, node) self._cache_node(node)
def init_module(app): """ Interface to init module. Args: app (Flask): An instance of Flask. """ # Just to suppress pylint warning about unused arg. logger.debug("App: %s", type(app)) DATA_MANAGER.register_brief_cache_item_updater(LineageCacheItemUpdater()) DATA_MANAGER.start_load_data(reload_interval=int(settings.RELOAD_INTERVAL), max_threads_count=int(settings.MAX_THREADS_COUNT))
def _delete_nodes_of_cache(self, node_names): """Delete node from cache.""" logger.debug("These nodes will be removed from the cache, node names: %s.", str(node_names)) for name in node_names: if self._parameter_node_temp_cache.get(name): self._parameter_node_temp_cache.pop(name) if self._const_node_temp_cache.get(name): self._const_node_temp_cache.pop(name) node = self._get_normal_node(node_name=name) self._normal_node_map.pop(name) self._node_id_map_name.pop(node.node_id)
def _init_app_module(app): """ Init app module. Args: app (Flask): An instance of Flask. """ packages = find_app_package() for package in packages: try: app_module = import_module(package) app_module.init_module(app) except AttributeError: logger.debug('[%s].init_module not exists.', package)
def _event_parse(event_str, latest_file_name): """ Transform `Event` data to tensor_event and update it to EventsData. This method is static to avoid sending unnecessary objects to other processes. Args: event_str (str): Message event string in summary proto, data read from file handler. latest_file_name (str): Latest file name. """ plugins = { 'scalar_value': PluginNameEnum.SCALAR, 'image': PluginNameEnum.IMAGE, 'histogram': PluginNameEnum.HISTOGRAM, 'tensor': PluginNameEnum.TENSOR } logger.debug("Start to parse event string. Event string len: %s.", len(event_str)) event = summary_pb2.Event.FromString(event_str) logger.debug("Deserialize event string completed.") ret_tensor_events = [] if event.HasField('summary'): for value in event.summary.value: for plugin in plugins: if not value.HasField(plugin): continue plugin_name_enum = plugins[plugin] logger.debug("Processing plugin value: %s.", plugin_name_enum) tensor_event_value = _SummaryParser._parse_summary_value( value, plugin) if tensor_event_value is None: continue tensor_event = TensorEvent( wall_time=event.wall_time, step=event.step, tag='{}/{}'.format(value.tag, plugin_name_enum.value), plugin_name=plugin_name_enum.value, value=tensor_event_value, filename=latest_file_name) logger.debug( "Tensor event generated, plugin is %s, tag is %s, step is %s.", plugin_name_enum, value.tag, event.step) ret_tensor_events.append(tensor_event) elif event.HasField('graph_def'): graph = MSGraph() graph.build_graph(event.graph_def) tensor_event = TensorEvent(wall_time=event.wall_time, step=event.step, tag=latest_file_name, plugin_name=PluginNameEnum.GRAPH.value, value=graph, filename=latest_file_name) ret_tensor_events.append(tensor_event) return ret_tensor_events
def init_module(app): """ Interface to init module. Args: app (Flask): An instance of Flask. """ # Just to suppress pylint warning about unused arg. logger.debug("App: %s", type(app)) DATA_MANAGER.register_brief_cache_item_updater(LineageCacheItemUpdater()) # Let gunicorn load other modules first. time.sleep(1) DATA_MANAGER.start_load_data(reload_interval=settings.RELOAD_INTERVAL)
def get_stack_info_by_offset(self, pattern=None, limit=0, offset=0): """ Get stack infos. Args: pattern (str): The pattern of stack infos. Default: None. If not given, return all stack infos. limit (int): The size of each page. Default: 0. If 0, there is no limitation. offset (int): The index of the page. Valid only when `limit` is not 0. Returns: dict, stack info objects. The format is like: { 'total': int, 'offset': int, 'stack_infos': [{<file_path>: [{'file_path': str, 'line_no': int, 'code_line': str}]] } """ # validate params self.check_int('limit', limit, min_num=0, max_num=100) self.check_int('offset', offset, min_num=0, max_num=len(self._stack_infos)) validate_stack_pattern(pattern) if not limit and offset > 0: return {} # get filter results filter_res = self.get(pattern) if not filter_res: log.debug("No stack info with pattern %s", pattern) return {} merged_res = self._merge_stack_by_file_path(filter_res) total_size = len(merged_res) if not limit: limit = total_size st_index = offset * limit query_res = merged_res[st_index:st_index + limit] for stack_info in query_res: source_items = stack_info['items'] stack_info['items'] = list(map(lambda x: x.to_dict(), source_items)) return { 'total': total_size, 'offset': offset, 'stack_infos': query_res }
def _init_app_module(app): """ Init app module. Args: app (Flask): An instance of Flask. """ packages = find_app_package() gunicorn_logger = setup_logger("gunicorn", "error") for package in packages: try: app_module = import_module(package) gunicorn_logger.info("[%s].init_module starts.", package) app_module.init_module(app) gunicorn_logger.info("[%s].init_module ends.", package) except AttributeError: logger.debug('[%s].init_module not exists.', package)
def load(self): """ Load all log valid files. When the file is reloaded, it will continue to load from where it left off. """ logger.debug("Start to load data in ms data loader.") filenames = self.filter_valid_files() if not filenames: logger.warning("No valid files can be loaded, summary_dir: %s.", self._summary_dir) raise exceptions.SummaryLogPathInvalid() old_filenames = list(self._valid_filenames) self._valid_filenames = filenames self._check_files_deleted(filenames, old_filenames) for parser in self._parser_list: parser.parse_files(filenames, events_data=self._events_data)
def _parse_graph_proto_parameter(self, parameter): """ Parse anf_ir_pb2.model_proto.graph.parameter, and create a parameter node. Args: parameter (anf_ir_pb2.model_proto.graph.parameter): Refer to anf_ir_pb2.model_proto.graph.parameter. Returns: Node, a `Node` object. """ node = Node(name=parameter.name, node_id=parameter.name) node.node_type = NodeTypeEnum.PARAMETER.value node.shape = self._parse_type_proto(parameter.type) logger.debug( "Foreach graph proto parameters, node id: %s, node name: %s, " "node def name: %s", node.node_id, node.name, parameter.name) return node
def parse_files(self, executor, filenames, events_data): """ Load summary file and parse file content. Args: executor (Executor): The executor instance. filenames (list[str]): File name list. events_data (EventsData): The container of event data. Returns: bool, True if all the summary files are finished loading. """ summary_files = self.filter_files(filenames) summary_files = self.sort_files(summary_files) if self._latest_filename in summary_files: index = summary_files.index(self._latest_filename) summary_files = summary_files[index:] for filename in summary_files: file_path = FileHandler.join(self._summary_dir, filename) if filename != self._latest_filename: self._summary_file_handler = FileHandler(file_path, 'rb') self._latest_filename = filename self._latest_file_size = 0 new_size = FileHandler.file_stat(file_path).size if new_size == self._latest_file_size: continue try: if not self._load_single_file(self._summary_file_handler, executor, events_data): self._latest_file_size = self._summary_file_handler.offset else: self._latest_file_size = new_size # Wait for data in this file to be processed to avoid loading multiple files at the same time. logger.debug("Parse summary file offset %d, file path: %s.", self._latest_file_size, file_path) return False except UnknownError as ex: logger.warning( "Parse summary file failed, detail: %r," "file path: %s.", str(ex), file_path) return True
def get_train_job_by_plugin(self, train_id, plugin_name): """ Get a train job by train job id. If the given train job does not has the given plugin data, the tag list will be empty. Args: train_id (str): Get train job info by the given id. plugin_name (str): Get tags by given plugin. Returns: TypedDict('TrainJobEntity', {'id': str, 'name': str, 'tags': List[str]}), a train job object. """ self._check_status_valid() self._check_train_job_exist(train_id, self._loader_pool) loader = self._get_loader(train_id) if loader is None: logger.warning( "No valid summary log in train job %s, " "or it is not in the cache.", train_id) return None name = loader.name data_loader = loader.data_loader tags = [] try: events_data = data_loader.get_events_data() tags = events_data.list_tags_by_plugin(plugin_name) except KeyError: logger.debug( "Plugin name %r does not exist " "in train job %r, and set tags to empty list.", plugin_name, name) except AttributeError: logger.debug( "Train job %r has been deleted or it has not loaded data, " "and set tags to empty list.", name) result = dict(id=train_id, name=name, tags=tags) return result
def add_tensor_event(self, tensor_event): """ Add a new tensor event to the tensors_data. Args: tensor_event (TensorEvent): Refer to `TensorEvent` object. """ if not isinstance(tensor_event, TensorEvent): raise TypeError('Expect to get data of type `TensorEvent`.') tag = tensor_event.tag plugin_name = tensor_event.plugin_name if tag not in set(self._tags): deleted_tag = self._check_tag_out_of_spec(plugin_name) if deleted_tag is not None: if tag in self._deleted_tags: logger.debug("Tag is in deleted tags: %s.", tag) return self.delete_tensor_event(deleted_tag) self._tags.append(tag) with self._tags_by_plugin_mutex_lock[plugin_name]: if tag not in self._tags_by_plugin[plugin_name]: self._tags_by_plugin[plugin_name].append(tag) with self._reservoir_mutex_lock: if tag not in self._reservoir_by_tag: reservoir_size = self._get_reservoir_size( tensor_event.plugin_name) self._reservoir_by_tag[tag] = reservoir.ReservoirFactory( ).create_reservoir(plugin_name, reservoir_size) tensor = _Tensor(wall_time=tensor_event.wall_time, step=tensor_event.step, value=tensor_event.value, filename=tensor_event.filename) if self._is_out_of_order_step(tensor_event.step, tensor_event.tag): self.purge_reservoir_data(tensor_event.filename, tensor_event.step, self._reservoir_by_tag[tag]) self._reservoir_by_tag[tag].add_sample(tensor)
def _load_data(self): """This function will load data once and ignore it if the status is loading.""" logger.info("Start to load data, reload interval: %r.", self._reload_interval) with self._status_mutex: if self.status == DataManagerStatus.LOADING.value: logger.debug( "Current status is %s , will ignore to load data.", self.status) return self.status = DataManagerStatus.LOADING.value summaries_info = SummaryWatcher().list_summary_directories( self._summary_base_dir) basic_train_jobs = [] for info in summaries_info: profiler = info['profiler'] basic_train_jobs.append( _BasicTrainJob( train_id=info['relative_path'], abs_summary_base_dir=self._summary_base_dir, abs_summary_dir=os.path.realpath( os.path.join(self._summary_base_dir, info['relative_path'])), create_time=info['create_time'], update_time=info['update_time'], profiler_dir=None if profiler is None else profiler['directory'], )) self._brief_cache.update_cache(basic_train_jobs) self._detail_cache.update_cache(basic_train_jobs) if not self._brief_cache.has_content( ) and not self._detail_cache.has_content(): self.status = DataManagerStatus.INVALID.value else: self.status = DataManagerStatus.DONE.value logger.info( "Load event data end, status: %r, and loader pool size is %r.", self.status, self._detail_cache.loader_pool_size())