Beispiel #1
0
    def _parse_consts(self, consts):
        """
        Parse `anf_ir_pb2.NameValueProto` object, and create a const node.

        Args:
            consts (list[anf_ir_pb2.NameValueProto]): Refer to `anf_ir_pb2.NameValueProto` object.
        """
        logger.debug("Start to parse consts from proto.")
        for const in consts:
            if not const.key:
                logger.warning("Finding a const with an empty key will not save it.")
                continue
            node = Node(name=const.key, node_id=const.key)
            node.type = NodeTypeEnum.CONST.value
            node.add_attr({const.key: str(const.value)})

            if const.value.dtype == DataType.DT_TENSOR:
                shape = list(const.value.tensor_val.dims)
                node.output_shape.append(shape)
                if const.value.tensor_val.HasField('data_type'):
                    node.elem_types.append(DataType.Name(const.value.tensor_val.data_type))
            else:
                node.elem_types.append(DataType.Name(const.value.dtype))
                # dim is zero
                node.output_shape.append([])

            node.output_nums = len(node.output_shape)

            self._cache_node(node)
Beispiel #2
0
    def load(self, executor=None):
        """
        Load all log valid files.

        When the file is reloaded, it will continue to load from where it left off.

        Args:
            executor (Optional[executor]): The Executor instance.

        Returns:
            bool, True if the train job is finished loading.
        """
        logger.debug("Start to load data in ms data loader.")
        if isinstance(executor, Executor):
            return self._load(executor)

        if executor is not None:
            raise TypeError(
                "'executor' should be an Executor instance or None.")

        with ComputingResourceManager() as mgr:
            with mgr.get_executor() as new_executor:
                while not self._load(new_executor):
                    pass
                new_executor.wait_all_tasks_finish()
                return True
Beispiel #3
0
    def samples(self):
        """Return all stored samples."""
        with self._mutex:
            if self._visual_range_up_to_date:
                return list(self._samples)

            # calc visual range
            visual_range = _VisualRange()
            max_count = 0
            for sample in self._samples:
                histogram = sample.value
                if histogram.count == 0:
                    # ignore empty tensor
                    continue
                max_count = max(histogram.count, max_count)
                visual_range.update(histogram.max, histogram.min)

            if visual_range.max == visual_range.min and not max_count:
                logger.info("Max equals to min. Count is zero.")

            bins = calc_histogram_bins(max_count)

            # update visual range
            logger.debug(
                "Visual histogram: min %s, max %s, bins %s, max_count %s.",
                visual_range.min, visual_range.max, bins, max_count)
            for sample in self._samples:
                histogram = sample.value
                histogram.set_visual_range(visual_range.max, visual_range.min,
                                           bins)

            self._visual_range_up_to_date = True
            return list(self._samples)
Beispiel #4
0
    def _build_polymeric_nodes(self):
        """Build polymeric node."""
        logger.debug("Start to build polymeric nodes")

        self._find_polymeric_nodes()

        group_count_map = {}
        for group_name, group in self._node_groups.items():
            name = group_name.split('/')[-1]
            count = group_count_map.get(name, 0)
            count += 1
            group_count_map[name] = count
            polymeric_node_name = group_name + '_{}_[{}]'.format(
                count, len(group))
            polymeric_node = Node(polymeric_node_name,
                                  node_id=polymeric_node_name)
            polymeric_node.node_type = NodeTypeEnum.POLYMERIC_SCOPE.value
            polymeric_node.name_scope = '/'.join(group_name.split('/')[:-1])
            polymeric_node.subnode_count = len(group)

            for name_tmp, node_tmp in group.items():
                node_tmp.polymeric_scope_name = polymeric_node_name
                self._polymeric_nodes.update({name_tmp: node_tmp})
                polymeric_node.update_input(node_tmp.input)
                polymeric_node.update_output(node_tmp.output)

            self._normal_nodes.update({polymeric_node_name: polymeric_node})

        self._update_input_output()
Beispiel #5
0
    def _parse_parameters(self, parameter_protos):
        """
        Parse `anf_ir_pb2.ParameterProto` object, and create a parameter node.

        Args:
            parameter_protos (list[anf_ir_pb2.ParameterProto]): Refer to anf_ir_pb2.ParameterProto.
        """
        logger.debug("Start to parse parameters from proto.")
        for parameter in parameter_protos:
            if not parameter.name:
                logger.warning(
                    "Finding a parameter with an empty name will not save it.")
                continue
            check_invalid_character(parameter.name)
            node = Node(name=parameter.name, node_id=parameter.name)
            node.type = NodeTypeEnum.PARAMETER.value
            node.output_shape = self._get_shape_by_parse_type_proto(
                parameter.type)
            node.output_nums = len(node.output_shape)
            node.output_data_type = self._get_data_type_by_parse_type_proto(
                parameter.type, node)
            attr = dict(
                type=self._get_data_type_by_parse_type_proto(
                    parameter.type, node),
                shape=str(self._get_shape_by_parse_type_proto(parameter.type)))
            node.add_attr(attr)

            self._cache_node(node)
            logger.debug(
                "Foreach graph proto parameters, node id: %s, node name: %s, "
                "node def name: %s", node.node_id, node.name, parameter.name)
Beispiel #6
0
    def _load_single_file(self, file_handler):
        """
        Load a log file data.

        Args:
            file_handler (FileHandler): A file handler.
        """
        logger.debug("Load single summary file, file path: %s.", file_handler.file_path)
        while True:
            start_offset = file_handler.offset
            try:
                event_str = self._event_load(file_handler)
                if event_str is None:
                    file_handler.reset_offset(start_offset)
                    break

                event = summary_pb2.Event.FromString(event_str)
                self._event_parse(event)
            except exceptions.CRCFailedError:
                file_handler.reset_offset(start_offset)
                logger.warning("Check crc faild and ignore this file, file_path=%s, "
                               "offset=%s.", file_handler.file_path, file_handler.offset)
                break
            except (OSError, DecodeError, exceptions.MindInsightException) as ex:
                logger.warning("Parse log file fail, and ignore this file, detail: %r,"
                               "file path: %s.", str(ex), file_handler.file_path)
                break
            except Exception as ex:
                logger.exception(ex)
                raise UnknownError(str(ex))
Beispiel #7
0
    def _execute_loader(self, loader_id):
        """
        Load data form data_loader.

        If there is something wrong by loading, add logs and delete the loader.

        Args:
            loader_id (str): An ID for `Loader`.

        """
        try:
            with self._loader_pool_mutex:
                loader = self._loader_pool.get(loader_id, None)
                if loader is None:
                    logger.debug(
                        "Loader %r has been deleted, will not load data.",
                        loader_id)
                    return

            loader.data_loader.load()

            # Update loader cache status to CACHED.
            # Loader with cache status CACHED should remain the same cache status.
            loader.cache_status = CacheStatus.CACHED

        except MindInsightException as ex:
            logger.warning(
                "Data loader %r load data failed. "
                "Delete data_loader. Detail: %s", loader_id, ex)

            with self._loader_pool_mutex:
                self._delete_loader(loader_id)
Beispiel #8
0
    def _execute_loader(self, loader_id):
        """
        Load data form data_loader.

        If there is something wrong by loading, add logs and delete the loader.

        Args:
            loader_id (str): An ID for `Loader`.

        """
        try:
            with self._loader_pool_mutex:
                loader = self._loader_pool.get(loader_id, None)
                if loader is None:
                    logger.debug(
                        "Loader %r has been deleted, will not load data.",
                        loader_id)
                    return
            loader.data_loader.load()
        except MindInsightException as ex:
            logger.warning(
                "Data loader %r load data failed. "
                "Delete data_loader. Detail: %s", loader_id, ex)

            with self._loader_pool_mutex:
                self._delete_loader(loader_id)
Beispiel #9
0
    def list_tensors(self, train_id, tag):
        """
        List tensors of the given train job and tag.

        If the tensor can not find by the given tag, will raise exception.

        Args:
            train_id (str): ID for train job.
            tag (str): The tag name.

        Returns:
            list, the NameTuple format is `collections.namedtuple('_Tensor', ['wall_time', 'event_step', 'value'])`.
                the value will contain the given tag data.

        """
        loader_pool = self._get_snapshot_loader_pool()
        if not self._is_loader_in_loader_pool(train_id, loader_pool):
            raise TrainJobNotExistError(
                "Can not find the given train job in cache.")

        data_loader = loader_pool[train_id].data_loader

        tensors = []
        try:
            events_data = data_loader.get_events_data()
            tensors = events_data.tensors(tag)
        except KeyError:
            error_msg = "Can not find any data in this train job by given tag."
            raise ParamValueError(error_msg)
        except AttributeError:
            logger.debug(
                "Train job %r has been deleted or it has not loaded data, "
                "and set tags to empty list.", train_id)

        return tensors
Beispiel #10
0
    def _parse_op_nodes(self, node_protos):
        """
        Parse `anf_ir_pb2.NodeProto` object, and create a normal node.

        Args:
            node_protos (list[anf_ir_pb2.NodeProto]): Refer to anf_ir_pb2.NodeProto.
        """
        logger.debug("Start to parse op nodes from proto.")
        for node_proto in node_protos:
            if not node_proto.name:
                logger.warning("Finding a node with an empty name will not save it.")
                continue

            if not node_proto.full_name or any(
                    node_proto.full_name.lower().endswith(f'[:{plugin.value.lower()}]') for plugin in PluginNameEnum):
                node_name = Node.create_node_name(scope=node_proto.scope,
                                                  base_name=f'{node_proto.op_type}{node_proto.name}')
            else:
                node_name = node_proto.full_name
            node = Node(name=node_name, node_id=node_proto.name)
            node.full_name = node_proto.full_name
            node.type = node_proto.op_type

            self._parse_attributes(node_proto.attribute, node)
            self._parse_inputs(node_proto.input, node)

            node.output_i = node_proto.output_i
            node.scope = node_proto.scope
            node.output_shape = self._get_shape_by_parse_type_proto(node_proto.output_type)
            node.output_nums = len(node.output_shape)
            node.output_data_type = self._get_data_type_by_parse_type_proto(node_proto.output_type, node)

            self._cache_node(node)
Beispiel #11
0
    def _load_data(self):
        """This function will load data once and ignore it if the status is loading."""
        with self._status_mutex:
            if self.status == DataManagerStatus.LOADING.value:
                logger.debug(
                    "Current status is %s , will ignore to load data.",
                    self.status)
                return
            self.status = DataManagerStatus.LOADING.value

        with ComputingResourceManager(
                executors_cnt=1, max_processes_cnt=settings.MAX_PROCESSES_COUNT
        ) as computing_resource_mgr:
            with computing_resource_mgr.get_executor() as executor:
                self._brief_cache.update_cache(executor)
                brief_cache_update = time.time()
                for _ in self._detail_cache.update_cache(executor):
                    update_interval = time.time() - brief_cache_update
                    logger.debug(
                        'Loading one round of detail cache taking %ss.',
                        update_interval)
                    if update_interval > 3:  # Use 3 seconds as threshold to avoid updating too often
                        self._brief_cache.update_cache(executor)
                        brief_cache_update += update_interval
                executor.wait_all_tasks_finish()
            with self._status_mutex:
                if not self._brief_cache.has_content(
                ) and not self._detail_cache.has_content():
                    self.status = DataManagerStatus.INVALID.value
                else:
                    self.status = DataManagerStatus.DONE.value

                logger.info("Load brief data end, and loader pool size is %r.",
                            self._detail_cache.loader_pool_size())
Beispiel #12
0
    def _update_node_name_of_cache(self, node, new_name, update_parent=False):
        """
        Update a node name which is stored in cache.

        Args:
            node (Node): The node that will be renamed.
            new_name (str): The new name.
            update_parent (bool): Determines whether the input and output of the parent node need to be updated.
        """
        logger.debug('Update node name of cache, node(%s), new name is %s.', str(node), new_name)
        origin_name = node.name
        node.name = new_name

        # Find all nodes that need to modify the input and input
        update_node_map = {}
        for method in ['input', 'output', 'proxy_input', 'proxy_output']:
            for target_name in getattr(node, method):
                target_node = self._get_normal_node(node_name=target_name)
                if target_node is None:
                    message = f"Node should not be None, name: {target_name}, {method}: {list(getattr(node, method))}."
                    logger.error(message)
                    continue

                update_node_map.update({target_name: target_node})

                if not update_parent:
                    continue

                slash_index = target_name.find('/')
                while slash_index != -1:
                    scope_name = target_name[:slash_index]
                    slash_index = target_name.find('/', slash_index+1)

                    if update_node_map.get(scope_name):
                        continue

                    scope_node = self._get_normal_node(node_name=scope_name)
                    if scope_node is None:
                        message = f"Can not find the scope node by scope name({scope_name}), " \
                                  f"may be this scope node has not been built."
                        logger.debug(message)
                        continue

                    update_node_map.update({scope_name: scope_node})

        # Update the input and output of the nodes
        for target_node in update_node_map.values():
            for method in ['input', 'output', 'proxy_input', 'proxy_output']:
                attr_temp = getattr(target_node, method).get(origin_name)
                if attr_temp is None:
                    # This method does not have this node, so it is skipped
                    continue

                # Delete the old attribute and update new name to source node or destination node.
                getattr(target_node, f'delete_{method}')(origin_name)
                getattr(target_node, f'add_{method}')(new_name, attr_temp)

        # Delete the origin node in cache.
        self._delete_nodes_of_cache(node_names=[origin_name])
        self._cache_node(node)
Beispiel #13
0
    def _find_polymeric_nodes(self):
        """Find polymeric nodes from node groups."""
        node_groups = copy.deepcopy(self._node_groups)
        for group_name, group in node_groups.items():
            if len(group) < self.MIN_POLYMERIC_NODE_COUNT:
                self._normal_nodes.update(group)
                self._node_groups.pop(group_name)
                continue

            move_node_names = []
            is_move_group = False
            for node_name, group_node in group.items():
                node_list = []
                is_in_group = False
                for dst_name in group_node.output:
                    node_tmp = self._leaf_nodes[dst_name]
                    node_list.append(node_tmp)

                start = time.time()
                run_count = 0
                visit_nodes = {}
                while node_list:
                    # Iterate to find if the output of the node in the group causes a loop
                    # example: there is a group A, and node_a is a Node in group.
                    # if there is a loop in node_a, like A/node_a -> B/node_b -> A/node_b
                    # we will remove the node_a from group A.
                    node_tmp = node_list[0]
                    node_list = node_list[1:]
                    visit_nodes.update({node_tmp.name: True})
                    if node_tmp in group.values():
                        is_in_group = True
                        break
                    for dst_name_tmp in node_tmp.output:
                        run_count += 1
                        node_tmp = self._leaf_nodes[dst_name_tmp]
                        if visit_nodes.get(dst_name_tmp):
                            continue
                        node_list.append(node_tmp)
                logger.debug(
                    "Find group %s node end, is_in_group: %s, use time: %s, "
                    "run count: %s.", group_name, is_in_group,
                    time.time() - start, run_count)

                if is_in_group:
                    move_node_names.append(node_name)

                if (len(group) -
                        len(move_node_names)) < self.MIN_POLYMERIC_NODE_COUNT:
                    is_move_group = True
                    break

            if is_move_group:
                self._normal_nodes.update(group)
                self._node_groups.pop(group_name)
            else:
                for name_tmp in move_node_names:
                    node_tmp = self._node_groups[group_name].pop(name_tmp)
                    self._normal_nodes.update({name_tmp: node_tmp})
Beispiel #14
0
    def _calc_input(self, leaf_node_id_map_name, graph_proto, const_nodes_map):
        """
        Calc input for every leaf node.

        Args:
            leaf_node_id_map_name (dict[str, str]): Format is {'node_id': 'node_name'}.
            graph_proto (anf_ir_pb2.model_proto.graph): See anf_ir_pb2.model_proto.graph.
            const_nodes_map (dict[str, Node]): Format is {'node name': <Const node>}.
        """
        logger.debug("Start to calc input.")
        for node_def in graph_proto.node:
            node_name = leaf_node_id_map_name[node_def.name]
            node = self._leaf_nodes[node_name]
            for input_def in node_def.input:
                edge_type = EdgeTypeEnum.data
                if input_def.type == "CONTROL_EDGE":
                    edge_type = EdgeTypeEnum.control

                if const_nodes_map.get(input_def.name):
                    const_node = copy.deepcopy(const_nodes_map[input_def.name])
                    src_name = '{}/{}'.format(node.name_scope, input_def.name)
                    if not self._normal_nodes.get(src_name):
                        const_node.name = src_name
                        const_node.name_scope = node.name_scope
                        self._normal_nodes.update({src_name: const_node})
                        self._leaf_nodes.update({src_name: const_node})
                    src_node = self._leaf_nodes.get(src_name)
                else:
                    src_name = leaf_node_id_map_name.get(input_def.name)
                    if not src_name:
                        logger.warning(
                            "The input_def name '%s' in node '%s' is invalid, "
                            "will be ignore.", input_def.name, node_name)
                        continue

                    src_node = self._leaf_nodes.get(src_name)
                    if src_node is None:
                        logger.warning(
                            "The input '%s' in node '%s' is not in "
                            "leaf nodes.", src_name, node_name)
                        continue

                input_item = {
                    src_name: {
                        "shape": src_node.shape,
                        "edge_type": edge_type,
                        "scope": NodeTypeEnum.NAME_SCOPE.value
                    }
                }
                node.update_input(input_item)

            if self._normal_nodes.get(node_name):
                self._normal_nodes[node_name] = node
            else:
                group_name = self._create_group_name(node.name_scope,
                                                     node.node_type, node.name)
                self._node_groups[group_name][node.name] = node
Beispiel #15
0
    def _delete_loader(self, loader_id):
        """
        Delete loader from loader pool by loader id.

        Args:
            loader_id (str): ID of loader.
        """
        if self._loader_pool.get(loader_id) is not None:
            logger.debug("delete loader %s", loader_id)
            self._loader_pool.pop(loader_id)
Beispiel #16
0
    def reload_data(self):
        """
        Reload the data once.

        This function needs to be used after `start_load_data` function.
        """
        logger.debug("start to reload data")
        thread = threading.Thread(target=self._load_data_in_thread,
                                  name='reload_data_thread')
        thread.daemon = False
        thread.start()
Beispiel #17
0
    def _parse_op_nodes(self, node_protos):
        """
        Parse `anf_ir_pb2.NodeProto` object, and create a normal node.

        Args:
            node_protos (list[anf_ir_pb2.NodeProto]): Refer to anf_ir_pb2.NodeProto.
        """
        logger.debug("Start to parse op nodes from proto.")
        for topological_index, node_proto in enumerate(node_protos):
            if not node_proto.name:
                logger.warning(
                    "Finding a node with an empty name will not save it.")
                continue

            if node_proto.op_type == "Load":
                # The Load operator needs to be renamed as it has the same name with parameter
                node_name = Node.create_node_name(
                    scope=node_proto.scope,
                    base_name=f'{node_proto.op_type}-op{node_proto.name}')
                node_proto.full_name = node_name
            elif not node_proto.full_name or any(node_proto.full_name.lower(
            ).endswith(f'[:{plugin.value.lower()}]')
                                                 for plugin in PluginNameEnum):
                node_name = Node.create_node_name(
                    scope=node_proto.scope,
                    base_name=f'{node_proto.op_type}{node_proto.name}')
            else:
                node_name = node_proto.full_name

            # The Graphviz plug-in that the UI USES can't handle these special characters.
            check_invalid_character(node_name)

            node = Node(name=node_name,
                        node_id=node_proto.name,
                        topological_index=topological_index)
            node.full_name = node_proto.full_name
            node.type = node_proto.op_type
            if getattr(node_proto, 'source_address', None):
                node.stack = DebuggerSource.build_stack_from_source_address(
                    node_proto.source_address)
            self._parse_attributes(node_proto.attribute, node)
            self._parse_inputs(node_proto.input, node)

            node.output_i = node_proto.output_i
            node.scope = node_proto.scope
            node.output_shape = self._get_shape_by_parse_type_proto(
                node_proto.output_type)
            node.output_nums = len(node.output_shape)
            node.output_data_type = self._get_data_type_by_parse_type_proto(
                node_proto.output_type, node)

            self._cache_node(node)
Beispiel #18
0
def init_module(app):
    """
    Interface to init module.

    Args:
        app (Flask): An instance of Flask.

    """
    # Just to suppress pylint warning about unused arg.
    logger.debug("App: %s", type(app))
    DATA_MANAGER.register_brief_cache_item_updater(LineageCacheItemUpdater())
    DATA_MANAGER.start_load_data(reload_interval=int(settings.RELOAD_INTERVAL),
                                 max_threads_count=int(settings.MAX_THREADS_COUNT))
Beispiel #19
0
    def _delete_nodes_of_cache(self, node_names):
        """Delete node from cache."""
        logger.debug("These nodes will be removed from the cache, node names: %s.", str(node_names))
        for name in node_names:

            if self._parameter_node_temp_cache.get(name):
                self._parameter_node_temp_cache.pop(name)
            if self._const_node_temp_cache.get(name):
                self._const_node_temp_cache.pop(name)

            node = self._get_normal_node(node_name=name)
            self._normal_node_map.pop(name)
            self._node_id_map_name.pop(node.node_id)
Beispiel #20
0
def _init_app_module(app):
    """
    Init app module.

    Args:
        app (Flask): An instance of Flask.
    """
    packages = find_app_package()
    for package in packages:
        try:
            app_module = import_module(package)
            app_module.init_module(app)
        except AttributeError:
            logger.debug('[%s].init_module not exists.', package)
Beispiel #21
0
    def _event_parse(event_str, latest_file_name):
        """
        Transform `Event` data to tensor_event and update it to EventsData.

        This method is static to avoid sending unnecessary objects to other processes.

        Args:
            event_str (str): Message event string in summary proto, data read from file handler.
            latest_file_name (str): Latest file name.
        """

        plugins = {
            'scalar_value': PluginNameEnum.SCALAR,
            'image': PluginNameEnum.IMAGE,
            'histogram': PluginNameEnum.HISTOGRAM,
            'tensor': PluginNameEnum.TENSOR
        }
        logger.debug("Start to parse event string. Event string len: %s.",
                     len(event_str))
        event = summary_pb2.Event.FromString(event_str)
        logger.debug("Deserialize event string completed.")

        ret_tensor_events = []
        if event.HasField('summary'):
            for value in event.summary.value:
                for plugin in plugins:
                    if not value.HasField(plugin):
                        continue
                    plugin_name_enum = plugins[plugin]
                    logger.debug("Processing plugin value: %s.",
                                 plugin_name_enum)
                    tensor_event_value = _SummaryParser._parse_summary_value(
                        value, plugin)
                    if tensor_event_value is None:
                        continue

                    tensor_event = TensorEvent(
                        wall_time=event.wall_time,
                        step=event.step,
                        tag='{}/{}'.format(value.tag, plugin_name_enum.value),
                        plugin_name=plugin_name_enum.value,
                        value=tensor_event_value,
                        filename=latest_file_name)
                    logger.debug(
                        "Tensor event generated, plugin is %s, tag is %s, step is %s.",
                        plugin_name_enum, value.tag, event.step)
                    ret_tensor_events.append(tensor_event)

        elif event.HasField('graph_def'):
            graph = MSGraph()
            graph.build_graph(event.graph_def)
            tensor_event = TensorEvent(wall_time=event.wall_time,
                                       step=event.step,
                                       tag=latest_file_name,
                                       plugin_name=PluginNameEnum.GRAPH.value,
                                       value=graph,
                                       filename=latest_file_name)
            ret_tensor_events.append(tensor_event)

        return ret_tensor_events
Beispiel #22
0
def init_module(app):
    """
    Interface to init module.

    Args:
        app (Flask): An instance of Flask.

    """
    # Just to suppress pylint warning about unused arg.
    logger.debug("App: %s", type(app))
    DATA_MANAGER.register_brief_cache_item_updater(LineageCacheItemUpdater())
    # Let gunicorn load other modules first.
    time.sleep(1)

    DATA_MANAGER.start_load_data(reload_interval=settings.RELOAD_INTERVAL)
    def get_stack_info_by_offset(self, pattern=None, limit=0, offset=0):
        """
        Get stack infos.

        Args:
            pattern (str): The pattern of stack infos. Default: None. If not given, return all stack infos.
            limit (int): The size of each page. Default: 0. If 0, there is no limitation.
            offset (int): The index of the page. Valid only when `limit` is not 0.

        Returns:
            dict, stack info objects. The format is like:
                {
                    'total': int,
                    'offset': int,
                    'stack_infos': [{<file_path>: [{'file_path': str, 'line_no': int, 'code_line': str}]]
                }

        """
        # validate params
        self.check_int('limit', limit, min_num=0, max_num=100)
        self.check_int('offset',
                       offset,
                       min_num=0,
                       max_num=len(self._stack_infos))
        validate_stack_pattern(pattern)
        if not limit and offset > 0:
            return {}
        # get filter results
        filter_res = self.get(pattern)
        if not filter_res:
            log.debug("No stack info with pattern %s", pattern)
            return {}
        merged_res = self._merge_stack_by_file_path(filter_res)
        total_size = len(merged_res)
        if not limit:
            limit = total_size
        st_index = offset * limit
        query_res = merged_res[st_index:st_index + limit]
        for stack_info in query_res:
            source_items = stack_info['items']
            stack_info['items'] = list(map(lambda x: x.to_dict(),
                                           source_items))

        return {
            'total': total_size,
            'offset': offset,
            'stack_infos': query_res
        }
Beispiel #24
0
def _init_app_module(app):
    """
    Init app module.

    Args:
        app (Flask): An instance of Flask.
    """
    packages = find_app_package()
    gunicorn_logger = setup_logger("gunicorn", "error")
    for package in packages:
        try:
            app_module = import_module(package)
            gunicorn_logger.info("[%s].init_module starts.", package)
            app_module.init_module(app)
            gunicorn_logger.info("[%s].init_module ends.", package)
        except AttributeError:
            logger.debug('[%s].init_module not exists.', package)
Beispiel #25
0
    def load(self):
        """
        Load all log valid files.

        When the file is reloaded, it will continue to load from where it left off.
        """
        logger.debug("Start to load data in ms data loader.")
        filenames = self.filter_valid_files()
        if not filenames:
            logger.warning("No valid files can be loaded, summary_dir: %s.", self._summary_dir)
            raise exceptions.SummaryLogPathInvalid()
        old_filenames = list(self._valid_filenames)
        self._valid_filenames = filenames
        self._check_files_deleted(filenames, old_filenames)

        for parser in self._parser_list:
            parser.parse_files(filenames, events_data=self._events_data)
Beispiel #26
0
    def _parse_graph_proto_parameter(self, parameter):
        """
        Parse anf_ir_pb2.model_proto.graph.parameter, and create a parameter node.

        Args:
            parameter (anf_ir_pb2.model_proto.graph.parameter): Refer to anf_ir_pb2.model_proto.graph.parameter.

        Returns:
            Node, a `Node` object.
        """
        node = Node(name=parameter.name, node_id=parameter.name)
        node.node_type = NodeTypeEnum.PARAMETER.value
        node.shape = self._parse_type_proto(parameter.type)
        logger.debug(
            "Foreach graph proto parameters, node id: %s, node name: %s, "
            "node def name: %s", node.node_id, node.name, parameter.name)
        return node
Beispiel #27
0
    def parse_files(self, executor, filenames, events_data):
        """
        Load summary file and parse file content.

        Args:
            executor (Executor): The executor instance.
            filenames (list[str]): File name list.
            events_data (EventsData): The container of event data.

        Returns:
            bool, True if all the summary files are finished loading.
        """
        summary_files = self.filter_files(filenames)
        summary_files = self.sort_files(summary_files)
        if self._latest_filename in summary_files:
            index = summary_files.index(self._latest_filename)
            summary_files = summary_files[index:]

        for filename in summary_files:
            file_path = FileHandler.join(self._summary_dir, filename)

            if filename != self._latest_filename:
                self._summary_file_handler = FileHandler(file_path, 'rb')
                self._latest_filename = filename
                self._latest_file_size = 0

            new_size = FileHandler.file_stat(file_path).size
            if new_size == self._latest_file_size:
                continue

            try:
                if not self._load_single_file(self._summary_file_handler,
                                              executor, events_data):
                    self._latest_file_size = self._summary_file_handler.offset
                else:
                    self._latest_file_size = new_size
                # Wait for data in this file to be processed to avoid loading multiple files at the same time.
                logger.debug("Parse summary file offset %d, file path: %s.",
                             self._latest_file_size, file_path)
                return False
            except UnknownError as ex:
                logger.warning(
                    "Parse summary file failed, detail: %r,"
                    "file path: %s.", str(ex), file_path)
        return True
Beispiel #28
0
    def get_train_job_by_plugin(self, train_id, plugin_name):
        """
        Get a train job by train job id.

        If the given train job does not has the given plugin data, the tag list will be empty.

        Args:
            train_id (str): Get train job info by the given id.
            plugin_name (str): Get tags by given plugin.

        Returns:
            TypedDict('TrainJobEntity', {'id': str, 'name': str, 'tags': List[str]}),
                a train job object.

        """
        self._check_status_valid()
        self._check_train_job_exist(train_id, self._loader_pool)

        loader = self._get_loader(train_id)
        if loader is None:
            logger.warning(
                "No valid summary log in train job %s, "
                "or it is not in the cache.", train_id)
            return None

        name = loader.name
        data_loader = loader.data_loader

        tags = []
        try:
            events_data = data_loader.get_events_data()
            tags = events_data.list_tags_by_plugin(plugin_name)
        except KeyError:
            logger.debug(
                "Plugin name %r does not exist "
                "in train job %r, and set tags to empty list.", plugin_name,
                name)
        except AttributeError:
            logger.debug(
                "Train job %r has been deleted or it has not loaded data, "
                "and set tags to empty list.", name)

        result = dict(id=train_id, name=name, tags=tags)
        return result
Beispiel #29
0
    def add_tensor_event(self, tensor_event):
        """
        Add a new tensor event to the tensors_data.

        Args:
            tensor_event (TensorEvent): Refer to `TensorEvent` object.
        """
        if not isinstance(tensor_event, TensorEvent):
            raise TypeError('Expect to get data of type `TensorEvent`.')

        tag = tensor_event.tag
        plugin_name = tensor_event.plugin_name

        if tag not in set(self._tags):
            deleted_tag = self._check_tag_out_of_spec(plugin_name)
            if deleted_tag is not None:
                if tag in self._deleted_tags:
                    logger.debug("Tag is in deleted tags: %s.", tag)
                    return
                self.delete_tensor_event(deleted_tag)

            self._tags.append(tag)

        with self._tags_by_plugin_mutex_lock[plugin_name]:
            if tag not in self._tags_by_plugin[plugin_name]:
                self._tags_by_plugin[plugin_name].append(tag)

        with self._reservoir_mutex_lock:
            if tag not in self._reservoir_by_tag:
                reservoir_size = self._get_reservoir_size(
                    tensor_event.plugin_name)
                self._reservoir_by_tag[tag] = reservoir.ReservoirFactory(
                ).create_reservoir(plugin_name, reservoir_size)

        tensor = _Tensor(wall_time=tensor_event.wall_time,
                         step=tensor_event.step,
                         value=tensor_event.value,
                         filename=tensor_event.filename)

        if self._is_out_of_order_step(tensor_event.step, tensor_event.tag):
            self.purge_reservoir_data(tensor_event.filename, tensor_event.step,
                                      self._reservoir_by_tag[tag])

        self._reservoir_by_tag[tag].add_sample(tensor)
Beispiel #30
0
    def _load_data(self):
        """This function will load data once and ignore it if the status is loading."""
        logger.info("Start to load data, reload interval: %r.",
                    self._reload_interval)
        with self._status_mutex:
            if self.status == DataManagerStatus.LOADING.value:
                logger.debug(
                    "Current status is %s , will ignore to load data.",
                    self.status)
                return
            self.status = DataManagerStatus.LOADING.value

        summaries_info = SummaryWatcher().list_summary_directories(
            self._summary_base_dir)

        basic_train_jobs = []
        for info in summaries_info:
            profiler = info['profiler']
            basic_train_jobs.append(
                _BasicTrainJob(
                    train_id=info['relative_path'],
                    abs_summary_base_dir=self._summary_base_dir,
                    abs_summary_dir=os.path.realpath(
                        os.path.join(self._summary_base_dir,
                                     info['relative_path'])),
                    create_time=info['create_time'],
                    update_time=info['update_time'],
                    profiler_dir=None
                    if profiler is None else profiler['directory'],
                ))

        self._brief_cache.update_cache(basic_train_jobs)
        self._detail_cache.update_cache(basic_train_jobs)

        if not self._brief_cache.has_content(
        ) and not self._detail_cache.has_content():
            self.status = DataManagerStatus.INVALID.value
        else:
            self.status = DataManagerStatus.DONE.value

        logger.info(
            "Load event data end, status: %r, and loader pool size is %r.",
            self.status, self._detail_cache.loader_pool_size())