def SendGraph(self, request_iterator, context):
     """Send graph into DebuggerCache."""
     log.info("Received graph.")
     reply = get_ack_reply()
     if self._status == ServerStatus.MISMATCH:
         log.info(
             "Mindspore and Mindinsight is unmatched, waiting for user to terminate the service."
         )
         return reply
     serial_graph = b""
     for chunk in request_iterator:
         serial_graph += chunk.buffer
     graph = GraphProto.FromString(serial_graph)
     log.debug("Deserialize the graph %s. Receive %s nodes", graph.name,
               len(graph.node))
     graph_dict = {graph.name: graph}
     self._cache_store.get_stream_handler(Streams.GRAPH).put(
         {0: graph_dict})
     self._cache_store.get_stream_handler(Streams.GRAPH).parse_stack_infos()
     self._cache_store.get_stream_handler(
         Streams.TENSOR).get_tensor_handler_by_rank_id(0).put_const_vals(
             graph.const_vals)
     self._cache_store.get_stream_handler(
         Streams.METADATA).graph_name = graph.name
     self._record_parameter_names()
     self._status = ServerStatus.RECEIVE_GRAPH
     log.debug("Send the reply for graph.")
     return reply
Ejemplo n.º 2
0
    def SendMultiGraphs(self, request_iterator, context):
        """Send graph into DebuggerCache."""
        log.info("Received multi_graphs.")
        reply = get_ack_reply()
        if self._status == ServerStatus.MISMATCH:
            log.info(
                "Mindspore and Mindinsight is unmatched, waiting for user to terminate the service."
            )
            return reply
        serial_graph = b""
        graph_dict = {}
        for chunk in request_iterator:
            serial_graph += chunk.buffer
            if chunk.finished:
                sub_graph = GraphProto.FromString(serial_graph)
                graph_dict[sub_graph.name] = sub_graph
                log.debug("Deserialize the graph %s. Receive %s nodes",
                          sub_graph.name, len(sub_graph.node))
                serial_graph = b""
                self._cache_store.get_stream_handler(
                    Streams.TENSOR).put_const_vals(sub_graph.const_vals)

        self._cache_store.get_stream_handler(Streams.GRAPH).put(graph_dict)
        self._record_parameter_names()
        self._status = ServerStatus.RECEIVE_GRAPH
        log.debug("Send the reply for graph.")
        return reply
Ejemplo n.º 3
0
 def SendTensors(self, request_iterator, context):
     """Send tensors into DebuggerCache."""
     log.info("Received tensor.")
     tensor_construct = []
     tensor_stream = self._cache_store.get_stream_handler(Streams.TENSOR)
     metadata_stream = self._cache_store.get_stream_handler(
         Streams.METADATA)
     tensor_names = []
     step = metadata_stream.step
     for tensor in request_iterator:
         tensor_construct.append(tensor)
         if tensor.finished:
             update_flag = tensor_stream.put({
                 'step':
                 step,
                 'tensor_protos':
                 tensor_construct
             })
             if self._received_view_cmd.get(
                     'wait_for_tensor') and update_flag:
                 # update_flag is used to avoid querying empty tensors again
                 self._received_view_cmd['wait_for_tensor'] = False
                 log.debug("Set wait for tensor flag to False.")
             tensor_construct = []
             tensor_names.append(':'.join([tensor.node_name, tensor.slot]))
             continue
     reply = get_ack_reply()
     return reply
 def _deal_with_run_cmd(self, event):
     """Deal with run cmd."""
     metadata_stream = self._cache_store.get_stream_handler(
         Streams.METADATA)
     run_cmd = event.run_cmd
     # receive step command
     if run_cmd.run_level == RunLevel.STEP.value:
         # receive pause cmd
         if not run_cmd.run_steps:
             log.debug("Pause training and wait for next command.")
             self._old_run_cmd.clear()
             # update metadata state from sending to waiting
             metadata_stream.state = ServerStatus.WAITING.value
             return None
         # receive step cmd
         left_steps = run_cmd.run_steps - 1
         event.run_cmd.run_steps = 1
         if left_steps:
             self._old_run_cmd[
                 'left_step_count'] = left_steps if left_steps > 0 else -1
     elif run_cmd.node_name:
         self._old_run_cmd['node_name'] = run_cmd.node_name
         run_cmd.node_name = ''
     # clean watchpoint hit cache
     if run_cmd.run_level == RunLevel.RECHECK.value:
         self._cache_store.get_stream_handler(
             Streams.WATCHPOINT_HIT).clean()
         log.debug("Receive RunCMD. Clean watchpoint hit cache.")
     # update metadata state from sending to running
     metadata_stream.state = ServerStatus.RUNNING.value
     return event
Ejemplo n.º 5
0
    def _deal_with_run_cmd(self, event):
        """Deal with run cmd."""
        run_cmd = event.run_cmd
        # receive step command
        if run_cmd.run_level == 'step':
            # receive pause cmd
            if not run_cmd.run_steps:
                log.debug("Pause training and wait for next command.")
                self._old_run_cmd.clear()
                return None
            # receive step cmd
            left_steps = run_cmd.run_steps - 1
            event.run_cmd.run_steps = 1
            if left_steps:
                self._old_run_cmd[
                    'left_step_count'] = left_steps if left_steps > 0 else -1
        elif run_cmd.node_name:
            self._old_run_cmd['node_name'] = run_cmd.node_name
            run_cmd.node_name = ''
        # clean watchpoint hit cache
        if run_cmd.run_level == RunLevel.RECHECK.value:
            self._cache_store.get_stream_handler(
                Streams.WATCHPOINT_HIT).clean()
        log.debug("Receive RunCMD. Clean watchpoint hit cache.")

        return event
 def _add_hit_node_info(watchpoint_hit, multi_card_graph_streams, rank_id,
                        hit):
     """Add hit node info."""
     graph_stream = multi_card_graph_streams.get_graph_handler_by_rank_id(
         rank_id)
     node_full_name = hit['name']
     graph_name = graph_stream.get_graph_id_by_full_name(node_full_name)
     if not graph_name:
         log.warning("Cannot find node %s in graph. Skip it.",
                     node_full_name)
         return
     ui_node_name = graph_stream.get_node_name_by_full_name(
         node_full_name, graph_name)
     log.debug("Receive watch point hit: %s:%s", node_full_name,
               hit['slot'])
     if not ui_node_name:
         log.info("Not support to show %s on graph.", node_full_name)
         return
     watchpoint_hit.update({
         'tensor_proto':
         TensorProto(node_name=node_full_name, slot=str(hit['slot'])),
         'node_name':
         ui_node_name,
         'graph_name':
         graph_name
     })
    def _deal_with_set_cmd(self, event):
        """
        Deal with set cmd.

        Args:
            event (EventReply): User command event including set_cmd.
        """
        set_cmd = event.set_cmd
        set_cmd_id = set_cmd.id
        delete = set_cmd.delete
        if not delete:
            log.info("Add watchpoint by using dbg_server.")
            watch_condition = set_cmd.watch_condition
            param_list = []
            for param in watch_condition.params:
                param_list.append(
                    self._dbg_services_module.Parameter(
                        param.name, param.disabled, param.value))
            watch_nodes = set_cmd.watch_nodes
            check_nodes = self._get_check_nodes(watch_nodes)
            log.debug("Watchpoint  %s, condition: %s, watch nodes: %s",
                      set_cmd_id, watch_condition.condition, check_nodes)
            self._dbg_service.add_watchpoint(set_cmd_id,
                                             watch_condition.condition,
                                             check_nodes, param_list)
        else:
            log.info("Remove watchpoint by using dbg_server.")
            self._dbg_service.remove_watchpoint(set_cmd_id)
 def _load_graphs(self):
     """Load graphs."""
     # the format of graphs is a list of {'rank_id': int, 'graph_protos': [GraphProto]}}
     log.debug("Begin to load graphs.")
     graphs = self._data_loader.load_graphs()
     device_stream = self._cache_store.get_stream_handler(Streams.DEVICE)
     graph_per_rank = {}
     for graph in graphs:
         rank_id = graph.get('rank_id')
         graph_per_rank[rank_id] = {}
         tensor_stream_per_rank = self._cache_store.get_stream_handler(Streams.TENSOR). \
             get_tensor_handler_by_rank_id(rank_id, create_if_not_exit=True)
         for graph_proto in graph.get('graph_protos'):
             graph_per_rank[rank_id][graph_proto.name] = graph_proto
             tensor_stream_per_rank.put_const_vals(graph_proto.const_vals)
     # the graph_per_rank is format like: Dict[<rank_id>, Dict[<graph_name>, <GraphProto>]]
     try:
         self._cache_store.get_stream_handler(
             Streams.GRAPH).put(graph_per_rank)
         self._cache_store.get_stream_handler(
             Streams.GRAPH).parse_stack_infos()
         device_stream.add_graph_name_info(graph_per_rank)
     except DebuggerParamValueError:
         log.warning("Parse graph failed. The graph file is invalid.")
         self._cache_store.get_stream_handler(Streams.GRAPH).clean()
     self._metadata_stream.state = ServerStatus.RECEIVE_GRAPH.value
     log.debug("Finish to load graphs.")
    def _save_watchpoint_hits(self, hits):
        """Save watchpoint hits."""
        multi_card_hit_streams = self._cache_store.get_stream_handler(
            Streams.WATCHPOINT_HIT)
        multi_card_graph_streams = self._cache_store.get_stream_handler(
            Streams.GRAPH)
        watchpoint_stream = self._cache_store.get_stream_handler(
            Streams.WATCHPOINT)

        watchpoint_hits = defaultdict(list)
        for hit in hits:
            log.info(
                "Received hit\n: "
                "name:%s, slot:%s, condition:%s, "
                "watchpoint_id:%s"
                "error_code:%s, rank_id:%s", hit['name'], hit['slot'],
                hit['condition'], hit['watchpoint_id'], hit['error_code'],
                hit['rank_id'])
            rank_id = hit['rank_id']
            watchpoint_hit = {}
            self._add_hit_node_info(watchpoint_hit, multi_card_graph_streams,
                                    rank_id, hit)
            if not watchpoint_hit:
                continue
            self._add_hit_watchpoint_info(watchpoint_hit, watchpoint_stream,
                                          hit)
            watchpoint_hit['error_code'] = hit['error_code']
            watchpoint_hits[rank_id].append(watchpoint_hit)
        # save hit info into cache
        multi_card_hit_streams.put(watchpoint_hits)
        self._cache_store.put_data({'receive_watchpoint_hits': True})
        log.debug("Send the watchpoint hits to DataQueue.")
Ejemplo n.º 10
0
    def _put_tensor_into_cache(self, tensor, step):
        """
        Put tensor into cache.

        Args:
            tensor (OpTensor): The tensor value.
            step (int): The step of tensor.

        Returns:
            bool, the tensor has updated successfully.
        """
        cache_tensor = self._tensors.get(tensor.name)
        if cache_tensor is None:
            cache_tensor = {}
            self._tensors[tensor.name] = cache_tensor

        old_tensor = cache_tensor.get(step)
        if old_tensor and not self._is_value_diff(old_tensor.value,
                                                  tensor.value):
            log.debug("Tensor %s of step %s has no change. Ignore it.",
                      tensor.name, step)
            return False
        cache_tensor[step] = tensor
        log.debug("Put updated tensor value for %s of step %s.", tensor.name,
                  step)
        return True
Ejemplo n.º 11
0
    def _get_missing_tensor_info(self, tensor_name, node_type):
        """
        Get missing tensor infos.

        Args:
            tensor_name (str): The full name of Tensor.
            node_type (str): The type of the relative node.

        Returns:
            list, list of missing tensor basic information.
        """
        step = self.cur_step
        missing_tensors_info = []
        # check the current step value is missing
        if self._is_tensor_value_missing(tensor_name, step):
            missing_tensors_info.append(
                TensorBasicInfo(full_name=tensor_name,
                                node_type=node_type,
                                iter=''))
            log.debug("Add current step view cmd for %s", tensor_name)
        # check the previous step value is missing
        if node_type == NodeTypeEnum.PARAMETER.value and self._is_tensor_value_missing(
                tensor_name, step - 1):
            missing_tensors_info.append(
                TensorBasicInfo(full_name=tensor_name,
                                node_type=node_type,
                                iter='prev'))
            log.debug("Add previous view cmd for %s", tensor_name)
        return missing_tensors_info
    def reset_training_step(self, step_id):
        """
        Reset the training step.

        Args:
            step_id (int): The target step_id.

        Returns:
            dict, metadata info.
        """
        metadata_stream = self._metadata_stream
        if metadata_stream.debugger_type == DebuggerServerMode.ONLINE.value:
            log.error(
                "'step_id' can not be changed manually in online debugger.")
            return metadata_stream.get(['state', 'enable_recheck', 'step'])
        if step_id > metadata_stream.max_step_num:
            log.error("Invalid step_id, step_id should be less than %d.",
                      metadata_stream.max_step_num)
            raise DebuggerParamValueError("Invalid step_id.")
        metadata_stream.state = ServerStatus.SENDING.value
        metadata_stream.step = step_id
        self._cache_store.get_stream_handler(Streams.TENSOR).set_step(step_id)
        self._cache_store.clean_data()
        self._cache_store.clean_command()
        metadata_stream.enable_recheck = True
        metadata_stream.state = ServerStatus.WAITING.value
        self._cache_store.get_stream_handler(Streams.WATCHPOINT).set_outdated()
        log.debug("Send the Change_training_step CMD.")
        return metadata_stream.get(['state', 'enable_recheck', 'step'])
Ejemplo n.º 13
0
    def continue_training(self, params):
        """
        Send RunCMD to MindSpore.

        Args:
            params (dict): The control params.

        Returns:
            dict, metadata info.
        """
        metadata_stream = self._metadata_stream
        if metadata_stream.state != ServerStatus.WAITING.value:
            log.error("MindSpore is not ready to run. Current state is: %s",
                      metadata_stream.state)
            raise DebuggerContinueError(
                "MindSpore is not ready to run or is running currently.")
        metadata_stream.state = ServerStatus.RUNNING.value
        try:
            self._validate_continue_params(params)
            event = self._construct_run_event(params)
            self._send_watchpoints()
            self._cache_store.put_command(event)
        except MindInsightException as err:
            log.error("Failed to send run event.")
            log.exception(err)
            metadata_stream.state = ServerStatus.WAITING.value
            raise DebuggerContinueError("Failed to send run command.")
        else:
            metadata_stream.enable_recheck = False
            log.debug("Send the RunCMD to command queue.")
        return metadata_stream.get(['state', 'enable_recheck'])
Ejemplo n.º 14
0
    def _add_tensor_value_for_tensor_history(self, tensor_history, node_name,
                                             graph_name, rank_id):
        """
        Add tensor value for_tensor_history and send ViewCMD if tensor value missed.

        Args:
            tensor_history (list[dict]): A list of tensor info, including name and type.
            node_name (str): The UI node name.
            graph_name (str): The graph name. Default: None.
            rank_id (int): The id of rank. Default: 0.

        Returns:
            dict, the tensor info.
        """
        tensor_stream = self.cache_store.get_stream_handler(
            Streams.TENSOR).get_tensor_handler_by_rank_id(rank_id)
        cur_step = self.cache_store.get_stream_handler(Streams.METADATA).step
        missed_tensors = tensor_stream.update_tensor_history(
            tensor_history, cur_step)
        if missed_tensors:
            view_cmd = create_view_event_from_tensor_basic_info(missed_tensors)
            self.cache_store.put_command({
                'view_cmd': view_cmd,
                'node_name': node_name,
                'graph_name': graph_name,
                'rank_id': rank_id,
                'stats': True
            })
            log.debug("Send view cmd.")
Ejemplo n.º 15
0
    def get_tensor_history(self, node_name, graph_name=None, depth=0):
        """
        Get the tensor history of a specified node.

        Args:
            node_name (str): The debug name of the node.
            graph_name (str): The graph_name. Default: None.
            depth (int): The number of layers the user
                wants to trace. Default is 0.

        Returns:
            dict, basic tensor history, only including tensor name and tensor type and node type.
        """
        graph_name, node_name = self._parse_node_name(node_name, graph_name)
        graph = self._get_graph(graph_name=graph_name, node_name=node_name)
        # validate node type, scope node has no tensor history
        node_type = graph.get_node_type(node_name)
        if is_scope_type(node_type):
            log.error("Scope type node has no tensor history.")
            raise DebuggerParamValueError("Invalid leaf node name.")
        # get tensor history
        tensor_history, cur_outputs_nums = graph.get_tensor_history(
            node_name, depth)
        # add the tensor type for tensor history
        self._update_tensor_history(tensor_history[0:cur_outputs_nums],
                                    'output', graph_name)
        self._update_tensor_history(tensor_history[cur_outputs_nums:], 'input',
                                    graph_name)
        log.debug("Get %d tensors in tensor history for node <%s>.",
                  len(tensor_history), node_name)
        return {'tensor_history': tensor_history}
Ejemplo n.º 16
0
    def SendWatchpointHits(self, request_iterator, context):
        """Send watchpoint hits info DebuggerCache."""
        log.info("Received WatchpointHits. Left run cmd %s change to emtpy.",
                 self._old_run_cmd)
        self._old_run_cmd.clear()
        if self._cache_store.get_stream_handler(
                Streams.METADATA).state == ServerStatus.RUNNING.value:
            # if the client session is running a script, all the cached command should be cleared
            # when received watchpoint_hits.
            self._cache_store.clean_command()

        # save the watchpoint_hits data
        watchpoint_hits = []
        watchpoint_stream = self._cache_store.get_stream_handler(
            Streams.WATCHPOINT)
        graph_stream = self._cache_store.get_stream_handler(Streams.GRAPH)
        for watchpoint_hit_proto in request_iterator:
            node_full_name = watchpoint_hit_proto.tensor.node_name
            graph_name = graph_stream.get_graph_id_by_full_name(node_full_name)
            if not graph_name:
                log.warning("Cannot find node %s in graph. Skip it.",
                            node_full_name)
                continue
            ui_node_name = graph_stream.get_node_name_by_full_name(
                node_full_name, graph_name)
            log.debug("Receive watch point hit: %s", watchpoint_hit_proto)
            if not ui_node_name:
                log.info("Not support to show %s on graph.", node_full_name)
                continue
            watchpoint_hit = {
                'tensor_proto':
                watchpoint_hit_proto.tensor,
                'watchpoint':
                copy.deepcopy(
                    watchpoint_stream.get_watchpoint_by_id(
                        watchpoint_hit_proto.id)),
                'node_name':
                ui_node_name,
                'graph_name':
                graph_name
            }
            hit_params = {}
            for param in watchpoint_hit_proto.watch_condition.params:
                if param.actual_value:
                    hit_params[param.name] = param.actual_value
            for i, param in enumerate(
                    watchpoint_hit['watchpoint'].condition['params']):
                name = param['name']
                if name in hit_params.keys():
                    watchpoint_hit['watchpoint'].condition['params'][i][
                        'actual_value'] = hit_params[name]
                else:
                    watchpoint_hit['watchpoint'].condition['params'][i][
                        'actual_value'] = None
            if watchpoint_hit_proto.error_code:
                watchpoint_hit['error_code'] = watchpoint_hit_proto.error_code
            watchpoint_hits.append(watchpoint_hit)
        self._received_hit = watchpoint_hits
        reply = get_ack_reply()
        return reply
    def recheck(self):
        """
        Recheck all watchpoints.

        Returns:
            dict, metadata info.
        """
        metadata_stream = self._metadata_stream
        # validate backend status is able to recheck watchpoint
        if not metadata_stream.enable_recheck:
            log.error("Recheck is not available.")
            raise DebuggerRecheckError("Recheck is not available.")
        metadata_stream.state = ServerStatus.SENDING.value
        metadata_stream.enable_recheck = False
        # send updated watchpoint and recheck command
        try:
            event = self._construct_run_event({'level': 'recheck'})
            self._send_watchpoints()
            self._cache_store.put_command(event)
        except MindInsightException as err:
            log.error("Failed to send recheck event.")
            log.exception(err)
            metadata_stream.state = ServerStatus.WAITING.value
            metadata_stream.enable_recheck = True
            raise DebuggerContinueError("Failed to send recheck command.")
        else:
            log.debug("Send the recheck to command queue.")
        return metadata_stream.get(['state', 'enable_recheck'])
Ejemplo n.º 18
0
    def _get_watch_names_by_search(self, search_nodes, target_node_name):
        """
        Get watch names according to search results.

        Args:
            search_nodes (dict): Search result.
                The format is like {'nodes': [<Search Node>]}. The <Search Node> format is like
                {'name': <UI node name>, 'type': <node type>, 'nodes': [<Search Node>]}
            target_node_name (str): Node name for UI.

        Returns:
            set[str], collection of names.
        """
        names = set()
        tmp_queue = Queue()
        tmp_queue.put(search_nodes)
        while not tmp_queue.empty():
            cur_node = tmp_queue.get()
            for node in cur_node.get('nodes'):
                node_name = node.get('name')
                if not target_node_name.startswith(node_name) or is_cst_type(
                        node.get('type')):
                    continue
                if target_node_name == node_name:
                    self._add_leaf_node_collection(node, names)
                    return names
                tmp_queue.put(node)
        # the target node name is not in search nodes.
        log.debug("node %s is not in search nodes.")
        names.add(target_node_name)
        return names
Ejemplo n.º 19
0
    def load_step_number(self):
        """
        Load step number in the directory.

        Returns:
            dict, the total step number in each rank id. The format is like Dict[str, int].
        """
        step_num = {}
        for rank_dir in self._rank_dirs:
            rank_id, rank_path = rank_dir.rank_id, rank_dir.path
            net_path = rank_path / self._net_name
            if not net_path.is_dir():
                log.info("No net directory under rank dir: %s", str(rank_dir))
                continue
            max_step = -1
            for graph_dir in net_path.iterdir():
                if not graph_dir.name.isdigit():
                    log.info("Invalid graph dir under net dir:%s",
                             str(net_path))
                    continue
                for iteration_dir in graph_dir.iterdir():
                    iteration_id = iteration_dir.name
                    if not iteration_id.isdigit():
                        log.info("Invalid iteration dir under graph dir:%s",
                                 str(graph_dir))
                    max_step = max(int(iteration_id), max_step)
                log.debug("Current max iteration number is %s", max_step)
            step_num[rank_id] = max_step + 1

        return step_num
Ejemplo n.º 20
0
    def put(self, value):
        """
        Put value into event_cache.

        Args:
            value (dict): The event to be put into cache.
        """
        if not isinstance(value, dict):
            log.error("Dict type required when put event message.")
            raise DebuggerParamValueError(
                "Dict type required when put event message.")

        with self._lock:
            log.debug(
                "Put the %d-th message into queue. \n %d requests is waiting.",
                self._next_idx, len(self._pending_requests))
            cur_pos = self._next_idx
            # update next pos
            self._next_idx += 1
            if self._next_idx >= self.max_limit:
                self._next_idx = 0
                self._prev_flag = self._cur_flag
                self._cur_flag = str(uuid.uuid4())
            # set next pos
            if not value.get('metadata'):
                value['metadata'] = {}
            value['metadata']['pos'] = self.next_pos
            self._event_cache[cur_pos] = value
            # feed the value for pending requests
            self.clean_pending_requests(value)
Ejemplo n.º 21
0
    def _get_watch_nodes_by_search(self, node_names, search_pattern,
                                   graph_name):
        """
        Get watched leaf nodes by search name.

        Args:
            node_names (list[str]): A list of node names.
            search_pattern (dict): Get watch node with search pattern.

                - name (str): The name pattern.
                - node_category (str): The node_category.
            graph_name (str): The relative graph_name of the watched node.

        Returns:
            list[NodeBasicInfo], a list of node basic infos.
        """
        search_pattern['graph_name'] = graph_name
        search_nodes = self._graph_stream.search_nodes(search_pattern)
        watch_node_names = set()
        for name in node_names:
            names = self._get_watch_names_by_search(search_nodes, name)
            watch_node_names.update(names)
        watch_node_info = self._get_node_basic_infos(watch_node_names,
                                                     graph_name=graph_name)
        log.debug("Update nodes: %s", watch_node_info)

        return watch_node_info
Ejemplo n.º 22
0
    def add_node(self, node_name, node_type, full_name=''):
        """
        Add watch node to watch node tree.

        Args:
            node_name (str): The node name.
            node_type (str): The node type.
            full_name (str): The full name of node.
        """
        log.debug("Add node %s with type: %s, full_name: %s", node_name,
                  node_type, full_name)
        scope_names = node_name.split('/', 1)
        if len(scope_names) == 1:
            target_node = self.get(node_name)
            if not target_node:
                self.add(node_name,
                         node_type,
                         full_name,
                         watch_status=WatchNodeTree.TOTAL_WATCH)
            else:
                target_node.update_metadata(node_type, full_name,
                                            WatchNodeTree.TOTAL_WATCH)
            return

        scope_name, sub_names = scope_names
        sub_tree = self.get(scope_name)
        if not sub_tree:
            sub_tree = self.add(scope_name, watch_status=1)
        sub_tree.add_node(sub_names, node_type, full_name)
Ejemplo n.º 23
0
    def get(self, filter_condition=None):
        """
        Get the watchpoints.

        Args:
            filter_condition (Union[None, int]): The filter conditions. Get watchpoint by
                id. If None, return all watchpoint. Default: None.

        Returns:
            dict, the watchpoint list.
        """
        reply = []
        if not filter_condition:
            # get watch condition list
            for _, watchpoint in self._watchpoints.items():
                watchpoint_info = watchpoint.get_watch_condition_info()
                reply.append(watchpoint_info)
        else:
            self.validate_watchpoint_id(filter_condition)
            reply = [self._watchpoints.get(filter_condition)]

        log.debug("get the watch points with filter_condition:%s",
                  filter_condition)

        return {'watch_points': reply}
Ejemplo n.º 24
0
    def _retrieve_node(self, filter_condition):
        """
        Retrieve node info.

        Args:
            filter_condition (dict): Filter condition.

                - name (str): The name of single node.
                - graph_name (str): The relative graph_name of the node.
                - single_node (bool): If False, return the sub-layer of single node. If True, return
                    the node list from root node to single node.
                - watch_point_id (int): The id of watchpoint.

        Returns:
            dict, reply with graph.
        """
        log.debug("Retrieve node %s.", filter_condition)
        # validate node name
        node_name = filter_condition.get('name')
        graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
        graph_name = graph_stream.validate_graph_name(
            filter_condition.get('graph_name'))
        if node_name:
            # validate node name
            graph_stream.get_node_type(node_name, graph_name)
        filter_condition['single_node'] = bool(
            filter_condition.get('single_node'))
        filter_condition['graph_name'] = graph_name
        reply = self._get_nodes_info(filter_condition)
        return reply
Ejemplo n.º 25
0
    def put(self, value):
        """
        Put value into tensor cache. Called by grpc server.

        Args:
            value (dict): The Tensor proto message.

                - step (int): The current step of tensor.
                - tensor_proto (TensorProto): The tensor proto.
                - tensor_contents (list[byte]): The list of tensor content values.

        Returns:
            bool, the tensor has updated successfully.
        """
        tensor_proto = value.get('tensor_proto')
        tensor_proto.ClearField('tensor_content')
        step = value.get('step', 0)
        if tensor_proto.iter and step > 0:
            log.debug("Received previous tensor.")
            step -= 1
        tensor_content = b''.join(value.get('tensor_contents'))
        tensor = OpTensor(tensor_proto, tensor_content, step)
        flag = self._put_tensor_into_cache(tensor, step)
        log.info("Put tensor %s of step: %d, into cache. Flag: %s", tensor.name, step, flag)
        return flag
Ejemplo n.º 26
0
 def get_condition_collections(self, train_id):
     """Get default condition_collections"""
     metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
     condition_context = ConditionContext(metadata_stream.backend,
                                          metadata_stream.step)
     log.debug("Train_id: %s, backend: %s", train_id,
               condition_context.backend)
     return self.condition_mgr.get_all_collections(condition_context)
Ejemplo n.º 27
0
 def _clean_parameters(self):
     """Clean parameter cache."""
     for param in self._param_names:
         if param in self._tensors:
             params = self._tensors.pop(param)
             for step in params:
                 self._memory_mgr.release((self._rank_id, param, step))
             log.debug("Clean param %s in cache.", param)
Ejemplo n.º 28
0
 def _is_value_diff(old_value, new_value):
     """Check tensor value if there are equal."""
     log.debug("old value type: %s, new_value type: %s", type(old_value), type(new_value))
     if old_value is None and new_value is None:
         return False
     flag = old_value != new_value
     if isinstance(flag, np.ndarray):
         return flag.any()
     return flag
Ejemplo n.º 29
0
    def put_data(self, value):
        """
        Set updated data to data stream.

        Args:
            value (dict): The updated data.
        """
        log.debug("Set <%d> bytes data", sys.getsizeof(value))
        return self._put(Streams.DATA, value)
Ejemplo n.º 30
0
    def put_command(self, cmd):
        """
        Set command to command stream.

        Args:
            cmd (EventReply): The command EventReply.
        """
        log.debug("Set command %s", cmd)
        return self._put(Streams.COMMAND, {'cmd': cmd})