def load_param_into_net(net, parameter_dict): """ Loads parameters into network. Args: net (Cell): Cell network. parameter_dict (dict): Parameter dict. Raises: TypeError: Argument is not a Cell, or parameter_dict is not a Parameter dict. """ if not isinstance(net, nn.Cell): logger.error("Failed to combine the net and the parameters.") msg = ("Argument net should be a Cell, but got {}.".format(type(net))) raise TypeError(msg) if not isinstance(parameter_dict, dict): logger.error("Failed to combine the net and the parameters.") msg = ("Argument parameter_dict should be a dict, but got {}.".format( type(parameter_dict))) raise TypeError(msg) logger.info("Execute load parameter into net process.") net.init_parameters_data() param_not_load = [] for _, param in net.parameters_and_names(): if param.name in parameter_dict: new_param = parameter_dict[param.name] if not isinstance(new_param, Parameter): logger.error("Failed to combine the net and the parameters.") msg = ( "Argument parameter_dict element should be a Parameter, but got {}." .format(type(new_param))) raise TypeError(msg) param.init_data() _update_param(param, new_param) else: param_not_load.append(param.name) if param_not_load: _load_dismatch_prefix_params(net, parameter_dict, param_not_load) logger.debug("Params not matched(in net but not in parameter_dict):") for param_name in param_not_load: logger.debug("%s", param_name) logger.info( "Load parameter into net finish, {} parameters has not been loaded.". format(len(param_not_load)))
def _fill_image_summary(tag: str, np_value, summary_image, input_format='NCHW'): """ Package the image summary. Args: tag (str): Summary tag describe. np_value (Type): Summary data type. summary_image (Tensor): The tensor of summary. input_format (str): Data sort order index. Default: 'NCHW'. Returns: Summary, return image summary content. """ logger.debug(f"Set({tag}) the image summary value") if np_value.ndim != 4 or np_value.shape[1] not in (1, 3): logger.error( f"The value is not Image, tag = {tag}, ndim = {np_value.ndim}, shape={np_value.shape}" ) return False if np_value.ndim != len(input_format): logger.error( f"The tensor with dim({np_value.ndim}) can't convert the format({input_format}) because dim not same" ) return False # convert the tensor format tensor = _convert_image_format(np_value, input_format) # convert the tensor dtype # Do not assume that user passes in values in [0, 255], use data type to detect scale_factor = 1 if tensor.dtype == np.uint8: scale_factor = 1 elif np.max(tensor) <= 1 and np.min(tensor) >= 0: scale_factor = 255 tensor = tensor.astype(np.float32) tensor = (tensor * scale_factor).astype(np.uint8) # create the image summary height, width, channel, image_string = _make_image(tensor) summary_image.height = height summary_image.width = width summary_image.colorspace = channel summary_image.encoded_image = image_string return True
def _updata(param): if param in replace: return replace[param] layout = None set_sliced = False if auto_parallel_mode: set_sliced = True if param.name not in self.parameter_layout_dict: logger.debug("Layout dict does not contain the key %s.", param.name) else: layout = self.parameter_layout_dict[param.name] new_p = param.init_data(layout, set_sliced=set_sliced) replace[param] = new_p return new_p
def record(self, step, train_network=None, plugin_filter=None): """ Record the summary. Args: step (int): Represents training step number. train_network (Cell): The network to call the callback. plugin_filter (Optional[Callable[[str], bool]]): The filter function, \ which is used to filter out plugins from being written by returning False. Returns: bool, whether the record process is successful or not. Examples: >>> with SummaryRecord(log_dir="./summary_dir", file_prefix="xxx_", file_suffix="_yyy") as summary_record: >>> summary_record.record(step=2) """ logger.debug("SummaryRecord step is %r.", step) if self._closed: logger.error("The record writer is closed.") return False if not isinstance(step, int) or isinstance(step, bool): raise ValueError("`step` should be int") # Set the current summary of train step if self.network is not None and not self.has_graph: graph_proto = self.network.get_func_graph_proto() if graph_proto is None and train_network is not None: graph_proto = train_network.get_func_graph_proto() if graph_proto is None: logger.error("Failed to get proto for graph") else: self._event_writer.write({'graph': [{'step': step, 'value': graph_proto}]}) self.has_graph = True if not _summary_tensor_cache: return True if self._mode == 'train': self._add_summary_tensor_data() if not plugin_filter: self._event_writer.write(self._consume_data_pool(step)) else: filtered = {} for plugin, datalist in self._consume_data_pool(step).items(): if plugin_filter(plugin): filtered[plugin] = datalist self._event_writer.write(filtered) return True
def _parse_tag_format(tag: str): """ Parse the tag. Args: tag (str): Format: xxx[:Scalar] xxx[:Image] xxx[:Tensor]. Returns: Tuple, (SummaryType, summary_tag). """ summary_type = SummaryType.INVALID summary_tag = tag if tag is None: logger.error("The tag is None") return summary_type, summary_tag # search the slice slice_begin = FORMAT_BEGIN_SLICE slice_end = FORMAT_END_SLICE index = tag.rfind(slice_begin) if index is -1: logger.error("The tag(%s) have not the key slice.", tag) return summary_type, summary_tag # slice the tag summary_tag = tag[:index] # check the slice end if tag[-1:] != slice_end: logger.error("The tag(%s) end format is error", tag) return summary_type, summary_tag # check the type type_str = tag[index + 2:-1] logger.debug("The summary_tag is = %r", summary_tag) logger.debug("The type_str value is = %r", type_str) if type_str == FORMAT_SCALAR_STR: summary_type = SummaryType.SCALAR elif type_str == FORMAT_TENSOR_STR: summary_type = SummaryType.TENSOR elif type_str == FORMAT_IMAGE_STR: summary_type = SummaryType.IMAGE else: logger.error("The tag(%s) type is invalid.", tag) summary_type = SummaryType.INVALID return summary_type, summary_tag
def get_namespace_symbol(self, var: str): """Get symbol type and namespace and symbol.""" if var in self.closure_namespace: logger.debug("in closure_namespace") return self.closure_namespace, var if var in self.global_namespace: logger.debug("in global_namespace") value = self.global_namespace[var] if isinstance( value, type(abs) ) and self.global_namespace[var] not in convert_object_map: error_info = f"The builtin function '{var}' is not supported in graph mode." return None, var, error_info return self.global_namespace, var error_info = f"The name '{var}' is not defined." return None, var, error_info
def load_param_into_net(net, parameter_dict): """ Loads parameters into network. Args: net (Cell): Cell network. parameter_dict (dict): Parameter dict. Raises: TypeError: Argument is not a Cell, or parameter_dict is not a Parameter dict. """ if not isinstance(net, nn.Cell): logger.error("Failed to combine the net and the parameters.") msg = ("Argument net should be a Cell, but got {}.".format(type(net))) raise TypeError(msg) if not isinstance(parameter_dict, dict): logger.error("Failed to combine the net and the parameters.") msg = ("Argument parameter_dict should be a dict, but got {}.".format(type(parameter_dict))) raise TypeError(msg) logger.info("Execute parameter into net process.") param_name_net_not_have = [] for name in parameter_dict: b_par_dict_have_par_of_net = False for _, param in net.parameters_and_names(): if name == param.name: b_par_dict_have_par_of_net = True # layerwise parallel parameter data loaded from checkpoint file, # was a complete(merged) data, need to be splited if param.layerwise_parallel: new_param = parameter_dict[param.name] _load_tensor_for_layerwise(new_param, param) break if not b_par_dict_have_par_of_net: param_name_net_not_have.append(name) param_name_param_dict_not_have = [] for _, param in net.parameters_and_names(): if param.name in parameter_dict: new_param = parameter_dict[param.name] if not isinstance(new_param, Parameter): logger.error("Failed to combine the net and the parameters.") msg = ("Argument parameter_dict element should be a Parameter, but got {}.".format(type(new_param))) raise TypeError(msg) _update_param(param, new_param) else: param_name_param_dict_not_have.append(param.name) logger.debug("Params not matched(in net but not in parameter_dict):") for paramname in param_name_param_dict_not_have: logger.debug("%s", paramname) logger.debug("Params not matched(in parameter_dict but not in net):") for paramname in param_name_net_not_have: logger.debug("%s", paramname) logger.info("Load parameter into net process finish.")
def package_summary_event(data_id, step): """ Package the summary to event protobuffer. Args: data_id (Number): Summary data id. step (Number): The recode step index. Returns: Summary, the summary event. """ data_list = get_summary_data(data_id) if data_list is None: logger.error("The step(%r) does not have record data.", step) del_summary_data(data_id) # create the event of summary summary_event = Event() summary = summary_event.summary for value in data_list: tag = value["name"] data = value["data"] summary_type = value["type"] # get the summary type and parse the tag if summary_type is SummaryType.SCALAR: logger.debug("Now process Scalar summary, tag = %r", tag) summary_value = summary.value.add() summary_value.tag = tag summary_value.scalar_value = _get_scalar_summary(tag, data) elif summary_type is SummaryType.TENSOR: logger.debug("Now process Tensor summary, tag = %r", tag) summary_value = summary.value.add() summary_value.tag = tag summary_tensor = summary_value.tensor _get_tensor_summary(tag, data, summary_tensor) elif summary_type is SummaryType.IMAGE: logger.debug("Now process Image summary, tag = %r", tag) summary_value = summary.value.add() summary_value.tag = tag summary_image = summary_value.image _get_image_summary(tag, data, summary_image, MS_IMAGE_TENSOR_FORMAT) elif summary_type is SummaryType.HISTOGRAM: logger.debug("Now process Histogram summary, tag = %r", tag) summary_value = summary.value.add() summary_value.tag = tag summary_histogram = summary_value.histogram _fill_histogram_summary(tag, data, summary_histogram) else: # The data is invalid ,jump the data logger.error("Summary type is error, tag = %r", tag) continue summary_event.wall_time = time.time() summary_event.step = int(step) return summary_event
def package_summary_event(data_list, step): """ Package the summary to event protobuffer. Args: data_id (Number): Summary data id. step (Number): The recode step index. Returns: Summary, the summary event. """ # create the event of summary summary_event = Event() summary = summary_event.summary summary_event.wall_time = time.time() summary_event.step = int(step) for value in data_list: summary_type = value["_type"] data = value["data"] tag = value["name"] logger.debug("Now process %r summary, tag = %r", summary_type, tag) summary_value = summary.value.add() summary_value.tag = tag # get the summary type and parse the tag if summary_type == 'Scalar': if not _fill_scalar_summary(tag, data, summary_value): del summary.value[-1] elif summary_type == 'Tensor': _fill_tensor_summary(tag, data, summary_value.tensor) elif summary_type == 'Image': if not _fill_image_summary(tag, data, summary_value.image, MS_IMAGE_TENSOR_FORMAT): del summary.value[-1] elif summary_type == 'Histogram': _fill_histogram_summary(tag, data, summary_value.histogram) else: # The data is invalid ,jump the data logger.error("Summary type(%r) is error, tag = %r", summary_type, tag) del summary.value[-1] return summary_event
def parse(self): """Parse the function or method.""" logger.debug("fn = %r", self.fn) tree = None if isinstance(self.fn, (types.FunctionType, types.MethodType)): original_src = inspect.getsource(self.fn) hexstr = hashlib.sha256(original_src.encode()).hexdigest() tree = Parser.ast_cache.get(hexstr) if not tree: src = dedent(original_src) self.col_offset = \ len(original_src.split('\n')[0]) - len(src.split('\n')[0]) logger.debug("get source = %s", src) tree = asttokens.ASTTokens(src, parse=True).tree Parser.ast_cache[hexstr] = tree else: logger.error("Fn type is invalid") return tree
def get_object_key(obj): """Return the function key: module + name.""" obj_key = "" if hasattr(obj, "__name__"): if hasattr(obj, "cell_init_args"): obj_key = "%s_ID" % (str(obj.__class__.__name__) + str(obj.__name__) + obj.cell_init_args) obj_id = "%s_ID%d" % (str(obj.__class__.__name__) + str(obj.__name__), id(obj)) else: if hasattr(obj, "cell_init_args"): obj_key = "%s_ID" % (str(obj.__class__.__name__) + obj.cell_init_args) obj_id = "%s_ID%d" % (str(obj.__class__.__name__), id(obj)) logger.debug("obj_key %s obj_id = %s", obj_key, obj_id) # method has same id of different instance if isinstance(obj, types.MethodType): method_instance = obj.__self__ instance_id = "%s_ID%d" % (str(method_instance.__class__.__name__), id(method_instance)) obj_id = instance_id + obj_id + str(obj.__hash__()) return obj_id, obj_key
def record(self, step, train_network=None): """ Record the summary. Args: step (int): Represents training step number. train_network (Cell): The network that called the callback. Returns: bool, whether the record process is successful or not. Examples: >>> with SummaryRecord(log_dir="./summary_dir", file_prefix="xxx_", file_suffix="_yyy") as summary_record: >>> summary_record.record(step=2) """ logger.debug("SummaryRecord step is %r.", step) if self._closed: logger.error("The record writer is closed.") return False if not isinstance(step, int) or isinstance(step, bool): raise ValueError("`step` should be int") # Set the current summary of train step if self.network is not None and not self.has_graph: graph_proto = self.network.get_func_graph_proto() if graph_proto is None and train_network is not None: graph_proto = train_network.get_func_graph_proto() if graph_proto is None: logger.error("Failed to get proto for graph") else: self._event_writer.write( {'graph': [{ 'step': step, 'value': graph_proto }]}) self.has_graph = True if not _summary_tensor_cache: return True if self._mode == 'train': self._add_summary_tensor_data() self._event_writer.write(self._consume_data_pool(step)) return True
def _make_directory(path: str): """Make directory.""" if path is None or not isinstance(path, str) or path.strip() == "": logger.error("The path(%r) is invalid type.", path) raise TypeError("Input path is invaild type") path = os.path.realpath(path) logger.debug("The abs path is %r", path) if os.path.exists(path): real_path = path else: logger.debug("The directory(%s) doesn't exist, will create it", path) try: os.makedirs(path, exist_ok=True) real_path = path except PermissionError as e: logger.error("No write permission on the directory(%r), error = %r", path, e) raise TypeError("No write permission on the directory.") return real_path
def _fill_histogram_summary(tag: str, np_value: np.ndarray, summary) -> None: """ Package the histogram summary. Args: tag (str): Summary tag describe. np_value (np.ndarray): Summary data. summary (summary_pb2.Summary.Histogram): Summary histogram data. """ logger.debug("Set(%r) the histogram summary value", tag) # Default bucket for tensor with no valid data. ma_value = np.ma.masked_invalid(np_value) total, valid = np_value.size, ma_value.count() invalids = [] for isfn in np.isnan, np.isposinf, np.isneginf: if total - valid > sum(invalids): count = np.count_nonzero(isfn(np_value)) invalids.append(count) else: invalids.append(0) summary.count = total summary.nan_count, summary.pos_inf_count, summary.neg_inf_count = invalids if not valid: logger.warning( 'There are no valid values in the ndarray(size=%d, shape=%d)', total, np_value.shape) # summary.{min, max, sum} are 0s by default, no need to explicitly set else: summary.min = ma_value.min() summary.max = ma_value.max() summary.sum = ma_value.sum() bins = _calc_histogram_bins(valid) range_ = summary.min, summary.max hists, edges = np.histogram(np_value, bins=bins, range=range_) for hist, edge1, edge2 in zip(hists, edges, edges[1:]): bucket = summary.buckets.add() bucket.width = edge2 - edge1 bucket.count = hist bucket.left = edge1
def expand_expr_statement(self, node): """ Process the expr statement and expand it. Returns: tuple, (True, expr.value, x)/(False, None, None). """ if isinstance(node, ast.Expr) and hasattr(node, "value"): expr_value = node.value if isinstance(expr_value, ast.Call): func = expr_value.func if isinstance(func, ast.Attribute) and \ hasattr(func, "attr") and \ hasattr(func, "value"): method = func.attr target = func.value if method in parse_expr_statement_white_list: logger.debug("Expand expr, target:%s, method:%s", target, method) return True, expr_value, target return True, expr_value return False, None, None
def test_cifar10(): """ dataset parameter """ logger.info("Test dataset parameter") data_dir_10 = "../data/dataset/testCifar10Data" num_repeat = 2 batch_size = 32 limit_dataset = 100 # apply dataset operations data1 = ds.Cifar10Dataset(data_dir_10, num_samples=limit_dataset) data1 = data1.repeat(num_repeat) data1 = data1.batch(batch_size, True) num_epoch = 5 # iter1 will always assume there is a next epoch and never shutdown. iter1 = data1.create_tuple_iterator() epoch_count = 0 sample_count = 0 for _ in range(num_epoch): row_count = 0 for _ in iter1: # in this example, each dictionary has keys "image" and "label" row_count += 1 assert row_count == int(limit_dataset * num_repeat / batch_size) logger.debug("row_count: ", row_count) epoch_count += 1 sample_count += row_count assert epoch_count == num_epoch logger.debug("total epochs: ", epoch_count) assert sample_count == int( limit_dataset * num_repeat / batch_size) * num_epoch logger.debug("total sample: ", sample_count)
def _convert_function_arguments(fn, *args): """ Process the fn default parameters. Args: fn (Function): The function to be parsed. args (tuple): The parameters of the function. """ arguments_dict = OrderedDict() parse_method = None if isinstance(fn, (types.FunctionType, types.MethodType)): parse_method = fn.__name__ index = 0 for value in args: arguments_dict[f'arg{index}'] = value index = index + 1 logger.debug("fn(%r) full parameters dict is: %r", fn, arguments_dict) converted = True else: logger.warning("Find error: fn isn't function or method") converted = False return converted, arguments_dict, parse_method
def load_parameter_slice(self, params): """ Replace parameters with sliced tensors by parallel strategies. Please refer to the usage in source code of `mindspore.common._Executor.compile`. Args: params (dict): The parameters dictionary used for init data graph. """ if params is None: for key in self.parameters_dict(): tensor = self.parameters_dict()[key].data if key not in self.parameter_layout_dict: logger.info("layout dict does not contain the key %s", key) continue if self.parameters_dict()[key].sliced: logger.debug("Param %s is already sliced.", key) continue layout = self.parameter_layout_dict[key] new_tensor = _load_tensor_by_layout(tensor, layout) self.parameters_dict()[key].set_parameter_data(new_tensor) self.parameters_dict()[key].sliced = True elif isinstance(params, OrderedDict): for key in params: tensor = params[key].data if key not in self.parameter_layout_dict: logger.info("layout dict does not contain the key %s", key) continue if params[key].sliced: logger.debug("Param %s is already sliced.", key) continue layout = self.parameter_layout_dict[key] new_tensor = _load_tensor_by_layout(tensor, layout) params[key].set_parameter_data(new_tensor) params[key].sliced = True else: raise TypeError( 'Parameters need OrderedDict type, but got {}'.format( type(params)))
def _load_dismatch_prefix_params(net, parameter_dict, param_not_load): """When some net parameter did not load, try to continue load.""" prefix_name = "" longest_name = param_not_load[0] while prefix_name != longest_name and param_not_load: logger.debug("Count: {} parameters has not been loaded, try to load continue.".format(len(param_not_load))) longest_name = sorted(param_not_load, key=len, reverse=True)[0] prefix_name = longest_name for net_param_name in param_not_load: for dict_name in parameter_dict: if dict_name.endswith(net_param_name): tmp_name = dict_name[:-len(net_param_name)] prefix_name = prefix_name if len(prefix_name) < len(tmp_name) else tmp_name if prefix_name != longest_name: logger.info("Remove parameter prefix name: {}, continue to load.".format(prefix_name)) for _, param in net.parameters_and_names(): new_param_name = prefix_name + param.name if param.name in param_not_load and new_param_name in parameter_dict: new_param = parameter_dict[new_param_name] _update_param(param, new_param) param_not_load.remove(param.name)
def _get_image_summary(tag: str, np_value, summary_image, input_format='NCHW'): """ Package the image summary. Args: tag (str): Summary tag describe. np_value (Type): Summary data type. summary_image (Tensor): The tensor of summary. input_format (str): Data sort order index. Default: 'NCHW'. Returns: Summary, return image summary content. """ logger.debug("Set(%r) the image summary value", tag) if np_value.ndim != 4: logger.error("The value is not Image, tag = %r, Value = %r", tag, np_value) # convert the tensor format tensor = _convert_image_format(np_value, input_format) # convert the tensor dtype # Do not assume that user passes in values in [0, 255], use data type to detect scale_factor = 1 if tensor.dtype == np.uint8: scale_factor = 1 elif np.max(tensor) <= 1 and np.min(tensor) >= 0: scale_factor = 255 tensor = tensor.astype(np.float32) tensor = (tensor * scale_factor).astype(np.uint8) # create the image summary height, width, channel, image_string = _make_image(tensor) summary_image.height = height summary_image.width = width summary_image.colorspace = channel summary_image.encoded_image = image_string return summary_image
def _fill_scalar_summary(tag: str, np_value, summary): """ Package the scalar summary. Args: tag (str): Summary tag describe. np_value (Object): Scalary object. Returns: Summary, return scalar summary content. """ logger.debug(f"Set({tag}) the scalar summary value") if np_value.size == 1: # is scalar summary.scalar_value = np_value.item() return True if np_value.size > 1: logger.warning( f"The tensor is not a single scalar, tag = {tag}, ndim = {np_value.ndim}, shape = {np_value.shape}") summary.scalar_value = next(np_value.flat).item() return True logger.error(f"There no values inside tensor, tag = {tag}, size = {np_value.size}") return False
def make_directory(path: str): """Make directory.""" if path is None or not isinstance(path, str) or path.strip() == "": logger.error("The path(%r) is invalid type.", path) raise TypeError("Input path is invalid type") # convert the relative paths path = os.path.realpath(path) logger.debug("The abs path is %r", path) # check the path is exist and write permissions? if os.path.exists(path): real_path = path else: # All exceptions need to be caught because create directory maybe have some limit(permissions) logger.debug("The directory(%s) doesn't exist, will create it", path) try: os.makedirs(path, exist_ok=True) real_path = path except PermissionError as e: logger.error("No write permission on the directory(%r), error = %r", path, e) raise TypeError("No write permission on the directory.") return real_path
def send_res(self, res, keep_format=True): """ Send result to remote Args: keep_format: True or False """ logger.debug(f"[OUT] {str(res)}") if keep_format: res_str = str(res).replace('\n', '[LF]').replace('\r', '[CR]').replace(' ', '[SP]') else: res_str = str(res).replace('\n', '').replace('\r', '').replace(' ', '') tag = '[~]' # The same as client kTAG # Not write by print(tag + res_str, flush=True) any more try: self.fout.write(tag + res_str + "\n") self.fout.flush() except BrokenPipeError as err: logger.info(f"[TRACE] Write {str(err)}") self.exit() finally: pass
def _make_directory(path): """Make directory.""" real_path = None if path is None or not isinstance(path, str) or path.strip() == "": raise ValueError(f"Input path `{path}` is invalid type") # convert the relative paths path = os.path.realpath(path) logger.debug("The absolute path is %r", path) # check whether the path is already existed and has written permissions if os.path.exists(path): real_path = path else: # All exceptions need to be caught because create directory maybe have some limit(permissions) logger.debug("The directory(%s) doesn't exist, will create it", path) try: os.makedirs(path) real_path = path except PermissionError as e: logger.error(f"No write permission on the directory `{path}, error = {e}") raise ValueError(f"No write permission on the directory `{path}`.") return real_path
def resolve_symbol(namespace, symbol): """ Resolve a symbol. Note: Can't get function when use closure function. So save the fn on namespace. Args: namespace (Object): Symbol's namespace. symbol (str): Need resolve symbol. Returns: Object, resolve result of symbol. """ # All exceptions need to be caught in this function try: resolve_ = namespace[symbol] # list and dict is not hashable ,it can not be key for the map, just return the result if isinstance(resolve_, (list, dict)): return resolve_ # dataclass may not be hashable if getattr(resolve_, "__hash__") is None: return resolve_ # If need trope the obj if resolve_ in convert_object_map: resolve_ = convert_object_map.get(resolve_) logger.debug("convert resolve = %r", resolve_) if resolve_ == NO_IMPLEMENT: raise NotImplementedError("not implemented for ", str(symbol)) except Exception as e: if isinstance(e, NotImplementedError): raise e resolve_ = None logger.debug("resolve exception occurred, value = %r", e) logger.debug("resolve type is invalid, namespace = %s, symbol = %s", namespace.__str__(), symbol) if isinstance(resolve_, _MindSporeFunction): logger.debug("resolve class _MindSporeFunction, resolve fn instead.") resolve_ = resolve_.fn return resolve_
def init_timeline(self, all_reduce_info, framework_info, aicpu_info, min_cycle_counter, source_path): """ Init timeline metadata, adding all collected info. Args: all_reduce_info (list[list]): The metadata of AllReduce operator. framework_info (dict): The framework metadata. aicpu_info (dict): The metadata of AI CPU operator. min_cycle_counter (float): The minimum cycle counter of the timeline. """ if min_cycle_counter == float('inf'): min_cycle_counter = 0 logger.info('Initiating timeline...') timeline_list = self._load_timeline_data() cpu_timeline_generator = CpuTimelineGenerator(self._profiling_dir, self._device_id) cpu_timeline_list = cpu_timeline_generator.get_timeline_data() if cpu_timeline_list: self._clock_synchronize_to_host(timeline_list, source_path) timeline_list.extend(cpu_timeline_list) timeline_list.sort(key=lambda x: float(x[2])) self._timeline_summary['op_exe_times'] = len(timeline_list) # Add AllReduce info to timeline temp list and sort by start time. if all_reduce_info: logger.debug( 'AllReduce info found. Start adding info into timeline...') timeline_list.extend(all_reduce_info) timeline_list.sort(key=lambda x: float(x[2])) # Add AI CPU data into timeline temp list and sort by start time. aicpu_data = aicpu_info.get('info') if aicpu_data: timeline_list.extend(aicpu_data) timeline_list.sort(key=lambda x: float(x[2])) self._timeline_summary['op_exe_times'] += aicpu_info.get( 'op_exe_times', 0) self._timeline_summary['num_of_streams'] += aicpu_info.get( 'num_of_streams', 0) self._timeline_summary['num_of_ops'] += aicpu_info.get( 'num_of_ops', 0) self._timeline_summary['total_time'] += aicpu_info.get( 'total_time', 0) # Init a dict for counting the num of streams. stream_count_dict = {} for timeline in timeline_list: self._parse_timeline_data(timeline, min_cycle_counter) # Updating the collection of streams. if len(timeline) == 4: self._update_num_of_streams(timeline, stream_count_dict) # Get framework metadata. framework_obj_list = framework_info.get('object') # The length of list is the number of operators. self._timeline_summary['num_of_ops'] += len(framework_obj_list) self._add_framework_info(framework_obj_list) logger.info('Finished adding info into timeline...') # Update timeline summary info self._timeline_summary['num_of_streams'] += len( stream_count_dict.keys())
def compile(self, obj, *args, phase='predict', params=None, do_convert=True): """ Compiles graph. Args: obj (Function/Cell): The function or cell instance need compile. args (tuple): Function or cell input arguments. phase (str): The name of compile phase. Default: 'predict'. params (OrderedDict): The parameters dictionary used for init data graph. Default: None. do_convert (bool): When set to True, convert ME graph to GE graph after compiling graph. Return: Str, the full phase of the cell. Bool, if the graph has been compiled before, return False, else return True. """ obj.check_names() args_names, args_list = _generate_pip_args(obj, *args) dic = dict(zip(args_names, args_list)) key = generate_key(phase, dic) self.phase_prefix = str(key[1]) if phase == 'export': phase = phase + '.' + str(obj.create_time) else: phase = self.phase_prefix + phase + '.' + str(obj.create_time) enable_debug_runtime = context.get_context("enable_debug_runtime") enable_ge = context.get_context("enable_ge") use_vm = not enable_ge or (enable_debug_runtime and context.get_context("mode") == context.PYNATIVE_MODE) if phase in self.compile_cache.keys(): logger.debug("%r graph has existed.", phase) return phase, False result = self._executor.compile(obj, args_list, phase, use_vm) self.compile_cache[phase] = phase if not result: raise RuntimeError("Executor compile failed.") graph = self._executor.get_func_graph(phase) if graph is None: logger.error("%r graph compile failed.", phase) if not do_convert: return phase, True if not enable_debug_runtime or enable_ge: if _get_parallel_mode() in ["auto_parallel", "semi_auto_parallel"]: obj.parameter_layout_dict = self._executor.get_parameter_layout( phase) obj.load_parameter_slice(params) # the following GE init process is not needed when use vm or ms backend if enable_ge: # decide whether to sink based on whether the inputs is virtual or not if args_list and isinstance(args_list[0], Tensor) and args_list[0].virtual_flag: _set_dataset_mode_config('sink') else: _set_dataset_mode_config('normal') self._build_data_graph(obj, params, phase) if "export" not in phase: init_phase = "init_subgraph" + "." + str(obj.create_time) _exec_init_graph(obj, init_phase) elif not enable_ge and "export" in phase: self._build_data_graph(obj, params, phase) return phase, True
def _fill_histogram_summary(tag: str, np_value: np.array, summary_histogram) -> None: """ Package the histogram summary. Args: tag (str): Summary tag describe. np_value (np.array): Summary data. summary_histogram (summary_pb2.Summary.Histogram): Summary histogram data. """ logger.debug("Set(%r) the histogram summary value", tag) # Default bucket for tensor with no valid data. default_bucket_left = -0.5 default_bucket_width = 1.0 if np_value.size == 0: bucket = summary_histogram.buckets.add() bucket.left = default_bucket_left bucket.width = default_bucket_width bucket.count = 0 summary_histogram.nan_count = 0 summary_histogram.pos_inf_count = 0 summary_histogram.neg_inf_count = 0 summary_histogram.max = 0 summary_histogram.min = 0 summary_histogram.sum = 0 summary_histogram.count = 0 return summary_histogram.nan_count = np.count_nonzero(np.isnan(np_value)) summary_histogram.pos_inf_count = np.count_nonzero(np.isposinf(np_value)) summary_histogram.neg_inf_count = np.count_nonzero(np.isneginf(np_value)) summary_histogram.count = np_value.size masked_value = np.ma.masked_invalid(np_value) tensor_max = masked_value.max() tensor_min = masked_value.min() tensor_sum = masked_value.sum() # No valid value in tensor. if tensor_max is np.ma.masked: bucket = summary_histogram.buckets.add() bucket.left = default_bucket_left bucket.width = default_bucket_width bucket.count = 0 summary_histogram.max = np.nan summary_histogram.min = np.nan summary_histogram.sum = 0 return bin_number = _calc_histogram_bins(masked_value.count()) counts, edges = np.histogram(np_value, bins=bin_number, range=(tensor_min, tensor_max)) for ind, count in enumerate(counts): bucket = summary_histogram.buckets.add() bucket.left = edges[ind] bucket.width = edges[ind + 1] - edges[ind] bucket.count = count summary_histogram.max = tensor_max summary_histogram.min = tensor_min summary_histogram.sum = tensor_sum
def load_param_into_net(net, parameter_dict, strict_load=False): """ Loads parameters into network. Args: net (Cell): Cell network. parameter_dict (dict): Parameter dictionary. strict_load (bool): Whether to strict load the parameter into net. If False, it will load parameter in the param_dict into net with the same suffix. Default: False Raises: TypeError: Argument is not a Cell, or parameter_dict is not a Parameter dictionary. Examples: >>> net = Net() >>> ckpt_file_name = "./checkpoint/LeNet5-1_32.ckpt" >>> param_dict = load_checkpoint(ckpt_file_name, filter_prefix="conv1") >>> param_not_load = load_param_into_net(net, param_dict) >>> print(param_not_load) ['conv1.weight'] """ if not isinstance(net, nn.Cell): logger.error("Failed to combine the net and the parameters.") msg = ("Argument net should be a Cell, but got {}.".format(type(net))) raise TypeError(msg) if not isinstance(parameter_dict, dict): logger.error("Failed to combine the net and the parameters.") msg = ("Argument parameter_dict should be a dict, but got {}.".format( type(parameter_dict))) raise TypeError(msg) strict_load = Validator.check_bool(strict_load) logger.info("Execute the process of loading parameters into net.") net.init_parameters_data() param_not_load = [] for _, param in net.parameters_and_names(): if param.name in parameter_dict: new_param = parameter_dict[param.name] if not isinstance(new_param, Parameter): logger.error("Failed to combine the net and the parameters.") msg = ( "Argument parameter_dict element should be a Parameter, but got {}." .format(type(new_param))) raise TypeError(msg) _update_param(param, new_param) else: param_not_load.append(param.name) if param_not_load and not strict_load: _load_dismatch_prefix_params(net, parameter_dict, param_not_load) logger.debug("Params not matched(in net but not in parameter_dict):") for param_name in param_not_load: logger.debug("%s", param_name) logger.info("Loading parameters into net is finished.") if param_not_load: logger.warning("{} parameters in the net are not loaded.".format( len(param_not_load))) return param_not_load
def compile(self, obj, *args, phase='predict', do_convert=True, auto_parallel_mode=False): """ Compiles graph. Args: obj (Function/Cell): The function or cell instance need compile. args (tuple): Function or cell input arguments. phase (str): The name of compile phase. Default: 'predict'. do_convert (bool): When set to True, convert ME graph to GE graph after compiling graph. auto_parallel_mode: When set to True, use auto parallel mode to compile graph. Return: Str, the full phase of the cell. Bool, if the graph has been compiled before, return False, else return True. """ from mindspore import nn from mindspore.ops.composite import GradOperation class InputsToAttrCell(nn.Cell): """The cell that converts non-tensor inputs to attr.""" def __init__(self, net, args_names, non_tensor_inputs): super(InputsToAttrCell, self).__init__() self.net = net self.args_names = args_names self.non_tensor_inputs = non_tensor_inputs self.inputs_to_attr = True def construct(self, *tensor_inputs): real_inputs = () index = 0 for i in args_names: if i in self.non_tensor_inputs.keys(): real_inputs += (self.non_tensor_inputs[i], ) else: real_inputs += (tensor_inputs[index], ) index += 1 return self.net(*real_inputs) args_names, args_list = _generate_pip_args(obj, *args) if not hasattr(obj, "inputs_to_attr"): dic = dict(zip(args_names, args_list)) key = generate_key(phase, dic) obj.phase_prefix = str(key[1]) if 'export' in phase: phase = phase + '.' + obj.phase_prefix + '.' + str( obj.create_time) else: phase = obj.phase_prefix + phase + '.' + str(obj.create_time) if phase in self.compile_cache.keys(): logger.debug("%r graph has existed.", phase) return phase, False if getattr(obj, "support_non_tensor_inputs", None): for i in obj.__dict__.values(): if isinstance(i, GradOperation): raise ValueError( "Not support set 'support_non_tensor_inputs' to the 'True' for grad net, " "only support forward net.") attrs = {} inputs = [] for key, value in dic.items(): if not isinstance(value, (Tensor, MetaTensor)): attrs[key] = value else: inputs.append(value) if attrs: inputs_to_attr_cell = InputsToAttrCell(obj, args_names, attrs) return self.compile(inputs_to_attr_cell, *inputs, phase=phase) obj.check_names() _check_full_batch() self._set_dataset_mode(args_list) is_sink_mode = args and isinstance(args[0], Tensor) and args[0].virtual_flag if auto_parallel_mode and _need_to_full( ) and not is_sink_mode and obj.auto_parallel_compile_and_run(): args_full = _to_full_tensor(args, _get_device_num(), _get_global_rank()) _, args_list = _generate_pip_args(obj, *args_full) enable_debug_runtime = context.get_context("enable_debug_runtime") enable_ge = context.get_context("enable_ge") use_vm = not enable_ge or (enable_debug_runtime and context.get_context("mode") == context.PYNATIVE_MODE) result = self._executor.compile(obj, args_list, phase, use_vm) self.compile_cache[phase] = phase if not result: raise RuntimeError("Executor compile failed.") graph = self._executor.get_func_graph(phase) if graph is None: logger.error("%r graph compile failed.", phase) if not do_convert: return phase, True if auto_parallel_mode: obj.parameter_layout_dict = self._executor.get_parameter_layout( phase) replace = obj.init_parameters_data( auto_parallel_mode=auto_parallel_mode) if not enable_debug_runtime or enable_ge: if auto_parallel_mode: obj.load_parameter_slice(None) self._updata_param_node_default_input(phase, replace) # set parallel inputs in sink mode if auto_parallel_mode and is_sink_mode: obj.set_parallel_input_with_inputs(*args) # the following GE init process is not needed when use vm or ms backend if enable_ge: self._build_data_graph(obj, phase) if "export" not in phase: init_phase = "init_subgraph" + "." + str(obj.create_time) _exec_init_graph(obj, init_phase) elif not enable_ge and "export" in phase: self._build_data_graph(obj, phase) elif BROADCAST_PHASE not in phase and _get_parameter_broadcast(): auto_split_param_names = [] if auto_parallel_mode: auto_split_param_names = self._get_auto_split_param_names( obj.parameter_layout_dict) broadcast_params_dict = obj.parameters_broadcast_dict() if auto_split_param_names and broadcast_params_dict: broadcast_params_dict = OrderedDict() for param_name, param in obj.parameters_broadcast_dict().items( ): if param_name not in auto_split_param_names: broadcast_params_dict[param_name] = param broadcast_phase = "_broadcast_subgraph" self._build_broadcast_graph(broadcast_params_dict, broadcast_phase) return phase, True