def record_point_info(self, point_info, output_path): """ Record point info into json. Args: point_info (dict): The point info about tag id and relative op name. output_path (str): The output path for saving point info. Returns: dict, parsed point info. """ points = { 'fp_start': point_info.get(self._fp_tag, ''), 'bp_end': point_info.get(self._bp_tag, '') } try: with open(output_path, 'w') as json_file: json.dump(points, json_file) os.chmod(output_path, stat.S_IREAD) except (IOError, OSError) as err: log.warning('Failed to save point info. %s', err) raise ProfilerIOException return points
def _load_inputs(self, *inputs): """ Slice inputs tensors by parallel strategies. Args: inputs (Function or Cell): inputs of construct method. """ parallel_inputs_run = [] if len(inputs) > self._construct_inputs_num: raise ValueError( 'Len of inputs: {} is bigger than self._construct_inputs_num: {}.' .format(len(inputs), self._construct_inputs_num)) for i, tensor in enumerate(inputs): key = self._construct_inputs_names[i] # if input is not used, self.parameter_layout_dict may not contain the key if key not in self.parameter_layout_dict: logger.warning("layout dict does not contain the key %s", key) parallel_inputs_run.append(tensor) else: layout = self.parameter_layout_dict[key] new_tensor = _load_tensor_by_layout(tensor, layout) parallel_inputs_run.append(new_tensor) return tuple(parallel_inputs_run)
def _fill_scalar_summary(tag: str, np_value, summary): """ Package the scalar summary. Args: tag (str): Summary tag describe. np_value (Object): Scalary object. Returns: Summary, return scalar summary content. """ logger.debug(f"Set({tag}) the scalar summary value") if np_value.size == 1: # is scalar summary.scalar_value = np_value.item() return True if np_value.size > 1: logger.warning( f"The tensor is not a single scalar, tag = {tag}, ndim = {np_value.ndim}, shape = {np_value.shape}") summary.scalar_value = next(np_value.flat).item() return True logger.error(f"There no values inside tensor, tag = {tag}, size = {np_value.size}") return False
def _load_dismatch_prefix_params(net, parameter_dict, param_not_load): """When some net parameter did not load, try to continue load.""" prefix_name = "" longest_name = param_not_load[0] while prefix_name != longest_name and param_not_load: logger.debug("Count: {} parameters has not been loaded, try to load continue.".format(len(param_not_load))) prefix_name = longest_name for net_param_name in param_not_load: for dict_name in parameter_dict: if dict_name.endswith(net_param_name): prefix_name = dict_name[:-len(net_param_name)] break if prefix_name != longest_name: break if prefix_name != longest_name: logger.warning("Remove parameter prefix name: {}, continue to load.".format(prefix_name)) for _, param in net.parameters_and_names(): new_param_name = prefix_name + param.name if param.name in param_not_load and new_param_name in parameter_dict: new_param = parameter_dict[new_param_name] _update_param(param, new_param) param_not_load.remove(param.name)
def set_category_field(self, category_field): """ Set category field for reading. Note: Should be a candidate category field. Args: category_field (str): String of category field name. Returns: MSRStatus, SUCCESS or FAILED. """ logger.warning( "WARN_DEPRECATED: The usage of set_category_field is deprecated." " Please use category_field") if not category_field or not isinstance(category_field, str): raise ParamTypeError('category_fields', 'str') if category_field not in self._candidate_fields: raise MRMDefineCategoryError( "Field '{}' is not a candidate category field.".format( category_field)) return self._segment.set_category_field(category_field)
def _check_target_specific_cfgs(device, arg_key): """Checking whether a config is sutable for a specified device""" device_cfgs = { 'enable_auto_mixed_precision': ['Ascend'], 'enable_dump': ['Ascend'], 'save_dump_path': ['Ascend'], 'enable_graph_kernel': ['Ascend', 'GPU'], 'enable_reduce_precision': ['Ascend'], 'enable_profiling': ['Ascend'], 'profiling_options': ['Ascend'], 'print_file_path': ['Ascend'], 'variable_memory_max_size': ['Ascend'], 'max_device_memory': ['GPU'] } # configs not in map device_cfgs are supposed to be suitable for all devices if not arg_key in device_cfgs: return True supported_devices = device_cfgs[arg_key] if device in supported_devices: return True logger.warning(f"Config '{arg_key}' only supports devices in {supported_devices}, current device is '{device}'" ", ignore it.") return False
def parse(self): """ Parse the minddata pipeline files. Raises: ProfilerRawFileException: If fails to parse the raw file of minddata pipeline or the file is empty. """ with open(self._pipeline_path, 'r') as file: try: pipeline_info = json.load(file) except (json.JSONDecodeError, TypeError) as err: logger.warning(err) raise ProfilerRawFileException( 'Fail to parse minddata pipeline file.' ) if not pipeline_info: logger.warning('The minddata pipeline file is empty.') raise ProfilerRawFileException( 'The minddata pipeline file is empty.' ) self._parse_and_save(pipeline_info)
def _parse(self, source_files): """Parse source step trace files.""" log.info("Start to parse step trace file.") event_info = {} self._get_step_end_tag_id(source_files) for source_file in source_files: source_file = validate_and_normalize_path(source_file) try: with open(source_file, 'rb') as handler: content = handler.read() for step_trace in self._get_next_step_trace(content, event_info): if self._skip_first_step: self._skip_first_step = False continue self._record_trace_event(step_trace) except (IOError, OSError) as err: log.warning(f'Failed to read {source_file}', err) raise ProfilerIOException self._record_average_info() log.info("Finish to parse step trace file.")
def _preprocess_single_lr(self, learning_rate): """Check lr value, and convert lr to a float, a Tensor or a LearningRateSchedule.""" if isinstance(learning_rate, (float, int)): learning_rate = float(learning_rate) validator.check_number_range("learning rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name) return learning_rate if isinstance(learning_rate, Tensor) and learning_rate.dim() == 0: return learning_rate self.dynamic_lr = True if isinstance(learning_rate, Iterable): return Tensor(np.array(list(learning_rate)).astype(np.float32)) if isinstance(learning_rate, Tensor): if learning_rate.dim() > 1: raise ValueError("The dim of `Tensor` type Learning rate should be a 0 or 1," f"but got {learning_rate.dim()}.") if learning_rate.dim() == 1 and learning_rate.size() < 2: logger.warning("If use `Tensor` type dynamic learning rate, please make sure that the number" "of elements in the tensor passed is greater than 1.") return learning_rate if isinstance(learning_rate, LearningRateSchedule): return learning_rate raise TypeError("Learning rate should be int, float, Tensor, Iterable or LearningRateSchedule.")
def record_point_info(self, source_file, output_path): """ Record point info into json. Args: source_file (str): The file path of step trace original data. output_path (str): The output path for saving point info. Returns: dict, parsed point info. """ fp_start, bp_end = 0, 1 try: with open(source_file, 'r') as f: lines = f.readlines() fp_start_name = lines[fp_start].split()[0] bp_end_name = lines[bp_end].split()[0] except (IOError, OSError) as err: log.warning(f'Failed to read {source_file}', err) raise ProfilerIOException if self._is_training_mode: points = {'fp_start': fp_start_name, 'bp_end': bp_end_name} else: points = { 'fp_start': fp_start_name, } if os.path.exists(output_path): return points try: with open(output_path, 'w') as json_file: json.dump(points, json_file) os.chmod(output_path, stat.S_IRUSR) except (IOError, OSError) as err: log.warning('Failed to save point info. %s', err) raise ProfilerIOException return points
def test_tfrecord_to_mindrecord_with_special_field_name(): """test transform tfrecord to mindrecord.""" if not tf or tf.__version__ < SupportedTensorFlowVersion: # skip the test logger.warning("Module tensorflow is not found or version wrong, \ please use pip install it / reinstall version >= {}.".format( SupportedTensorFlowVersion)) return generate_tfrecord_with_special_field_name() assert os.path.exists(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME)) feature_dict = { "image/class/label": tf.io.FixedLenFeature([], tf.int64), "image/encoded": tf.io.FixedLenFeature([], tf.string), } if os.path.exists(MINDRECORD_FILE_NAME): os.remove(MINDRECORD_FILE_NAME) if os.path.exists(MINDRECORD_FILE_NAME + ".db"): os.remove(MINDRECORD_FILE_NAME + ".db") tfrecord_transformer = TFRecordToMR( os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME), MINDRECORD_FILE_NAME, feature_dict, ["image/encoded"]) tfrecord_transformer.transform() assert os.path.exists(MINDRECORD_FILE_NAME) assert os.path.exists(MINDRECORD_FILE_NAME + ".db") fr_mindrecord = FileReader(MINDRECORD_FILE_NAME) verify_data(tfrecord_transformer, fr_mindrecord) os.remove(MINDRECORD_FILE_NAME) os.remove(MINDRECORD_FILE_NAME + ".db") os.remove(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
def _save(self): """save step trace file.""" BP_POINT, TAIL, FP_DURATION = 5, -1, -2 log.info("Start to save step trace file.") if not self._header: return try: with open(self._output_path, 'w') as file_handle: csv_writer = csv.writer(file_handle) if not self._is_training_mode: self._header[FP_DURATION] = 'fp' self._header = self._header[:BP_POINT] + self._header[ BP_POINT + 1:TAIL] csv_writer.writerow(self._header) for row_data in self._result: if not self._is_training_mode: row_data[FP_DURATION] += row_data[TAIL] row_data = row_data[:BP_POINT] + row_data[BP_POINT + 1:TAIL] csv_writer.writerow(row_data) os.chmod(self._output_path, stat.S_IRUSR) except (IOError, OSError) as err: log.warning('Failed to save step trace raw info. %s', err) raise ProfilerIOException
def _get_single_lr(self, learning_rate): """Get learning rate in Tensor type.""" if isinstance(learning_rate, float): validator.check_number_range("learning rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name) lr = Tensor(learning_rate, mstype.float32) elif isinstance(learning_rate, Iterable): lr = Tensor(np.array(list(learning_rate)).astype(np.float32)) elif isinstance(learning_rate, Tensor): if learning_rate.dim() > 1: raise ValueError( "Learning rate should be a 0 or 1 dim `Tensor`," f"but got {learning_rate.dim()}.") if learning_rate.dim() == 1 and learning_rate.size() < 2: logger.warning( "If want to use the dynamic learning rate, please make sure that the number " "of elements in the list, tuple or tensor passed is greater than 1." ) lr = learning_rate else: raise TypeError( "Learning rate should be float, Tensor or Iterable.") return lr
def calc_faithfulness(self, inputs: _Array, model: _Module, targets: _Label, saliency: _Array) -> np.ndarray: """ Calculate naive faithfulness. Args: inputs (_Array): sample to calculate faithfulness score model (_Module): model to explanation targets (_Label): label to explanation on. saliency (_Array): Saliency map of given inputs and targets from the explainer. Return: - faithfulness (np.ndarray): faithfulness score """ if not np.count_nonzero(saliency): log.warning( "The saliency map is zero everywhere. The correlation will be set to zero." ) correlation = 0 normalized_faithfulness = (correlation + 1) / 2 return np.array([normalized_faithfulness], np.float) reference = self._get_reference(inputs) perturbations, masks = self._perturb(inputs, saliency, reference, return_mask=True) feature_importance = _calc_feature_importance(saliency, masks) perturbations = ms.Tensor(perturbations, dtype=ms.float32) predictions = model(perturbations).asnumpy()[:, targets] faithfulness = calc_correlation(feature_importance, predictions) normalized_faithfulness = (faithfulness + 1) / 2 return np.array([normalized_faithfulness], np.float)
def _verify_data(self): """Verify dataset and labels.""" next_element = next(self._dataset.create_tuple_iterator()) if len(next_element) not in [1, 2, 3]: raise ValueError("The dataset should provide [images] or [images, labels], [images, labels, bboxes]" " as columns.") if len(next_element) == 3: inputs, labels, bboxes = next_element if bboxes.shape[-1] != 4: raise ValueError("The third element of dataset should be bounding boxes with shape of " "[batch_size, num_ground_truth, 4].") else: if self._benchmarkers is not None: if any([isinstance(bench, Localization) for bench in self._benchmarkers]): raise ValueError("The dataset must provide bboxes if Localization is to be computed.") if len(next_element) == 2: inputs, labels = next_element if len(next_element) == 1: inputs = next_element[0] if len(inputs.shape) > 4 or len(inputs.shape) < 3 or inputs.shape[-3] not in [1, 3, 4]: raise ValueError( "Image shape {} is unrecognizable: the dimension of image can only be CHW or NCHW.".format( inputs.shape)) if len(inputs.shape) == 3: log.warning( "Image shape {} is 3-dimensional. All the data will be automatically unsqueezed at the 0-th" " dimension as batch data.".format(inputs.shape)) if len(next_element) > 1: if len(labels.shape) > 2 and (np.array(labels.shape[1:]) > 1).sum() > 1: raise ValueError( "Labels shape {} is unrecognizable: outputs should not have more than two dimensions" " with length greater than 1.".format(labels.shape))
def _collect_input_data(self, cb_params): """Only support to collect image data.""" if not self._collect_specified_data.get('collect_input_data'): return input_data = getattr(cb_params, 'train_dataset_element', None) if input_data is None: self._collect_specified_data['collect_input_data'] = False logger.info( "The 'train_dataset_element' in cb_params is None, maybe there is dataset sink mode." ) return if isinstance(input_data, (list, tuple)): input_data = input_data[0] try: self._record.add_value(PluginEnum.IMAGE.value, 'input_data/auto', input_data) except ValueError: logger.warning( 'The input data of network are not image, so will not collect by SummaryCollector.' ) self._collect_specified_data['collect_input_data'] = False return
def _package_parameter(key, value, message): """ Package parameters in operation. Args: key (str): Operation name. value (Union[str, bool, int, float, list, None]): Operation args. message (OperationParameter): Operation proto message. """ if isinstance(value, str): message.mapStr[key] = value elif isinstance(value, bool): message.mapBool[key] = value elif isinstance(value, int): message.mapInt[key] = value elif isinstance(value, float): message.mapDouble[key] = value elif isinstance(value, (list, tuple)) and key != "operations": if value: replace_value_list = list( map(lambda x: "" if x is None else x, value)) message.mapStrList[key].strValue.extend(replace_value_list) elif isinstance(value, dict): try: message.mapStr[key] = json.dumps(value) except TypeError as exo: logger.warning( "Transform the value of parameter %r to string failed. Detail: %s.", key, str(exo)) elif value is None: message.mapStr[key] = "None" else: logger.warning( "The parameter %r is not recorded, because its type is not supported in event package. " "Its type is %r.", key, type(value).__name__)
def _get_imagenet_as_dict(self): """ Get data from imagenet as dict. Yields: data (dict of list): imagenet data list which contains dict. """ if not os.path.exists(self.map_file): raise IOError("map file {} not exists".format(self.map_file)) label_dict = {} with open(self.map_file) as fp: line = fp.readline() while line: labels = line.split(" ") label_dict[labels[1]] = labels[0] line = fp.readline() # get all the dir which are n02087046, n02094114, n02109525 dir_paths = {} for item in label_dict: real_path = os.path.join(self.image_dir, label_dict[item]) if not os.path.isdir(real_path): logger.warning("{} dir is not exist".format(real_path)) continue dir_paths[item] = real_path if not dir_paths: raise PathNotExistsError("not valid image dir in {}".format( self.image_dir)) # get the filename, label and image binary as a dict for label in dir_paths: for item in os.listdir(dir_paths[label]): file_name = os.path.join(dir_paths[label], item) if not item.endswith("JPEG") and not item.endswith("jpg"): logger.warning( "{} file is not suffix with JPEG/jpg, skip it.".format( file_name)) continue data = {} data["file_name"] = str(file_name) data["label"] = int(label) # get the image data image_file = open(file_name, "rb") image_bytes = image_file.read() image_file.close() if not image_bytes: logger.warning( "The image file: {} is invalid.".format(file_name)) continue data["image"] = image_bytes yield data
def __init__(self, **kwargs): # get device_id and device_target self._get_devid_and_devtarget() output_path = kwargs.pop("output_path", "./data") self._output_path = validate_and_normalize_path(output_path) self._output_path = os.path.join(self._output_path, "profiler") if not os.path.exists(self._output_path): os.makedirs(self._output_path, exist_ok=True) else: logger.warning( "The target dir already exists. " "There may be some old profiling data, and they will be rewrote in the end." ) if self._device_target and self._device_target == "GPU": from mindspore._c_expression import GPUProfiler self._gpu_profiler = GPUProfiler.get_instance() self._gpu_profiler.init(self._output_path) self._gpu_profiler.step_profiling_enable(True) if kwargs: logger.warning("Params not be supported yet on GPU.") elif self._device_target and self._device_target == "Ascend": optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable") if not isinstance(optypes_not_deal, str): raise TypeError("The parameter optypes_not_deal must be str.") job_id = kwargs.pop("ascend_job_id", "") if kwargs: logger.warning("There are invalid params which don't work.") os.environ['PROFILING_MODE'] = 'true' os.environ['MINDDATA_PROFILING_DIR'] = self._output_path os.environ['DEVICE_ID'] = self._dev_id os.environ['AICPU_PROFILING_MODE'] = 'true' # use context interface to open profiling, for the new mindspore version(after 2020.5.21) context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace") self._container_path = os.path.join( self._base_profiling_container_path, self._dev_id) data_path = os.path.join(self._container_path, "data") data_path = validate_and_normalize_path(data_path) if not os.path.exists(data_path): os.makedirs(data_path, exist_ok=True) self._filt_optype_names = optypes_not_deal.split( ",") if optypes_not_deal else [] self._profiling_job_id = job_id # add job id env through user input later self._job_id_env = 0 self._start_time = int(time.time() * 10000000) logger.info("Profiling: profiling start time: %d", self._start_time)
def _collect_input_data(self, cb_params): """Only support to collect image data.""" if not self._collect_specified_data.get('collect_input_data'): return input_data = getattr(cb_params, 'train_dataset_element', None) if not isinstance(input_data, (Tensor, list, tuple)): self._collect_specified_data['collect_input_data'] = False logger.warning("The type of input data is not Tensor/list/tuple, " "so SummaryCollector will not collect input data.") return if not isinstance(input_data, Tensor) and not input_data: self._collect_specified_data['collect_input_data'] = False logger.warning( "The 'train_dataset_element' in cb_params is empty, " "so SummaryCollector will not record the input data.") if self._dataset_sink_mode and context.get_context( 'device_target') == 'Ascend': logger.warning( 'On Ascend device, SummaryCollector is not supported to record input data ' 'in dataset sink mode.') return if isinstance(input_data, (list, tuple)) and input_data: input_data = input_data[0] try: self._record.add_value(PluginEnum.IMAGE.value, 'input_data/auto', input_data) except (TypeError, ValueError): logger.warning( 'The input data of network are not image, so will not collect by SummaryCollector.' ) self._collect_specified_data['collect_input_data'] = False return
def to_tensor(self, slice_index=None, shape=None, opt_shard_group=None): """Return init_data().""" logger.warning("WARN_DEPRECATED: The usage of to_tensor is deprecated." " Please use init_data") return self.init_data(slice_index, shape, opt_shard_group)
def eval(self, valid_dataset, callbacks=None, dataset_sink_mode=True): """ Evaluation API where the iteration is controlled by python front-end. Configure to pynative mode or CPU, the evaluating process will be performed with dataset non-sink mode. Note: If dataset_sink_mode is True, data will be sent to device. If device is Ascend, features of data will be transferred one by one. The limitation of data transmission per time is 256M. Args: valid_dataset (Dataset): Dataset to evaluate the model. callbacks (Optional[list(Callback)]): List of callback objects which should be executed while training. Default: None. dataset_sink_mode (bool): Determines whether to pass the data through dataset channel. Default: True. Returns: Dict, which returns the loss value and metrics values for the model in the test mode. Examples: >>> from mindspore import Model, nn >>> >>> # For details about how to build the dataset, please refer to the tutorial >>> # document on the official website. >>> dataset = create_custom_dataset() >>> net = Net() >>> loss = nn.SoftmaxCrossEntropyWithLogits() >>> model = Model(net, loss_fn=loss, optimizer=None, metrics={'acc'}) >>> acc = model.eval(dataset, dataset_sink_mode=False) """ dataset_sink_mode = Validator.check_bool(dataset_sink_mode) _device_number_check(self._parallel_mode, self._device_number) if not self._metric_fns: raise ValueError("metric fn can not be None or empty.") if isinstance(self._eval_network, nn.GraphCell) and dataset_sink_mode is True: raise ValueError( "Sink mode is currently not supported when evaluating with a GraphCell." ) cb_params = _InternalCallbackParam() cb_params.eval_network = self._eval_network cb_params.valid_dataset = valid_dataset cb_params.batch_num = valid_dataset.get_dataset_size() cb_params.mode = "eval" cb_params.cur_step_num = 0 cb_params.list_callback = self._transform_callbacks(callbacks) cb_params.network = self._network self._clear_metrics() if context.get_context("device_target") == "CPU" and dataset_sink_mode: dataset_sink_mode = False logger.warning( "CPU cannot support dataset sink mode currently." "So the evaluating process will be performed with dataset non-sink mode." ) with _CallbackManager(callbacks) as list_callback: if dataset_sink_mode: return self._eval_dataset_sink_process(valid_dataset, list_callback, cb_params) return self._eval_process(valid_dataset, list_callback, cb_params)
def _train(self, epoch, train_dataset, callbacks=None, dataset_sink_mode=True, sink_size=-1): """ Training. Args: epoch (int): Total number of iterations on the data. train_dataset (Dataset): A training dataset iterator. If there is no loss_fn, a tuple with multiple data (data1, data2, data3, ...) will be returned and passed to the network. Otherwise, a tuple (data, label) will be returned. The data and label would be passed to the network and loss function respectively. callbacks (list): List of callback objects which should be executed while training. Default: None. dataset_sink_mode (bool): Determine whether the data should be passed through the dataset channel. Default: True. Configure pynative mode or CPU, the training process will be performed with dataset not sink. sink_size (int): Control the amount of data in each sink. Default: -1. """ epoch = Validator.check_positive_int(epoch) if self._parameter_broadcast: self._train_network.set_broadcast_flag() cb_params = _InternalCallbackParam() cb_params.train_network = self._train_network cb_params.epoch_num = epoch if dataset_sink_mode and sink_size > 0: cb_params.batch_num = sink_size else: cb_params.batch_num = train_dataset.get_dataset_size() cb_params.mode = "train" cb_params.loss_fn = self._loss_fn cb_params.optimizer = self._optimizer cb_params.parallel_mode = self._parallel_mode cb_params.device_number = self._device_number cb_params.train_dataset = train_dataset cb_params.list_callback = self._transform_callbacks(callbacks) if context.get_context("mode") == context.PYNATIVE_MODE: cb_params.list_callback.insert(0, _StepSync()) callbacks = cb_params.list_callback cb_params.train_dataset_element = None cb_params.network = self._network if _is_role_pserver() or _is_role_sched(): epoch = 1 # build callback list with _CallbackManager(callbacks) as list_callback: self._check_reuse_dataset(train_dataset) if not dataset_sink_mode: self._train_process(epoch, train_dataset, list_callback, cb_params) elif context.get_context("device_target") == "CPU": logger.warning( "The CPU cannot support dataset sink mode currently." "So the training process will be performed with dataset not sink." ) self._train_process(epoch, train_dataset, list_callback, cb_params) else: self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params, sink_size)
def __init__(self, in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, has_bias, weight_init, bias_init, transposed=False): super(_Conv, self).__init__() self.in_channels = check_int_positive(in_channels) self.out_channels = check_int_positive(out_channels) self.kernel_size = kernel_size self.stride = stride self.pad_mode = pad_mode self.weight_init = weight_init self.bias_init = bias_init if isinstance(padding, int): Validator.check_integer('padding', padding, 0, Rel.GE, self.cls_name) self.padding = padding elif isinstance(padding, tuple): for pad in padding: Validator.check_integer('padding item', pad, 0, Rel.GE, self.cls_name) self.padding = padding else: raise TypeError("padding type must be int/tuple(int) cannot be {}!".format(type(padding))) self.dilation = dilation self.group = check_int_positive(group) self.has_bias = has_bias if (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \ kernel_size[0] < 1 or kernel_size[1] < 1: raise ValueError("Attr 'kernel_size' of 'Conv2D' Op passed " + str(self.kernel_size) + ", should be a int or tuple and equal to or greater than 1.") if (not isinstance(stride[0], int)) or (not isinstance(stride[1], int)) or stride[0] < 1 or stride[1] < 1: raise ValueError("Attr 'stride' of 'Conv2D' Op passed " + str(self.stride) + ", should be a int or tuple and equal to or greater than 1.") if (not isinstance(dilation[0], int)) or (not isinstance(dilation[1], int)) or \ dilation[0] < 1 or dilation[1] < 1: raise ValueError("Attr 'dilation' of 'Conv2D' Op passed " + str(self.dilation) + ", should equal to or greater than 1.") if in_channels % group != 0: raise ValueError("Attr 'in_channels' of 'Conv2D' Op must be divisible by " "attr 'group' of 'Conv2D' Op.") if out_channels % group != 0: raise ValueError("Attr 'out_channels' of 'Conv2D' Op must be divisible by " "attr 'group' of 'Conv2D' Op.") if transposed: shape = [in_channels, out_channels // group, *kernel_size] else: shape = [out_channels, in_channels // group, *kernel_size] self.weight = Parameter(initializer(self.weight_init, shape), name='weight') if check_bool(has_bias): self.bias = Parameter(initializer(self.bias_init, [out_channels]), name='bias') else: if self.bias_init != 'zeros': logger.warning("Value of 'has_bias' is False, value of 'bias_init' will be ignored.") self.bias = None
import os from string import punctuation import numpy as np import pytest from mindspore import log as logger from mindspore.mindrecord import FileReader from mindspore.mindrecord import TFRecordToMR SupportedTensorFlowVersion = '1.13.0-rc1' try: tf = import_module( "tensorflow") # just used to convert tfrecord to mindrecord except ModuleNotFoundError: logger.warning("tensorflow module not found.") tf = None TFRECORD_DATA_DIR = "../data/mindrecord/testTFRecordData" TFRECORD_FILE_NAME = "test.tfrecord" MINDRECORD_FILE_NAME = "test.mindrecord" PARTITION_NUM = 1 def cast_name(key): """ Cast schema names which containing special characters to valid names. Here special characters means any characters in '!"#$%&\'()*+,./:;<=>?@[\\]^`{|}~ Valid names can only contain a-z, A-Z, and 0-9 and _
def run_pretrain(args_opt): """pre-train bert""" global device_id global device_num global rank_id global job_id args_opt.device_id = device_id args_opt.device_num = device_num sync_dataset(args_opt.data_url) context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) context.set_context(variable_memory_max_size="30GB") ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init('hccl') device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init('nccl') device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str( rank) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) from mindspore.parallel._auto_parallel_context import auto_parallel_context if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [29, 58, 87, 116, 145, 174, 203, 217]) else: auto_parallel_context().set_all_reduce_fusion_split_indices( [28, 55, 82, 109, 136, 163, 190, 205]) elif bert_net_cfg.num_hidden_layers == 24: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [30, 90, 150, 210, 270, 330, 390, 421]) else: auto_parallel_context().set_all_reduce_fusion_split_indices( [38, 93, 148, 203, 258, 313, 368, 397]) else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num, rank, args_opt.do_shuffle, args_opt.enable_data_sink, args_opt.data_sink_steps, args_opt.data_dir, args_opt.schema_dir) if args_opt.train_steps > 0: new_repeat_count = min( new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps) netwithloss = BertNetworkWithLoss(bert_net_cfg, True) if cfg.optimizer == 'Lamb': optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * new_repeat_count, start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, power=cfg.Lamb.power, warmup_steps=cfg.Lamb.warmup_steps, weight_decay=cfg.Lamb.weight_decay, eps=cfg.Lamb.eps) elif cfg.optimizer == 'Momentum': optimizer = Momentum(netwithloss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, momentum=cfg.Momentum.momentum) elif cfg.optimizer == 'AdamWeightDecayDynamicLR': optimizer = AdamWeightDecayDynamicLR( netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * new_repeat_count, learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate, end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate, power=cfg.AdamWeightDecayDynamicLR.power, weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay, eps=cfg.AdamWeightDecayDynamicLR.eps, warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps) else: raise ValueError( "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]" .format(cfg.optimizer)) callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()] print("Enable save checkpoint: ", args_opt.enable_save_ckpt) print("Rank ID: ", rank_id) if args_opt.enable_save_ckpt == "true" and rank_id % device_num == 0: print("Enable save checkpoint") config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(netwithloss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer) model = Model(netwithgrads) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"))
def run_squad(): """run squad task""" parser = argparse.ArgumentParser(description="run classifier") parser.add_argument("--device_target", type=str, default="Ascend", help="Device type, default is Ascend") parser.add_argument("--do_train", type=str, default="false", help="Eable train, default is false") parser.add_argument("--do_eval", type=str, default="false", help="Eable eval, default is false") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--epoch_num", type=int, default="1", help="Epoch number, default is 1.") parser.add_argument("--num_class", type=int, default="2", help="The number of class, default is 2.") parser.add_argument("--train_data_shuffle", type=str, default="true", help="Enable train data shuffle, default is true") parser.add_argument("--eval_data_shuffle", type=str, default="false", help="Enable eval data shuffle, default is false") parser.add_argument("--vocab_file_path", type=str, default="", help="Vocab file path") parser.add_argument("--eval_json_path", type=str, default="", help="Evaluation json file path, can be eval.json") parser.add_argument("--save_finetune_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_pretrain_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--load_finetune_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--train_data_file_path", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--eval_data_file_path", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_file_path", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() epoch_num = args_opt.epoch_num load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower() == "false": raise ValueError("At least one of 'do_train' or 'do_eval' must be true") if args_opt.do_train.lower() == "true" and args_opt.train_data_file_path == "": raise ValueError("'train_data_file_path' must be set when do finetune task") if args_opt.do_eval.lower() == "true": if args_opt.eval_data_file_path == "": raise ValueError("'eval_data_file_path' must be set when do evaluation task") if args_opt.vocab_file_path == "": raise ValueError("'vocab_file_path' must be set when do evaluation task") if args_opt.eval_json_path == "": raise ValueError("'tokenization_file_path' must be set when do evaluation task") target = args_opt.device_target if target == "Ascend": context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) elif target == "GPU": context.set_context(mode=context.GRAPH_MODE, device_target="GPU") if bert_net_cfg.compute_type != mstype.float32: logger.warning('GPU only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 else: raise Exception("Target error, GPU or Ascend is supported.") netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1) if args_opt.do_train.lower() == "true": ds = create_squad_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1, data_file_path=args_opt.train_data_file_path, schema_file_path=args_opt.schema_file_path, do_shuffle=(args_opt.train_data_shuffle.lower() == "true")) do_train(ds, netwithloss, load_pretrain_checkpoint_path, save_finetune_checkpoint_path, epoch_num) if args_opt.do_eval.lower() == "true": if save_finetune_checkpoint_path == "": load_finetune_checkpoint_dir = _cur_dir else: load_finetune_checkpoint_dir = make_directory(save_finetune_checkpoint_path) load_finetune_checkpoint_path = LoadNewestCkpt(load_finetune_checkpoint_dir, ds.get_dataset_size(), epoch_num, "squad") if args_opt.do_eval.lower() == "true": ds = create_squad_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1, data_file_path=args_opt.eval_data_file_path, schema_file_path=args_opt.schema_file_path, is_training=False, do_shuffle=(args_opt.eval_data_shuffle.lower() == "true")) do_eval(ds, args_opt.vocab_file_path, args_opt.eval_json_path, load_finetune_checkpoint_path, bert_net_cfg.seq_length)
def parse_args(): """ parse args """ parser = argparse.ArgumentParser(description='tinybert task distill') parser.add_argument( "--device_target", type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--do_train", type=str, default="true", choices=["true", "false"], help="Do train task, default is true.") parser.add_argument("--do_eval", type=str, default="true", choices=["true", "false"], help="Do eval task, default is true.") parser.add_argument("--td_phase1_epoch_size", type=int, default=10, help="Epoch size for td phase 1, default is 10.") parser.add_argument("--td_phase2_epoch_size", type=int, default=3, help="Epoch size for td phase 2, default is 3.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--do_shuffle", type=str, default="true", choices=["true", "false"], help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", choices=["true", "false"], help="Enable data sink, default is true.") parser.add_argument("--save_ckpt_step", type=int, default=100, help="Enable data sink, default is true.") parser.add_argument("--max_ckpt_num", type=int, default=1, help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default=1, help="Sink steps for each epoch, default is 1.") parser.add_argument("--load_teacher_ckpt_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--load_gd_ckpt_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--load_td1_ckpt_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--train_data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--eval_data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") parser.add_argument("--task_type", type=str, default="classification", choices=["classification", "ner"], help="The type of the task to train.") parser.add_argument("--task_name", type=str, default="", choices=["SST-2", "QNLI", "MNLI", "TNEWS", "CLUENER"], help="The name of the task to train.") parser.add_argument( "--assessment_method", type=str, default="accuracy", choices=["accuracy", "bf1", "mf1"], help= "assessment_method include: [accuracy, bf1, mf1], default is accuracy") parser.add_argument( "--dataset_type", type=str, default="tfrecord", help="dataset type tfrecord/mindrecord, default is tfrecord") args = parser.parse_args() if args.do_train.lower() != "true" and args.do_eval.lower() != "true": raise ValueError( "do train or do eval must have one be true, please confirm your config" ) if args.task_name in ["SST-2", "QNLI", "MNLI", "TNEWS" ] and args.task_type != "classification": raise ValueError( f"{args.task_name} is a classification dataset, please set --task_type=classification" ) if args.task_name in ["CLUENER"] and args.task_type != "ner": raise ValueError( f"{args.task_name} is a ner dataset, please set --task_type=ner") if args.task_name in ["SST-2", "QNLI", "MNLI"] and \ (td_teacher_net_cfg.vocab_size != 30522 or td_student_net_cfg.vocab_size != 30522): logger.warning(f"{args.task_name} is an English dataset. Usually, we use 21128 for CN vocabs and 30522 for "\ "EN vocabs according to the origin paper.") if args.task_name in ["TNEWS", "CLUENER"] and \ (td_teacher_net_cfg.vocab_size != 21128 or td_student_net_cfg.vocab_size != 21128): logger.warning(f"{args.task_name} is a Chinese dataset. Usually, we use 21128 for CN vocabs and 30522 for " \ "EN vocabs according to the origin paper.") return args
print("==============================================================") eval_result_print(args_opt.assessment_method, callback) print("==============================================================") if __name__ == '__main__': context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, reserve_class_name_in_scope=False) if args_opt.device_target == "Ascend": context.set_context(device_id=args_opt.device_id) enable_loss_scale = True if args_opt.device_target == "GPU": if td_student_net_cfg.compute_type != mstype.float32: logger.warning( 'Compute about the student only support float32 temporarily, run with float32.' ) td_student_net_cfg.compute_type = mstype.float32 # Backward of the network are calculated using fp32, # and the loss scale is not necessary enable_loss_scale = False if args_opt.device_target == "CPU": logger.warning( 'CPU only support float32 temporarily, run with float32.') td_teacher_net_cfg.dtype = mstype.float32 td_teacher_net_cfg.compute_type = mstype.float32 td_student_net_cfg.dtype = mstype.float32 td_student_net_cfg.compute_type = mstype.float32 enable_loss_scale = False
def check_node_type(self, node): if isinstance(node, (de.ShuffleDataset, de.RepeatDataset, de.BatchDataset)): logger.warning("Used shuffle, repeat, batch before save operator.")