def load_checkpoint(ckpt_file_name, net=None, strict_load=False, filter_prefix=None): """ Loads checkpoint info from a specified file. Args: ckpt_file_name (str): Checkpoint file name. net (Cell): Cell network. Default: None strict_load (bool): Whether to strict load the parameter into net. If False, it will load parameter in the param_dict into net with the same suffix. Default: False filter_prefix (Union[str, list[str], tuple[str]]): Parameters starting with the filter_prefix will not be loaded. Default: None. Returns: Dict, key is parameter name, value is a Parameter. Raises: ValueError: Checkpoint file is incorrect. Examples: >>> ckpt_file_name = "./checkpoint/LeNet5-1_32.ckpt" >>> param_dict = load_checkpoint(ckpt_file_name, filter_prefix="conv1") """ if not isinstance(ckpt_file_name, str): raise ValueError("The ckpt_file_name must be string.") if not os.path.exists(ckpt_file_name): raise ValueError("The checkpoint file is not exist.") if ckpt_file_name[-5:] != ".ckpt": raise ValueError("Please input the correct checkpoint file name.") if os.path.getsize(ckpt_file_name) == 0: raise ValueError("The checkpoint file may be empty, please make sure enter the correct file name.") if filter_prefix is not None: if not isinstance(filter_prefix, (str, list, tuple)): raise TypeError(f"The type of filter_prefix must be str, list[str] or tuple[str] " f"when filter_prefix is not None, but got {str(type(filter_prefix))}.") if isinstance(filter_prefix, str): filter_prefix = (filter_prefix,) if not filter_prefix: raise ValueError("The filter_prefix can't be empty when filter_prefix is list or tuple.") for index, prefix in enumerate(filter_prefix): if not isinstance(prefix, str): raise TypeError(f"The type of filter_prefix must be str, list[str] or tuple[str], " f"but got {str(type(prefix))} at index {index}.") logger.info("Execute the process of loading checkpoint files.") checkpoint_list = Checkpoint() try: with open(ckpt_file_name, "rb") as f: pb_content = f.read() checkpoint_list.ParseFromString(pb_content) except BaseException as e: logger.error("Failed to read the checkpoint file `%s`, please check the correct of the file.", ckpt_file_name) raise ValueError(e.__str__()) parameter_dict = {} try: param_data_list = [] for element_id, element in enumerate(checkpoint_list.value): if filter_prefix is not None and _check_param_prefix(filter_prefix, element.tag): continue data = element.tensor.tensor_content data_type = element.tensor.tensor_type np_type = tensor_to_np_type[data_type] ms_type = tensor_to_ms_type[data_type] element_data = np.frombuffer(data, np_type) param_data_list.append(element_data) if (element_id == len(checkpoint_list.value) - 1) or \ (element.tag != checkpoint_list.value[element_id + 1].tag): param_data = np.concatenate((param_data_list), axis=0) param_data_list.clear() dims = element.tensor.dims if dims == [0]: if 'Float' in data_type: param_data = float(param_data[0]) elif 'Int' in data_type: param_data = int(param_data[0]) parameter_dict[element.tag] = Parameter(Tensor(param_data, ms_type), name=element.tag) elif dims == [1]: parameter_dict[element.tag] = Parameter(Tensor(param_data, ms_type), name=element.tag) else: param_dim = [] for dim in dims: param_dim.append(dim) param_value = param_data.reshape(param_dim) parameter_dict[element.tag] = Parameter(Tensor(param_value, ms_type), name=element.tag) logger.info("Loading checkpoint files process is finished.") except BaseException as e: logger.error("Failed to load the checkpoint file `%s`.", ckpt_file_name) raise RuntimeError(e.__str__()) if not parameter_dict: raise ValueError(f"The loaded parameter dict is empty after filtering, please check filter_prefix.") if net is not None: load_param_into_net(net, parameter_dict, strict_load) return parameter_dict
def _ascend_analyse(self): """Collect and analyse ascend performance data""" release() job_id = self._get_profiling_job_id() logger.info("Profiling: job id is %s ", job_id) source_path = os.path.join(self._output_path, job_id) # parse hwts.log.data.45.dev file, and get task profiling data hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt" hwts_output_filename = os.path.join(self._output_path, hwts_output_filename) source_path = validate_and_normalize_path(source_path) hwts_output_filename = validate_and_normalize_path( hwts_output_filename) hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename) hwtslog_parser.execute() # parse Framework file, and get the relation of op and tasks framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path) framework_parser.parse() op_task_dict = framework_parser.to_task_id_full_op_name_dict() if not op_task_dict: logger.error("Profiling: fail to parse framework files.") return # get op compute time from hwts data and framework data, write output_op_compute_time.txt opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt" opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename) opcompute_output_filename = validate_and_normalize_path( opcompute_output_filename) optime_parser = OPComputeTimeParser(hwts_output_filename, opcompute_output_filename, op_task_dict, self._output_path, self._dev_id) optime_parser.execute() # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt" output_data_preprocess_aicpu = os.path.join( self._output_path, output_data_preprocess_aicpu) output_data_preprocess_aicpu = validate_and_normalize_path( output_data_preprocess_aicpu) aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu) aicpu_data_parser.execute() # Parsing minddata AICPU profiling MinddataParser.execute(source_path, self._output_path, self._dev_id) # parse minddata pipeline operator and queue try: pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path) pipeline_parser.parse() except ProfilerException as err: logger.warning(err.message) # analyse op compute time info try: self._analyser_op_info() except ProfilerException as err: logger.warning(err.message) # analyse step trace info points = None try: points = self._analyse_step_trace(source_path, framework_parser) except ProfilerException as err: logger.warning(err.message) # analyse timeline info try: self._analyse_timeline(aicpu_data_parser, optime_parser, source_path) except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err: logger.warning('Fail to write timeline data: %s', err) # analyse memory usage info try: self._analyse_memory_usage(points) except (ProfilerIOException, ProfilerFileNotFoundException, ProfilerRawFileException) as err: logger.warning(err.message) os.environ['PROFILING_MODE'] = str("false") context.set_context(enable_profiling=False)
def test_cpp_uniform_augment(plot=False, num_ops=2): """ Test UniformAugment """ logger.info("Test CPP UniformAugment") # Original Images data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = [C.Decode(), C.Resize(size=[224, 224]), F.ToTensor()] ds_original = data_set.map(operations=transforms_original, input_columns="image") ds_original = ds_original.batch(512) for idx, (image, _) in enumerate(ds_original): if idx == 0: images_original = np.transpose(image.asnumpy(), (0, 2, 3, 1)) else: images_original = np.append(images_original, np.transpose(image.asnumpy(), (0, 2, 3, 1)), axis=0) # UniformAugment Images data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_ua = [ C.RandomCrop(size=[224, 224], padding=[32, 32, 32, 32]), C.RandomHorizontalFlip(), C.RandomVerticalFlip(), C.RandomColorAdjust(), C.RandomRotation(degrees=45) ] uni_aug = C.UniformAugment(transforms=transforms_ua, num_ops=num_ops) transforms_all = [ C.Decode(), C.Resize(size=[224, 224]), uni_aug, F.ToTensor() ] ds_ua = data_set.map(operations=transforms_all, input_columns="image", num_parallel_workers=1) ds_ua = ds_ua.batch(512) for idx, (image, _) in enumerate(ds_ua): if idx == 0: images_ua = np.transpose(image.asnumpy(), (0, 2, 3, 1)) else: images_ua = np.append(images_ua, np.transpose(image.asnumpy(), (0, 2, 3, 1)), axis=0) if plot: visualize_list(images_original, images_ua) num_samples = images_original.shape[0] mse = np.zeros(num_samples) for i in range(num_samples): mse[i] = diff_mse(images_ua[i], images_original[i]) logger.info("MSE= {}".format(str(np.mean(mse))))
def load_checkpoint(ckpt_file_name, net=None): """ Loads checkpoint info from a specified file. Args: ckpt_file_name (str): Checkpoint file name. net (Cell): Cell network. Default: None Returns: Dict, key is parameter name, value is a Parameter. Raises: ValueError: Checkpoint file is incorrect. """ if not isinstance(ckpt_file_name, str): raise ValueError("The ckpt_file_name must be string.") if not os.path.exists(ckpt_file_name): raise ValueError("The checkpoint file is not exist.") if ckpt_file_name[-5:] != ".ckpt": raise ValueError("Please input the correct checkpoint file name.") if os.path.getsize(ckpt_file_name) == 0: raise ValueError( "The checkpoint file may be empty, please make sure enter the correct file name." ) logger.info("Execute load checkpoint process.") checkpoint_list = Checkpoint() try: with open(ckpt_file_name, "rb") as f: pb_content = f.read() checkpoint_list.ParseFromString(pb_content) except BaseException as e: logger.error( "Failed to read the checkpoint file `%s`, please check the correct of the file.", ckpt_file_name) raise ValueError(e.__str__()) parameter_dict = {} try: element_id = 0 param_data_list = [] for element in checkpoint_list.value: data = element.tensor.tensor_content data_type = element.tensor.tensor_type np_type = tensor_to_np_type[data_type] ms_type = tensor_to_ms_type[data_type] element_data = np.frombuffer(data, np_type) param_data_list.append(element_data) if (element_id == len(checkpoint_list.value) - 1) or \ (element.tag != checkpoint_list.value[element_id + 1].tag): param_data = np.concatenate((param_data_list), axis=0) param_data_list.clear() dims = element.tensor.dims if dims == [0]: if 'Float' in data_type: param_data = float(param_data[0]) elif 'Int' in data_type: param_data = int(param_data[0]) parameter_dict[element.tag] = Parameter(Tensor( param_data, ms_type), name=element.tag) elif dims == [1]: parameter_dict[element.tag] = Parameter(Tensor( param_data, ms_type), name=element.tag) else: param_dim = [] for dim in dims: param_dim.append(dim) param_value = param_data.reshape(param_dim) parameter_dict[element.tag] = Parameter(Tensor( param_value, ms_type), name=element.tag) element_id += 1 logger.info("Load checkpoint process finish.") except BaseException as e: logger.error("Failed to load the checkpoint file `%s`.", ckpt_file_name) raise RuntimeError(e.__str__()) if net is not None: load_param_into_net(net, parameter_dict) return parameter_dict
def parse_print(print_file_name): """ Loads Print data from a specified file. Args: print_file_name (str): The file name of save print data. Returns: List, element of list is Tensor. Raises: ValueError: The print file may be empty, please make sure enter the correct file name. """ print_file_path = os.path.realpath(print_file_name) if os.path.getsize(print_file_path) == 0: raise ValueError( "The print file may be empty, please make sure enter the correct file name." ) logger.info("Execute load print process.") print_list = Print() try: with open(print_file_path, "rb") as f: pb_content = f.read() print_list.ParseFromString(pb_content) except BaseException as e: logger.error( "Failed to read the print file %s, please check the correct of the file.", print_file_name) raise ValueError(e.__str__()) tensor_list = [] try: for print_ in print_list.value: # String type if print_.HasField("desc"): tensor_list.append(print_.desc) elif print_.HasField("tensor"): dims = print_.tensor.dims data_type = print_.tensor.tensor_type data = print_.tensor.tensor_content np_type = tensor_to_np_type[data_type] param_data = np.fromstring(data, np_type) ms_type = tensor_to_ms_type[data_type] param_dim = [] for dim in dims: param_dim.append(dim) if param_dim: param_value = param_data.reshape(param_dim) tensor_list.append(Tensor(param_value, ms_type)) # Scale type else: data_type_ = data_type.lower() if 'float' in data_type_: param_data = float(param_data[0]) elif 'int' in data_type_: param_data = int(param_data[0]) elif 'bool' in data_type_: param_data = bool(param_data[0]) tensor_list.append(Tensor(param_data, ms_type)) except BaseException as e: logger.error("Failed to load the print file %s.", print_list) raise RuntimeError(e.__str__()) return tensor_list
def run_pretrain(): """pre-train bert_clue""" parser = argparse_init() args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) is_auto_enable_graph_kernel = _auto_enable_graph_kernel( args_opt.device_target, args_opt.enable_graph_kernel) _set_graph_kernel_context(args_opt.device_target, args_opt.enable_graph_kernel, is_auto_enable_graph_kernel) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init() device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str( get_rank()) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) _set_bert_all_reduce_split() else: rank = 0 device_num = 1 _check_compute_type(args_opt, is_auto_enable_graph_kernel) if args_opt.accumulation_steps > 1: logger.info("accumulation steps: {}".format( args_opt.accumulation_steps)) logger.info("global batch size: {}".format( cfg.batch_size * args_opt.accumulation_steps)) if args_opt.enable_data_sink == "true": args_opt.data_sink_steps *= args_opt.accumulation_steps logger.info("data sink steps: {}".format(args_opt.data_sink_steps)) if args_opt.enable_save_ckpt == "true": args_opt.save_checkpoint_steps *= args_opt.accumulation_steps logger.info("save checkpoint steps: {}".format( args_opt.save_checkpoint_steps)) ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.data_sink_steps if args_opt.train_steps > 0: train_steps = args_opt.train_steps * args_opt.accumulation_steps new_repeat_count = min(new_repeat_count, train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.accumulation_steps logger.info("train steps: {}".format(args_opt.train_steps)) optimizer = _get_optimizer(args_opt, net_with_loss) callback = [ TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size()) ] if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min( 8, device_num) == 0: config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint( prefix='checkpoint_bert', directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) accumulation_steps = args_opt.accumulation_steps enable_global_norm = cfg.enable_global_norm if accumulation_steps <= 1: if cfg.optimizer == 'AdamWeightDecay' and args_opt.device_target == 'GPU': net_with_grads = BertTrainOneStepWithLossScaleCellForAdam( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: net_with_grads = BertTrainOneStepWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: allreduce_post = args_opt.distribute == "false" or args_opt.allreduce_post_accumulation == "true" net_with_accumulation = ( BertTrainAccumulationAllReducePostWithLossScaleCell if allreduce_post else BertTrainAccumulationAllReduceEachWithLossScaleCell) net_with_grads = net_with_accumulation( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell, accumulation_steps=accumulation_steps, enable_global_norm=enable_global_norm) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) model = ConvertModelUtils().convert_to_thor_model( model, network=net_with_grads, optimizer=optimizer, frequency=cfg.Thor.frequency) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
def test_net(): x = np.random.randn(2, 5, 8).astype(np.float32) mask = np.random.randn(16).astype(np.uint8) keep_prob = 1 ddm = Net() output = ddm(Tensor(x), Tensor(mask), Tensor(keep_prob)) logger.info("***********x*********") logger.info(x) logger.info("***********mask*********") logger.info(mask) logger.info("***********keep_prob*********") logger.info(keep_prob) logger.info("***********output y*********") logger.info(output.asnumpy())
schema = ds.Schema() schema.add_column( 'image', de_type=mstype.uint8, shape=[640, 480, 3]) # 921600 bytes (a bit less than 1 MB per image) schema.add_column('label', de_type=mstype.uint8, shape=[1]) # Make up about 10 samples ds1 = ds.RandomDataset(schema=schema, num_samples=10, num_parallel_workers=1) # cache size allows for about 4 images since each image just a bit less than 1MB, after that we will have to spill ds1 = ds1.repeat(4) num_iter = 0 for data in ds1.create_dict_iterator(): # each data is a dictionary # in this example, each dictionary has keys "image" and "label" #logger.info(data["image"]) logger.info("printing the label: {}".format(data["label"])) num_iter += 1 logger.info("Number of data in ds1: ", num_iter) assert (num_iter == 40) if __name__ == '__main__': test_randomdataset_basic1() test_randomdataset_basic2() logger.info('test_randomdataset_basic Ended.\n')
def write_timeline(self, size_limit=SIZE_LIMIT_DEFAULT): """Load data according to the parsed profiling files.""" # Write timeline to file. logger.info('Writing timeline file...') self.write_timeline_to_json_by_limitation(size_limit) logger.info('Finished file writing!')
def visualize_with_bounding_boxes(orig, aug, annot_name="bbox", plot_rows=3): """ Take a list of un-augmented and augmented images with "bbox" bounding boxes Plot images to compare test correct BBox augment functionality :param orig: list of original images and bboxes (without aug) :param aug: list of augmented images and bboxes :param annot_name: the dict key for bboxes in data, e.g "bbox" (COCO) / "bbox" (VOC) :param plot_rows: number of rows on plot (rows = samples on one plot) :return: None """ def add_bounding_boxes(ax, bboxes): for bbox in bboxes: rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2] * 0.997, bbox[3] * 0.997, linewidth=1.80, edgecolor='r', facecolor='none') # Add the patch to the Axes # Params to Rectangle slightly modified to prevent drawing overflow ax.add_patch(rect) # Quick check to confirm correct input parameters if not isinstance(orig, list) or not isinstance(aug, list): return if len(orig) != len(aug) or not orig: return batch_size = int(len(orig) / plot_rows) # creates batches of images to plot together split_point = batch_size * plot_rows orig, aug = np.array(orig), np.array(aug) if len(orig) > plot_rows: # Create batches of required size and add remainder to last batch orig = np.split(orig[:split_point], batch_size) + ( [orig[split_point:]] if (split_point < orig.shape[0]) else [] ) # check to avoid empty arrays being added aug = np.split(aug[:split_point], batch_size) + ([aug[split_point:]] if (split_point < aug.shape[0]) else []) else: orig = [orig] aug = [aug] for ix, allData in enumerate(zip(orig, aug)): base_ix = ix * plot_rows # current batch starting index curPlot = len(allData[0]) fig, axs = plt.subplots(curPlot, 2) fig.tight_layout(pad=1.5) for x, (dataA, dataB) in enumerate(zip(allData[0], allData[1])): cur_ix = base_ix + x # select plotting axes based on number of image rows on plot - else case when 1 row (axA, axB) = (axs[x, 0], axs[x, 1]) if (curPlot > 1) else (axs[0], axs[1]) axA.imshow(dataA["image"]) add_bounding_boxes(axA, dataA[annot_name]) axA.title.set_text("Original" + str(cur_ix + 1)) axB.imshow(dataB["image"]) add_bounding_boxes(axB, dataB[annot_name]) axB.title.set_text("Augmented" + str(cur_ix + 1)) logger.info("Original **\n{} : {}".format(str(cur_ix + 1), dataA[annot_name])) logger.info("Augmented **\n{} : {}\n".format( str(cur_ix + 1), dataB[annot_name])) plt.show()
def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=4, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.") parser.add_argument("--enable_lossscale", type=str, default="false", help="Use lossscale or not, default is not.") parser.add_argument("--do_shuffle", type=str, default="false", help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default="100", help="Sink steps for each epoch, default is 1.") parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " "default is 1000.") parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, " "meaning run all steps according to epoch number.") parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id, save_graphs=False) context.set_context(reserve_class_name_in_scope=False) context.set_context(max_call_depth=3000) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/' context.reset_auto_parallel_context() _set_bert_all_reduce_split() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size() // args_opt.data_sink_steps if args_opt.train_steps > 0: new_repeat_count = min(new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size() logger.info("train steps: {}".format(args_opt.train_steps)) optimizer = _get_optimizer(args_opt, net_with_loss) callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack()] if args_opt.enable_save_ckpt == "true" and rank == 0: config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads, frequency=cfg.Thor.frequency) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
def test_greater_2d_scalar0(): a = np.random.randint(-5, 5, [8, 32]).astype(np.int32) b = np.random.randint(-5, 5, [8, 32]).astype(np.int32) out_me = me_greater(Tensor(a), Tensor(b)) logger.info("Check me result:") logger.info(out_me)
SCHEMA_DIR_2 = "../data/dataset/testTFBert5Rows2/datasetSchema.json" def test_rename(): data1 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False) data2 = ds.TFRecordDataset(DATA_DIR_2, SCHEMA_DIR_2, shuffle=False) data2 = data2.rename(input_columns=["input_ids", "segment_ids"], output_columns=["masks", "seg_ids"]) data = ds.zip((data1, data2)) data = data.repeat(3) num_iter = 0 for i, item in enumerate(data.create_dict_iterator()): logger.info("item[mask] is {}".format(item["masks"])) assert item["masks"].all() == item["input_ids"].all() logger.info("item[seg_ids] is {}".format(item["seg_ids"])) assert item["segment_ids"].all() == item["seg_ids"].all() # need to consume the data in the buffer num_iter += 1 logger.info("Number of data in data: {}".format(num_iter)) assert num_iter == 15 if __name__ == '__main__': logger.info('===========test Rename Repeat===========') test_rename() logger.info('\n')
def util_test_random_color_adjust_op(brightness=(1, 1), contrast=(1, 1), saturation=(1, 1), hue=(0, 0), plot=False): """ Util function that tests RandomColorAdjust for a specific argument """ # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) decode_op = c_vision.Decode() random_adjust_op = c_vision.RandomColorAdjust(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue) ctrans = [ decode_op, random_adjust_op, ] data1 = data1.map(operations=ctrans, input_columns=["image"]) # Second dataset transforms = [ py_vision.Decode(), py_vision.RandomColorAdjust(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue), py_vision.ToTensor() ] transform = mindspore.dataset.transforms.py_transforms.Compose(transforms) data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) data2 = data2.map(operations=transform, input_columns=["image"]) num_iter = 0 for item1, item2 in zip( data1.create_dict_iterator(num_epochs=1, output_numpy=True), data2.create_dict_iterator(num_epochs=1, output_numpy=True)): num_iter += 1 c_image = item1["image"] py_image = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8) logger.info("shape of c_image: {}".format(c_image.shape)) logger.info("shape of py_image: {}".format(py_image.shape)) logger.info("dtype of c_image: {}".format(c_image.dtype)) logger.info("dtype of py_image: {}".format(py_image.dtype)) mse = diff_mse(c_image, py_image) logger.info("mse is {}".format(mse)) logger.info("random_rotation_op_{}, mse: {}".format(num_iter + 1, mse)) assert mse < 0.01 if plot: visualize_image(c_image, py_image, mse)
pass assert False except TypeError: pass try: data2 = data1.apply(dataset_fn) _ = data1.apply(dataset_fn) for _, _ in zip(data1.create_dict_iterator(), data2.create_dict_iterator()): pass assert False except ValueError as e: logger.info("Got an exception in DE: {}".format(str(e))) if __name__ == '__main__': logger.info("Running test_apply.py test_apply_generator_case() function") test_apply_generator_case() logger.info("Running test_apply.py test_apply_imagefolder_case() function") test_apply_imagefolder_case() logger.info("Running test_apply.py test_apply_flow_case(id) function") test_apply_flow_case_0() test_apply_flow_case_1() test_apply_flow_case_2() test_apply_flow_case_3() logger.info("Running test_apply.py test_apply_exception_case() function") test_apply_exception_case()
def init_timeline(self, all_reduce_info, framework_info, aicpu_info, min_cycle_counter, source_path): """ Init timeline metadata, adding all collected info. Args: all_reduce_info (list[list]): The metadata of AllReduce operator. framework_info (dict): The framework metadata. aicpu_info (dict): The metadata of AI CPU operator. min_cycle_counter (float): The minimum cycle counter of the timeline. """ if min_cycle_counter == float('inf'): min_cycle_counter = 0 logger.info('Initiating timeline...') timeline_list = self._load_timeline_data() cpu_timeline_generator = CpuTimelineGenerator(self._profiling_dir, self._device_id) cpu_timeline_list = cpu_timeline_generator.get_timeline_data() if cpu_timeline_list: self._clock_synchronize_to_host(timeline_list, source_path) timeline_list.extend(cpu_timeline_list) timeline_list.sort(key=lambda x: float(x[2])) self._timeline_summary['op_exe_times'] = len(timeline_list) # Add AllReduce info to timeline temp list and sort by start time. if all_reduce_info: logger.debug( 'AllReduce info found. Start adding info into timeline...') timeline_list.extend(all_reduce_info) timeline_list.sort(key=lambda x: float(x[2])) # Add AI CPU data into timeline temp list and sort by start time. aicpu_data = aicpu_info.get('info') if aicpu_data: timeline_list.extend(aicpu_data) timeline_list.sort(key=lambda x: float(x[2])) self._timeline_summary['op_exe_times'] += aicpu_info.get( 'op_exe_times', 0) self._timeline_summary['num_of_streams'] += aicpu_info.get( 'num_of_streams', 0) self._timeline_summary['num_of_ops'] += aicpu_info.get( 'num_of_ops', 0) self._timeline_summary['total_time'] += aicpu_info.get( 'total_time', 0) # Init a dict for counting the num of streams. stream_count_dict = {} for timeline in timeline_list: self._parse_timeline_data(timeline, min_cycle_counter) # Updating the collection of streams. if len(timeline) == 4: self._update_num_of_streams(timeline, stream_count_dict) # Get framework metadata. framework_obj_list = framework_info.get('object') # The length of list is the number of operators. self._timeline_summary['num_of_ops'] += len(framework_obj_list) self._add_framework_info(framework_obj_list) logger.info('Finished adding info into timeline...') # Update timeline summary info self._timeline_summary['num_of_streams'] += len( stream_count_dict.keys())
def analyse(self): """ Collect and analyse performance data, called after training or during training. Examples: >>> from mindspore.profiler import Profiler >>> import mindspore.context >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", >>> device_id=int(os.environ["DEVICE_ID"])) >>> profiler = Profiler() >>> model = Model() >>> model.train() >>> profiler.analyse() """ if self._device_target and self._device_target == "GPU": self._gpu_profiler.stop() self._generate_timeline() # parse minddata pipeline operator and queue for GPU try: pipeline_parser = MinddataPipelineParser( self._output_path, self._dev_id, self._output_path) pipeline_parser.parse() except ProfilerException as err: logger.warning(err.message) os.environ['PROFILING_MODE'] = str("false") elif self._device_target and self._device_target == "Ascend": release() job_id = self._get_profiling_job_id() logger.info("Profiling: job id is %s ", job_id) source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id) # parse hwts.log.data.45.dev file, and get task profiling data hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt" hwts_output_filename = os.path.join(self._output_path, hwts_output_filename) source_path = validate_and_normalize_path(source_path) hwts_output_filename = validate_and_normalize_path( hwts_output_filename) hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename) _ = hwtslog_parser.execute() # parse Framework file, and get the relation of op and tasks framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path) framework_parser.parse() op_task_dict = framework_parser.to_task_id_full_op_name_dict() if not op_task_dict: logger.error("Profiling: fail to parse framework files.") return # get op compute time from hwts data and framework data, write output_op_compute_time.txt opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt" opcompute_output_filename = os.path.join( self._output_path, opcompute_output_filename) opcompute_output_filename = validate_and_normalize_path( opcompute_output_filename) optime_parser = OPComputeTimeParser(hwts_output_filename, opcompute_output_filename, op_task_dict, self._output_path, self._dev_id) optime_parser.execute() # parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt" output_data_preprocess_aicpu = os.path.join( self._output_path, output_data_preprocess_aicpu) output_data_preprocess_aicpu = validate_and_normalize_path( output_data_preprocess_aicpu) aicpu_data_parser = DataPreProcessParser( source_path, output_data_preprocess_aicpu) aicpu_data_parser.execute() # Parsing minddata AICPU profiling MinddataParser.execute(source_path, self._output_path, self._dev_id) # parse minddata pipeline operator and queue try: pipeline_parser = MinddataPipelineParser( self._output_path, self._dev_id, self._output_path) pipeline_parser.parse() except ProfilerException as err: logger.warning(err.message) # analyse op compute time info try: self._analyser_op_info() except ProfilerException as err: logger.warning(err.message) # analyse step trace info try: self._analyse_step_trace(source_path, framework_parser) except ProfilerException as err: logger.warning(err.message) # analyse timeline info try: self._analyse_timeline(aicpu_data_parser, optime_parser) except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err: logger.warning('Fail to write timeline data: %s', err) os.environ['PROFILING_MODE'] = str("false") context.set_context(enable_profiling=False)
def test_schema_simple(): logger.info("test_schema_simple") ds.Schema(SCHEMA_FILE)
def save_checkpoint(save_obj, ckpt_file_name, integrated_save=True, async_save=False): """ Saves checkpoint info to a specified file. Args: save_obj (nn.Cell or list): The cell object or parameters list(each element is a dictionary, like {"name": param_name, "data": param_data}.) ckpt_file_name (str): Checkpoint file name. If the file name already exists, it will be overwritten. integrated_save (bool): Whether to integrated save in automatic model parallel scene. async_save (bool): Whether asynchronous execution saves the checkpoint to a file. Default: False Raises: TypeError: If the parameter save_obj is not nn.Cell or list type. RuntimeError: Failed to save the Checkpoint file. """ if not isinstance(save_obj, nn.Cell) and not isinstance(save_obj, list): raise TypeError( "The parameter save_obj should be nn.Cell or list, but got {}". format(type(save_obj))) logger.info("Execute save checkpoint process.") if isinstance(save_obj, nn.Cell): save_obj.init_parameters_data() param_dict = {} for _, param in save_obj.parameters_and_names(): param_dict[param.name] = param param_list = [] for (key, value) in param_dict.items(): each_param = {"name": key} if isinstance(value.data, Tensor): param_data = value.data else: param_data = Tensor(value.data) # in automatic model parallel scenario, some parameters were spliteds to all the devices, # which should be combined before saving if integrated_save and key in save_obj.parameter_layout_dict: param_data = _get_merged_param_data(save_obj, key, param_data) each_param["data"] = param_data param_list.append(each_param) save_obj = param_list data_list = {} with _ckpt_mutex: for param in save_obj: key = param["name"] data_list[key] = [] if isinstance(param["data"], Parameter): param["data"].init_data() dims = [] if param['data'].shape == (): dims.append(0) else: for dim in param['data'].shape: dims.append(dim) data_list[key].append(dims) tensor_type = str(param["data"].dtype) data_list[key].append(tensor_type) data = param["data"].asnumpy().reshape(-1) data_list[key].append(data) if async_save: thr = Thread(target=_exec_save, args=(ckpt_file_name, data_list), name="asyn_save_ckpt") thr.start() else: _exec_save(ckpt_file_name, data_list) logger.info("Save checkpoint process finish.")
def test_numpy_slice_empty_output_shape(): logger.info("running test_numpy_slice_empty_output_shape") dataset = de.NumpySlicesDataset([[[1, 2], [3, 4]]], column_names=["col1"]) dataset = dataset.batch(batch_size=3, drop_remainder=True) assert dataset.output_shapes() == []
def test_cv_minddataset_reader_multi_image_and_ndarray_tutorial(): writer = FileWriter(CV_FILE_NAME, FILES_NUM) cv_schema_json = { "id": { "type": "int32" }, "image_0": { "type": "bytes" }, "image_2": { "type": "bytes" }, "image_3": { "type": "bytes" }, "image_4": { "type": "bytes" }, "input_mask": { "type": "int32", "shape": [-1] }, "segments": { "type": "float32", "shape": [2, 3] } } writer.add_schema(cv_schema_json, "two_images_schema") with open("../data/mindrecord/testImageNetData/images/image_00010.jpg", "rb") as file_reader: img_data = file_reader.read() ndarray_1 = np.array([1, 2, 3, 4, 5], np.int32) ndarray_2 = np.array(([2, 3, 1], [7, 9, 0]), np.float32) data = [] for i in range(5): item = { "id": i, "image_0": img_data, "image_2": img_data, "image_3": img_data, "image_4": img_data, "input_mask": ndarray_1, "segments": ndarray_2 } data.append(item) writer.write_raw_data(data) writer.commit() assert os.path.exists(CV_FILE_NAME) assert os.path.exists(CV_FILE_NAME + ".db") # tutorial for minderdataset. columns_list = [ "id", "image_0", "image_2", "image_3", "image_4", "input_mask", "segments" ] num_readers = 1 data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers) assert data_set.get_dataset_size() == 5 num_iter = 0 for item in data_set.create_dict_iterator(): assert len(item) == 7 logger.info("item: {}".format(item)) assert item["image_0"].dtype == np.uint8 assert (item["image_0"] == item["image_2"]).all() assert (item["image_3"] == item["image_4"]).all() assert (item["image_0"] == item["image_4"]).all() assert item["image_2"].dtype == np.uint8 assert item["image_3"].dtype == np.uint8 assert item["image_4"].dtype == np.uint8 assert item["id"].dtype == np.int32 assert item["input_mask"].shape == (5, ) assert item["input_mask"].dtype == np.int32 assert item["segments"].shape == (2, 3) assert item["segments"].dtype == np.float32 num_iter += 1 assert num_iter == 5 if os.path.exists("{}".format(CV_FILE_NAME + ".db")): os.remove(CV_FILE_NAME + ".db") if os.path.exists("{}".format(CV_FILE_NAME)): os.remove(CV_FILE_NAME)
def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') parser.add_argument( '--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.") parser.add_argument("--enable_lossscale", type=str, default="true", help="Use lossscale or not, default is not.") parser.add_argument("--do_shuffle", type=str, default="true", help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.") parser.add_argument( "--accumulation_steps", type=int, default="1", help= "Accumulating gradients N times before weight update, default is 1.") parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " "default is 1000.") parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, " "meaning run all steps according to epoch number.") parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init('hccl') device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init('nccl') device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str( rank) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) from mindspore.parallel._auto_parallel_context import auto_parallel_context if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [29, 58, 87, 116, 145, 174, 203, 217]) else: auto_parallel_context().set_all_reduce_fusion_split_indices( [28, 55, 82, 109, 136, 163, 190, 205]) elif bert_net_cfg.num_hidden_layers == 24: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [30, 90, 150, 210, 270, 330, 390, 421]) else: auto_parallel_context().set_all_reduce_fusion_split_indices( [38, 93, 148, 203, 258, 313, 368, 397]) else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 if args_opt.accumulation_steps > 1: logger.info("accumulation steps: {}".format( args_opt.accumulation_steps)) logger.info("global batch size: {}".format( bert_net_cfg.batch_size * args_opt.accumulation_steps)) if args_opt.enable_data_sink == "true": args_opt.data_sink_steps *= args_opt.accumulation_steps logger.info("data sink steps: {}".format(args_opt.data_sink_steps)) if args_opt.enable_save_ckpt == "true": args_opt.save_checkpoint_steps *= args_opt.accumulation_steps logger.info("save checkpoint steps: {}".format( args_opt.save_checkpoint_steps)) ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.data_sink_steps if args_opt.train_steps > 0: new_repeat_count = min( new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size() logger.info("train steps: {}".format(args_opt.train_steps)) if cfg.optimizer == 'Lamb': lr_schedule = BertLearningRate( learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=cfg.Lamb.warmup_steps, decay_steps=args_opt.train_steps, power=cfg.Lamb.power) params = net_with_loss.trainable_params() decay_params = list(filter(cfg.Lamb.decay_filter, params)) other_params = list( filter(lambda x: not cfg.Lamb.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': cfg.Lamb.weight_decay }, { 'params': other_params }, { 'order_params': params }] optimizer = Lamb(group_params, learning_rate=lr_schedule, eps=cfg.Lamb.eps) elif cfg.optimizer == 'Momentum': optimizer = Momentum(net_with_loss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, momentum=cfg.Momentum.momentum) elif cfg.optimizer == 'AdamWeightDecay': lr_schedule = BertLearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=cfg.AdamWeightDecay.warmup_steps, decay_steps=args_opt.train_steps, power=cfg.AdamWeightDecay.power) params = net_with_loss.trainable_params() decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }, { 'order_params': params }] optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) else: raise ValueError( "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]" .format(cfg.optimizer)) callback = [ TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size()) ] if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min( 8, device_num) == 0: config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint( prefix='checkpoint_bert', directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) if args_opt.accumulation_steps <= 1: net_with_grads = BertTrainOneStepWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: accumulation_steps = args_opt.accumulation_steps net_with_grads = BertTrainAccumulateStepsWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell, accumulation_steps=accumulation_steps) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
def export(net, *inputs, file_name, file_format='AIR'): """ Exports MindSpore predict model to file in specified format. Args: net (Cell): MindSpore network. inputs (Tensor): Inputs of the `net`. file_name (str): File name of model to export. file_format (str): MindSpore currently supports 'AIR', 'ONNX' and 'MINDIR' format for exported model. - AIR: Ascend Intermidiate Representation. An intermidiate representation format of Ascend model. Recommended suffix for output file is '.air'. - ONNX: Open Neural Network eXchange. An open format built to represent machine learning models. Recommended suffix for output file is '.onnx'. - MINDIR: MindSpore Native Intermidiate Representation for Anf. An intermidiate representation format for MindSpore models. Recommended suffix for output file is '.mindir'. """ logger.info("exporting model file:%s format:%s.", file_name, file_format) check_input_data(*inputs, data_class=Tensor) if file_format == 'GEIR': logger.warning( f"Format 'GEIR' is deprecated, it would be removed in future release, use 'AIR' instead." ) file_format = 'AIR' supported_formats = ['AIR', 'ONNX', 'MINDIR'] if file_format not in supported_formats: raise ValueError( f'Illegal file format {file_format}, it must be one of {supported_formats}' ) # switch network mode to infer when it is training is_training = net.training if is_training: net.set_train(mode=False) # export model net.init_parameters_data() if file_format == 'AIR': phase_name = 'export.air' graph_id, _ = _executor.compile(net, *inputs, phase=phase_name) _executor.export(file_name, graph_id) elif file_format == 'ONNX': # file_format is 'ONNX' phase_name = 'export.onnx' graph_id, _ = _executor.compile(net, *inputs, phase=phase_name, do_convert=False) onnx_stream = _executor._get_func_graph_proto(graph_id) with open(file_name, 'wb') as f: os.chmod(file_name, stat.S_IWUSR | stat.S_IRUSR) f.write(onnx_stream) elif file_format == 'MINDIR': # file_format is 'MINDIR' phase_name = 'export.mindir' graph_id, _ = _executor.compile(net, *inputs, phase=phase_name, do_convert=False) onnx_stream = _executor._get_func_graph_proto(graph_id, 'mind_ir') with open(file_name, 'wb') as f: os.chmod(file_name, stat.S_IWUSR | stat.S_IRUSR) f.write(onnx_stream) # restore network training mode if is_training: net.set_train(mode=True)
Test shuffle exception: buffer_size wrong type, boolean value True """ logger.info("test_shuffle_exception_07") # apply dataset operations data1 = ds.TFRecordDataset(DATA_DIR) ds.config.set_seed(1) try: data1 = data1.shuffle(buffer_size=True) sum([1 for _ in data1]) except Exception as e: logger.info("Got an exception in DE: {}".format(str(e))) assert "buffer_size" in str(e) if __name__ == '__main__': test_shuffle_01() test_shuffle_02() test_shuffle_03() test_shuffle_04() test_shuffle_05() test_shuffle_06() test_shuffle_exception_01() test_shuffle_exception_02() test_shuffle_exception_03() test_shuffle_exception_05() test_shuffle_exception_06() test_shuffle_exception_07() logger.info('\n')
def __init__(self, **kwargs): # get device_id and device_target self._get_devid_and_devtarget() self._get_output_path(kwargs) os.environ['PROFILING_MODE'] = 'true' os.environ['MINDDATA_PROFILING_DIR'] = self._output_path if self._device_target: CPUProfiler = c_expression.CPUProfiler self._cpu_profiler = CPUProfiler.get_instance() self._cpu_profiler.init(self._output_path) self._cpu_profiler.step_profiling_enable(True) if self._device_target and self._device_target == "GPU": GPUProfiler = c_expression.GPUProfiler self._gpu_profiler = GPUProfiler.get_instance() self._gpu_profiler.init(self._output_path) self._gpu_profiler.step_profiling_enable(True) if GlobalComm.WORLD_COMM_GROUP == "nccl_world_group": self._dev_id = str(get_rank()) os.environ['DEVICE_ID'] = self._dev_id if kwargs: logger.warning("Params not be supported yet on GPU.") elif self._device_target and self._device_target == "Ascend": optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable") if not isinstance(optypes_not_deal, str): raise TypeError("The parameter optypes_not_deal must be str.") job_dir = kwargs.pop("ascend_job_id", "") if job_dir: job_dir = validate_and_normalize_path(job_dir) if not os.path.exists(job_dir): msg = f"Invalid ascend_job_id: {job_dir}, Please pass the absolute path of the JOB dir" logger.error(msg) raise ValueError(msg) self._output_path, _ = os.path.split(job_dir) if kwargs: logger.warning("There are invalid params which don't work.") os.environ['DEVICE_ID'] = self._dev_id fp_point = os.environ.get("PROFILING_FP_START", "") bp_point = os.environ.get("PROFILING_BP_END", "") profiling_options = { "output": self._output_path, "fp_point": fp_point, "bp_point": bp_point, "training_trace": "on", "task_trace": "on", "aic_metrics": "PipeUtilization", "aicpu": "on" } profiling_options = json.dumps(profiling_options) # Characters longer than 2048 are ignored, resulting in profiling option resolution errors if len(profiling_options) > 2048: msg = "The parameter length exceeds the limit (2048), please input valid parameters." logger.error(msg) raise ValueError(msg) # use context interface to open profiling, for the new mindspore version(after 2020.5.21) context.set_context(enable_profiling=True, profiling_options=profiling_options) base_profiling_container_path = os.path.join( self._output_path, "container") container_path = os.path.join(base_profiling_container_path, self._dev_id) data_path = os.path.join(container_path, "data") data_path = validate_and_normalize_path(data_path) if not os.path.exists(data_path): os.makedirs(data_path, exist_ok=True) self._filt_optype_names = optypes_not_deal.split( ",") if optypes_not_deal else [] # add job id env through user input later self._job_id_env = 0 self._start_time = int(time.time() * 10000000) logger.info("Profiling: profiling start time: %d", self._start_time)
def test_pad_op(): """ Test Pad op """ logger.info("test_random_color_jitter_op") # First dataset data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) decode_op = c_vision.Decode() pad_op = c_vision.Pad((100, 100, 100, 100)) ctrans = [ decode_op, pad_op, ] data1 = data1.map(operations=ctrans, input_columns=["image"]) # Second dataset transforms = [ py_vision.Decode(), py_vision.Pad(100), py_vision.ToTensor(), ] transform = mindspore.dataset.transforms.py_transforms.Compose(transforms) data2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False) data2 = data2.map(operations=transform, input_columns=["image"]) for item1, item2 in zip( data1.create_dict_iterator(num_epochs=1, output_numpy=True), data2.create_dict_iterator(num_epochs=1, output_numpy=True)): c_image = item1["image"] py_image = (item2["image"].transpose(1, 2, 0) * 255).astype(np.uint8) logger.info("shape of c_image: {}".format(c_image.shape)) logger.info("shape of py_image: {}".format(py_image.shape)) logger.info("dtype of c_image: {}".format(c_image.dtype)) logger.info("dtype of py_image: {}".format(py_image.dtype)) mse = diff_mse(c_image, py_image) logger.info("mse is {}".format(mse)) assert mse < 0.01
def run(self): """ Executes transformation from imagenet to MindRecord. Returns: SUCCESS or FAILED, whether imagenet is successfully transformed to MindRecord. """ t0_total = time.time() imagenet_schema_json = { "label": { "type": "int32" }, "image": { "type": "bytes" }, "file_name": { "type": "string" } } logger.info("transformed MindRecord schema is: {}".format( imagenet_schema_json)) # set the header size self.writer.set_header_size(1 << 24) # set the page size self.writer.set_page_size(1 << 26) # create the schema self.writer.add_schema(imagenet_schema_json, "imagenet_schema") # add the index self.writer.add_index(["label", "file_name"]) imagenet_iter = self._get_imagenet_as_dict() batch_size = 256 transform_count = 0 while True: data_list = [] try: for _ in range(batch_size): data_list.append(imagenet_iter.__next__()) transform_count += 1 self.writer.write_raw_data(data_list) logger.info("transformed {} record...".format(transform_count)) except StopIteration: if data_list: self.writer.write_raw_data(data_list) logger.info( "transformed {} record...".format(transform_count)) break ret = self.writer.commit() t1_total = time.time() logger.info("--------------------------------------------") logger.info("END. Total time: {}".format(t1_total - t0_total)) logger.info("--------------------------------------------") return ret
def write_timeline(self): """Load data according to the parsed profiling files.""" # Write timeline to file. logger.info('Writing timeline file...') self.write_timeline_to_json_by_limitation() logger.info('Finished file writing!')
def test_uniform_augment(plot=False, num_ops=2): """ Test UniformAugment """ logger.info("Test UniformAugment") # Original Images data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = mindspore.dataset.transforms.py_transforms.Compose( [F.Decode(), F.Resize((224, 224)), F.ToTensor()]) ds_original = data_set.map(operations=transforms_original, input_columns="image") ds_original = ds_original.batch(512) for idx, (image, _) in enumerate(ds_original): if idx == 0: images_original = np.transpose(image.asnumpy(), (0, 2, 3, 1)) else: images_original = np.append(images_original, np.transpose(image.asnumpy(), (0, 2, 3, 1)), axis=0) # UniformAugment Images data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transform_list = [ F.RandomRotation(45), F.RandomColor(), F.RandomSharpness(), F.Invert(), F.AutoContrast(), F.Equalize() ] transforms_ua = \ mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), F.UniformAugment(transforms=transform_list, num_ops=num_ops), F.ToTensor()]) ds_ua = data_set.map(operations=transforms_ua, input_columns="image") ds_ua = ds_ua.batch(512) for idx, (image, _) in enumerate(ds_ua): if idx == 0: images_ua = np.transpose(image.asnumpy(), (0, 2, 3, 1)) else: images_ua = np.append(images_ua, np.transpose(image.asnumpy(), (0, 2, 3, 1)), axis=0) num_samples = images_original.shape[0] mse = np.zeros(num_samples) for i in range(num_samples): mse[i] = diff_mse(images_ua[i], images_original[i]) logger.info("MSE= {}".format(str(np.mean(mse)))) if plot: visualize_list(images_original, images_ua)
def test_invalid_input(test_name, size, interpolation, error, error_msg): logger.info("Test Resize with bad input: {0}".format(test_name)) with pytest.raises(error) as error_info: vision.Resize(size, interpolation) assert error_msg in str(error_info.value)