def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, parameter_broadcast=True, mirror_mean=True) else: args.rank = 0 args.group_size = 1 if args.is_dynamic_loss_scale == 1: args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, 1, args.rank, args.group_size, num_parallel_workers=8) de_dataset.map_model = 4 # !!!important args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = get_network(args.backbone, args.num_classes, platform=args.platform) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(args.pretrained)) # lr scheduler if args.lr_scheduler == 'exponential': lr = warmup_step_lr(args.lr, args.lr_epochs, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, gamma=args.lr_gamma, ) elif args.lr_scheduler == 'cosine_annealing': lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) else: raise NotImplementedError(args.lr_scheduler) # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) if args.platform == "Ascend": model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O3") else: model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O2") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [progress_cb,] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", choices=["true", "false"], help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--enable_save_ckpt", type=str, default="true", choices=["true", "false"], help="Enable save checkpoint, default is true.") parser.add_argument("--enable_lossscale", type=str, default="true", choices=["true", "false"], help="Use lossscale or not, default is not.") parser.add_argument("--do_shuffle", type=str, default="true", choices=["true", "false"], help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", choices=["true", "false"], help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.") parser.add_argument("--accumulation_steps", type=int, default="1", help="Accumulating gradients N times before weight update, default is 1.") parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " "default is 1000.") parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, " "meaning run all steps according to epoch number.") parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init() device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) _set_bert_all_reduce_split() else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 if args_opt.accumulation_steps > 1: logger.info("accumulation steps: {}".format(args_opt.accumulation_steps)) logger.info("global batch size: {}".format(cfg.batch_size * args_opt.accumulation_steps)) if args_opt.enable_data_sink == "true": args_opt.data_sink_steps *= args_opt.accumulation_steps logger.info("data sink steps: {}".format(args_opt.data_sink_steps)) if args_opt.enable_save_ckpt == "true": args_opt.save_checkpoint_steps *= args_opt.accumulation_steps logger.info("save checkpoint steps: {}".format(args_opt.save_checkpoint_steps)) ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size() // args_opt.data_sink_steps if args_opt.train_steps > 0: train_steps = args_opt.train_steps * args_opt.accumulation_steps new_repeat_count = min(new_repeat_count, train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size() // args_opt.accumulation_steps logger.info("train steps: {}".format(args_opt.train_steps)) optimizer = _get_optimizer(args_opt, net_with_loss) callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size())] if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0: config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) if args_opt.accumulation_steps <= 1: net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: accumulation_steps = args_opt.accumulation_steps net_with_grads = BertTrainAccumulateStepsWithLossScaleCell(net_with_loss, optimizer=optimizer, scale_update_cell=update_cell, accumulation_steps=accumulation_steps, enable_global_norm=cfg.enable_global_norm) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
def do_eval(dataset=None, network=None, metric=None, load_checkpoint_path="", eval_type=None, tokenizer_file="", top_k=None, top_p=None, temperature=None, generate_length=None): """ Do evaluation on summarization """ if load_checkpoint_path == "": raise ValueError( "Finetune model missed, evaluation task must load finetune model!") if metric.lower() == "rouge": print("Prepare to calculate the Rouge score ...") callback = Rouge() gpt2_loss = network(config=gpt2_net_cfg, is_training=False, use_one_hot_embeddings=False) gpt2_loss.set_train(False) param_dict = load_checkpoint(load_checkpoint_path) reorganized_param_dict = modify_paramdict(param_dict, mode=eval_type, model_prefix="gpt2.") load_param_into_net(gpt2_loss, reorganized_param_dict) # load nn.Cell into Model and initiate tokenizer and Sample model = Model(gpt2_loss) tokenizer = Tokenizer(vocab_file=tokenizer_file + 'gpt2-vocab.json', merge_file=tokenizer_file + 'gpt2-merges.txt') # load data and process text generation columns_list = ["input_ids", "input_mask", "label_ids"] summarization_generator = GenerateForSummarization( model, config=gpt2_net_cfg, tokenizer=tokenizer, select_sentence=3, eval_type=eval_type, topk=top_k, topp=float(top_p), temperature=float(temperature), generate_length=generate_length) num_data = 1 print( "==================== [Summrization] Testing ====================") for data in dataset.create_dict_iterator(): input_data = [] for value in columns_list: input_data.append(data[value]) input_ids, _, label_ids = input_data print(" | [ROUGE] number : {} / {} ".format( num_data, dataset.get_dataset_size())) print("input_ids shape: {}".format(input_ids.shape)) print("label_ids shape: {}".format(label_ids.shape)) hypothesis, ref = summarization_generator.generate_for_summarization( input_ids) if ref[0] == '' or ref[0] is None: print("Sorry ref_list is None, skip it!") continue print("REF str:\n ", ref, "\nHYPO str:\n", hypothesis, "\n") for batch_idx in range(gpt2_net_cfg.batch_size): hypothesis[batch_idx] = clean_hypo(hypothesis[batch_idx]) for batch_idx in range(gpt2_net_cfg.batch_size): hypothesis[batch_idx] = hypothesis[batch_idx].lower() ref[batch_idx] = ref[batch_idx].lower() callback.update(hypothesis, ref) num_data += 1 print("\n\n") print("**********************************************************") eval_result_print(metric, callback) print("******************** Testing Finished ********************") else: raise ValueError( "metric method not supported in summarization, support: [Rouge]")
def maskrcnn_eval(dataset_path, ckpt_path, ann_file): """MaskRcnn evaluation.""" ds = create_maskrcnn_dataset(dataset_path, batch_size=config.test_batch_size, is_training=False) net = Mask_Rcnn_Resnet50(config) param_dict = load_checkpoint(ckpt_path) load_param_into_net(net, param_dict) net.set_train(False) eval_iter = 0 total = ds.get_dataset_size() outputs = [] dataset_coco = COCO(ann_file) print("\n========================================\n") print("total images num: ", total) print("Processing, please wait a moment.") max_num = 128 for data in ds.create_dict_iterator(output_numpy=True, num_epochs=1): eval_iter = eval_iter + 1 img_data = data['image'] img_metas = data['image_shape'] gt_bboxes = data['box'] gt_labels = data['label'] gt_num = data['valid_num'] gt_mask = data["mask"] start = time.time() # run net output = net(Tensor(img_data), Tensor(img_metas), Tensor(gt_bboxes), Tensor(gt_labels), Tensor(gt_num), Tensor(gt_mask)) end = time.time() print("Iter {} cost time {}".format(eval_iter, end - start)) # output all_bbox = output[0] all_label = output[1] all_mask = output[2] all_mask_fb = output[3] for j in range(config.test_batch_size): all_bbox_squee = np.squeeze(all_bbox.asnumpy()[j, :, :]) all_label_squee = np.squeeze(all_label.asnumpy()[j, :, :]) all_mask_squee = np.squeeze(all_mask.asnumpy()[j, :, :]) all_mask_fb_squee = np.squeeze(all_mask_fb.asnumpy()[j, :, :, :]) all_bboxes_tmp_mask = all_bbox_squee[all_mask_squee, :] all_labels_tmp_mask = all_label_squee[all_mask_squee] all_mask_fb_tmp_mask = all_mask_fb_squee[all_mask_squee, :, :] if all_bboxes_tmp_mask.shape[0] > max_num: inds = np.argsort(-all_bboxes_tmp_mask[:, -1]) inds = inds[:max_num] all_bboxes_tmp_mask = all_bboxes_tmp_mask[inds] all_labels_tmp_mask = all_labels_tmp_mask[inds] all_mask_fb_tmp_mask = all_mask_fb_tmp_mask[inds] bbox_results = bbox2result_1image(all_bboxes_tmp_mask, all_labels_tmp_mask, config.num_classes) segm_results = get_seg_masks(all_mask_fb_tmp_mask, all_bboxes_tmp_mask, all_labels_tmp_mask, img_metas[j], True, config.num_classes) outputs.append((bbox_results, segm_results)) eval_types = ["bbox", "segm"] result_files = results2json(dataset_coco, outputs, "./results.pkl") coco_eval(result_files, eval_types, dataset_coco, single_result=False)
device_num=device_num, rank_id=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done!") net = Faster_Rcnn_Resnet50(config=config) net = net.set_train() load_path = args_opt.pre_trained if load_path != "": param_dict = load_checkpoint(load_path) for item in list(param_dict.keys()): if not item.startswith('backbone'): param_dict.pop(item) load_param_into_net(net, param_dict) loss = LossNet() lr = Tensor(dynamic_lr(config, rank_size=device_num), mstype.float32) opt = SGD(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) net_with_loss = WithLossCell(net, loss) if args_opt.run_distribute: net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale,
def main(): args_opt = get_args() rank = 0 device_num = 1 if args_opt.run_platform == "CPU": context.set_context(mode=context.GRAPH_MODE, device_target="CPU") else: context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.run_platform, device_id=args_opt.device_id) if args_opt.distribute: device_num = args_opt.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) init() if config.model == "ssd_resnet50_fpn": context.set_auto_parallel_context(all_reduce_fusion_config=[90, 183, 279]) else: context.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 89]) rank = get_rank() mindrecord_file = create_mindrecord(args_opt.dataset, "ssd.mindrecord", True) if args_opt.only_create_dataset: return loss_scale = float(args_opt.loss_scale) if args_opt.run_platform == "CPU": loss_scale = 1.0 # When create MindDataset, using the fitst mindrecord file, such as ssd.mindrecord0. use_multiprocessing = (args_opt.run_platform != "CPU") dataset = create_ssd_dataset(mindrecord_file, repeat_num=1, batch_size=args_opt.batch_size, device_num=device_num, rank=rank, use_multiprocessing=use_multiprocessing) dataset_size = dataset.get_dataset_size() print(f"Create dataset done! dataset size is {dataset_size}") ssd = ssd_model_build(args_opt) if ("use_float16" in config and config.use_float16) or args_opt.run_platform == "GPU": ssd.to_float(dtype.float16) net = SSDWithLossCell(ssd, config) # checkpoint ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) save_ckpt_path = './ckpt_' + str(rank) + '/' ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=save_ckpt_path, config=ckpt_config) if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) if args_opt.filter_weight: filter_checkpoint_parameter_by_list(param_dict, config.checkpoint_filter_list) load_param_into_net(net, param_dict, True) lr = Tensor(get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size, lr_init=config.lr_init, lr_end=config.lr_end_rate * args_opt.lr, lr_max=args_opt.lr, warmup_epochs=config.warmup_epochs, total_epochs=args_opt.epoch_size, steps_per_epoch=dataset_size)) if "use_global_norm" in config and config.use_global_norm: opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, 1.0) net = TrainingWrapper(net, opt, loss_scale, True) else: opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, loss_scale) net = TrainingWrapper(net, opt, loss_scale) callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb] model = Model(net) dataset_sink_mode = False if args_opt.mode == "sink" and args_opt.run_platform != "CPU": print("In sink mode, one epoch return a loss.") dataset_sink_mode = True print("Start train SSD, the first epoch will be slower because of the graph compilation.") model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
def run_transformer_train(): """ Transformer training. """ parser = argparse_init() args, _ = parser.parse_known_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id) context.set_context(save_graphs=True, reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) if args.distribute == "true": device_num = args.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True, device_num=device_num) D.init() rank_id = args.device_id % device_num else: device_num = 1 rank_id = 0 dataset, repeat_count = create_transformer_dataset(epoch_count=args.epoch_size, rank_size=device_num, rank_id=rank_id, do_shuffle=args.do_shuffle, enable_data_sink=args.enable_data_sink, dataset_path=args.data_path) netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True) if args.checkpoint_path: parameter_dict = load_checkpoint(args.checkpoint_path) else: parameter_dict = {} params = netwithloss.trainable_params() for param in params: name = param.name value = param.default_input if isinstance(value, Tensor): if name.endswith(".gamma"): parameter_dict[name] = Parameter(one_weight(value.asnumpy().shape), name=name) elif name.endswith(".beta") or name.endswith(".bias"): parameter_dict[name] = Parameter(zero_weight(value.asnumpy().shape), name=name) elif "embedding" in name: parameter_dict[name] = Parameter(normal_weight(value.asnumpy().shape, transformer_net_cfg.hidden_size), name=name) else: parameter_dict[name] = Parameter(weight_variable(value.asnumpy().shape), name=name) load_param_into_net(netwithloss, parameter_dict) lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", training_steps=dataset.get_dataset_size()*args.epoch_size, learning_rate=cfg.lr_schedule.learning_rate, warmup_steps=cfg.lr_schedule.warmup_steps, hidden_size=transformer_net_cfg.hidden_size), mstype.float32) optimizer = Adam(netwithloss.trainable_params(), lr) callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()] if args.enable_save_ckpt == "true": ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps, keep_checkpoint_max=args.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=args.save_checkpoint_path, config=ckpt_config) callbacks.append(ckpoint_cb) if args.enable_lossscale == "true": scale_manager = DynamicLossScaleManager(init_loss_scale=cfg.init_loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) update_cell = scale_manager.get_update_cell() netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = TransformerTrainOneStepCell(netwithloss, optimizer=optimizer) netwithgrads.set_train(True) model = Model(netwithgrads) model.train(repeat_count, dataset, callbacks=callbacks, dataset_sink_mode=(args.enable_data_sink == "true"))
def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.") parser.add_argument("--enable_lossscale", type=str, default="true", help="Use lossscale or not, default is not.") parser.add_argument("--do_shuffle", type=str, default="true", help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.") parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path") parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " "default is 1000.") parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) if args_opt.distribute == "true": device_num = args_opt.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) D.init() rank = args_opt.device_id % device_num else: rank = 0 device_num = 1 ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num, rank, args_opt.do_shuffle, args_opt.enable_data_sink, args_opt.data_sink_steps, args_opt.data_dir, args_opt.schema_dir) netwithloss = BertNetworkWithLoss(bert_net_cfg, True) if cfg.optimizer == 'Lamb': optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * ds.get_repeat_count(), start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, power=cfg.Lamb.power, warmup_steps=cfg.Lamb.warmup_steps, weight_decay=cfg.Lamb.weight_decay, eps=cfg.Lamb.eps) elif cfg.optimizer == 'Momentum': optimizer = Momentum(netwithloss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, momentum=cfg.Momentum.momentum) elif cfg.optimizer == 'AdamWeightDecayDynamicLR': optimizer = AdamWeightDecayDynamicLR( netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * ds.get_repeat_count(), learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate, end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate, power=cfg.AdamWeightDecayDynamicLR.power, weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay, eps=cfg.AdamWeightDecayDynamicLR.eps, warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps) else: raise ValueError( "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]" .format(cfg.optimizer)) callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()] if args_opt.enable_save_ckpt == "true": config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', config=config_ck) callback.append(ckpoint_cb) if args_opt.checkpoint_path: param_dict = load_checkpoint(args_opt.checkpoint_path) load_param_into_net(netwithloss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer) model = Model(netwithgrads) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"))
pad_num = int(np.ceil(cfg.embed_size / 16) * 16 - cfg.embed_size) if pad_num > 0: embedding_table = np.pad(embedding_table, [(0, 0), (0, pad_num)], 'constant') cfg.embed_size = int(np.ceil(cfg.embed_size / 16) * 16) network = SentimentNet(vocab_size=embedding_table.shape[0], embed_size=cfg.embed_size, num_hiddens=cfg.num_hiddens, num_layers=cfg.num_layers, bidirectional=cfg.bidirectional, num_classes=cfg.num_classes, weight=Tensor(embedding_table), batch_size=cfg.batch_size) # pre_trained if args.pre_trained: load_param_into_net(network, load_checkpoint(args.pre_trained)) ds_train = lstm_create_dataset(args.preprocess_path, cfg.batch_size, 1, device_num=device_num, rank=rank) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') if cfg.dynamic_lr: lr = Tensor( get_lr(global_step=cfg.global_step, lr_init=cfg.lr_init, lr_end=cfg.lr_end, lr_max=cfg.lr_max, warmup_epochs=cfg.warmup_epochs,
def train(): """training CenterNet""" context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) context.set_context(reserve_class_name_in_scope=False) context.set_context(save_graphs=False) ckpt_save_dir = args_opt.save_checkpoint_path rank = 0 device_num = 1 num_workers = 8 if args_opt.device_target == "Ascend": context.set_context(enable_auto_mixed_precision=False) context.set_context(device_id=args_opt.device_id) if args_opt.distribute == "true": D.init() device_num = args_opt.device_num rank = args_opt.device_id % device_num ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) _set_parallel_all_reduce_split() else: args_opt.distribute = "false" args_opt.need_profiler = "false" args_opt.enable_data_sink = "false" # Start create dataset! # mindrecord files will be generated at args_opt.mindrecord_dir such as centernet.mindrecord0, 1, ... file_num. logger.info("Begin creating dataset for CenterNet") coco = COCOHP(dataset_config, run_mode="train", net_opt=net_config, save_path=args_opt.save_result_dir) dataset = coco.create_train_dataset(args_opt.mindrecord_dir, args_opt.mindrecord_prefix, batch_size=train_config.batch_size, device_num=device_num, rank=rank, num_parallel_workers=num_workers, do_shuffle=args_opt.do_shuffle == 'true') dataset_size = dataset.get_dataset_size() logger.info("Create dataset done!") net_with_loss = CenterNetLossCell(net_config) args_opt.train_steps = args_opt.epoch_size * dataset_size logger.info("train steps: {}".format(args_opt.train_steps)) optimizer = _get_optimizer(net_with_loss, dataset_size) enable_static_time = args_opt.device_target == "CPU" callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(dataset_size, enable_static_time)] if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0: config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_centernet', directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.device_target == "Ascend": net_with_grads = CenterNetWithLossScaleCell(net_with_loss, optimizer=optimizer, sens=train_config.loss_scale_value) else: net_with_grads = CenterNetWithoutLossScaleCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
def test(cloud_args=None): """ network eval function. Get top1 and top5 ACC from classification. The result will be save at [./outputs] by default. """ args = parse_args(cloud_args) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) # network args.logger.important_info('start create network') if os.path.isdir(args.pretrained): models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt'))) f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_') [0]) args.models = sorted(models, key=f) else: args.models = [ args.pretrained, ] for model in args.models: de_dataset = classification_dataset(args.data_dir, image_size=args.image_size, per_batch_size=args.per_batch_size, max_epoch=1, rank=args.rank, group_size=args.group_size, mode='eval') eval_dataloader = de_dataset.create_tuple_iterator() network = DenseNet121(args.num_classes) param_dict = load_checkpoint(model) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(model)) network.add_flags_recursive(fp16=True) img_tot = 0 top1_correct = 0 top5_correct = 0 network.set_train(False) for data, gt_classes in eval_dataloader: output = network(Tensor(data, mstype.float32)) output = output.asnumpy() gt_classes = gt_classes.asnumpy() top1_output = np.argmax(output, (-1)) top5_output = np.argsort(output)[:, -5:] t1_correct = np.equal(top1_output, gt_classes).sum() top1_correct += t1_correct top5_correct += get_top5_acc(top5_output, gt_classes) img_tot += args.per_batch_size results = [[top1_correct], [top5_correct], [img_tot]] args.logger.info('before results={}'.format(results)) if args.is_distributed: model_md5 = model.replace('/', '') tmp_dir = '../cache' if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format( tmp_dir, args.rank, model_md5) top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format( tmp_dir, args.rank, model_md5) img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format( tmp_dir, args.rank, model_md5) np.save(top1_correct_npy, top1_correct) np.save(top5_correct_npy, top5_correct) np.save(img_tot_npy, img_tot) while True: rank_ok = True for other_rank in range(args.group_size): top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) if not os.path.exists(top1_correct_npy) or not os.path.exists(top5_correct_npy) \ or not os.path.exists(img_tot_npy): rank_ok = False if rank_ok: break top1_correct_all = 0 top5_correct_all = 0 img_tot_all = 0 for other_rank in range(args.group_size): top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format( tmp_dir, other_rank, model_md5) top1_correct_all += np.load(top1_correct_npy) top5_correct_all += np.load(top5_correct_npy) img_tot_all += np.load(img_tot_npy) results = [[top1_correct_all], [top5_correct_all], [img_tot_all]] results = np.array(results) else: results = np.array(results) args.logger.info('after results={}'.format(results)) top1_correct = results[0, 0] top5_correct = results[1, 0] img_tot = results[2, 0] acc1 = 100.0 * top1_correct / img_tot acc5 = 100.0 * top5_correct / img_tot args.logger.info( 'after allreduce eval: top1_correct={}, tot={}, acc={:.2f}%'. format(top1_correct, img_tot, acc1)) args.logger.info( 'after allreduce eval: top5_correct={}, tot={}, acc={:.2f}%'. format(top5_correct, img_tot, acc5)) if args.is_distributed: release()
def test(): """The function of eval.""" start_time = time.time() args = parse_args() # logger args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) rank_id = int(os.environ.get('RANK_ID')) args.logger = get_logger(args.outputs_dir, rank_id) context.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) args.logger.info('Creating Network....') network = YOLOV3DarkNet53(is_training=False) config = ConfigYOLOV3DarkNet53() if args.testing_shape: config.test_img_shape = conver_testing_shape(args) # convert fusion network to quantization aware network if config.quantization_aware: quantizer = QuantizationAwareTraining(bn_fold=True, per_channel=[True, False], symmetric=[True, False]) network = quantizer.quantize(network) args.logger.info(args.pretrained) if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('yolo_network.'): param_dict_new[key[13:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.pretrained)) else: args.logger.info('{} not exists or not a pre-trained file'.format( args.pretrained)) assert FileNotFoundError( '{} not exists or not a pre-trained file'.format(args.pretrained)) exit(1) data_root = args.data_root ann_file = args.annFile ds, data_size = create_yolo_dataset(data_root, ann_file, is_training=False, batch_size=args.per_batch_size, max_epoch=1, device_num=1, rank=rank_id, shuffle=False, config=config) args.logger.info('testing shape : {}'.format(config.test_img_shape)) args.logger.info('totol {} images to eval'.format(data_size)) network.set_train(False) # init detection engine detection = DetectionEngine(args) input_shape = Tensor(tuple(config.test_img_shape), ms.float32) args.logger.info('Start inference....') for i, data in enumerate(ds.create_dict_iterator(num_epochs=1)): image = data["image"] image_shape = data["image_shape"] image_id = data["img_id"] prediction = network(image, input_shape) output_big, output_me, output_small = prediction output_big = output_big.asnumpy() output_me = output_me.asnumpy() output_small = output_small.asnumpy() image_id = image_id.asnumpy() image_shape = image_shape.asnumpy() detection.detect([output_small, output_me, output_big], args.per_batch_size, image_shape, image_id) if i % 1000 == 0: args.logger.info('Processing... {:.2f}% '.format( i * args.per_batch_size / data_size * 100)) args.logger.info('Calculating mAP...') detection.do_nms_for_results() result_file_path = detection.write_result() args.logger.info('result file path: {}'.format(result_file_path)) eval_result = detection.get_eval_result() cost_time = time.time() - start_time args.logger.info('\n=============coco eval reulst=========\n' + eval_result) args.logger.info('testing cost time {:.2f}h'.format(cost_time / 3600.))
def do_eval(dataset=None, network=None, metric=None, load_checkpoint_path="", eval_type=None, num_choice=None): """ Do evaluation for CBT task. Args: dataset: the eval dataset. network: the network with loss. metric: the evaluation method. load_checkpoint_path: the file path which saved finetuned model checkpoint. eval_type: num_choice: """ if load_checkpoint_path == "": raise ValueError( "Finetune model missed, evaluation task must load finetune model!") if metric.lower() == "accuracy": print("Prepare to calculate the accuracy score ...") gpt2_cbt = network(config=gpt2_net_cfg, is_training=False, use_one_hot_embeddings=False) gpt2_cbt.set_train(False) param_dict = load_checkpoint(load_checkpoint_path) if eval_type == "zero-shot": final_param_dict = {} for name, _ in param_dict.items(): final_param_dict['gpt2.gpt2.' + name] = param_dict[name] final_param_dict['gpt2.lm_head.weight'] = param_dict[ 'gpt2_embedding_lookup.embedding_table'] load_param_into_net(gpt2_cbt, final_param_dict) print("load pretrained parameter successfully!\n") elif eval_type == "finetuned": load_param_into_net(gpt2_cbt, param_dict) print("load finetuned parameter successfully!\n") else: raise ValueError( "Evaluation type missed, eval_type should be [zero-shot, finetuned]" ) model = Model(gpt2_cbt) callback = Accuracy() columns_list = ["input_ids", "input_mask", "input_length", "mc_labels"] print("==================== [ACC] Testing ====================") num_data = 1 all_choice_prob = [] for data in dataset.create_dict_iterator(): input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, input_length, mc_labels = input_data print("| [ACC] number : {} / {} ".format( num_data, dataset.get_dataset_size())) # print("mc_labels: {}".format(mc_labels)) # [batch_size] logits = model.predict(input_ids, input_mask) # choice_prob_list [batch_size] choice_prob_list = calculate_choice_prob_for_cbt( logits=logits, batch_size=gpt2_net_cfg.batch_size, input_length=input_length, input_ids=input_ids) all_choice_prob.append(choice_prob_list) if (num_data * gpt2_net_cfg.batch_size) % num_choice == 0: all_choice_prob_np = np.array(all_choice_prob) all_choice_prob_np = all_choice_prob_np.reshape( (-1, num_choice)) print("| all_choice_prob_np: ", all_choice_prob_np) print("| all_choice_prob_np shape: ", all_choice_prob_np.shape) mc_labels = np.array([mc_labels.asnumpy()[0]]) callback.update(all_choice_prob_np, mc_labels) all_choice_prob = [] num_data += 1 print("\n\n") print("**************************************************************") print("acc_num {} , total_num {}, accuracy {:.6f}".format( callback.acc_num, callback.total_num, callback.acc_num / callback.total_num)) print("********************** Testing Finished **********************") else: raise ValueError("metric method not supported, support: [Accuracy]")
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False) if args.device_target == 'Ascend': devid = int(os.getenv('DEVICE_ID')) context.set_context(device_id=devid) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() if args.is_dynamic_loss_scale == 1: args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt # select for master rank save ckpt or all rank save, compatible for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join( args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, args.max_epoch, args.rank, args.group_size) de_dataset.map_model = 4 args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = DenseNet121(args.num_classes) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 criterion = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(args.pretrained)) # lr scheduler if args.lr_scheduler == 'exponential': lr_scheduler = MultiStepLR(args.lr, args.lr_epochs, args.lr_gamma, args.steps_per_epoch, args.max_epoch, warmup_epochs=args.warmup_epochs) elif args.lr_scheduler == 'cosine_annealing': lr_scheduler = CosineAnnealingLR(args.lr, args.T_max, args.steps_per_epoch, args.max_epoch, warmup_epochs=args.warmup_epochs, eta_min=args.eta_min) else: raise NotImplementedError(args.lr_scheduler) lr_schedule = lr_scheduler.get_lr() # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr_schedule), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # mixed precision training criterion.add_flags_recursive(fp32=True) # package training process, adjust lr + forward + backward + optimizer train_net = BuildTrainNetwork(network, criterion) if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL else: parallel_mode = ParallelMode.STAND_ALONE if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, gradients_mean=True) if args.device_target == 'Ascend': model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O3") elif args.device_target == 'GPU': model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O0") else: raise ValueError("Unsupported device target.") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [ progress_cb, ] if args.rank_save_ckpt_flag: ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks)
def main(): args_opt = get_args() rank = 0 device_num = 1 if args_opt.run_platform == "CPU": context.set_context(mode=context.GRAPH_MODE, device_target="CPU") else: context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.run_platform, device_id=args_opt.device_id) if args_opt.distribute: device_num = args_opt.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) init() if config.model == "ssd_resnet50_fpn": context.set_auto_parallel_context( all_reduce_fusion_config=[90, 183, 279]) if config.model == "ssd_vgg16": context.set_auto_parallel_context( all_reduce_fusion_config=[20, 41, 62]) else: context.set_auto_parallel_context( all_reduce_fusion_config=[29, 58, 89]) rank = get_rank() mindrecord_file = create_mindrecord(args_opt.dataset, "ssd.mindrecord", True) if args_opt.only_create_dataset: return loss_scale = float(args_opt.loss_scale) if args_opt.run_platform == "CPU": loss_scale = 1.0 # When create MindDataset, using the fitst mindrecord file, such as ssd.mindrecord0. use_multiprocessing = (args_opt.run_platform != "CPU") dataset = create_ssd_dataset(mindrecord_file, repeat_num=1, batch_size=args_opt.batch_size, device_num=device_num, rank=rank, use_multiprocessing=use_multiprocessing) dataset_size = dataset.get_dataset_size() print(f"Create dataset done! dataset size is {dataset_size}") ssd = ssd_model_build(args_opt) if ("use_float16" in config and config.use_float16) or args_opt.run_platform == "GPU": ssd.to_float(dtype.float16) net = SSDWithLossCell(ssd, config) # checkpoint ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) save_ckpt_path = './ckpt_' + str(rank) + '/' ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=save_ckpt_path, config=ckpt_config) if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) if args_opt.filter_weight: filter_checkpoint_parameter_by_list(param_dict, config.checkpoint_filter_list) load_param_into_net(net, param_dict, True) lr = Tensor( get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size, lr_init=config.lr_init, lr_end=config.lr_end_rate * args_opt.lr, lr_max=args_opt.lr, warmup_epochs=config.warmup_epochs, total_epochs=args_opt.epoch_size, steps_per_epoch=dataset_size)) if "use_global_norm" in config and config.use_global_norm: opt = nn.Momentum( filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, 1.0) net = TrainingWrapper(net, opt, loss_scale, True) else: opt = nn.Momentum( filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, loss_scale) net = TrainingWrapper(net, opt, loss_scale) callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb] if args_opt.run_eval: eval_net = SsdInferWithDecoder(ssd, Tensor(default_boxes), config) eval_net.set_train(False) mindrecord_file = create_mindrecord(args_opt.dataset, "ssd_eval.mindrecord", False) eval_dataset = create_ssd_dataset(mindrecord_file, batch_size=args_opt.batch_size, repeat_num=1, is_training=False, use_multiprocessing=False) if args_opt.dataset == "coco": anno_json = os.path.join( config.coco_root, config.instances_set.format(config.val_data_type)) elif args_opt.dataset == "voc": anno_json = os.path.join(config.voc_root, config.voc_json) else: raise ValueError( 'SSD eval only support dataset mode is coco and voc!') eval_param_dict = { "net": eval_net, "dataset": eval_dataset, "anno_json": anno_json } eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=args_opt.eval_interval, eval_start_epoch=args_opt.eval_start_epoch, save_best_ckpt=True, ckpt_directory=save_ckpt_path, besk_ckpt_name="best_map.ckpt", metrics_name="mAP") callback.append(eval_cb) model = Model(net) dataset_sink_mode = False if args_opt.mode == "sink" and args_opt.run_platform != "CPU": print("In sink mode, one epoch return a loss.") dataset_sink_mode = True print( "Start train SSD, the first epoch will be slower because of the graph compilation." ) model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
def test(cloud_args=None): """test""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit() and args.device_target == "Ascend": context.set_context(device_id=int(os.getenv('DEVICE_ID'))) args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) if args.dataset == "cifar10": net = vgg16(num_classes=args.num_classes, args=args) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, args.momentum, weight_decay=args.weight_decay) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) param_dict = load_checkpoint(args.pre_trained) load_param_into_net(net, param_dict) net.set_train(False) dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, training=False) res = model.eval(dataset) print("result: ", res) else: # network args.logger.important_info('start create network') if os.path.isdir(args.pre_trained): models = list(glob.glob(os.path.join(args.pre_trained, '*.ckpt'))) print(models) if args.graph_ckpt: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1]. split('_')[0]) else: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1]) args.models = sorted(models, key=f) else: args.models = [ args.pre_trained, ] for model in args.models: dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size, mode='eval') eval_dataloader = dataset.create_tuple_iterator(output_numpy=True, num_epochs=1) network = vgg16(args.num_classes, args, phase="test") # pre_trained load_param_into_net(network, load_checkpoint(model)) network.add_flags_recursive(fp16=True) img_tot = 0 top1_correct = 0 top5_correct = 0 network.set_train(False) t_end = time.time() it = 0 for data, gt_classes in eval_dataloader: output = network(Tensor(data, mstype.float32)) output = output.asnumpy() top1_output = np.argmax(output, (-1)) top5_output = np.argsort(output)[:, -5:] t1_correct = np.equal(top1_output, gt_classes).sum() top1_correct += t1_correct top5_correct += get_top5_acc(top5_output, gt_classes) img_tot += args.per_batch_size if args.rank == 0 and it == 0: t_end = time.time() it = 1 if args.rank == 0: time_used = time.time() - t_end fps = (img_tot - args.per_batch_size) * args.group_size / time_used args.logger.info( 'Inference Performance: {:.2f} img/sec'.format(fps)) results = [[top1_correct], [top5_correct], [img_tot]] args.logger.info('before results={}'.format(results)) results = np.array(results) args.logger.info('after results={}'.format(results)) top1_correct = results[0, 0] top5_correct = results[1, 0] img_tot = results[2, 0] acc1 = 100.0 * top1_correct / img_tot acc5 = 100.0 * top5_correct / img_tot args.logger.info('after allreduce eval: top1_correct={}, tot={},' 'acc={:.2f}%(TOP1)'.format( top1_correct, img_tot, acc1)) args.logger.info('after allreduce eval: top5_correct={}, tot={},' 'acc={:.2f}%(TOP5)'.format( top5_correct, img_tot, acc5))
def run_transformer_train(): """ Transformer training. """ parser = argparse_init() args, _ = parser.parse_known_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id) context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) if args.distribute == "true": device_num = args.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True, device_num=device_num) D.init() rank_id = args.device_id % device_num else: device_num = 1 rank_id = 0 dataset = create_transformer_dataset( epoch_count=1, rank_size=device_num, rank_id=rank_id, do_shuffle=args.do_shuffle, enable_data_sink=args.enable_data_sink, dataset_path=args.data_path) netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True) if args.checkpoint_path: parameter_dict = load_checkpoint(args.checkpoint_path) load_param_into_net(netwithloss, parameter_dict) lr = Tensor( create_dynamic_lr( schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", training_steps=dataset.get_dataset_size() * args.epoch_size, learning_rate=cfg.lr_schedule.learning_rate, warmup_steps=cfg.lr_schedule.warmup_steps, hidden_size=transformer_net_cfg.hidden_size, start_decay_step=cfg.lr_schedule.start_decay_step, min_lr=cfg.lr_schedule.min_lr), mstype.float32) optimizer = Adam(netwithloss.trainable_params(), lr) callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()] if args.enable_save_ckpt == "true": if device_num == 1 or (device_num > 1 and rank_id == 0): ckpt_config = CheckpointConfig( save_checkpoint_steps=args.save_checkpoint_steps, keep_checkpoint_max=args.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=args.save_checkpoint_path, config=ckpt_config) callbacks.append(ckpoint_cb) if args.enable_lossscale == "true": scale_manager = DynamicLossScaleManager( init_loss_scale=cfg.init_loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) update_cell = scale_manager.get_update_cell() netwithgrads = TransformerTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = TransformerTrainOneStepCell(netwithloss, optimizer=optimizer) netwithgrads.set_train(True) model = Model(netwithgrads) enable_sink = (args.enable_data_sink == "true") if enable_sink: sink_size = args.save_checkpoint_steps model.train(args.epoch_size * dataset.get_dataset_size() // sink_size, dataset, callbacks=callbacks, dataset_sink_mode=enable_sink, sink_size=sink_size) else: model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=enable_sink)
help='checkpoint of inceptionV4') args_opt = parser.parse_args() return args_opt if __name__ == '__main__': args = parse_args() if args.platform == 'Ascend': device_id = int(os.getenv('DEVICE_ID')) context.set_context(device_id=device_id) context.set_context(mode=context.GRAPH_MODE, device_target=args.platform) net = Inceptionv4(classes=config.num_classes) ckpt = load_checkpoint(args.checkpoint_path) load_param_into_net(net, ckpt) net.set_train(False) dataset = create_dataset(dataset_path=args.dataset_path, do_train=False, repeat_num=1, batch_size=config.batch_size) loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") eval_metrics = { 'Loss': nn.Loss(), 'Top1-Acc': nn.Top1CategoricalAccuracy(), 'Top5-Acc': nn.Top5CategoricalAccuracy() } model = Model(net, loss, optimizer=None, metrics=eval_metrics) print('=' * 20, 'Evalute start', '=' * 20) metrics = model.eval(dataset) print("metric: ", metrics)
def test_train(): ''' finetune function ''' devid = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) #BertCLSTrain for classification #BertNERTrain for sequence labeling if cfg.task == 'NER': if cfg.use_crf: netwithloss = BertNER(bert_net_cfg, True, num_labels=len(tag_to_index), use_crf=True, tag_to_index=tag_to_index, dropout_prob=0.1) else: netwithloss = BertNER(bert_net_cfg, True, num_labels=cfg.num_labels, dropout_prob=0.1) elif cfg.task == 'SQUAD': netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1) else: netwithloss = BertCLS(bert_net_cfg, True, num_labels=cfg.num_labels, dropout_prob=0.1) if cfg.task == 'SQUAD': dataset = get_squad_dataset(bert_net_cfg.batch_size, cfg.epoch_num) else: dataset = get_dataset(bert_net_cfg.batch_size, cfg.epoch_num) # optimizer steps_per_epoch = dataset.get_dataset_size() if cfg.optimizer == 'AdamWeightDecayDynamicLR': optimizer = AdamWeightDecayDynamicLR( netwithloss.trainable_params(), decay_steps=steps_per_epoch * cfg.epoch_num, learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate, end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate, power=cfg.AdamWeightDecayDynamicLR.power, warmup_steps=int(steps_per_epoch * cfg.epoch_num * 0.1), weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay, eps=cfg.AdamWeightDecayDynamicLR.eps) elif cfg.optimizer == 'Lamb': optimizer = Lamb(netwithloss.trainable_params(), decay_steps=steps_per_epoch * cfg.epoch_num, start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, power=cfg.Lamb.power, weight_decay=cfg.Lamb.weight_decay, warmup_steps=int(steps_per_epoch * cfg.epoch_num * 0.1), decay_filter=cfg.Lamb.decay_filter) elif cfg.optimizer == 'Momentum': optimizer = Momentum(netwithloss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, momentum=cfg.Momentum.momentum) else: raise Exception("Optimizer not supported.") # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix, directory=cfg.ckpt_dir, config=ckpt_config) param_dict = load_checkpoint(cfg.pre_training_ckpt) load_param_into_net(netwithloss, param_dict) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) if cfg.task == 'SQUAD': netwithgrads = BertSquadCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = BertFinetuneCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) model.train(cfg.epoch_num, dataset, callbacks=[LossCallBack(), ckpoint_cb])
def test_salt_and_pepper_attack_on_mnist(): """ Salt-and-Pepper-Attack test """ # upload trained network ckpt_name = './trained_ckpt_file/checkpoint_lenet-10_1875.ckpt' net = LeNet5() load_dict = load_checkpoint(ckpt_name) load_param_into_net(net, load_dict) # get test data data_list = "./MNIST_unzip/test" batch_size = 32 ds = generate_mnist_dataset(data_list, batch_size=batch_size) # prediction accuracy before attack model = ModelToBeAttacked(net) batch_num = 3 # the number of batches of attacking samples test_images = [] test_labels = [] predict_labels = [] i = 0 for data in ds.create_tuple_iterator(): i += 1 images = data[0].astype(np.float32) labels = data[1] test_images.append(images) test_labels.append(labels) pred_labels = np.argmax(model.predict(images), axis=1) predict_labels.append(pred_labels) if i >= batch_num: break LOGGER.debug( TAG, 'model input image shape is: {}'.format(np.array(test_images).shape)) predict_labels = np.concatenate(predict_labels) true_labels = np.concatenate(test_labels) accuracy = np.mean(np.equal(predict_labels, true_labels)) LOGGER.info(TAG, "prediction accuracy before attacking is : %g", accuracy) # attacking is_target = False attack = SaltAndPepperNoiseAttack(model=model, is_targeted=is_target, sparse=True) if is_target: targeted_labels = np.random.randint(0, 10, size=len(true_labels)) for i in range(len(true_labels)): if targeted_labels[i] == true_labels[i]: targeted_labels[i] = (targeted_labels[i] + 1) % 10 else: targeted_labels = true_labels LOGGER.debug( TAG, 'input shape is: {}'.format(np.concatenate(test_images).shape)) success_list, adv_data, query_list = attack.generate( np.concatenate(test_images), targeted_labels) success_list = np.arange(success_list.shape[0])[success_list] LOGGER.info(TAG, 'success_list: %s', success_list) LOGGER.info(TAG, 'average of query times is : %s', np.mean(query_list)) adv_preds = [] for ite_data in adv_data: pred_logits_adv = model.predict(ite_data) # rescale predict confidences into (0, 1). pred_logits_adv = softmax(pred_logits_adv, axis=1) adv_preds.extend(pred_logits_adv) accuracy_adv = np.mean(np.equal(np.max(adv_preds, axis=1), true_labels)) LOGGER.info(TAG, "prediction accuracy after attacking is : %g", accuracy_adv) test_labels_onehot = np.eye(10)[true_labels] attack_evaluate = AttackEvaluate(np.concatenate(test_images), test_labels_onehot, adv_data, adv_preds, targeted=is_target, target_label=targeted_labels) LOGGER.info(TAG, 'mis-classification rate of adversaries is : %s', attack_evaluate.mis_classification_rate()) LOGGER.info(TAG, 'The average confidence of adversarial class is : %s', attack_evaluate.avg_conf_adv_class()) LOGGER.info(TAG, 'The average confidence of true class is : %s', attack_evaluate.avg_conf_true_class()) LOGGER.info( TAG, 'The average distance (l0, l2, linf) between original ' 'samples and adversarial samples are: %s', attack_evaluate.avg_lp_distance())
audio_conf=config.DataConfig.SpectConfig, bidirectional=True, device_target=args.device_target) loss_net = NetWithLossClass(deepspeech_net) weights = ParameterTuple(deepspeech_net.trainable_params()) optimizer = Adam(weights, learning_rate=config.OptimConfig.learning_rate, eps=config.OptimConfig.epsilon, loss_scale=config.OptimConfig.loss_scale) train_net = TrainOneStepCell(loss_net, optimizer) train_net.set_train(True) if args.pre_trained_model_path != '': param_dict = load_checkpoint(args.pre_trained_model_path) load_param_into_net(train_net, param_dict) print('Successfully loading the pre-trained model') model = Model(train_net) callback_list = [TimeMonitor(steps_size), LossMonitor()] if args.is_distributed: config.CheckpointConfig.ckpt_file_name_prefix = config.CheckpointConfig.ckpt_file_name_prefix + str( get_rank()) config.CheckpointConfig.ckpt_path = os.path.join( config.CheckpointConfig.ckpt_path, 'ckpt_' + str(get_rank()) + '/') config_ck = CheckpointConfig( save_checkpoint_steps=1, keep_checkpoint_max=config.CheckpointConfig.keep_checkpoint_max) ckpt_cb = ModelCheckpoint( prefix=config.CheckpointConfig.ckpt_file_name_prefix,
def _build_training_pipeline(pre_dataset): """ Build training pipeline Args: pre_dataset: preprocessed dataset """ net_with_loss = FastTextNetWithLoss(config.vocab_size, config.embedding_dims, config.num_class) net_with_loss.init_parameters_data() if config.pretrain_ckpt_dir: parameter_dict = load_checkpoint(config.pretrain_ckpt_dir) load_param_into_net(net_with_loss, parameter_dict) if pre_dataset is None: raise ValueError("pre-process dataset must be provided") #get learning rate update_steps = config.epoch * pre_dataset.get_dataset_size() decay_steps = pre_dataset.get_dataset_size() rank_size = os.getenv("RANK_SIZE") if isinstance(rank_size, int): raise ValueError("RANK_SIZE must be integer") if rank_size is not None and int(rank_size) > 1: base_lr = config.lr else: base_lr = config.lr / 10 print("+++++++++++Total update steps ", update_steps) lr = Tensor(polynomial_decay_scheduler( lr=base_lr, min_lr=config.min_lr, decay_steps=decay_steps, total_update_num=update_steps, warmup_steps=config.warmup_steps, power=config.poly_lr_scheduler_power), dtype=mstype.float32) optimizer = Adam(net_with_loss.trainable_params(), lr, beta1=0.9, beta2=0.999) net_with_grads = FastTextTrainOneStepCell(net_with_loss, optimizer=optimizer) net_with_grads.set_train(True) model = Model(net_with_grads) loss_monitor = LossCallBack(rank_ids=rank_id) dataset_size = pre_dataset.get_dataset_size() time_monitor = TimeMonitor(data_size=dataset_size) ckpt_config = CheckpointConfig(save_checkpoint_steps=decay_steps * config.epoch, keep_checkpoint_max=config.keep_ckpt_max) callbacks = [time_monitor, loss_monitor] if rank_size is None or int(rank_size) == 1: ckpt_callback = ModelCheckpoint( prefix='fasttext', directory=os.path.join('./', 'ckpt_{}'.format(os.getenv("DEVICE_ID"))), config=ckpt_config) callbacks.append(ckpt_callback) if rank_size is not None and int( rank_size) > 1 and MultiAscend.get_rank() % 8 == 0: ckpt_callback = ModelCheckpoint( prefix='fasttext', directory=os.path.join('./', 'ckpt_{}'.format(os.getenv("DEVICE_ID"))), config=ckpt_config) callbacks.append(ckpt_callback) print("Prepare to Training....") epoch_size = pre_dataset.get_repeat_count() print("Epoch size ", epoch_size) if os.getenv("RANK_SIZE") is not None and int(os.getenv("RANK_SIZE")) > 1: print(f" | Rank {MultiAscend.get_rank()} Call model train.") model.train(epoch=config.epoch, train_dataset=pre_dataset, callbacks=callbacks, dataset_sink_mode=False)
def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') parser.add_argument( '--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.") parser.add_argument("--enable_lossscale", type=str, default="true", help="Use lossscale or not, default is not.") parser.add_argument("--do_shuffle", type=str, default="true", help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.") parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " "default is 1000.") parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, " "meaning run all steps according to epoch number.") parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init('hccl') device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init('nccl') device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str( rank) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) from mindspore.parallel._auto_parallel_context import auto_parallel_context if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [29, 58, 87, 116, 145, 174, 203, 217]) else: auto_parallel_context().set_all_reduce_fusion_split_indices( [28, 55, 82, 109, 136, 163, 190, 205]) elif bert_net_cfg.num_hidden_layers == 24: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [30, 90, 150, 210, 270, 330, 390, 421]) else: auto_parallel_context().set_all_reduce_fusion_split_indices( [38, 93, 148, 203, 258, 313, 368, 397]) else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num, rank, args_opt.do_shuffle, args_opt.enable_data_sink, args_opt.data_sink_steps, args_opt.data_dir, args_opt.schema_dir) if args_opt.train_steps > 0: new_repeat_count = min( new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps) netwithloss = BertNetworkWithLoss(bert_net_cfg, True) if cfg.optimizer == 'Lamb': optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * new_repeat_count, start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, power=cfg.Lamb.power, warmup_steps=cfg.Lamb.warmup_steps, weight_decay=cfg.Lamb.weight_decay, eps=cfg.Lamb.eps) elif cfg.optimizer == 'Momentum': optimizer = Momentum(netwithloss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, momentum=cfg.Momentum.momentum) elif cfg.optimizer == 'AdamWeightDecayDynamicLR': optimizer = AdamWeightDecayDynamicLR( netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * new_repeat_count, learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate, end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate, power=cfg.AdamWeightDecayDynamicLR.power, weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay, eps=cfg.AdamWeightDecayDynamicLR.eps, warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps) else: raise ValueError( "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]" .format(cfg.optimizer)) callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()] if args_opt.enable_save_ckpt == "true": config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(netwithloss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer) model = Model(netwithgrads) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"))
network = quant.convert_quant_network(network, bn_fold=True, per_channel=[True, False], symmetric=[True, False]) # define network loss loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, config=config_device_target, device_target=args_opt.device_target, batch_size=config_device_target.batch_size) step_size = dataset.get_dataset_size() # load checkpoint if args_opt.checkpoint_path: param_dict = load_checkpoint(args_opt.checkpoint_path) not_load_param = load_param_into_net(network, param_dict) if not_load_param: raise ValueError("Load param into net fail!") network.set_train(False) # define model model = Model(network, loss_fn=loss, metrics={'acc'}) print("============== Starting Validation ==============") res = model.eval(dataset) print("result:", res, "ckpt=", args_opt.checkpoint_path) print("============== End Validation ==============")
def val(args): '''eval''' print('=============yolov3 start evaluating==================') # logger args.batch_size = config.batch_size args.input_shape = config.input_shape args.result_path = config.result_path args.conf_thresh = config.conf_thresh args.nms_thresh = config.nms_thresh context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE, device_num=args.world_size, gradients_mean=True) mindrecord_path = args.mindrecord_path print('Loading data from {}'.format(mindrecord_path)) num_classes = config.num_classes if num_classes > 1: raise NotImplementedError( 'num_classes > 1: Yolov3 postprocess not implemented!') anchors = config.anchors anchors_mask = config.anchors_mask num_anchors_list = [len(x) for x in anchors_mask] reduction_0 = 64.0 reduction_1 = 32.0 reduction_2 = 16.0 labels = ['face'] classes = {0: 'face'} # dataloader ds = de.MindDataset( mindrecord_path + "0", columns_list=["image", "annotation", "image_name", "image_size"]) single_scale_trans = SingleScaleTrans(resize=args.input_shape) ds = ds.batch( args.batch_size, per_batch_map=single_scale_trans, input_columns=["image", "annotation", "image_name", "image_size"], num_parallel_workers=8) args.steps_per_epoch = ds.get_dataset_size() # backbone network = backbone_HwYolov3(num_classes, num_anchors_list, args) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) print('load model {} success'.format(args.pretrained)) else: print( 'load model {} failed, please check the path of model, evaluating end' .format(args.pretrained)) exit(0) ds = ds.repeat(1) det = {} img_size = {} img_anno = {} model_name = args.pretrained.split('/')[-1].replace('.ckpt', '') result_path = os.path.join(args.result_path, model_name) if os.path.exists(result_path): pass if not os.path.isdir(result_path): os.makedirs(result_path, exist_ok=True) # result file ret_files_set = { 'face': os.path.join(result_path, 'comp4_det_test_face_rm5050.txt'), } test_net = BuildTestNetwork(network, reduction_0, reduction_1, reduction_2, anchors, anchors_mask, num_classes, args) print('conf_thresh:', args.conf_thresh) eval_times = 0 for data in ds.create_tuple_iterator(output_numpy=True): batch_images = data[0] batch_labels = data[1] batch_image_name = data[2] batch_image_size = data[3] eval_times += 1 img_tensor = Tensor(batch_images, mstype.float32) dets = [] tdets = [] coords_0, cls_scores_0, coords_1, cls_scores_1, coords_2, cls_scores_2 = test_net( img_tensor) boxes_0, boxes_1, boxes_2 = get_bounding_boxes( coords_0, cls_scores_0, coords_1, cls_scores_1, coords_2, cls_scores_2, args.conf_thresh, args.input_shape, num_classes) converted_boxes_0, converted_boxes_1, converted_boxes_2 = tensor_to_brambox( boxes_0, boxes_1, boxes_2, args.input_shape, labels) tdets.append(converted_boxes_0) tdets.append(converted_boxes_1) tdets.append(converted_boxes_2) batch = len(tdets[0]) for b in range(batch): single_dets = [] for op in range(3): single_dets.extend(tdets[op][b]) dets.append(single_dets) det.update({ batch_image_name[k].decode('UTF-8'): v for k, v in enumerate(dets) }) img_size.update({ batch_image_name[k].decode('UTF-8'): v for k, v in enumerate(batch_image_size) }) img_anno.update({ batch_image_name[k].decode('UTF-8'): v for k, v in enumerate(batch_labels) }) print('eval times:', eval_times) print('batch size: ', args.batch_size) netw, neth = args.input_shape reorg_dets = voc_wrapper.reorg_detection(det, netw, neth, img_size) voc_wrapper.gen_results(reorg_dets, result_path, img_size, args.nms_thresh) # compute mAP ground_truth = parse_gt_from_anno(img_anno, classes) ret_list = parse_rets(ret_files_set) iou_thr = 0.5 evaluate = calc_recall_precision_ap(ground_truth, ret_list, iou_thr) aps_str = '' for cls in evaluate: per_line, = plt.plot(evaluate[cls]['recall'], evaluate[cls]['presicion'], 'b-') per_line.set_label('%s:AP=%.3f' % (cls, evaluate[cls]['ap'])) aps_str += '_%s_AP_%.3f' % (cls, evaluate[cls]['ap']) plt.plot([i / 1000.0 for i in range(1, 1001)], [i / 1000.0 for i in range(1, 1001)], 'y--') plt.axis([0, 1.2, 0, 1.2]) plt.xlabel('recall') plt.ylabel('precision') plt.grid() plt.legend() plt.title('PR') # save mAP ap_save_path = os.path.join( result_path, result_path.replace('/', '_') + aps_str + '.png') print('Saving {}'.format(ap_save_path)) plt.savefig(ap_save_path) print('=============yolov3 evaluating finished==================')
def train(): args = parse_args() cfg = FCN8s_VOC2012_cfg device_num = int(os.environ.get("DEVICE_NUM", 1)) # init multicards training if device_num > 1: parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num) init() args.rank = get_rank() args.group_size = get_group_size() context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False, device_target="Ascend", device_id=args.device_id) # dataset dataset = data_generator.SegDataset(image_mean=cfg.image_mean, image_std=cfg.image_std, data_file=cfg.data_file, batch_size=cfg.batch_size, crop_size=cfg.crop_size, max_scale=cfg.max_scale, min_scale=cfg.min_scale, ignore_label=cfg.ignore_label, num_classes=cfg.num_classes, num_readers=2, num_parallel_calls=4, shard_id=args.rank, shard_num=args.group_size) dataset = dataset.get_dataset(repeat=1) net = FCN8s(n_class=cfg.num_classes) loss_ = loss.SoftmaxCrossEntropyLoss(cfg.num_classes, cfg.ignore_label) # load pretrained vgg16 parameters to init FCN8s if cfg.ckpt_vgg16: param_vgg = load_checkpoint(cfg.ckpt_vgg16) param_dict = {} for layer_id in range(1, 6): sub_layer_num = 2 if layer_id < 3 else 3 for sub_layer_id in range(sub_layer_num): # conv param y_weight = 'conv{}.{}.weight'.format(layer_id, 3 * sub_layer_id) x_weight = 'vgg16_feature_extractor.conv{}_{}.0.weight'.format(layer_id, sub_layer_id + 1) param_dict[y_weight] = param_vgg[x_weight] # BatchNorm param y_gamma = 'conv{}.{}.gamma'.format(layer_id, 3 * sub_layer_id + 1) y_beta = 'conv{}.{}.beta'.format(layer_id, 3 * sub_layer_id + 1) x_gamma = 'vgg16_feature_extractor.conv{}_{}.1.gamma'.format(layer_id, sub_layer_id + 1) x_beta = 'vgg16_feature_extractor.conv{}_{}.1.beta'.format(layer_id, sub_layer_id + 1) param_dict[y_gamma] = param_vgg[x_gamma] param_dict[y_beta] = param_vgg[x_beta] load_param_into_net(net, param_dict) # load pretrained FCN8s elif cfg.ckpt_pre_trained: param_dict = load_checkpoint(cfg.ckpt_pre_trained) load_param_into_net(net, param_dict) # optimizer iters_per_epoch = dataset.get_dataset_size() lr_scheduler = CosineAnnealingLR(cfg.base_lr, cfg.train_epochs, iters_per_epoch, cfg.train_epochs, warmup_epochs=0, eta_min=0) lr = Tensor(lr_scheduler.get_lr()) # loss scale manager_loss_scale = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False) optimizer = nn.Momentum(params=net.trainable_params(), learning_rate=lr, momentum=0.9, weight_decay=0.0001, loss_scale=cfg.loss_scale) model = Model(net, loss_fn=loss_, loss_scale_manager=manager_loss_scale, optimizer=optimizer, amp_level="O3") # callback for saving ckpts time_cb = TimeMonitor(data_size=iters_per_epoch) loss_cb = LossMonitor() cbs = [time_cb, loss_cb] if args.rank == 0: config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=cfg.model, directory=cfg.train_dir, config=config_ck) cbs.append(ckpoint_cb) model.train(cfg.train_epochs, dataset, callbacks=cbs)
args.logger.info('Creating Network....') network = YOLOV4CspDarkNet53(is_training=False) args.logger.info(args.pretrained) if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('yolo_network.'): param_dict_new[key[13:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.pretrained)) else: args.logger.info('{} not exists or not a pre-trained file'.format( args.pretrained)) assert FileNotFoundError( '{} not exists or not a pre-trained file'.format(args.pretrained)) exit(1) data_root = args.data_root ann_file = args.ann_file config = ConfigYOLOV4CspDarkNet53() if args.testing_shape: config.test_img_shape = convert_testing_shape(args.testing_shape)
def test_nad_method(): """ NAD-Defense test. """ # 1. load trained network ckpt_name = './trained_ckpt_file/checkpoint_lenet-10_1875.ckpt' net = LeNet5() load_dict = load_checkpoint(ckpt_name) load_param_into_net(net, load_dict) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=False) opt = nn.Momentum(net.trainable_params(), 0.01, 0.09) nad = NaturalAdversarialDefense(net, loss_fn=loss, optimizer=opt, bounds=(0.0, 1.0), eps=0.3) # 2. get test data data_list = "./MNIST_unzip/test" batch_size = 32 ds_test = generate_mnist_dataset(data_list, batch_size=batch_size, sparse=False) inputs = [] labels = [] for data in ds_test.create_tuple_iterator(): inputs.append(data[0].astype(np.float32)) labels.append(data[1]) inputs = np.concatenate(inputs) labels = np.concatenate(labels) # 3. get accuracy of test data on original model net.set_train(False) acc_list = [] batchs = inputs.shape[0] // batch_size for i in range(batchs): batch_inputs = inputs[i * batch_size:(i + 1) * batch_size] batch_labels = np.argmax(labels[i * batch_size:(i + 1) * batch_size], axis=1) logits = net(Tensor(batch_inputs)).asnumpy() label_pred = np.argmax(logits, axis=1) acc_list.append(np.mean(batch_labels == label_pred)) LOGGER.debug(TAG, 'accuracy of TEST data on original model is : %s', np.mean(acc_list)) # 4. get adv of test data attack = FastGradientSignMethod(net, eps=0.3) adv_data = attack.batch_generate(inputs, labels) LOGGER.debug(TAG, 'adv_data.shape is : %s', adv_data.shape) # 5. get accuracy of adv data on original model net.set_train(False) acc_list = [] batchs = adv_data.shape[0] // batch_size for i in range(batchs): batch_inputs = adv_data[i * batch_size:(i + 1) * batch_size] batch_labels = np.argmax(labels[i * batch_size:(i + 1) * batch_size], axis=1) logits = net(Tensor(batch_inputs)).asnumpy() label_pred = np.argmax(logits, axis=1) acc_list.append(np.mean(batch_labels == label_pred)) LOGGER.debug(TAG, 'accuracy of adv data on original model is : %s', np.mean(acc_list)) # 6. defense net.set_train() nad.batch_defense(inputs, labels, batch_size=32, epochs=10) # 7. get accuracy of test data on defensed model net.set_train(False) acc_list = [] batchs = inputs.shape[0] // batch_size for i in range(batchs): batch_inputs = inputs[i * batch_size:(i + 1) * batch_size] batch_labels = np.argmax(labels[i * batch_size:(i + 1) * batch_size], axis=1) logits = net(Tensor(batch_inputs)).asnumpy() label_pred = np.argmax(logits, axis=1) acc_list.append(np.mean(batch_labels == label_pred)) LOGGER.debug(TAG, 'accuracy of TEST data on defensed model is : %s', np.mean(acc_list)) # 8. get accuracy of adv data on defensed model acc_list = [] batchs = adv_data.shape[0] // batch_size for i in range(batchs): batch_inputs = adv_data[i * batch_size:(i + 1) * batch_size] batch_labels = np.argmax(labels[i * batch_size:(i + 1) * batch_size], axis=1) logits = net(Tensor(batch_inputs)).asnumpy() label_pred = np.argmax(logits, axis=1) acc_list.append(np.mean(batch_labels == label_pred)) LOGGER.debug(TAG, 'accuracy of adv data on defensed model is : %s', np.mean(acc_list))
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ Do train Args: dataset: the train dataset. network: the network with loss load_checkpoint_path: the file path which saved pretrain model checkpoint. save_checkpoint_path: the file path which will save finetune model checkpoint. epoch_num: the number of epoch """ if load_checkpoint_path == "": raise ValueError( "Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() # optimizer if cfg.optimizer == 'AdamWeightDecay': lr_schedule = GPT2LearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.AdamWeightDecay.power) params = network.trainable_params() decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=cfg.AdamWeightDecay.eps) elif cfg.optimizer == 'Lamb': lr_schedule = GPT2LearningRate( learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), lr_schedule) elif cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), cfg.Momentum.learning_rate, cfg.Momentum.momentum) else: raise Exception( "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]" ) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) prefix_name = "gpt2_summarization_" + str(cfg.gpt2_network) + "_" + str(cfg.optimizer) + "_" \ + str(epoch_num) + "_bs" + str(gpt2_net_cfg.batch_size) ckpoint_cb = ModelCheckpoint( prefix=prefix_name, directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) final_param_dict = {} for name, _ in param_dict.items(): final_param_dict['gpt2.gpt2.' + name] = param_dict[name] final_param_dict['gpt2.lm_head.weight'] = param_dict[ 'gpt2_embedding_lookup.embedding_table'] load_param_into_net(network, final_param_dict) print("Load pretrained parameter successfully!\n") update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = GPT2FinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) netwithgrads.set_train(True) loss_cb = LossMonitor(per_print_times=1) model = Model(netwithgrads) callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb] print("============== Starting Finetuning ==============") model.train(epoch_num, dataset, callbacks=callbacks, dataset_sink_mode=False) print("============== Finetuning Success ==============")
layers=hparams.layers, stacks=hparams.stacks, residual_channels=hparams.residual_channels, gate_channels=hparams.gate_channels, skip_out_channels=hparams.skip_out_channels, cin_channels=hparams.cin_channels, gin_channels=hparams.gin_channels, n_speakers=hparams.n_speakers, dropout=hparams.dropout, kernel_size=hparams.kernel_size, cin_pad=hparams.cin_pad, upsample_conditional_features=hparams.upsample_conditional_features, upsample_params=upsample_params, scalar_input=is_scalar_input(hparams.input_type), output_distribution=hparams.output_distribution, ) Net = PredictNet(model) Net.set_train(False) receptive_field = model.receptive_field print("Receptive field (samples / ms): {} / {}".format(receptive_field, receptive_field / fs * 1000)) param_dict = load_checkpoint(args.pretrain_ckpt) load_param_into_net(model, param_dict) print('Successfully loading the pre-trained model') x = np.array(np.random.random((2, 256, 10240)), dtype=np.float32) c = np.array(np.random.random((2, 80, 44)), dtype=np.float32) g = np.array([0, 0], dtype=np.int64) export(Net, Tensor(x), Tensor(c), Tensor(g), file_name="WaveNet", file_format='MINDIR')