# select for master rank save ckpt or all rank save, compatible for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) if args.dataset == "cifar10": dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, args.rank, args.group_size) else: dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size, args.rank, args.group_size) batch_num = dataset.get_dataset_size() args.steps_per_epoch = dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = vgg16(args.num_classes, args) # pre_trained
def test(cloud_args=None): """test""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) if args.dataset == "cifar10": net = vgg16(num_classes=args.num_classes) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, cfg.momentum, weight_decay=args.weight_decay) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) param_dict = load_checkpoint(args.checkpoint_path) load_param_into_net(net, param_dict) net.set_train(False) dataset = vgg_create_dataset(args.data_path, 1, False) res = model.eval(dataset) print("result: ", res) else: # network args.logger.important_info('start create network') if os.path.isdir(args.pretrained): models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt'))) print(models) if args.graph_ckpt: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1]. split('_')[0]) else: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1]) args.models = sorted(models, key=f) else: args.models = [ args.pretrained, ] for model in args.models: if args.dataset == "cifar10": dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, training=False) else: dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size) eval_dataloader = dataset.create_tuple_iterator() network = vgg16(args.num_classes, args, phase="test") # pre_trained load_param_into_net(network, load_checkpoint(model)) network.add_flags_recursive(fp16=True) img_tot = 0 top1_correct = 0 top5_correct = 0 network.set_train(False) t_end = time.time() it = 0 for data, gt_classes in eval_dataloader: output = network(Tensor(data, mstype.float32)) output = output.asnumpy() top1_output = np.argmax(output, (-1)) top5_output = np.argsort(output)[:, -5:] t1_correct = np.equal(top1_output, gt_classes).sum() top1_correct += t1_correct top5_correct += get_top5_acc(top5_output, gt_classes) img_tot += args.per_batch_size if args.rank == 0 and it == 0: t_end = time.time() it = 1 if args.rank == 0: time_used = time.time() - t_end fps = (img_tot - args.per_batch_size) * args.group_size / time_used args.logger.info( 'Inference Performance: {:.2f} img/sec'.format(fps)) results = [[top1_correct], [top5_correct], [img_tot]] args.logger.info('before results={}'.format(results)) results = np.array(results) args.logger.info('after results={}'.format(results)) top1_correct = results[0, 0] top5_correct = results[1, 0] img_tot = results[2, 0] acc1 = 100.0 * top1_correct / img_tot acc5 = 100.0 * top5_correct / img_tot args.logger.info('after allreduce eval: top1_correct={}, tot={},' 'acc={:.2f}%(TOP1)'.format( top1_correct, img_tot, acc1)) args.logger.info('after allreduce eval: top5_correct={}, tot={},' 'acc={:.2f}%(TOP5)'.format( top5_correct, img_tot, acc5))
args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) context.set_context(device_id=args_opt.device_id) device_num = int(os.environ.get("DEVICE_NUM", 1)) if device_num > 1: context.reset_auto_parallel_context() context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) init() dataset = vgg_create_dataset(args_opt.data_path, 1) batch_num = dataset.get_dataset_size() net = vgg16(num_classes=cfg.num_classes) # pre_trained if args_opt.pre_trained: load_param_into_net(net, load_checkpoint(args_opt.pre_trained)) lr = lr_steps(0, lr_init=cfg.lr_init, lr_max=cfg.lr_max, warmup_epochs=cfg.warmup_epochs, total_epochs=cfg.epoch_size, steps_per_epoch=batch_num) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr),
help='device where the code will be implemented. (Default: Ascend)') parser.add_argument('--data_path', type=str, default='./cifar', help='path where the dataset is saved') parser.add_argument('--device_id', type=int, default=None, help='device id of GPU or Ascend. (Default: None)') args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) context.set_context(device_id=args_opt.device_id) device_num = int(os.environ.get("DEVICE_NUM", 1)) if device_num > 1: context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) init() dataset = vgg_create_dataset(args_opt.data_path, cfg.epoch_size) batch_num = dataset.get_dataset_size() net = vgg16(num_classes=cfg.num_classes) lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=batch_num) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr), cfg.momentum, weight_decay=cfg.weight_decay) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None) config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, keep_checkpoint_max=cfg.keep_checkpoint_max) time_cb = TimeMonitor(data_size=batch_num) ckpoint_cb = ModelCheckpoint(prefix="train_vgg_cifar10", directory="./", config=config_ck) loss_cb = LossMonitor() model.train(cfg.epoch_size, dataset, callbacks=[time_cb, ckpoint_cb, loss_cb])