def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = 32 class_num = 10 loss_scale_num = 1024 local_data_path = '/cache/data' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) if device_num > 1: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() local_data_path = os.path.join(local_data_path, str(device_id)) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=1, batch_size=batch_size) eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False, repeat_num=1, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor( get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnoram will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [performance_cb, loss_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) if device_num == 1 or device_id == 0: print(f'Start run evaluation.') output = model.eval(eval_dataset) print(f'Evaluation result: {output}.')
'params': no_decayed_params }, { 'order_params': net.trainalbe_params() }] opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) # define loss, model if target == "Ascend": if args_opt.dataset == "imagenet2012": if not config.use_label_smooth: config.label_smooth_factor = 0.0 loss = SoftmaxCrossEntropyWithLogits( sparse=True, reduction="mean", smooth_factor=config.label_smooth_factor, num_classes=config.class_num) else: loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False) else: # GPU target
parser.add_argument('--checkpoint_path', type=str, default=None, help='Pretrained checkpoint path') args = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) dataset_sink_mode = not args.device_target == "CPU" # define net net = resnet( class_num=10 ) #if you wish to consider other module you can pass also this argument # ckpoint = args.checkpoint_path # define loss, model loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') #define cifar-10 path cifar_path = "./CIFAR-10" # define model model = Model(net, loss_fn=loss, metrics={"Accuracy": Accuracy()}) # eval model test_net(net, model, cifar_path, args.checkpoint_path) # # config for resent50, cifar10 # config1 = ed({ # "class_num": 10, # "batch_size": 32, # "loss_scale": 1024, # "momentum": 0.9, # "weight_decay": 1e-4, # "epoch_size": 90,
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = cfg.batch_size class_num = cfg.class_num loss_scale_num = cfg.loss_scale local_data_path = '/cache/data' local_ckpt_path = '/cache/ckpt_file' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=epoch_size, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor( get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch time_cb = TimeMonitor(data_size=train_step_size) performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [time_cb, performance_cb, loss_cb] config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck) cb += [ckpt_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) # upload checkpoint files print('Upload checkpoint.') mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
if run_distribute: context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, parameter_broadcast=True, mirror_mean=True) auto_parallel_context().set_all_reduce_fusion_split_indices([140]) init() epoch_size = config.epoch_size net = mobilenet_v2(num_classes=config.num_classes) net.to_float(mstype.float16) for _, cell in net.cells_and_names(): if isinstance(cell, nn.Dense): cell.add_flags_recursive(fp32=True) if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes) else: loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') print("train args: ", args_opt, "\ncfg: ", config, "\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size)) dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=epoch_size, batch_size=config.batch_size) step_size = dataset.get_dataset_size() if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) lr = Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=config.lr, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size)) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
if __name__ == '__main__': if args_opt.do_eval: context.set_context(enable_hccl=False) else: if args_opt.run_distribute: context.set_context(enable_hccl=True) context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) auto_parallel_context().set_all_reduce_fusion_split_indices([140]) init() else: context.set_context(enable_hccl=False) epoch_size = config.epoch_size net = resnet50(class_num=config.class_num) loss = SoftmaxCrossEntropyWithLogits(sparse=True) if args_opt.do_train: dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=epoch_size, batch_size=config.batch_size) step_size = dataset.get_dataset_size() loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size, lr_decay_mode='poly')) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})
def __init__(self, reduction='mean'): super(CrossEntropyLoss2, self).__init__() self.cross_entropy = SoftmaxCrossEntropyWithLogits(reduction=reduction)
def __init__(self, reduction='mean'): super(CrossEntropyLoss, self).__init__() self.reduce_mean = P.ReduceMean() self.cross_entropy = SoftmaxCrossEntropyWithLogits() self.reduction = reduction