def __init__(self, network, total_steps=1, sens=16384.0): super(TrainStepWrap, self).__init__(auto_prefix=False) self.network = network self.network.set_train() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) lr = dynamic_lr(0.01, total_steps, 5000) self.optimizer = nn.Adam(self.weights, learning_rate=lr, beta1=0.9, beta2=0.999, eps=1e-8, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer( self.optimizer.parameters, mean, degree)
def train(): rank_id = 0 if args.run_distribute: context.set_auto_parallel_context( device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() rank_id = get_rank() # dataset/network/criterion/optim ds = train_dataset_creator(args.device_id, args.device_num) step_size = ds.get_dataset_size() print('Create dataset done!') config.INFERENCE = False net = ETSNet(config) net = net.set_train() param_dict = load_checkpoint(args.pre_trained) load_param_into_net(net, param_dict) print('Load Pretrained parameters done!') criterion = DiceLoss(batch_size=config.TRAIN_BATCH_SIZE) lrs = dynamic_lr(config.BASE_LR, config.TRAIN_TOTAL_ITER, config.WARMUP_STEP, config.WARMUP_RATIO) opt = nn.SGD(params=net.trainable_params(), learning_rate=lrs, momentum=0.99, weight_decay=5e-4) # warp model net = WithLossCell(net, criterion) if args.run_distribute: net = TrainOneStepCell(net, opt, reduce_flag=True, mean=True, degree=args.device_num) else: net = TrainOneStepCell(net, opt) time_cb = TimeMonitor(data_size=step_size) loss_cb = LossCallBack(per_print_times=10) # set and apply parameters of check point config.TRAIN_MODEL_SAVE_PATH ckpoint_cf = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=2) ckpoint_cb = ModelCheckpoint(prefix="ETSNet", config=ckpoint_cf, directory="./ckpt_{}".format(rank_id)) model = Model(net) model.train(config.TRAIN_REPEAT_NUM, ds, dataset_sink_mode=True, callbacks=[time_cb, loss_cb, ckpoint_cb])
def train_net(data_dir, seg_dir, run_distribute, config=None): network = UNet3d(config=config) lr = Tensor(dynamic_lr(config, 877), mstype.float32) print(lr) # loss = SoftmaxCrossEntropyWithLogits() loss = nn.DiceLoss() network.set_train() inputs = mindspore.Tensor(np.ones((1, 1, 144, 144, 144), np.float32)) output = network(inputs)
def train_net__(data_dir, seg_dir, run_distribute, config=None): train_data_size = 5 print("train dataset length is:", train_data_size) network = UNet3d(config=config) loss = SoftmaxCrossEntropyWithLogits() # loss = nn.DiceLoss() lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32) optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr) scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) network.set_train() network.to_float(mstype.float16) _do_keep_batchnorm_fp32(network) network = _add_loss_network(network, loss, mstype.float16) loss_scale = 1.0 loss_scale = scale_manager.get_loss_scale() update_cell = scale_manager.get_update_cell() if update_cell is not None: model = nn.TrainOneStepWithLossScaleCell( network, optimizer, scale_sense=update_cell).set_train() else: model = nn.TrainOneStepCell(network, optimizer, loss_scale).set_train() inputs = mindspore.Tensor(np.random.rand(1, 1, 224, 224, 96), mstype.float32) labels = mindspore.Tensor(np.random.rand(1, 4, 224, 224, 96), mstype.float32) step_per_epoch = train_data_size print("============== Starting Training ==============") # for epoch_id in range(1): for epoch_id in range(cfg.epoch_size): time_epoch = 0.0 for step_id in range(step_per_epoch): # for step_id in range(1): time_start = time.time() loss = model(inputs, labels) # loss = network(inputs, labels) # loss = network(inputs) loss = loss.asnumpy() time_end = time.time() time_step = time_end - time_start time_epoch = time_epoch + time_step print( 'Epoch: [%3d/%3d], step: [%5d/%5d], loss: [%6.4f], time: [%.4f]' % (epoch_id, cfg.epoch_size, step_id, step_per_epoch, loss, time_step)) print('Epoch time: %10.4f, per step time: %7.4f' % (time_epoch, time_epoch / step_per_epoch)) print("============== End Training ==============")
def train_net(data_dir, seg_dir, run_distribute, config=None): if run_distribute: init() rank_id = get_rank() rank_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=rank_size, gradients_mean=True) else: rank_id = 0 rank_size = 1 # train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \ # rank_size=rank_size, rank_id=rank_id, is_training=True) train_dataset = create_dataset_diy() # for item in train_dataset: # print(item) # exit(0) train_data_size = train_dataset.get_dataset_size() print("train dataset length is:", train_data_size) network = UNet3d(config=config) loss = SoftmaxCrossEntropyWithLogits() # loss = nn.DiceLoss() lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32) optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr) scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) network.set_train() model = Model(network, loss_fn=loss, optimizer=optimizer, loss_scale_manager=scale_manager, amp_level='O3') time_cb = TimeMonitor(data_size=train_data_size) loss_cb = LossMonitor(per_print_times=2) ckpt_config = CheckpointConfig( save_checkpoint_steps=train_data_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model), directory='./ckpt_{}/'.format(rank_size), config=ckpt_config) callbacks_list = [loss_cb, time_cb, ckpoint_cb] print("============== Starting Training ==============") model.train(config.epoch_size, train_dataset, callbacks=callbacks_list, dataset_sink_mode=False) print("============== End Training ==============")
load_path = args_opt.pre_trained if args_opt.task_type == "Pretraining": print("load backbone vgg16 ckpt {}".format(args_opt.pre_trained)) param_dict = load_checkpoint(load_path) for item in list(param_dict.keys()): if not item.startswith('vgg16_feature_extractor'): param_dict.pop(item) load_param_into_net(net, param_dict) else: if load_path != "": print("load pretrain ckpt {}".format(args_opt.pre_trained)) param_dict = load_checkpoint(load_path) load_param_into_net(net, param_dict) loss = LossNet() lr = Tensor(dynamic_lr(training_cfg, dataset_size), mstype.float32) opt = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum,\ weight_decay=config.weight_decay, loss_scale=config.loss_scale) net_with_loss = WithLossCell(net, loss) if args_opt.run_distribute: net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale, reduce_flag=True, mean=True, degree=device_num) else: net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale) time_cb = TimeMonitor(data_size=dataset_size) loss_cb = LossCallBack(rank_id=rank)
batch_size=config.batch_size, device_num=device_num, rank_id=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done! dataset_size = ", dataset_size) net = Deeptext_VGG16(config=config) net = net.set_train() load_path = args_opt.pre_trained if load_path != "": param_dict = load_checkpoint(load_path) load_param_into_net(net, param_dict) loss = LossNet() lr = Tensor(dynamic_lr(config, rank_size=device_num), mstype.float32) opt = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) net_with_loss = WithLossCell(net, loss) if args_opt.run_distribute: net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale, reduce_flag=True, mean=True, degree=device_num)
net = net.set_train() load_path = args_opt.pre_trained if load_path != "": param_dict = load_checkpoint(load_path) if config.pretrain_epoch_size == 0: for item in list(param_dict.keys()): if not (item.startswith('backbone') or item.startswith('rcnn_mask')): param_dict.pop(item) load_param_into_net(net, param_dict) loss = LossNet() lr = Tensor( dynamic_lr(config, rank_size=device_num, start_steps=config.pretrain_epoch_size * dataset_size), mstype.float32) opt = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) net_with_loss = WithLossCell(net, loss) if args_opt.run_distribute: net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale, reduce_flag=True,
device_num = 1 mindrecord_file = args.dataset_path if not os.path.exists(mindrecord_file): print("dataset file {} not exists, please check!".format( mindrecord_file)) raise ValueError(mindrecord_file) dataset = create_gru_dataset(epoch_count=config.num_epochs, batch_size=config.batch_size, dataset_path=mindrecord_file, rank_size=device_num, rank_id=rank) dataset_size = dataset.get_dataset_size() print("dataset size is {}".format(dataset_size)) network = Seq2Seq(config) network = GRUWithLossCell(network) lr = dynamic_lr(config, dataset_size) opt = Adam(network.trainable_params(), learning_rate=lr) scale_manager = DynamicLossScaleManager( init_loss_scale=config.init_loss_scale_value, scale_factor=config.scale_factor, scale_window=config.scale_window) update_cell = scale_manager.get_update_cell() netwithgrads = GRUTrainOneStepWithLossScaleCell(network, opt, update_cell) time_cb = TimeMonitor(data_size=dataset_size) loss_cb = LossCallBack(rank_id=rank) cb = [time_cb, loss_cb] #Save Checkpoint if config.save_checkpoint: ckpt_config = CheckpointConfig( save_checkpoint_steps=config.ckpt_epoch * dataset_size,
if k in oldkey: newkey = oldkey.replace(k, v) param_dict[newkey] = param_dict.pop(oldkey) break for item in list(param_dict.keys()): if not item.startswith('backbone'): param_dict.pop(item) for key, value in param_dict.items(): tensor = value.asnumpy().astype(np.float32) param_dict[key] = Parameter(tensor, key) load_param_into_net(net, param_dict) loss = LossNet() lr = Tensor(dynamic_lr(config, dataset_size), mstype.float32) opt = SGD(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) net_with_loss = WithLossCell(net, loss) if args_opt.run_distribute: net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale, reduce_flag=True, mean=True, degree=device_num) else: