def train(): """Train function.""" args = get_args("train") if args.need_profiler: from mindspore.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) ds = create_dataset(args) G_A = get_generator(args) G_B = get_generator(args) D_A = get_discriminator(args) D_B = get_discriminator(args) load_ckpt(args, G_A, G_B, D_A, D_B) imgae_pool_A = ImagePool(args.pool_size) imgae_pool_B = ImagePool(args.pool_size) generator = Generator(G_A, G_B, args.lambda_idt > 0) loss_D = DiscriminatorLoss(args, D_A, D_B) loss_G = GeneratorLoss(args, generator, D_A, D_B) optimizer_G = nn.Adam(generator.trainable_params(), get_lr(args), beta1=args.beta1) optimizer_D = nn.Adam(loss_D.trainable_params(), get_lr(args), beta1=args.beta1) net_G = TrainOneStepG(loss_G, generator, optimizer_G) net_D = TrainOneStepD(loss_D, optimizer_D) data_loader = ds.create_dict_iterator() reporter = Reporter(args) reporter.info('==========start training===============') for _ in range(args.max_epoch): reporter.epoch_start() for data in data_loader: img_A = data["image_A"] img_B = data["image_B"] res_G = net_G(img_A, img_B) fake_A = res_G[0] fake_B = res_G[1] res_D = net_D(img_A, img_B, imgae_pool_A.query(fake_A), imgae_pool_B.query(fake_B)) reporter.step_end(res_G, res_D) reporter.visualizer(img_A, img_B, fake_A, fake_B) reporter.epoch_end(net_G) if args.need_profiler: profiler.analyse() break reporter.info('==========end training===============')
def eval_net(): '''eval net''' if config.dataset == 'MR': instance = MovieReview(root_dir=config.data_path, maxlen=config.word_len, split=0.9) elif config.dataset == 'SUBJ': instance = Subjectivity(root_dir=config.data_path, maxlen=config.word_len, split=0.9) elif config.dataset == 'SST2': instance = SST2(root_dir=config.data_path, maxlen=config.word_len, split=0.9) device_target = config.device_target context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) if device_target == "Ascend": context.set_context(device_id=get_device_id()) dataset = instance.create_test_dataset(batch_size=config.batch_size) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True) net = TextCNN(vocab_len=instance.get_dict_len(), word_len=config.word_len, num_classes=config.num_classes, vec_length=config.vec_length) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=0.001, weight_decay=float(config.weight_decay)) param_dict = load_checkpoint(config.checkpoint_file_path) print("load checkpoint from [{}].".format(config.checkpoint_file_path)) load_param_into_net(net, param_dict) net.set_train(False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc': Accuracy()}) acc = model.eval(dataset) print("accuracy: ", acc)
def get_train_optimizer(net, steps_per_epoch, args): """ generate optimizer for updating the weights. """ if args.optimizer == "Adam": lr = get_lr(lr_init=1e-4, lr_end=1e-6, lr_max=9e-4, warmup_epochs=args.warmup_epochs, total_epochs=args.epoch_size, steps_per_epoch=steps_per_epoch, lr_decay_mode="linear") lr = Tensor(lr) decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) group_params = [{ 'params': decayed_params, 'weight_decay': args.weight_decay }, { 'params': no_decayed_params }, { 'order_params': net.trainable_params() }] optimizer = nn.Adam(params=group_params, learning_rate=lr) else: raise ValueError("Unsupported optimizer.") return optimizer
def test_svi_vae(): # define the encoder and decoder encoder = Encoder() decoder = Decoder() # define the vae model vae = VAE(encoder, decoder, hidden_size=400, latent_size=20) # define the loss function net_loss = ELBO(latent_prior='Normal', output_prior='Normal') # define the optimizer optimizer = nn.Adam(params=vae.trainable_params(), learning_rate=0.001) # define the training dataset ds_train = create_dataset(image_path, 128, 1) net_with_loss = nn.WithLossCell(vae, net_loss) # define the variational inference vi = SVI(net_with_loss=net_with_loss, optimizer=optimizer) # run the vi to return the trained network. vae = vi.run(train_dataset=ds_train, epochs=5) # get the trained loss trained_loss = vi.get_train_loss() # test function: generate_sample generated_sample = vae.generate_sample(64, IMAGE_SHAPE) # test function: reconstruct_sample for sample in ds_train.create_dict_iterator(): sample_x = Tensor(sample['image'], dtype=mstype.float32) reconstructed_sample = vae.reconstruct_sample(sample_x) print('The loss of the trained network is ', trained_loss) print('The hape of the generated sample is ', generated_sample.shape) print('The shape of the reconstructed sample is ', reconstructed_sample.shape)
def __init__(self, network, total_steps=1, sens=16384.0): super(TrainStepWrap, self).__init__(auto_prefix=False) self.network = network self.network.set_train() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) lr = dynamic_lr(0.01, total_steps, 5000) self.optimizer = nn.Adam(self.weights, learning_rate=lr, beta1=0.9, beta2=0.999, eps=1e-8, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer( self.optimizer.parameters, mean, degree)
def test_adam_group1(): """ test_adam_group_lr_and_weight_decay """ inputs = Tensor(np.ones([1, 64]).astype(np.float32)) label = Tensor(np.zeros([1, 10]).astype(np.float32)) net = Net() net.set_train() loss = nn.SoftmaxCrossEntropyWithLogits() net_with_loss = WithLossCell(net, loss) all_params = net.trainable_params() poly_decay_lr = polynomial_decay_lr(0.01, 0.0001, total_step=10, step_per_epoch=1, decay_epoch=3, power=1.0) group_params = [{ 'params': [all_params[0]], 'lr': poly_decay_lr, 'weight_decay': 0.9 }, { 'params': [all_params[1]] }] optimizer = nn.Adam(group_params, learning_rate=0.1) train_network = TrainOneStepCell(net_with_loss, optimizer) _executor.compile(train_network, inputs, label)
def train_mindspore_impl(self): input_ = Tensor(np.random.randn(self.batch_num, self.input_channels).astype(np.float32)) weight_np = Tensor(np.random.randn(self.output_channels, self.input_channels).astype(np.float32)) bias = Tensor(np.random.randn(self.output_channels).astype(np.float32)) label_np = np.random.randint(self.output_channels, size=self.batch_num) label_np_onehot = np.zeros(shape=(self.batch_num, self.output_channels)).astype(np.float32) label_np_onehot[np.arange(self.batch_num), label_np] = 1.0 label = Tensor(label_np_onehot) ms_dense = Dense(in_channels=self.input_channels, out_channels=self.output_channels, weight_init=weight_np, bias_init=bias, has_bias=True) criterion = SoftmaxCrossEntropyWithLogits() optimizer = nn.Adam(ms_dense.trainable_params(), learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=self.epsilon, use_locking=False, use_nesterov=False, weight_decay=0.0, loss_scale=1.0) net_with_criterion = WithLossCell(ms_dense, criterion) train_network = TrainOneStepCell(net_with_criterion, optimizer) train_network.set_train() print('MS Initialized!') for _ in range(self.epoch): train_network(input_, label) output = ms_dense(input_) print("===============output=================", output) return output.asnumpy()
def train(): context.set_context( mode=context.GRAPH_MODE, device_target="Ascend", #save_graphs=True, #save_graphs_path="/home/work/user-job-dir/EAST/", #enable_reduce_precision=False, #device_id=5 ) epoch = 600 my_dataset.download_dataset() train_img_path = os.path.abspath('/cache/train_img') train_gt_path = os.path.abspath('/cache/train_gt') #my_dataset.data_to_mindrecord_byte_image(train_img_path, train_gt_path, mindrecord_dir='/cache', prefix='icdar_train.mindrecord',file_num=1) #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file=['icdar_train.mindrecord0','icdar_train.mindrecord1','icdar_train.mindrecord2','icdar_train.mindrecord3'], batch_size=32, repeat_num=epoch, # is_training=True, num_parallel_workers=8, length=512, scale=0.25) #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file='/cache/icdar_train.mindrecord', batch_size=32, repeat_num=epoch, # is_training=True, num_parallel_workers=24, length=512, scale=0.25) #dataset = my_dataset.create_demo_dataset(batch_size=21, repeat_num=2) #train_img_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/img') #train_gt_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/gt') dataset = datasetV2.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24) #dataset = datasetV3.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24) dataset_size = dataset.get_dataset_size() print("Create dataset done!, dataset_size: ", dataset_size) #east = EAST.EAST() net = EAST_VGG.EAST() #ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * 20) #ckpoint_cb = ModelCheckpoint(prefix='EAST', directory='/cache', config=ckpt_config) milestone = [100, 300] learning_rates = [1e-3, 1e-4] lr = piecewise_constant_lr(milestone, learning_rates) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) net = my_loss.EASTWithLossCell(net) net = my_loss.TrainingWrapper(net, opt) net.set_train(True) callback = [TimeMonitor(data_size=dataset_size), LossMonitor()] #, ckpoint_cb] model = Model(net) dataset_sink_mode = False print("start trainig") model.train(epoch, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
def __init__(self, network, num_class, label, mask, learning_rate, l2_coeff): super(TrainGAT, self).__init__(auto_prefix=False) self.network = network loss_net = LossNetWrapper(network, num_class, label, mask, l2_coeff) optimizer = nn.Adam(loss_net.trainable_params(), learning_rate=learning_rate) self.loss_train_net = TrainOneStepCell(loss_net, optimizer) self.accuracy_func = MaskedAccuracy(num_class, label, mask)
def __init__(self, network, label, mask, config): super(TrainNetWrapper, self).__init__(auto_prefix=False) self.network = network loss_net = LossWrapper(network, label, mask, config.weight_decay) optimizer = nn.Adam(loss_net.trainable_params(), learning_rate=config.learning_rate) self.loss_train_net = TrainOneStepCell(loss_net, optimizer) self.accuracy = Accuracy(label, mask)
def test_vae_gan(): vae_gan = VaeGan() net_loss = VaeGanLoss() optimizer = nn.Adam(params=vae_gan.trainable_params(), learning_rate=0.001) ds_train = create_dataset(image_path, 128, 1) net_with_loss = nn.WithLossCell(vae_gan, net_loss) vi = SVI(net_with_loss=net_with_loss, optimizer=optimizer) vae_gan = vi.run(train_dataset=ds_train, epochs=5)
def get_eval_optimizer(net, steps_per_epoch, args): lr = get_lr(lr_init=1e-3, lr_end=6e-6, lr_max=1e-2, warmup_epochs=5, total_epochs=args.epoch_size, steps_per_epoch=steps_per_epoch, lr_decay_mode="linear") lr = Tensor(lr) optimizer = nn.Adam(params=net.trainable_params(), learning_rate=lr) return optimizer
def train_net__(data_dir, seg_dir, run_distribute, config=None): train_data_size = 5 print("train dataset length is:", train_data_size) network = UNet3d(config=config) loss = SoftmaxCrossEntropyWithLogits() # loss = nn.DiceLoss() lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32) optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr) scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) network.set_train() network.to_float(mstype.float16) _do_keep_batchnorm_fp32(network) network = _add_loss_network(network, loss, mstype.float16) loss_scale = 1.0 loss_scale = scale_manager.get_loss_scale() update_cell = scale_manager.get_update_cell() if update_cell is not None: model = nn.TrainOneStepWithLossScaleCell( network, optimizer, scale_sense=update_cell).set_train() else: model = nn.TrainOneStepCell(network, optimizer, loss_scale).set_train() inputs = mindspore.Tensor(np.random.rand(1, 1, 224, 224, 96), mstype.float32) labels = mindspore.Tensor(np.random.rand(1, 4, 224, 224, 96), mstype.float32) step_per_epoch = train_data_size print("============== Starting Training ==============") # for epoch_id in range(1): for epoch_id in range(cfg.epoch_size): time_epoch = 0.0 for step_id in range(step_per_epoch): # for step_id in range(1): time_start = time.time() loss = model(inputs, labels) # loss = network(inputs, labels) # loss = network(inputs) loss = loss.asnumpy() time_end = time.time() time_step = time_end - time_start time_epoch = time_epoch + time_step print( 'Epoch: [%3d/%3d], step: [%5d/%5d], loss: [%6.4f], time: [%.4f]' % (epoch_id, cfg.epoch_size, step_id, step_per_epoch, loss, time_step)) print('Epoch time: %10.4f, per step time: %7.4f' % (time_epoch, time_epoch / step_per_epoch)) print("============== End Training ==============")
def train_net(data_dir, cross_valid_ind=1, epochs=400, batch_size=16, lr=0.0001, run_distribute=False, cfg=None): if run_distribute: init() group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=False) net = UNet(n_channels=cfg['num_channels'], n_classes=cfg['num_classes']) if cfg['resume']: param_dict = load_checkpoint(cfg['resume_ckpt']) load_param_into_net(net, param_dict) criterion = CrossEntropyWithLogits() train_dataset, _ = create_dataset(data_dir, epochs, batch_size, True, cross_valid_ind, run_distribute) train_data_size = train_dataset.get_dataset_size() print("dataset length is:", train_data_size) ckpt_config = CheckpointConfig( save_checkpoint_steps=train_data_size, keep_checkpoint_max=cfg['keep_checkpoint_max']) ckpoint_cb = ModelCheckpoint(prefix='ckpt_unet_medical_adam', directory='./ckpt_{}/'.format(device_id), config=ckpt_config) optimizer = nn.Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=cfg['weight_decay'], loss_scale=cfg['loss_scale']) loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager( cfg['FixedLossScaleManager'], False) model = Model(net, loss_fn=criterion, loss_scale_manager=loss_scale_manager, optimizer=optimizer, amp_level="O3") print("============== Starting Training ==============") model.train( 1, train_dataset, callbacks=[StepLossTimeMonitor(batch_size=batch_size), ckpoint_cb], dataset_sink_mode=False) print("============== End Training ==============")
def test_sit_embedding_lookup_net(): indices = Tensor(np.array([0, 1, 2]).astype(np.int32)) label = Tensor(np.random.randn(3, 8).astype(np.float32)) net1 = NetWithEmbeddingLookUp(vocab_size=8, embedding_size=8, target="CPU") loss = nn.SoftmaxCrossEntropyWithLogits(reduction="mean") optimizer1 = nn.Adam(params=net1.trainable_params(), learning_rate=0.1) optimizer1.unique = True train_network1 = TrainOneStepCell(WithLossCell(net1, loss), optimizer1) train_network1.set_train() out1 = train_network1(indices, label) net2 = NetWithEmbeddingLookUp(vocab_size=8, embedding_size=8, target="CPU") optimizer2 = nn.Adam(params=net2.trainable_params(), learning_rate=0.1) optimizer2.unique = False optimizer2.target = "CPU" train_network2 = TrainOneStepCell(WithLossCell(net2, loss), optimizer2) train_network2.set_train() out2 = train_network2(indices, label) assert np.allclose(out1.asnumpy(), out2.asnumpy(), 0.001, 0.001)
def test_embedding_lookup_with_mix_precision(): data = Tensor(np.array([0, 1, 2]).astype(np.int32)) label = Tensor(np.random.randn(*(2, 3, 2, 2)).astype(np.float32)) net = EmbeddingLookUpBnNet(8, 8, target='CPU') criterion = nn.SoftmaxCrossEntropyWithLogits(reduction='mean') optimizer = nn.Adam(params=net.trainable_params(), learning_rate=0.1) optimizer.sparse_opt.add_prim_attr("primitive_target", "CPU") train_network = ms.amp.build_train_network(net, optimizer, criterion, level="O2") train_network.set_train() for _ in range(2): train_network(data, label)
def main(): # We currently support pynative mode with device GPU context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU') epoch_size = 1 batch_size = 32 mnist_path = "/data/chengzi/zhusuan-mindspore/data/MNIST" repeat_size = 1 # Define model parameters z_dim = 40 x_dim = 32 * 32 # create the network generator = Generator(x_dim, z_dim, batch_size) variational = Variational(x_dim, z_dim, batch_size) network = zs.variational.ELBO(generator, variational) # define loss # learning rate setting lr = 0.001 net_loss = ReduceMeanLoss() # define the optimizer print(network.trainable_params()[0]) net_opt = nn.Adam(network.trainable_params(), lr) model = Model(network, net_loss, net_opt) ds_train = create_dataset(os.path.join(mnist_path, "train"), batch_size, repeat_size) model.train(epoch_size, ds_train, callbacks=[LossMonitor()], dataset_sink_mode=False) print(network.trainable_params()[0]) iterator = ds_train.create_tuple_iterator() for item in iterator: batch_x = item[0].reshape(32, 32 * 32) break z, _ = network.variational(Tensor(batch_x), None, None) sample, _, _, _ = network.generator(None, z, None) sample = sample.asnumpy() save_img(batch_x, 'result/origin_x.png') save_img(sample, 'result/reconstruct_x.png') for i in range(4): sample, _, _, _ = network.generator(None, None, None) sample = sample.asnumpy() samples = sample if i == 0 else np.concatenate([samples, sample], axis=0) save_img(samples, 'result/sample_x.png', num=4 * batch_size)
def train_net(data_dir, seg_dir, run_distribute, config=None): if run_distribute: init() rank_id = get_rank() rank_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=rank_size, gradients_mean=True) else: rank_id = 0 rank_size = 1 # train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \ # rank_size=rank_size, rank_id=rank_id, is_training=True) train_dataset = create_dataset_diy() # for item in train_dataset: # print(item) # exit(0) train_data_size = train_dataset.get_dataset_size() print("train dataset length is:", train_data_size) network = UNet3d(config=config) loss = SoftmaxCrossEntropyWithLogits() # loss = nn.DiceLoss() lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32) optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr) scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) network.set_train() model = Model(network, loss_fn=loss, optimizer=optimizer, loss_scale_manager=scale_manager, amp_level='O3') time_cb = TimeMonitor(data_size=train_data_size) loss_cb = LossMonitor(per_print_times=2) ckpt_config = CheckpointConfig( save_checkpoint_steps=train_data_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model), directory='./ckpt_{}/'.format(rank_size), config=ckpt_config) callbacks_list = [loss_cb, time_cb, ckpoint_cb] print("============== Starting Training ==============") model.train(config.epoch_size, train_dataset, callbacks=callbacks_list, dataset_sink_mode=False) print("============== End Training ==============")
def train_mindspore_impl(self, indices, epoch, batch_size, use_parallel=True): ds = FakeData(size=8, batch_size=batch_size, num_class=8, image_size=(), use_parallel=use_parallel) ds.set_image_data_type(np.int32) net = self net.set_train() loss = nn.SoftmaxCrossEntropyWithLogits() optimizer = nn.Adam(net.trainable_params()) optimizer.target = "CPU" model = Model(net, loss, optimizer) for _ in range(epoch): model.train(1, ds, dataset_sink_mode=False) output = net(indices) return output
def __init__(self, network, neg_item_num, l2_embed, learning_rate, epsilon, dist_reg=0.002): super(TrainBGCF, self).__init__(auto_prefix=False) self.network = network loss_net = LossWrapper(network, neg_item_num, l2_embed, dist_reg) optimizer = nn.Adam(loss_net.trainable_params(), learning_rate=learning_rate, eps=epsilon) self.loss_train_net = TrainOneStepCell(loss_net, optimizer)
def TrainWrap(net, loss_fn=None, optimizer=None, weights=None): """ TrainWrap """ if loss_fn is None: loss_fn = nn.SoftmaxCrossEntropyWithLogits(reduction='mean', sparse=True) loss_net = nn.WithLossCell(net, loss_fn) loss_net.set_train() if weights is None: weights = ParameterTuple(net.trainable_params()) if optimizer is None: optimizer = nn.Adam(weights, learning_rate=0.003, beta1=0.9, beta2=0.999, eps=1e-5, use_locking=False, use_nesterov=False, weight_decay=4e-5, loss_scale=1.0) train_net = nn.TrainOneStepCell(loss_net, optimizer) return train_net
def csd_train(train_loader, net, opt): set_seed(1) device_id = int(os.getenv('DEVICE_ID', '0')) print("[CSD] Start Training...") step_size = train_loader.get_dataset_size() lr = [] for i in range(0, opt.epochs): cur_lr = opt.lr / (2 ** ((i + 1) // 200)) lr.extend([cur_lr] * step_size) optim = nn.Adam(net.trainable_params(), learning_rate=lr, loss_scale=opt.loss_scale) # net_with_loss = NetWithLossCell(net) net_with_loss = NetWithCSDLossCell(net, args.contra_lambda, args.neg_num) train_cell = TrainOneStepCell(net_with_loss, optim) net.set_train() eval_net = net # time_cb = TimeMonitor(data_size=step_size) # loss_cb = LossMonitor() # metrics = { # "psnr": PSNR(rgb_range=opt.rgb_range, shave=True), # } # eval_cb = EvalCallBack(eval_net, eval_ds, args.test_every, step_size / opt.batch_size, metrics=metrics, # rank_id=rank_id) # cb = [time_cb, loss_cb] # config_ck = CheckpointConfig(save_checkpoint_steps=opt.ckpt_save_interval * step_size, # keep_checkpoint_max=opt.ckpt_save_max) # ckpt_cb = ModelCheckpoint(prefix=opt.filename, directory=opt.ckpt_save_path, config=config_ck) # if device_id == 0: # cb += [ckpt_cb] for epoch in range(0, opt.epochs): epoch_loss = 0 for iteration, batch in enumerate(train_loader.create_dict_iterator(), 1): lr = batch["LR"] hr = batch["HR"] loss = train_cell(lr, hr, Tensor(opt.stu_width_mult), Tensor(1.0)) epoch_loss += loss print(f"Epoch[{epoch}] loss: {epoch_loss.asnumpy()}") # with eval_net.set_train(False): # do_eval(eval_ds, eval_net) if (epoch) % 10 == 0: print('===> Saving model...') save_checkpoint(net, f'./ckpt/{opt.filename}.ckpt')
def __init__(self, args, loader, my_model): self.args = args self.scale = args.scale self.trainloader = loader self.model = my_model self.model.set_train() self.criterion = nn.L1Loss() self.con_loss = SupConLoss() self.optimizer = nn.Adam(self.model.trainable_params(), learning_rate=args.lr, loss_scale=1024.0) self.train_net = MyTrain(self.model, self.criterion, self.con_loss, use_con=args.con_loss) self.bp = MyTrainOneStepCell(self.train_net, self.optimizer, 1024.0)
def train(Net): ds_train, ds_test = create_dataset() # 构建网络 network = Net(cfg.num_classes) # 定义模型的损失函数,优化器 net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Adam(network.trainable_params(), cfg.lr) # 训练模型 model = Model(network, loss_fn=net_loss, optimizer=net_opt, metrics={'acc': Accuracy()}) loss_cb = LossMonitor() print("============== Starting Training ==============") model.train(30, ds_train, callbacks=[loss_cb], dataset_sink_mode=True) # 验证 metric = model.eval(ds_test) print(metric) return model
def test_adam_group2(): """ test_adam_group_lr_and_weight_decay """ inputs = Tensor(np.ones([1, 64]).astype(np.float32)) label = Tensor(np.zeros([1, 10]).astype(np.float32)) net = Net() net.set_train() loss = nn.SoftmaxCrossEntropyWithLogits() net_with_loss = WithLossCell(net, loss) all_params = net.trainable_params() schedule_lr = lr_schedules.PolynomialDecayLR(0.01, 0.0001, 3, power=1.0) group_params = [{ 'params': [all_params[0]], 'lr': 0.02, 'weight_decay': 0.9 }, { 'params': [all_params[1]] }] optimizer = nn.Adam(group_params, learning_rate=schedule_lr) train_network = TrainOneStepCell(net_with_loss, optimizer) _executor.compile(train_network, inputs, label)
def test_svi_cvae(): # define the encoder and decoder encoder = Encoder(num_classes=10) decoder = Decoder() # define the cvae model cvae = ConditionalVAE(encoder, decoder, hidden_size=400, latent_size=20, num_classes=10) # define the loss function net_loss = ELBO(latent_prior='Normal', output_prior='Normal') # define the optimizer optimizer = nn.Adam(params=cvae.trainable_params(), learning_rate=0.001) # define the training dataset ds_train = create_dataset(image_path, 128, 1) # define the WithLossCell modified net_with_loss = CVAEWithLossCell(cvae, net_loss) # define the variational inference vi = SVI(net_with_loss=net_with_loss, optimizer=optimizer) # run the vi to return the trained network. cvae = vi.run(train_dataset=ds_train, epochs=5) # get the trained loss trained_loss = vi.get_train_loss() # test function: generate_sample sample_label = Tensor([i for i in range(0, 8)] * 8, dtype=mstype.int32) generated_sample = cvae.generate_sample(sample_label, 64, IMAGE_SHAPE) # test function: reconstruct_sample for sample in ds_train.create_dict_iterator(output_numpy=True, num_epochs=1): sample_x = Tensor(sample['image'], dtype=mstype.float32) sample_y = Tensor(sample['label'], dtype=mstype.int32) reconstructed_sample = cvae.reconstruct_sample(sample_x, sample_y) print('The loss of the trained network is ', trained_loss) print('The shape of the generated sample is ', generated_sample.shape) print('The shape of the reconstructed sample is ', reconstructed_sample.shape)
dataset = create_yolo_dataset(mindrecord_file, repeat_num=args_opt.epoch_size, batch_size=args_opt.batch_size, device_num=device_num, rank=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done!") net = yolov3_resnet18(ConfigYOLOV3ResNet18()) net = YoloWithLossCell(net, ConfigYOLOV3ResNet18()) init_net_param(net, "XavierUniform") # checkpoint ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=None, config=ckpt_config) lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=0, global_step=args_opt.epoch_size * dataset_size, decay_step=1000, decay_rate=0.95, steps=True)) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale) net = TrainingWrapper(net, opt, loss_scale) if args_opt.checkpoint_path != "": param_dict = load_checkpoint(args_opt.checkpoint_path) load_param_into_net(net, param_dict) callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb] model = Model(net) dataset_sink_mode = False if args_opt.mode == "sink": print("In sink mode, one epoch return a loss.") dataset_sink_mode = True print("Start train YOLOv3, the first epoch will be slower because of the graph compilation.") model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
if __name__ == "__main__": # define the encoder and decoder encoder = Encoder(num_classes=10) decoder = Decoder() # define the cvae model cvae = ConditionalVAE(encoder, decoder, hidden_size=400, latent_size=20, num_classes=10) # define the loss function net_loss = ELBO(latent_prior='Normal', output_prior='Normal') # define the optimizer optimizer = nn.Adam(params=cvae.trainable_params(), learning_rate=0.001) # define the training dataset ds_train = create_dataset(image_path, 128, 1) # define the WithLossCell modified net_with_loss = WithLossCell(cvae, net_loss) # define the variational inference vi = SVI(net_with_loss=net_with_loss, optimizer=optimizer) # run the vi to return the trained network. cvae = vi.run(train_dataset=ds_train, epochs=10) # get the trained loss trained_loss = vi.get_train_loss() # test function: generate_sample sample_label = Tensor([i for i in range(0, 8)] * 8, dtype=mstype.int32) generated_sample = cvae.generate_sample(sample_label, 64, IMAGE_SHAPE) # test function: reconstruct_sample for sample in ds_train.create_dict_iterator():
def main(): parser = argparse.ArgumentParser(description="YOLOv3 train") parser.add_argument("--only_create_dataset", type=bool, default=False, help="If set it true, only create " "Mindrecord, default is false.") parser.add_argument("--distribute", type=bool, default=False, help="Run distribute, default is false.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--lr", type=float, default=0.001, help="Learning rate, default is 0.001.") parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink") parser.add_argument("--epoch_size", type=int, default=10, help="Epoch size, default is 10") parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.") parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path") parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size") parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.") parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.") parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord", help="Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by" "image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir " "rather than image_dir and anno_path. Default is ./Mindrecord_train") parser.add_argument('--data_url', type=str, default=None, help='Dataset path') parser.add_argument('--train_url', type=str, default=None, help='Train output path') parser.add_argument("--anno_path", type=str, default="", help="Annotation path.") args_opt = parser.parse_args() device_id = int(os.getenv('DEVICE_ID')) device_num = int(os.getenv('RANK_SIZE')) rankid = int(os.getenv('RANK_ID')) local_data_url = '/cache/data' local_train_url = '/cache/ckpt' local_anno_url = '/cache/anno' local_mindrecord_url = '/cache/mindrecord' mox.file.copy_parallel(args_opt.mindrecord_dir,local_mindrecord_url) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id) if args_opt.distribute: context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) init() rank = rankid local_train_url = os.path.join(local_train_url,str(device_id)) else: rank = 0 device_num = 1 print("Start create dataset!") # It will generate mindrecord file in args_opt.mindrecord_dir, # and the file name is yolo.mindrecord0, 1, ... file_num. if not os.path.isdir(local_mindrecord_url): os.makedirs(local_mindrecord_url) prefix = "train.mindrecord" mindrecord_file = os.path.join(local_mindrecord_url, prefix + "0") if not os.path.exists(mindrecord_file): mox.file.copy_parallel(args_opt.data_url,local_data_url) if args_opt.anno_path: anno_file=os.path.join(local_anno_url,os.path.split(args_opt.anno_path)[1]) mox.file.copy_parallel(args_opt.anno_path,anno_file) if os.path.isdir(local_data_url) or os.path.exists(anno_file): print("Create Mindrecord.") data_to_mindrecord_byte_image(local_data_url, anno_file, local_mindrecord_url, prefix=prefix, file_num=8) print("Create Mindrecord Done, at {}".format(args_opt.mindrecord_dir)) mox.file.copy_parallel(local_mindrecord_url,args_opt.mindrecord_dir) else: print("image_dir or anno_path not exits.") if not args_opt.only_create_dataset: loss_scale = float(args_opt.loss_scale) # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0. dataset = create_yolo_dataset(mindrecord_file, repeat_num=args_opt.epoch_size, batch_size=args_opt.batch_size, device_num=device_num, rank=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done!") net = yolov3_resnet18(ConfigYOLOV3ResNet18()) net = YoloWithLossCell(net, ConfigYOLOV3ResNet18()) init_net_param(net, "XavierUniform") # checkpoint ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=local_train_url, config=ckpt_config) if args_opt.pre_trained: if args_opt.pre_trained_epoch_size <= 0: raise KeyError("pre_trained_epoch_size must be greater than 0.") param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) total_epoch_size = 60 if args_opt.distribute: total_epoch_size = 160 lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size, global_step=total_epoch_size * dataset_size, decay_step=1000, decay_rate=0.95, steps=True)) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale) net = TrainingWrapper(net, opt, loss_scale) callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb] model = Model(net) dataset_sink_mode = False if args_opt.mode == "sink": print("In sink mode, one epoch return a loss.") dataset_sink_mode = True print("Start train YOLOv3, the first epoch will be slower because of the graph compilation.") model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode) if device_id ==1: mox.file.copy_parallel(local_train_url,args_opt.train_url)
def main(): set_seed(1) date = time.strftime("%Y%m%d%H%M%S", time.localtime()) print(f'* Preparing to train model {date}') # ************** configuration **************** # - training setting resume = config['resume'] if config['mode'] == 'PYNATIVE': mode = context.PYNATIVE_MODE else: mode = context.GRAPH_MODE device = config['device'] device_id = config['device_id'] dataset_sink_mode = config['dataset_sink_mode'] # use in dataset div = 8 # setting bias and padding if resume: print('* Resuming model...') resume_config_log = config['resume_config_log'] resume_config = get_eval_config(resume_config_log) if 'best_ckpt' in resume_config.keys(): resume_model_path = resume_config['best_ckpt'] else: resume_model_path = resume_config['latest_model'] print('* [WARNING] Not using the best model, but latest saved model instead.') has_bias = resume_config['has_bias'] use_dropout = resume_config['use_dropout'] pad_mode = resume_config['pad_mode'] if pad_mode == 'pad': padding = resume_config['padding'] elif pad_mode == 'same': padding = 0 else: raise ValueError(f"invalid pad mode: {pad_mode}!") best_acc = resume_config['best_acc'] best_ckpt = resume_config['best_ckpt'] print('* The best accuracy in dev dataset for the current resumed model is {:.2f}%'.format(best_acc * 100)) else: has_bias = config['has_bias'] use_dropout = config['use_dropout'] pad_mode = config['pad_mode'] if pad_mode == 'pad': padding = config['padding'] elif pad_mode == 'same': padding = 0 else: raise ValueError(f"invalid pad mode: {pad_mode}!") # hyper-parameters if resume: batch_size = resume_config['batch_size'] opt_type = resume_config['opt'] use_dynamic_lr = resume_config['use_dynamic_lr'] warmup_step = resume_config['warmup_step'] warmup_ratio = resume_config['warmup_ratio'] else: batch_size = config['batch_size'] opt_type = config['opt'] use_dynamic_lr = config['use_dynamic_lr'] warmup_step = config['warmup_step'] warmup_ratio = config['warmup_ratio'] test_dev_batch_size = config['test_dev_batch_size'] learning_rate = float(config['learning_rate']) epochs = config['epochs'] loss_scale = config['loss_scale'] # configuration of saving model checkpoint save_checkpoint_steps = config['save_checkpoint_steps'] keep_checkpoint_max = config['keep_checkpoint_max'] prefix = config['prefix'] + '_' + date model_dir = config['model_dir'] # loss monitor loss_monitor_step = config['loss_monitor_step'] # whether to use mindInsight summary use_summary = config['use_summary'] # step_eval use_step_eval = config['use_step_eval'] eval_step = config['eval_step'] eval_epoch = config['eval_epoch'] patience = config['patience'] # eval in steps or epochs step_eval = True if eval_step == -1: step_eval = False # ************** end of configuration ************** if device == 'GPU': context.set_context(mode=mode, device_target=device, device_id=device_id) elif device == 'Ascend': import moxing as mox from utils.const import DATA_PATH, MODEL_PATH, BEST_MODEL_PATH, LOG_PATH obs_datapath = config['obs_datapath'] obs_saved_model = config['obs_saved_model'] obs_best_model = config['obs_best_model'] obs_log = config['obs_log'] mox.file.copy_parallel(obs_datapath, DATA_PATH) mox.file.copy_parallel(MODEL_PATH, obs_saved_model) mox.file.copy_parallel(BEST_MODEL_PATH, obs_best_model) mox.file.copy_parallel(LOG_PATH, obs_log) context.set_context(mode=mode, device_target=device) use_summary = False # callbacks function callbacks = [] # data train_loader, idx2label, label2idx = get_dataset(batch_size=batch_size, phase='train', test_dev_batch_size=test_dev_batch_size, div=div, num_parallel_workers=4) if eval_step == 0: eval_step = train_loader.get_dataset_size() # network net = DFCNN(num_classes=len(label2idx), padding=padding, pad_mode=pad_mode, has_bias=has_bias, use_dropout=use_dropout) # Criterion criterion = CTCLoss() # resume if resume: print("* Loading parameters...") param_dict = load_checkpoint(resume_model_path) # load the parameter into net load_param_into_net(net, param_dict) print(f'* Parameters loading from {resume_model_path} succeeded!') net.set_train(True) net.set_grad(True) # lr schedule if use_dynamic_lr: dataset_size = train_loader.get_dataset_size() learning_rate = Tensor(dynamic_lr(base_lr=learning_rate, warmup_step=warmup_step, warmup_ratio=warmup_ratio, epochs=epochs, steps_per_epoch=dataset_size), mstype.float32) print('* Using dynamic learning rate, which will be set up as :', learning_rate.asnumpy()) # optim if opt_type == 'adam': opt = nn.Adam(net.trainable_params(), learning_rate=learning_rate, beta1=0.9, beta2=0.999, weight_decay=0.0, eps=10e-8) elif opt_type == 'rms': opt = nn.RMSProp(params=net.trainable_params(), centered=True, learning_rate=learning_rate, momentum=0.9, loss_scale=loss_scale) elif opt_type == 'sgd': opt = nn.SGD(params=net.trainable_params(), learning_rate=learning_rate) else: raise ValueError(f"optimizer: {opt_type} is not supported for now!") if resume: # load the parameter into optimizer load_param_into_net(opt, param_dict) # save_model config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix=prefix, directory=model_dir, config=config_ck) # logger the_logger = logger(config, date) log = Logging(logger=the_logger, model_ckpt=ckpt_cb) callbacks.append(ckpt_cb) callbacks.append(log) net = WithLossCell(net, criterion) scaling_sens = Tensor(np.full((1), loss_scale), dtype=mstype.float32) net = DFCNNCTCTrainOneStepWithLossScaleCell(net, opt, scaling_sens) net.set_train(True) model = Model(net) if use_step_eval: # step evaluation step_eval = StepAccInfo(model=model, name=prefix, div=div, test_dev_batch_size=test_dev_batch_size, step_eval=step_eval, eval_step=eval_step, eval_epoch=eval_epoch, logger=the_logger, patience=patience, dataset_size=train_loader.get_dataset_size()) callbacks.append(step_eval) # loss monitor loss_monitor = LossMonitor(loss_monitor_step) callbacks.append(loss_monitor) if use_summary: summary_dir = os.path.join(SUMMARY_DIR, date) if not os.path.exists(summary_dir): os.mkdir(summary_dir) # mindInsight summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=1, max_file_size=4 * 1024 ** 3) callbacks.append(summary_collector) if resume: the_logger.update_acc_ckpt(best_acc, best_ckpt) print(f'* Start training...') model.train(epochs, train_loader, callbacks=callbacks, dataset_sink_mode=dataset_sink_mode)