def train(): rank_id = 0 if args.run_distribute: context.set_auto_parallel_context( device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) init() rank_id = get_rank() # dataset/network/criterion/optim ds = train_dataset_creator(args.device_id, args.device_num) step_size = ds.get_dataset_size() print('Create dataset done!') config.INFERENCE = False net = ETSNet(config) net = net.set_train() param_dict = load_checkpoint(args.pre_trained) load_param_into_net(net, param_dict) print('Load Pretrained parameters done!') criterion = DiceLoss(batch_size=config.TRAIN_BATCH_SIZE) lrs = lr_generator(start_lr=1e-3, lr_scale=0.1, total_iters=config.TRAIN_TOTAL_ITER) opt = nn.SGD(params=net.trainable_params(), learning_rate=lrs, momentum=0.99, weight_decay=5e-4) # warp model net = WithLossCell(net, criterion) if args.run_distribute: net = TrainOneStepCell(net, opt, reduce_flag=True, mean=True, degree=args.device_num) else: net = TrainOneStepCell(net, opt) time_cb = TimeMonitor(data_size=step_size) loss_cb = LossCallBack(per_print_times=10) # set and apply parameters of check point config.TRAIN_MODEL_SAVE_PATH ckpoint_cf = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=2) ckpoint_cb = ModelCheckpoint(prefix="ETSNet", config=ckpoint_cf, directory="./ckpt_{}".format(rank_id)) model = Model(net) model.train(config.TRAIN_REPEAT_NUM, ds, dataset_sink_mode=True, callbacks=[time_cb, loss_cb, ckpoint_cb])
def test_suppress_model_with_pynative_mode(): context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend") networks_l5 = LeNet5() epochs = 5 batch_num = 10 mask_times = 10 lr = 0.01 masklayers_lenet5 = [] masklayers_lenet5.append(MaskLayerDes("conv1.weight", 0, False, False, -1)) suppress_ctrl_instance = SuppressPrivacyFactory().create(networks_l5, masklayers_lenet5, policy="local_train", end_epoch=epochs, batch_num=batch_num, start_epoch=1, mask_times=mask_times, lr=lr, sparse_end=0.50, sparse_start=0.0) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.SGD(networks_l5.trainable_params(), lr) model_instance = SuppressModel( network=networks_l5, loss_fn=net_loss, optimizer=net_opt, metrics={"Accuracy": Accuracy()}) model_instance.link_suppress_ctrl(suppress_ctrl_instance) suppress_masker = SuppressMasker(model=model_instance, suppress_ctrl=suppress_ctrl_instance) config_ck = CheckpointConfig(save_checkpoint_steps=batch_num, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory="./trained_ckpt_file/", config=config_ck) ds_train = ds.GeneratorDataset(dataset_generator, ['data', 'label']) model_instance.train(epochs, ds_train, callbacks=[ckpoint_cb, LossMonitor(), suppress_masker], dataset_sink_mode=False)
def mnist_suppress_train(epoch_size=10, start_epoch=3, lr=0.05, samples=10000, mask_times=1000, sparse_thd=0.90, sparse_start=0.0, masklayers=None): """ local train by suppress-based privacy """ networks_l5 = LeNet5() suppress_ctrl_instance = SuppressPrivacyFactory().create( networks_l5, masklayers, policy="local_train", end_epoch=epoch_size, batch_num=(int)(samples / cfg.batch_size), start_epoch=start_epoch, mask_times=mask_times, lr=lr, sparse_end=sparse_thd, sparse_start=sparse_start) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.SGD(networks_l5.trainable_params(), lr) config_ck = CheckpointConfig(save_checkpoint_steps=(int)(samples / cfg.batch_size), keep_checkpoint_max=10) # Create the SuppressModel model for training. model_instance = SuppressModel(network=networks_l5, loss_fn=net_loss, optimizer=net_opt, metrics={"Accuracy": Accuracy()}) model_instance.link_suppress_ctrl(suppress_ctrl_instance) # Create a Masker for Suppress training. The function of the Masker is to # enforce suppress operation while training. suppress_masker = SuppressMasker(model=model_instance, suppress_ctrl=suppress_ctrl_instance) mnist_path = "./MNIST_unzip/" #"../../MNIST_unzip/" ds_train = generate_mnist_dataset(os.path.join(mnist_path, "train"), batch_size=cfg.batch_size, repeat_size=1, samples=samples) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory="./trained_ckpt_file/", config=config_ck) print("============== Starting SUPP Training ==============") model_instance.train( epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(), suppress_masker], dataset_sink_mode=False) print("============== Starting SUPP Testing ==============") ds_eval = generate_mnist_dataset(os.path.join(mnist_path, 'test'), batch_size=cfg.batch_size) acc = model_instance.eval(ds_eval, dataset_sink_mode=False) print("============== SUPP Accuracy: %s ==============", acc) suppress_ctrl_instance.print_paras()
# define lr lr_init = cf.learning_rate if not args_opt.run_distribute else cf.learning_rate * device_num * lr_scale lr = get_lr(cf.epoch_size, step_size, lr_init) loss = CTCLoss(max_sequence_length=cf.captcha_width, max_label_length=max_captcha_digits, batch_size=cf.batch_size) if args_opt.platform == 'Ascend': net = StackedRNN(input_size=input_size, batch_size=cf.batch_size, hidden_size=cf.hidden_size) else: net = StackedRNNForGPU(input_size=input_size, batch_size=cf.batch_size, hidden_size=cf.hidden_size) opt = nn.SGD(params=net.trainable_params(), learning_rate=lr, momentum=cf.momentum) net = WithLossCell(net, loss) net = TrainOneStepCellWithGradClip(net, opt).set_train() # define model model = Model(net) # define callbacks callbacks = [LossMonitor(), TimeMonitor(data_size=step_size)] if cf.save_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=cf.save_checkpoint_steps, keep_checkpoint_max=cf.keep_checkpoint_max) save_ckpt_path = os.path.join(cf.save_checkpoint_path, 'ckpt_' + str(rank) + '/') ckpt_cb = ModelCheckpoint(prefix="warpctc",
import sys import numpy as np from train_utils import SaveInOut, TrainWrap from effnet import effnet import mindspore.common.dtype as mstype from mindspore import context, Tensor, nn from mindspore.train.serialization import export context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU", save_graphs=False) n = effnet(num_classes=10) loss_fn = nn.SoftmaxCrossEntropyWithLogits(sparse=False) optimizer = nn.SGD(n.trainable_params(), learning_rate=0.01, momentum=0.9, dampening=0.0, weight_decay=0.0, nesterov=True, loss_scale=1.0) net = TrainWrap(n, loss_fn, optimizer) x = Tensor(np.random.randn(2, 3, 224, 224), mstype.float32) label = Tensor(np.zeros([2, 10]).astype(np.float32)) export(net, x, label, file_name="mindir/effnet_train", file_format='MINDIR') if len(sys.argv) > 1: SaveInOut(sys.argv[1] + "effnet", x, label, n, net)
def main(): set_seed(1) date = time.strftime("%Y%m%d%H%M%S", time.localtime()) print(f'* Preparing to train model {date}') # ************** configuration **************** # - training setting resume = config['resume'] if config['mode'] == 'PYNATIVE': mode = context.PYNATIVE_MODE else: mode = context.GRAPH_MODE device = config['device'] device_id = config['device_id'] dataset_sink_mode = config['dataset_sink_mode'] # use in dataset div = 8 # setting bias and padding if resume: print('* Resuming model...') resume_config_log = config['resume_config_log'] resume_config = get_eval_config(resume_config_log) if 'best_ckpt' in resume_config.keys(): resume_model_path = resume_config['best_ckpt'] else: resume_model_path = resume_config['latest_model'] print('* [WARNING] Not using the best model, but latest saved model instead.') has_bias = resume_config['has_bias'] use_dropout = resume_config['use_dropout'] pad_mode = resume_config['pad_mode'] if pad_mode == 'pad': padding = resume_config['padding'] elif pad_mode == 'same': padding = 0 else: raise ValueError(f"invalid pad mode: {pad_mode}!") best_acc = resume_config['best_acc'] best_ckpt = resume_config['best_ckpt'] print('* The best accuracy in dev dataset for the current resumed model is {:.2f}%'.format(best_acc * 100)) else: has_bias = config['has_bias'] use_dropout = config['use_dropout'] pad_mode = config['pad_mode'] if pad_mode == 'pad': padding = config['padding'] elif pad_mode == 'same': padding = 0 else: raise ValueError(f"invalid pad mode: {pad_mode}!") # hyper-parameters if resume: batch_size = resume_config['batch_size'] opt_type = resume_config['opt'] use_dynamic_lr = resume_config['use_dynamic_lr'] warmup_step = resume_config['warmup_step'] warmup_ratio = resume_config['warmup_ratio'] else: batch_size = config['batch_size'] opt_type = config['opt'] use_dynamic_lr = config['use_dynamic_lr'] warmup_step = config['warmup_step'] warmup_ratio = config['warmup_ratio'] test_dev_batch_size = config['test_dev_batch_size'] learning_rate = float(config['learning_rate']) epochs = config['epochs'] loss_scale = config['loss_scale'] # configuration of saving model checkpoint save_checkpoint_steps = config['save_checkpoint_steps'] keep_checkpoint_max = config['keep_checkpoint_max'] prefix = config['prefix'] + '_' + date model_dir = config['model_dir'] # loss monitor loss_monitor_step = config['loss_monitor_step'] # whether to use mindInsight summary use_summary = config['use_summary'] # step_eval use_step_eval = config['use_step_eval'] eval_step = config['eval_step'] eval_epoch = config['eval_epoch'] patience = config['patience'] # eval in steps or epochs step_eval = True if eval_step == -1: step_eval = False # ************** end of configuration ************** if device == 'GPU': context.set_context(mode=mode, device_target=device, device_id=device_id) elif device == 'Ascend': import moxing as mox from utils.const import DATA_PATH, MODEL_PATH, BEST_MODEL_PATH, LOG_PATH obs_datapath = config['obs_datapath'] obs_saved_model = config['obs_saved_model'] obs_best_model = config['obs_best_model'] obs_log = config['obs_log'] mox.file.copy_parallel(obs_datapath, DATA_PATH) mox.file.copy_parallel(MODEL_PATH, obs_saved_model) mox.file.copy_parallel(BEST_MODEL_PATH, obs_best_model) mox.file.copy_parallel(LOG_PATH, obs_log) context.set_context(mode=mode, device_target=device) use_summary = False # callbacks function callbacks = [] # data train_loader, idx2label, label2idx = get_dataset(batch_size=batch_size, phase='train', test_dev_batch_size=test_dev_batch_size, div=div, num_parallel_workers=4) if eval_step == 0: eval_step = train_loader.get_dataset_size() # network net = DFCNN(num_classes=len(label2idx), padding=padding, pad_mode=pad_mode, has_bias=has_bias, use_dropout=use_dropout) # Criterion criterion = CTCLoss() # resume if resume: print("* Loading parameters...") param_dict = load_checkpoint(resume_model_path) # load the parameter into net load_param_into_net(net, param_dict) print(f'* Parameters loading from {resume_model_path} succeeded!') net.set_train(True) net.set_grad(True) # lr schedule if use_dynamic_lr: dataset_size = train_loader.get_dataset_size() learning_rate = Tensor(dynamic_lr(base_lr=learning_rate, warmup_step=warmup_step, warmup_ratio=warmup_ratio, epochs=epochs, steps_per_epoch=dataset_size), mstype.float32) print('* Using dynamic learning rate, which will be set up as :', learning_rate.asnumpy()) # optim if opt_type == 'adam': opt = nn.Adam(net.trainable_params(), learning_rate=learning_rate, beta1=0.9, beta2=0.999, weight_decay=0.0, eps=10e-8) elif opt_type == 'rms': opt = nn.RMSProp(params=net.trainable_params(), centered=True, learning_rate=learning_rate, momentum=0.9, loss_scale=loss_scale) elif opt_type == 'sgd': opt = nn.SGD(params=net.trainable_params(), learning_rate=learning_rate) else: raise ValueError(f"optimizer: {opt_type} is not supported for now!") if resume: # load the parameter into optimizer load_param_into_net(opt, param_dict) # save_model config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix=prefix, directory=model_dir, config=config_ck) # logger the_logger = logger(config, date) log = Logging(logger=the_logger, model_ckpt=ckpt_cb) callbacks.append(ckpt_cb) callbacks.append(log) net = WithLossCell(net, criterion) scaling_sens = Tensor(np.full((1), loss_scale), dtype=mstype.float32) net = DFCNNCTCTrainOneStepWithLossScaleCell(net, opt, scaling_sens) net.set_train(True) model = Model(net) if use_step_eval: # step evaluation step_eval = StepAccInfo(model=model, name=prefix, div=div, test_dev_batch_size=test_dev_batch_size, step_eval=step_eval, eval_step=eval_step, eval_epoch=eval_epoch, logger=the_logger, patience=patience, dataset_size=train_loader.get_dataset_size()) callbacks.append(step_eval) # loss monitor loss_monitor = LossMonitor(loss_monitor_step) callbacks.append(loss_monitor) if use_summary: summary_dir = os.path.join(SUMMARY_DIR, date) if not os.path.exists(summary_dir): os.mkdir(summary_dir) # mindInsight summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=1, max_file_size=4 * 1024 ** 3) callbacks.append(summary_collector) if resume: the_logger.update_acc_ckpt(best_acc, best_ckpt) print(f'* Start training...') model.train(epochs, train_loader, callbacks=callbacks, dataset_sink_mode=dataset_sink_mode)
def train(cfg): context.set_context(mode=context.GRAPH_MODE, device_target='GPU', save_graphs=False) if cfg['ngpu'] > 1: init("nccl") context.set_auto_parallel_context( device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) cfg['ckpt_path'] = cfg['ckpt_path'] + "ckpt_" + str(get_rank()) + "/" else: raise ValueError('cfg_num_gpu <= 1') batch_size = cfg['batch_size'] max_epoch = cfg['epoch'] momentum = cfg['momentum'] weight_decay = cfg['weight_decay'] initial_lr = cfg['initial_lr'] gamma = cfg['gamma'] training_dataset = cfg['training_dataset'] num_classes = 2 negative_ratio = 7 stepvalues = (cfg['decay1'], cfg['decay2']) ds_train = create_dataset(training_dataset, cfg, batch_size, multiprocessing=True, num_worker=cfg['num_workers']) print('dataset size is : \n', ds_train.get_dataset_size()) steps_per_epoch = math.ceil(ds_train.get_dataset_size()) multibox_loss = MultiBoxLoss(num_classes, cfg['num_anchor'], negative_ratio, cfg['batch_size']) backbone = resnet50(1001) backbone.set_train(True) if cfg['pretrain'] and cfg['resume_net'] is None: pretrained_res50 = cfg['pretrain_path'] param_dict_res50 = load_checkpoint(pretrained_res50) load_param_into_net(backbone, param_dict_res50) print('Load resnet50 from [{}] done.'.format(pretrained_res50)) net = RetinaFace(phase='train', backbone=backbone) net.set_train(True) if cfg['resume_net'] is not None: pretrain_model_path = cfg['resume_net'] param_dict_retinaface = load_checkpoint(pretrain_model_path) load_param_into_net(net, param_dict_retinaface) print('Resume Model from [{}] Done.'.format(cfg['resume_net'])) net = RetinaFaceWithLossCell(net, multibox_loss, cfg) lr = adjust_learning_rate(initial_lr, gamma, stepvalues, steps_per_epoch, max_epoch, warmup_epoch=cfg['warmup_epoch']) if cfg['optim'] == 'momentum': opt = nn.Momentum(net.trainable_params(), lr, momentum) elif cfg['optim'] == 'sgd': opt = nn.SGD(params=net.trainable_params(), learning_rate=lr, momentum=momentum, weight_decay=weight_decay, loss_scale=1) else: raise ValueError('optim is not define.') net = TrainingWrapper(net, opt) model = Model(net) config_ck = CheckpointConfig( save_checkpoint_steps=cfg['save_checkpoint_steps'], keep_checkpoint_max=cfg['keep_checkpoint_max']) ckpoint_cb = ModelCheckpoint(prefix="RetinaFace", directory=cfg['ckpt_path'], config=config_ck) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) callback_list = [LossMonitor(), time_cb, ckpoint_cb] print("============== Starting Training ==============") model.train(max_epoch, ds_train, callbacks=callback_list, dataset_sink_mode=False)
batch_size=config.batch_size, num_shards=device_num, shard_id=rank, config=config) step_size = dataset.get_dataset_size() # define lr lr_init = config.learning_rate lr = nn.dynamic_lr.cosine_decay_lr(0.0, lr_init, config.epoch_size * step_size, step_size, config.epoch_size) loss = CTCLoss(max_sequence_length=config.num_step, max_label_length=max_text_length, batch_size=config.batch_size) net = CRNN(config) opt = nn.SGD(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum, nesterov=config.nesterov) net = WithLossCell(net, loss) net = TrainOneStepCellWithGradClip(net, opt).set_train() # define model model = Model(net) # define callbacks callbacks = [LossMonitor(), TimeMonitor(data_size=step_size)] if config.save_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_steps, keep_checkpoint_max=config.keep_checkpoint_max) save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/') ckpt_cb = ModelCheckpoint(prefix="crnn",
BACKBONE = effnet(num_classes=1000) load_checkpoint(CHECKPOINT_WEIGHT_FILE, BACKBONE) HEAD = nn.Dense(1000, 10) HEAD.weight.set_data( Tensor(np.random.normal(0, 0.1, HEAD.weight.data.shape).astype("float32"))) HEAD.bias.set_data(Tensor(np.zeros(HEAD.bias.data.shape, dtype="float32"))) n = TransferNet(BACKBONE, HEAD) trainable_weights_list = [] trainable_weights_list.extend(n.head.trainable_params()) trainable_weights = ParameterTuple(trainable_weights_list) sgd = nn.SGD(trainable_weights, learning_rate=0.01, momentum=0.9, dampening=0.01, weight_decay=0.0, nesterov=False, loss_scale=1.0) net = train_wrap(n, optimizer=sgd, weights=trainable_weights) BATCH_SIZE = 8 X = Tensor(np.random.randn(BATCH_SIZE, 3, 224, 224), mstype.float32) label = Tensor(np.zeros([BATCH_SIZE, 10]).astype(np.float32)) export(net, X, label, file_name="mindir/effnet_tune_train", file_format='MINDIR') if len(sys.argv) > 1:
# 定义网络并训练 network = nn.Dense(cfg.feature_number, cfg.num_class) model = train(network, None, ds_train, "checkpoint_no_opt", cfg.out_dir_no_opt, 4) # 评估预测 eval_predict(model, ds_test) # ---------------------------------------------------SGD------------------------------------- epoch_size = 200 lr = 0.01 print('-------------------SGD优化器-----------------------') # 数据 ds_train, ds_test = gen_data(X_train, Y_train, epoch_size) # 定义网络并训练、测试、预测 network = nn.Dense(cfg.feature_number, cfg.num_class) net_opt = nn.SGD(network.trainable_params(), lr) model = train(network, net_opt, ds_train, "checkpoint_sgd", cfg.out_dir_sgd, 40) # 评估预测 eval_predict(model, ds_test) # ----------------------------------------------------Momentum------------------------------- epoch_size = 20 lr = 0.01 print('-------------------Momentum优化器-----------------------') # 数据 ds_train, ds_test = gen_data(X_train, Y_train, epoch_size) # 定义网络并训练 network = nn.Dense(cfg.feature_number, cfg.num_class) net_opt = nn.Momentum(network.trainable_params(), lr, 0.9) model = train(network, net_opt, ds_train, "checkpoint_momentum",
if cfg.micro_batches and cfg.batch_size % cfg.micro_batches != 0: raise ValueError( "Number of micro_batches should divide evenly batch_size") # Create a factory class of DP noise mechanisms, this method is adding noise # in gradients while training. Mechanisms can be 'Gaussian' # or 'AdaGaussian', in which noise would be decayed with 'AdaGaussian' # mechanism while be constant with 'Gaussian' mechanism. noise_mech = NoiseMechanismsFactory().create( cfg.noise_mechanisms, norm_bound=cfg.norm_bound, initial_noise_multiplier=cfg.initial_noise_multiplier, decay_policy=cfg.decay_policy) net_opt = nn.SGD(params=network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum) # Create a monitor for DP training. The function of the monitor is to # compute and print the privacy budget(eps and delta) while training. rdp_monitor = PrivacyMonitorFactory.create( 'rdp', num_samples=60000, batch_size=cfg.batch_size, initial_noise_multiplier=cfg.initial_noise_multiplier, per_print_times=234) # Create the DP model for training. model = DPModel(micro_batches=cfg.micro_batches, norm_bound=cfg.norm_bound, noise_mech=noise_mech, network=network, loss_fn=net_loss,