def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ do train """ if load_checkpoint_path == "": raise ValueError( "Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() # optimizer optimizer = Adam(network.trainable_params(), learning_rate=optimizer_cfg.learning_rate) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint( prefix="classifier", directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(network, param_dict) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) callbacks = [ TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb ] model.train(epoch_num, dataset, callbacks=callbacks)
def train_and_eval(config): """ test_train_eval """ np.random.seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb])
def run_transformer_train(): """ Transformer training. """ parser = argparse_init() args, _ = parser.parse_known_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id) context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) if args.distribute == "true": device_num = args.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True, device_num=device_num) D.init() rank_id = args.device_id % device_num else: device_num = 1 rank_id = 0 dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num, rank_id=rank_id, do_shuffle=args.do_shuffle, enable_data_sink=args.enable_data_sink, dataset_path=args.data_path) netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True) if args.checkpoint_path: parameter_dict = load_checkpoint(args.checkpoint_path) load_param_into_net(netwithloss, parameter_dict) lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", training_steps=dataset.get_dataset_size()*args.epoch_size, learning_rate=cfg.lr_schedule.learning_rate, warmup_steps=cfg.lr_schedule.warmup_steps, hidden_size=transformer_net_cfg.hidden_size, start_decay_step=cfg.lr_schedule.start_decay_step, min_lr=cfg.lr_schedule.min_lr), mstype.float32) optimizer = Adam(netwithloss.trainable_params(), lr) callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()] if args.enable_save_ckpt == "true": if device_num == 1 or (device_num > 1 and rank_id == 0): ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps, keep_checkpoint_max=args.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=args.save_checkpoint_path, config=ckpt_config) callbacks.append(ckpoint_cb) if args.enable_lossscale == "true": scale_manager = DynamicLossScaleManager(init_loss_scale=cfg.init_loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) update_cell = scale_manager.get_update_cell() netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = TransformerTrainOneStepCell(netwithloss, optimizer=optimizer) netwithgrads.set_train(True) model = Model(netwithgrads) model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=(args.enable_data_sink == "true"))
def test_train(configure): """ test_train """ data_path = configure.data_path batch_size = configure.batch_size epochs = configure.epochs ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size) print("ds_train.size: {}".format(ds_train.get_dataset_size())) net_builder = ModelBuilder() train_net, _ = net_builder.get_net(configure) train_net.set_train() model = Model(train_net) callback = LossCallBack(config=configure) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb ])
def test_train_eval(): """ test_train_eval """ config = WideDeepConfig() data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size, data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size, data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt") model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]) eval_values = list(eval_callback.eval_values) assert eval_values[0] > 0.78
def train(model, dataset_direct, filename, columns_list, num_consumer=4, batch=16, epoch=50, save_checkpoint_steps=2172, keep_checkpoint_max=50, prefix="model", directory='./'): """ train network """ config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=prefix, directory=directory, config=config_ck) data_train = create_dataset(dataset_direct, filename, batch, columns_list, num_consumer) model.train(epoch, data_train, callbacks=[ ckpoint_cb, LossMonitor(per_print_times=181), TimeMonitor() ], dataset_sink_mode=True)
def train(): context.set_context( mode=context.GRAPH_MODE, device_target="Ascend", #save_graphs=True, #save_graphs_path="/home/work/user-job-dir/EAST/", #enable_reduce_precision=False, #device_id=5 ) epoch = 600 my_dataset.download_dataset() train_img_path = os.path.abspath('/cache/train_img') train_gt_path = os.path.abspath('/cache/train_gt') #my_dataset.data_to_mindrecord_byte_image(train_img_path, train_gt_path, mindrecord_dir='/cache', prefix='icdar_train.mindrecord',file_num=1) #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file=['icdar_train.mindrecord0','icdar_train.mindrecord1','icdar_train.mindrecord2','icdar_train.mindrecord3'], batch_size=32, repeat_num=epoch, # is_training=True, num_parallel_workers=8, length=512, scale=0.25) #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file='/cache/icdar_train.mindrecord', batch_size=32, repeat_num=epoch, # is_training=True, num_parallel_workers=24, length=512, scale=0.25) #dataset = my_dataset.create_demo_dataset(batch_size=21, repeat_num=2) #train_img_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/img') #train_gt_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/gt') dataset = datasetV2.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24) #dataset = datasetV3.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24) dataset_size = dataset.get_dataset_size() print("Create dataset done!, dataset_size: ", dataset_size) #east = EAST.EAST() net = EAST_VGG.EAST() #ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * 20) #ckpoint_cb = ModelCheckpoint(prefix='EAST', directory='/cache', config=ckpt_config) milestone = [100, 300] learning_rates = [1e-3, 1e-4] lr = piecewise_constant_lr(milestone, learning_rates) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) net = my_loss.EASTWithLossCell(net) net = my_loss.TrainingWrapper(net, opt) net.set_train(True) callback = [TimeMonitor(data_size=dataset_size), LossMonitor()] #, ckpoint_cb] model = Model(net) dataset_sink_mode = False print("start trainig") model.train(epoch, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
def test_train_eval(): """ test_train_eval """ np.random.seed(1000) config = WideDeepConfig() data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ]) expect_out0 = [0.792634, 0.799862, 0.803324] expect_out6 = [0.796580, 0.803908, 0.807262] if get_rank() == 0: assert np.allclose(eval_callback.eval_values, expect_out0) if get_rank() == 6: assert np.allclose(eval_callback.eval_values, expect_out6)
def train(): rank_id = 0 if args.run_distribute: context.set_auto_parallel_context( device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) init() rank_id = get_rank() # dataset/network/criterion/optim ds = train_dataset_creator(args.device_id, args.device_num) step_size = ds.get_dataset_size() print('Create dataset done!') config.INFERENCE = False net = ETSNet(config) net = net.set_train() param_dict = load_checkpoint(args.pre_trained) load_param_into_net(net, param_dict) print('Load Pretrained parameters done!') criterion = DiceLoss(batch_size=config.TRAIN_BATCH_SIZE) lrs = lr_generator(start_lr=1e-3, lr_scale=0.1, total_iters=config.TRAIN_TOTAL_ITER) opt = nn.SGD(params=net.trainable_params(), learning_rate=lrs, momentum=0.99, weight_decay=5e-4) # warp model net = WithLossCell(net, criterion) if args.run_distribute: net = TrainOneStepCell(net, opt, reduce_flag=True, mean=True, degree=args.device_num) else: net = TrainOneStepCell(net, opt) time_cb = TimeMonitor(data_size=step_size) loss_cb = LossCallBack(per_print_times=10) # set and apply parameters of check point config.TRAIN_MODEL_SAVE_PATH ckpoint_cf = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=2) ckpoint_cb = ModelCheckpoint(prefix="ETSNet", config=ckpoint_cf, directory="./ckpt_{}".format(rank_id)) model = Model(net) model.train(config.TRAIN_REPEAT_NUM, ds, dataset_sink_mode=True, callbacks=[time_cb, loss_cb, ckpoint_cb])
def test_train_eval(config): """ test_train_eval """ data_path = config.data_path batch_size = config.batch_size epochs = config.epochs sparse = config.sparse if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ], dataset_sink_mode=(not sparse))
def train_and_eval(config): """ train_and_eval """ set_seed(1000) data_path = config.data_path epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config) # Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size() * config.epochs, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) callback_list = [ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback ] if int(get_rank()) == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, sink_size=ds_train.get_dataset_size())
def run_predistill(): """ run predistill """ cfg = phase1_cfg context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) load_teacher_checkpoint_path = args_opt.load_teacher_ckpt_path load_student_checkpoint_path = args_opt.load_gd_ckpt_path netwithloss = BertNetworkWithLoss_td(teacher_config=td_teacher_net_cfg, teacher_ckpt=load_teacher_checkpoint_path, student_config=td_student_net_cfg, student_ckpt=load_student_checkpoint_path, is_training=True, task_type='classification', num_labels=args_opt.num_labels, is_predistill=True) rank = 0 device_num = 1 dataset = create_tinybert_dataset('td', td_teacher_net_cfg.batch_size, device_num, rank, args_opt.do_shuffle, args_opt.train_data_dir, args_opt.schema_dir) dataset_size = dataset.get_dataset_size() print('td1 dataset size: ', dataset_size) if args_opt.enable_data_sink == 'true': repeat_count = args_opt.td_phase1_epoch_size * dataset.get_dataset_size() // args_opt.data_sink_steps time_monitor_steps = args_opt.data_sink_steps else: repeat_count = args_opt.td_phase1_epoch_size time_monitor_steps = dataset_size optimizer_cfg = cfg.optimizer_cfg lr_schedule = BertLearningRate(learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate, end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(dataset_size / 10), decay_steps=int(dataset_size * args_opt.td_phase1_epoch_size), power=optimizer_cfg.AdamWeightDecay.power) params = netwithloss.trainable_params() decay_params = list(filter(optimizer_cfg.AdamWeightDecay.decay_filter, params)) other_params = list(filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay}, {'params': other_params, 'weight_decay': 0.0}, {'order_params': params}] optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps) callback = [TimeMonitor(time_monitor_steps), LossCallBack(), ModelSaveCkpt(netwithloss.bert, args_opt.save_ckpt_step, args_opt.max_ckpt_num, td_phase1_save_ckpt_dir)] update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) netwithgrads = BertEvaluationCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) model.train(repeat_count, dataset, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == 'true'), sink_size=args_opt.data_sink_steps)
def train_net(data_dir, seg_dir, run_distribute, config=None): if run_distribute: init() rank_id = get_rank() rank_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=rank_size, gradients_mean=True) else: rank_id = 0 rank_size = 1 # train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \ # rank_size=rank_size, rank_id=rank_id, is_training=True) train_dataset = create_dataset_diy() # for item in train_dataset: # print(item) # exit(0) train_data_size = train_dataset.get_dataset_size() print("train dataset length is:", train_data_size) network = UNet3d(config=config) loss = SoftmaxCrossEntropyWithLogits() # loss = nn.DiceLoss() lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32) optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr) scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) network.set_train() model = Model(network, loss_fn=loss, optimizer=optimizer, loss_scale_manager=scale_manager, amp_level='O3') time_cb = TimeMonitor(data_size=train_data_size) loss_cb = LossMonitor(per_print_times=2) ckpt_config = CheckpointConfig( save_checkpoint_steps=train_data_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model), directory='./ckpt_{}/'.format(rank_size), config=ckpt_config) callbacks_list = [loss_cb, time_cb, ckpoint_cb] print("============== Starting Training ==============") model.train(config.epoch_size, train_dataset, callbacks=callbacks_list, dataset_sink_mode=False) print("============== End Training ==============")
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = cfg.batch_size class_num = cfg.class_num loss_scale_num = cfg.loss_scale local_data_path = '/cache/data' local_ckpt_path = '/cache/ckpt_file' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=epoch_size, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch time_cb = TimeMonitor(data_size=train_step_size) performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [time_cb, performance_cb, loss_cb] config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck) cb += [ckpt_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) # upload checkpoint files print('Upload checkpoint.') mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
def train_and_eval(config): """ test_train_eval """ data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 host_device_mix = bool(config.host_device_mix) print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) de.config.set_seed(1) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size*get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size*get_group_size(), data_type=dataset_type) else: ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config, host_device_mix=host_device_mix) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt") callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] if not host_device_mix: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(not host_device_mix))
def test_all_trains(): ds_train = create_dataset( os.path.join('/home/workspace/mindspore_dataset/mnist', "train"), 32, 1) network = LeNet5(10) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Training ==============") model.train(1, ds_train, callbacks=[time_cb, LossMonitor()])
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ do train """ if load_checkpoint_path == "": raise ValueError("Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() # optimizer if optimizer_cfg.optimizer == 'AdamWeightDecay': lr_schedule = BertLearningRate(learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate, end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=optimizer_cfg.AdamWeightDecay.power) params = network.trainable_params() decay_params = list(filter(optimizer_cfg.AdamWeightDecay.decay_filter, params)) other_params = list(filter(lambda x: not optimizer_cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay}, {'params': other_params, 'weight_decay': 0.0}] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps) elif optimizer_cfg.optimizer == 'Lamb': lr_schedule = BertLearningRate(learning_rate=optimizer_cfg.Lamb.learning_rate, end_learning_rate=optimizer_cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=optimizer_cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), learning_rate=lr_schedule) elif optimizer_cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), learning_rate=optimizer_cfg.Momentum.learning_rate, momentum=optimizer_cfg.Momentum.momentum) else: raise Exception("Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]") # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix="ner", directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(network, param_dict) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb] train_begin = time.time() model.train(epoch_num, dataset, callbacks=callbacks) train_end = time.time() print("latency: {:.6f} s".format(train_end - train_begin))
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] if get_rank() == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, sink_size=ds_train.get_dataset_size())
def train_and_eval(config): """ train_and_eval. """ data_path = config.data_path epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ])
def train_lenet(): context.set_context(mode=context.GRAPH_MODE, device_target=device_target) cfg = nonquant_cfg ds_train = create_dataset(os.path.join(data_path, "train"), cfg.batch_size) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="ckpt_lenet_noquant", config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Training Lenet==============") model.train(cfg['epoch_size'], ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], dataset_sink_mode=True)
def train_lenet5(): """ Train lenet5 """ config.ckpt_path = config.output_path context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) ds_train = create_lenet_dataset(os.path.join(config.data_path, "train"), config.batch_size, num_parallel_workers=1) if ds_train.get_dataset_size() == 0: raise ValueError( "Please check dataset size > 0 and batch_size <= dataset size") network = LeNet5(config.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_steps, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint( prefix="checkpoint_lenet", directory=None if config.ckpt_path == "" else config.ckpt_path, config=config_ck) if config.device_target != "Ascend": model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) else: model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O2") print("============== Starting Training ==============") model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()])
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 parameter_server = bool(config.parameter_server) cache_enable = config.vocab_cache_size > 0 print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb] model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(parameter_server and cache_enable))
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path=""): """ do train """ if load_checkpoint_path == "": raise ValueError("Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() epoch_num = dataset.get_repeat_count() # optimizer if optimizer_cfg.optimizer == 'AdamWeightDecayDynamicLR': optimizer = AdamWeightDecayDynamicLR(network.trainable_params(), decay_steps=steps_per_epoch * epoch_num, learning_rate=optimizer_cfg.AdamWeightDecayDynamicLR.learning_rate, end_learning_rate=optimizer_cfg.AdamWeightDecayDynamicLR.end_learning_rate, power=optimizer_cfg.AdamWeightDecayDynamicLR.power, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), weight_decay=optimizer_cfg.AdamWeightDecayDynamicLR.weight_decay, eps=optimizer_cfg.AdamWeightDecayDynamicLR.eps) elif optimizer_cfg.optimizer == 'Lamb': optimizer = Lamb(network.trainable_params(), decay_steps=steps_per_epoch * epoch_num, start_learning_rate=optimizer_cfg.Lamb.start_learning_rate, end_learning_rate=optimizer_cfg.Lamb.end_learning_rate, power=optimizer_cfg.Lamb.power, weight_decay=optimizer_cfg.Lamb.weight_decay, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_filter=optimizer_cfg.Lamb.decay_filter) elif optimizer_cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), learning_rate=optimizer_cfg.Momentum.learning_rate, momentum=optimizer_cfg.Momentum.momentum) else: raise Exception("Optimizer not supported. support: [AdamWeightDecayDynamicLR, Lamb, Momentum]") # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix="classifier", directory=save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(network, param_dict) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb] model.train(epoch_num, dataset, callbacks=callbacks)
get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size, lr_decay_mode='poly')) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) time_cb = TimeMonitor(data_size=step_size) loss_cb = LossMonitor() cb = [time_cb, loss_cb] if config.save_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_steps, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=config.save_checkpoint_path, config=config_ck) cb += [ckpt_cb] model.train(epoch_size, dataset, callbacks=cb)
keep_batchnorm_fp32=False, loss_scale_manager=None) else: if not args.label_smooth: args.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, amp_level="O2") # define callbacks time_cb = TimeMonitor(data_size=batch_num) loss_cb = LossMonitor(per_print_times=batch_num) callbacks = [time_cb, loss_cb] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, dataset, callbacks=callbacks)
args = parser.parse_args() if args.device_target == "CPU": args.dataset_sink_mode = False context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) ds_train = create_dataset(os.path.join(args.data_path, "train"), cfg.batch_size) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=args.ckpt_path, config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Training ==============") model.train(cfg['epoch_size'], ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], dataset_sink_mode=args.dataset_sink_mode)
def inception_v4_train(): """ Train Inceptionv4 in data parallelism """ print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes)) context.set_context(mode=context.GRAPH_MODE, device_target=args.platform) if args.platform == "Ascend": context.set_context(device_id=args.device_id) context.set_context(enable_graph_kernel=False) rank = 0 if device_num > 1: if args.platform == "Ascend": init(backend_name='hccl') elif args.platform == "GPU": init() else: raise ValueError("Unsupported device target.") rank = get_rank() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[200, 400]) # create dataset train_dataset = create_dataset(dataset_path=args.dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size, shard_id=rank) train_step_size = train_dataset.get_dataset_size() # create model net = Inceptionv4(classes=config.num_classes) # loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # learning rate lr = Tensor(generate_cosine_lr(steps_per_epoch=train_step_size, total_epochs=config.epoch_size)) decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: param.set_data(initializer(XavierUniform(), param.data.shape, param.data.dtype)) group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, {'params': no_decayed_params}, {'order_params': net.trainable_params()}] opt = RMSProp(group_params, lr, decay=config.decay, epsilon=config.epsilon, weight_decay=config.weight_decay, momentum=config.momentum, loss_scale=config.loss_scale) if args.device_id == 0: print(lr) print(train_step_size) if args.resume: ckpt = load_checkpoint(args.resume) load_param_into_net(net, ckpt) loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) if args.platform == "Ascend": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level=config.amp_level) elif args.platform == "GPU": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level='O0') else: raise ValueError("Unsupported device target.") # define callbacks performance_cb = TimeMonitor(data_size=train_step_size) loss_cb = LossMonitor(per_print_times=train_step_size) ckp_save_step = config.save_checkpoint_epochs * train_step_size config_ck = CheckpointConfig(save_checkpoint_steps=ckp_save_step, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=f"inceptionV4-train-rank{rank}", directory='ckpts_rank_' + str(rank), config=config_ck) callbacks = [performance_cb, loss_cb] if device_num > 1 and config.is_save_on_master: if args.device_id == 0: callbacks.append(ckpoint_cb) else: callbacks.append(ckpoint_cb) # train model model.train(config.epoch_size, train_dataset, callbacks=callbacks, dataset_sink_mode=True)
def train(): """Train function.""" args.outputs_dir = params['save_model_path'] if args.group_size > 1: init() context.set_auto_parallel_context( device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_{}/".format(str(get_rank()))) args.rank = get_rank() else: args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/") args.rank = 0 if args.group_size > 1: args.max_epoch = params["max_epoch_train_NP"] args.loss_scale = params['loss_scale'] / 2 args.lr_steps = list(map(int, params["lr_steps_NP"].split(','))) params['train_type'] = params['train_type_NP'] params['optimizer'] = params['optimizer_NP'] params['group_params'] = params['group_params_NP'] else: args.max_epoch = params["max_epoch_train"] args.loss_scale = params['loss_scale'] args.lr_steps = list(map(int, params["lr_steps"].split(','))) # create network print('start create network') criterion = openpose_loss() criterion.add_flags_recursive(fp32=True) network = OpenPoseNet(vggpath=params['vgg_path'], vgg_with_bn=params['vgg_with_bn']) if params["load_pretrain"]: print("load pretrain model:", params["pretrained_model_path"]) load_model(network, params["pretrained_model_path"]) train_net = BuildTrainNetwork(network, criterion) # create dataset if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \ and os.path.exists(args.maskpath_train): print('start create dataset') else: print('Error: wrong data path') return 0 num_worker = 20 if args.group_size > 1 else 48 de_dataset_train = create_dataset(args.jsonpath_train, args.imgpath_train, args.maskpath_train, batch_size=params['batch_size'], rank=args.rank, group_size=args.group_size, num_worker=num_worker, multiprocessing=True, shuffle=True, repeat_num=1) steps_per_epoch = de_dataset_train.get_dataset_size() print("steps_per_epoch: ", steps_per_epoch) # lr scheduler lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size, params['lr_gamma'], steps_per_epoch, args.max_epoch, args.lr_steps, args.group_size, lr_type=params['lr_type'], warmup_epoch=params['warmup_epoch']) # optimizer if params['group_params']: vgg19_base_params = list( filter(lambda x: 'base.vgg_base' in x.name, train_net.trainable_params())) base_params = list( filter(lambda x: 'base.conv' in x.name, train_net.trainable_params())) stages_params = list( filter(lambda x: 'base' not in x.name, train_net.trainable_params())) group_params = [{ 'params': vgg19_base_params, 'lr': lr_vgg }, { 'params': base_params, 'lr': lr_base }, { 'params': stages_params, 'lr': lr_stage }] if params['optimizer'] == "Momentum": opt = Momentum(group_params, learning_rate=lr_stage, momentum=0.9) elif params['optimizer'] == "Adam": opt = Adam(group_params) else: raise ValueError("optimizer not support.") else: if params['optimizer'] == "Momentum": opt = Momentum(train_net.trainable_params(), learning_rate=lr_stage, momentum=0.9) elif params['optimizer'] == "Adam": opt = Adam(train_net.trainable_params(), learning_rate=lr_stage) else: raise ValueError("optimizer not support.") # callback config_ck = CheckpointConfig( save_checkpoint_steps=params['ckpt_interval'], keep_checkpoint_max=params["keep_checkpoint_max"]) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank), directory=args.outputs_dir, config=config_ck) time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size()) if args.rank == 0: callback_list = [MyLossMonitor(), time_cb, ckpoint_cb] else: callback_list = [MyLossMonitor(), time_cb] # train if params['train_type'] == 'clip_grad': train_net = TrainOneStepWithClipGradientCell(train_net, opt, sens=args.loss_scale) train_net.set_train() model = Model(train_net) elif params['train_type'] == 'fix_loss_scale': loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) train_net.set_train() model = Model(train_net, optimizer=opt, loss_scale_manager=loss_scale_manager) else: raise ValueError("Type {} is not support.".format( params['train_type'])) print("============== Starting Training ==============") model.train(args.max_epoch, de_dataset_train, callbacks=callback_list, dataset_sink_mode=False) return 0
loss = LossNet() lr = Tensor(dynamic_lr(training_cfg, dataset_size), mstype.float32) opt = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum,\ weight_decay=config.weight_decay, loss_scale=config.loss_scale) net_with_loss = WithLossCell(net, loss) if args_opt.run_distribute: net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale, reduce_flag=True, mean=True, degree=device_num) else: net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale) time_cb = TimeMonitor(data_size=dataset_size) loss_cb = LossCallBack(rank_id=rank) cb = [time_cb, loss_cb] if config.save_checkpoint: ckptconfig = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size, keep_checkpoint_max=config.keep_checkpoint_max) save_checkpoint_path = os.path.join(config.save_checkpoint_path, "ckpt_" + str(rank) + "/") ckpoint_cb = ModelCheckpoint(prefix='ctpn', directory=save_checkpoint_path, config=ckptconfig) cb += [ckpoint_cb] model = Model(net) model.train(training_cfg.total_epoch,
def run_general_distill(): """ run general distill """ parser = argparse.ArgumentParser(description='tinybert general distill') parser.add_argument( '--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="3", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--save_ckpt_step", type=int, default=100, help="Enable data sink, default is true.") parser.add_argument("--max_ckpt_num", type=int, default=1, help="Enable data sink, default is true.") parser.add_argument("--do_shuffle", type=str, default="true", help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default=1, help="Sink steps for each epoch, default is 1.") parser.add_argument("--save_ckpt_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_teacher_ckpt_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) context.set_context(variable_memory_max_size="30GB") save_ckpt_dir = os.path.join( args_opt.save_ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) if not os.path.exists(save_ckpt_dir): os.makedirs(save_ckpt_dir) if args_opt.distribute == "true": D.init('hccl') device_num = args_opt.device_num rank = args_opt.device_id % device_num context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) else: rank = 0 device_num = 1 netwithloss = BertNetworkWithLoss_gd( teacher_config=bert_teacher_net_cfg, teacher_ckpt=args_opt.load_teacher_ckpt_path, student_config=bert_student_net_cfg, is_training=True, use_one_hot_embeddings=False) dataset = create_tinybert_dataset('gd', bert_teacher_net_cfg.batch_size, device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) dataset_size = dataset.get_dataset_size() print('dataset size: ', dataset_size) if args_opt.enable_data_sink == "true": repeat_count = args_opt.epoch_size * dataset.get_dataset_size( ) // args_opt.data_sink_steps time_monitor_steps = args_opt.data_sink_steps else: repeat_count = args_opt.epoch_size time_monitor_steps = dataset_size lr_schedule = BertLearningRate( learning_rate=common_cfg.AdamWeightDecay.learning_rate, end_learning_rate=common_cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(dataset_size * args_opt.epoch_size / 10), decay_steps=int(dataset_size * args_opt.epoch_size), power=common_cfg.AdamWeightDecay.power) params = netwithloss.trainable_params() decay_params = list(filter(common_cfg.AdamWeightDecay.decay_filter, params)) other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': common_cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }, { 'order_params': params }] optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=common_cfg.AdamWeightDecay.eps) callback = [ TimeMonitor(time_monitor_steps), LossCallBack(), ModelSaveCkpt(netwithloss.bert, args_opt.save_ckpt_step, args_opt.max_ckpt_num, save_ckpt_dir) ] update_cell = DynamicLossScaleUpdateCell( loss_scale_value=common_cfg.loss_scale_value, scale_factor=common_cfg.scale_factor, scale_window=common_cfg.scale_window) netwithgrads = BertTrainWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) model.train(repeat_count, dataset, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)