def mobilenetv2_cb(device_target, lr, is_saving_checkpoint, save_checkpoint_epochs, step_size): """ Get mobilenetv2 callback. Args: device_target (str): 'CPU', 'GPU or 'Ascend'. lr (Tensor): learning rate. is_saving_checkpoint (bool): Whether to save the checkpoint file. save_checkpoint_epochs (int): Save checkpoint epochs. step_size (int): dataset.get_dataset_size(). Returns: Callback list. """ cb = None if device_target in ("CPU", "GPU"): cb = [LossTimeMonitor(lr_init=lr.asnumpy())] if is_saving_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=save_checkpoint_epochs * step_size, keep_checkpoint_max=10) ckpt_save_dir = "./" ckpt_cb = ModelCheckpoint(prefix="mobilenetv2_cifar10", directory=ckpt_save_dir, config=config_ck) cb += [ckpt_cb] return cb
def mobilenetv2_cb(device_target, lr, is_saving_checkpoint, save_checkpoint_epochs, step_size): cb = None if device_target in ("CPU", "GPU"): cb = [LossTimeMonitor(lr_init=lr.asnumpy())] if is_saving_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=save_checkpoint_epochs * step_size, keep_checkpoint_max=10) ckpt_save_dir = "./" ckpt_cb = ModelCheckpoint(prefix="mobilenetv2_cifar10", directory=ckpt_save_dir, config=config_ck) cb += [ckpt_cb] return cb
model.compile(loss_fn=net_loss, optimizer=net_opt, metrics={"Accuracy": Accuracy()}) epoch_size = args_opt.epoch_size batch_size = args_opt.batch_size cifar10_path = args_opt.dataset_path save_checkpoint_epochs = args_opt.save_checkpoint_epochs dataset_sink_mode = not args_opt.device_target == "CPU" if args_opt.do_eval: # as for evaluation, users could use model.eval ds_eval = create_dataset(cifar10_path, batch_size=batch_size, is_training=False) if args_opt.load_pretrained == 'local': if args_opt.checkpoint_path: model.load_checkpoint(args_opt.checkpoint_path) acc = model.eval(ds_eval, dataset_sink_mode=dataset_sink_mode) print("============== Accuracy:{} ==============".format(acc)) else: # as for train, users could use model.train ds_train = create_dataset(cifar10_path, batch_size=batch_size) ckpoint_cb = ModelCheckpoint( prefix="vgg16_cifar10", config=CheckpointConfig( save_checkpoint_steps=save_checkpoint_epochs * ds_train.get_dataset_size(), keep_checkpoint_max=10)) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=dataset_sink_mode)
def run_pretrain(): """pre-train bert_clue""" parser = argparse_init() args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) is_auto_enable_graph_kernel = _auto_enable_graph_kernel(args_opt.device_target, args_opt.enable_graph_kernel) _set_graph_kernel_context(args_opt.device_target, args_opt.enable_graph_kernel, is_auto_enable_graph_kernel) ckpt_save_dir = args_opt.save_checkpoint_path device_num = 1 _check_compute_type(args_opt, is_auto_enable_graph_kernel) if args_opt.accumulation_steps > 1: logger.info("accumulation steps: {}".format(args_opt.accumulation_steps)) logger.info("global batch size: {}".format(cfg.batch_size * args_opt.accumulation_steps)) if args_opt.enable_data_sink == "true": args_opt.data_sink_steps *= args_opt.accumulation_steps logger.info("data sink steps: {}".format(args_opt.data_sink_steps)) if args_opt.enable_save_ckpt == "true": args_opt.save_checkpoint_steps *= args_opt.accumulation_steps logger.info("save checkpoint steps: {}".format(args_opt.save_checkpoint_steps)) ds = create_bert_dataset( batch_size=cfg.batch_size, shuffle=args_opt.do_shuffle, data_dir=args_opt.data_dir, schema_dir=args_opt.schema_dir, num_parallel_workers=args_opt.num_parallel_workers ) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size() // args_opt.data_sink_steps if args_opt.train_steps > 0: train_steps = args_opt.train_steps * args_opt.accumulation_steps new_repeat_count = min(new_repeat_count, train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size() // args_opt.accumulation_steps logger.info("train steps: {}".format(args_opt.train_steps)) # get the optimizer followed args_opt.optimizer optimizer = get_optimizer(args_opt, net_with_loss, cfg, bert_net_cfg) # define the callbacks callback = [TimeMonitor(args_opt.data_sink_steps), BertLossCallBack(ds.get_dataset_size())] if args_opt.enable_save_ckpt == "true": config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) accumulation_steps = args_opt.accumulation_steps enable_global_norm = cfg.enable_global_norm if accumulation_steps <= 1: if cfg.optimizer == 'AdamWeightDecay' and args_opt.device_target == 'GPU': net_with_grads = BertTrainOneStepWithLossScaleCellForAdam(net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: allreduce_post = args_opt.distribute == "false" or args_opt.allreduce_post_accumulation == "true" net_with_accumulation = (BertTrainAccumulationAllReducePostWithLossScaleLayer if allreduce_post else BertTrainAccumulationAllReduceEachWithLossScaleLayer) net_with_grads = net_with_accumulation(net_with_loss, optimizer=optimizer, scale_update_cell=update_cell, accumulation_steps=accumulation_steps, enable_global_norm=enable_global_norm) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) if args_opt.load_checkpoint_path: model.load_checkpoint(args_opt.load_checkpoint_path) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
if not args_opt.dataset_path: args_opt.dataset_path = download_dataset('cifar10') # build the network net = densenetBC_100(args_opt.num_classes) net.update_parameters_name(prefix='huawei') model = Model(net) # define the loss function net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # define the optimizer net_opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model.compile(loss_fn=net_loss, optimizer=net_opt, metrics={"Accuracy": Accuracy()}) epoch_size = args_opt.epoch_size batch_size = args_opt.batch_size cifar10_path = args_opt.dataset_path save_checkpoint_epochs = args_opt.save_checkpoint_epochs dataset_sink_mode = not args_opt.device_target == "CPU" if args_opt.do_eval: # as for evaluation, users could use model.eval ds_eval = create_dataset(cifar10_path, batch_size=batch_size, is_training=False) if args_opt.checkpoint_path: model.load_checkpoint(args_opt.checkpoint_path) acc = model.eval(ds_eval, dataset_sink_mode=dataset_sink_mode) print("============== Accuracy:{} ==============".format(acc)) else: # as for train, users could use model.train ds_train = create_dataset(cifar10_path, batch_size=batch_size) ckpoint_cb = ModelCheckpoint(prefix="densenetBC_100_cifar10", config=CheckpointConfig( save_checkpoint_steps=save_checkpoint_epochs * ds_train.get_dataset_size(), keep_checkpoint_max=10)) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=dataset_sink_mode)
lr_end=0.001 * args_opt.lr, lr_max=args_opt.lr, warmup_epochs=2, total_epochs=args_opt.epoch_size, steps_per_epoch=dataset_size) loss_scale = 1.0 if args_opt.device_target == "CPU" else float( args_opt.loss_scale) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1.5e-4, loss_scale) model = Model(TrainingWrapper(net, opt, loss_scale)) model.compile() ckpoint_cb = ModelCheckpoint( prefix="ssd300", config=CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_epochs * dataset_size, keep_checkpoint_max=10)) model.train(epoch_size, ds_train, callbacks=[ ckpoint_cb, LossMonitor(), TimeMonitor(data_size=dataset_size) ], dataset_sink_mode=dataset_sink_mode) else: # as for evaluation, users could use model.eval ds_eval = create_dataset(voc_path, batch_size=1, is_training=False) total = ds_eval.get_dataset_size() # define the infer wrapper eval_net = ssd300_infer(class_num=args_opt.num_classes) model = Model(eval_net)
model.compile(net_loss, net_opt, metrics={"Accuracy": Accuracy()}) epoch_size = args_opt.epoch_size batch_size = args_opt.batch_size mnist_path = args_opt.dataset_path dataset_sink_mode = not args_opt.device_target == "CPU" if args_opt.do_eval: # as for evaluation, users could use model.eval print("============== Starting Evaluating ==============") # load testing dataset ds_eval = create_dataset(os.path.join(mnist_path, "test")) # load the saved model for evaluation if args_opt.checkpoint_path: model.load_checkpoint(args_opt.checkpoint_path) acc = model.eval(ds_eval, dataset_sink_mode=dataset_sink_mode) print("============== Accuracy:{} ==============".format(acc)) else: # as for train, users could use model.train print("============== Starting Training ==============") # load training dataset ds_train = create_dataset(os.path.join(mnist_path, "train"), batch_size=batch_size) # save the network model and parameters for subsequence fine-tuning ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=CheckpointConfig( save_checkpoint_steps=1875, keep_checkpoint_max=10)) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=dataset_sink_mode)
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ do train """ if load_checkpoint_path == "": raise ValueError( "Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() # optimizer if optimizer_cfg.optimizer == 'AdamWeightDecay': lr_schedule = BertLearningRate( learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate, end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=optimizer_cfg.AdamWeightDecay.power) params = network.trainable_params() decay_params = list( filter(optimizer_cfg.AdamWeightDecay.decay_filter, params)) other_params = list( filter(lambda x: not optimizer_cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps) elif optimizer_cfg.optimizer == 'Lamb': lr_schedule = BertLearningRate( learning_rate=optimizer_cfg.Lamb.learning_rate, end_learning_rate=optimizer_cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=optimizer_cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), learning_rate=lr_schedule) elif optimizer_cfg.optimizer == 'Momentum': optimizer = Momentum( network.trainable_params(), learning_rate=optimizer_cfg.Momentum.learning_rate, momentum=optimizer_cfg.Momentum.momentum) else: raise Exception( "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]" ) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint( prefix="classifier", directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) update_layer = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertFinetuneLayer(network, optimizer=optimizer, scale_update_layer=update_layer) model = Model(netwithgrads) model.load_checkpoint(load_checkpoint_path) callbacks = [ TimeMonitor(dataset.get_dataset_size()), BertLossCallBack(dataset.get_dataset_size()), ckpoint_cb ] model.train(epoch_num, dataset, callbacks=callbacks)
batch_size=batch_size) # build base network data_size = train_ds.get_dataset_size() net = DeepFM(field_size=39, vocab_size=184965, embed_size=80, convert_dtype=True) # build train network train_net = DeepFMTrainModel(DeepFMWithLoss(net)) # build eval network eval_net = DeepFMEvalModel(net) # build model model = Model(train_net) # loss/ckpt/metric callbacks loss_tm = LossTimeMonitorV2() config_ckpt = CheckpointConfig(save_checkpoint_steps=data_size // 100, keep_checkpoint_max=10) model_ckpt = ModelCheckpoint(prefix='deepfm', directory=checkpoint_dir, config=config_ckpt) auc_metric = AUCMetric() model.compile(eval_network=eval_net, metrics={"auc_metric": auc_metric}, amp_level='O0') print("====== start train model ======", flush=True) model.train(epoch=epoch_size, train_dataset=train_ds, callbacks=[loss_tm, model_ckpt], dataset_sink_mode=dataset_sink_mode) print("====== start eval model ======", flush=True) acc = model.eval(eval_ds)