def train_on_gpu(): config = config_gpu_quant if args_opt.quantization_aware else config_gpu print("training args: {}".format(args_opt)) print("training configure: {}".format(config)) # define network network = mobilenetV2(num_classes=config.num_classes) # define loss if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes) else: loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') # define dataset epoch_size = config.epoch_size dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config, device_target=args_opt.device_target, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() # resume if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(network, param_dict) # convert fusion network to quantization aware network if config.quantization_aware: network = quant.convert_quant_network(network, bn_fold=True, per_channel=[True, False], symmetric=[True, True]) # get learning rate loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) lr = Tensor(get_lr(global_step=config.start_epoch * step_size, lr_init=0, lr_end=0, lr_max=config.lr, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size + config.start_epoch, steps_per_epoch=step_size)) # define optimization opt = nn.Momentum(filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) # define model model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) print("============== Starting Training ==============") callback = [Monitor(lr_init=lr.asnumpy())] ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/" if config.save_checkpoint: config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=ckpt_save_dir, config=config_ck) callback += [ckpt_cb] model.train(epoch_size, dataset, callbacks=callback) print("============== End Training ==============")
switch_precision(net, mstype.float16, config) # define loss if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth( smooth_factor=config.label_smooth, num_classes=config.num_classes) else: loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') epoch_size = config.epoch_size # get learning rate lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size)) if args_opt.pretrain_ckpt == "" or args_opt.freeze_layer != "backbone": loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, \ config.weight_decay, config.loss_scale) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) cb = config_ckpoint(config, lr, step_size) print("============== Starting Training ==============") model.train(epoch_size, dataset, callbacks=cb) print("============== End Training ==============")
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, gradients_mean=True) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, 1, args.rank, args.group_size, num_parallel_workers=8) de_dataset.map_model = 4 # !!!important args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = get_network(args.backbone, num_classes=args.num_classes, platform=args.platform) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) load_pretrain_model(args.pretrained, network, args) # lr scheduler lr = get_lr(args) # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O3") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [ progress_cb, ] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
residual_channels=hparams.residual_channels, gate_channels=hparams.gate_channels, skip_out_channels=hparams.skip_out_channels, cin_channels=hparams.cin_channels, gin_channels=hparams.gin_channels, n_speakers=hparams.n_speakers, dropout=hparams.dropout, kernel_size=hparams.kernel_size, cin_pad=hparams.cin_pad, upsample_conditional_features=hparams.upsample_conditional_features, upsample_params=upsample_params, scalar_input=is_scalar_input(hparams.input_type), output_distribution=hparams.output_distribution, ) loss_net = NetWithLossClass(model, hparams) lr = get_lr(hparams.optimizer_params["lr"], hparams.nepochs, step_size_per_epoch) lr = Tensor(lr) if args.checkpoint != '': param_dict = load_checkpoint(args.pre_trained_model_path) load_param_into_net(model, param_dict) print('Successfully loading the pre-trained model') weights = model.trainable_params() optimizer = Adam(weights, learning_rate=lr, loss_scale=1024.) train_net = TrainOneStepCell(loss_net, optimizer) model = Model(train_net) lr_cb = Monitor(lr) callback_list = [lr_cb] if args.is_distributed:
platform=args_opt.platform, repeat_num=1, batch_size=config_gpu.batch_size) step_size = dataset.get_dataset_size() # resume if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) # define optimizer loss_scale = FixedLossScaleManager(config_gpu.loss_scale, drop_overflow_update=False) lr = Tensor( get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=config_gpu.lr, warmup_epochs=config_gpu.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size)) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_gpu.momentum, config_gpu.weight_decay, config_gpu.loss_scale) # define model model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) cb = [Monitor(lr_init=lr.asnumpy())] ckpt_save_dir = config_gpu.save_checkpoint_path + "ckpt_" + str( get_rank()) + "/"
def train_on_ascend(): config = config_ascend_quant print("training args: {}".format(args_opt)) print("training configure: {}".format(config)) print("parallel args: rank_id {}, device_id {}, rank_size {}".format( rank_id, device_id, rank_size)) epoch_size = config.epoch_size # distribute init if run_distribute: context.set_auto_parallel_context( device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() # define network network = mobilenetV2(num_classes=config.num_classes) # define loss if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes) else: loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config, device_target=args_opt.device_target, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() # load pre trained ckpt if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_nonquant_param_into_quant_net(network, param_dict) # convert fusion network to quantization aware network network = quant.convert_quant_network(network, bn_fold=True, per_channel=[True, False], symmetric=[True, False]) # get learning rate lr = Tensor( get_lr(global_step=config.start_epoch * step_size, lr_init=0, lr_end=0, lr_max=config.lr, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size + config.start_epoch, steps_per_epoch=step_size)) # define optimization opt = nn.Momentum( filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum, config.weight_decay) # define model model = Model(network, loss_fn=loss, optimizer=opt) print("============== Starting Training ==============") callback = None if rank_id == 0: callback = [Monitor(lr_init=lr.asnumpy())] if config.save_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=config.save_checkpoint_path, config=config_ck) callback += [ckpt_cb] model.train(epoch_size, dataset, callbacks=callback) print("============== End Training ==============")
cfg.group_size) batches_per_epoch = dataset.get_dataset_size() # network net = InceptionV3(num_classes=cfg.num_classes) # loss loss = CrossEntropy(smooth_factor=cfg.smooth_factor, num_classes=cfg.num_classes, factor=cfg.aux_factor) # learning rate schedule lr = get_lr(lr_init=cfg.lr_init, lr_end=cfg.lr_end, lr_max=cfg.lr_max, warmup_epochs=cfg.warmup_epochs, total_epochs=cfg.epoch_size, steps_per_epoch=batches_per_epoch, lr_decay_mode=cfg.decay_method) lr = Tensor(lr) # optimizer decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) group_params = [{
rank=rank) step_size = dataset.get_dataset_size() # resume if args_opt.resume: ckpt = load_checkpoint(args_opt.resume) load_param_into_net(net, ckpt) # get learning rate loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) lr = Tensor( get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode, global_step=config.finish_epoch * step_size)) # define optimization and model if args_opt.device_target == "Ascend": opt = Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay, config.loss_scale) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level='O3', keep_batchnorm_fp32=True)
cfg.group_size = 1 # dataloader dataset = create_dataset(args_opt.dataset_path, cfg, True) batches_per_epoch = dataset.get_dataset_size() # network net_with_loss = NASNetAMobileWithLoss(cfg) if args_opt.resume: ckpt = load_checkpoint(args_opt.resume) load_param_into_net(net_with_loss, ckpt) # learning rate schedule lr = get_lr(lr_init=cfg.lr_init, lr_decay_rate=cfg.lr_decay_rate, num_epoch_per_decay=cfg.num_epoch_per_decay, total_epochs=cfg.epoch_size, steps_per_epoch=batches_per_epoch, is_stair=True) if args_opt.resume: name_dir = os.path.basename(args_opt.resume) name, ext = name_dir.split(".") split_result = name.split("_") resume = split_result[-2].split("-") resume_epoch = int(resume[-1]) step_num_in_epoch = int(split_result[-1]) assert step_num_in_epoch == dataset.get_dataset_size()\ , "This script only supports resuming at the end of epoch" lr = lr[(dataset.get_dataset_size() * (resume_epoch - 1) + step_num_in_epoch):] lr = Tensor(lr, mstype.float32)
cell.weight.set_data( weight_init.initializer(weight_init.XavierUniform(), cell.weight.shape, cell.weight.dtype)) if isinstance(cell, nn.Dense): cell.weight.set_data( weight_init.initializer(weight_init.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) # init lr if args.net == "resnet50" or args.net == "se-resnet50": lr = get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode) else: lr = warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size, config.pretrain_epoch_size * step_size) lr = Tensor(lr) # define opt decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
with open(config.DataConfig.labels_path) as label_file: labels = json.load(label_file) ds_train = create_dataset( audio_conf=config.DataConfig.SpectConfig, manifest_filepath=config.DataConfig.train_manifest, labels=labels, normalize=True, train_mode=True, batch_size=config.DataConfig.batch_size, rank=rank_id, group_size=group_size) steps_size = ds_train.get_dataset_size() lr = get_lr(lr_init=config.OptimConfig.learning_rate, total_epochs=config.TrainingConfig.epochs, steps_per_epoch=steps_size) lr = Tensor(lr) deepspeech_net = DeepSpeechModel( batch_size=config.DataConfig.batch_size, rnn_hidden_size=config.ModelConfig.hidden_size, nb_layers=config.ModelConfig.hidden_layers, labels=labels, rnn_type=config.ModelConfig.rnn_type, audio_conf=config.DataConfig.SpectConfig, bidirectional=True, device_target=args.device_target) loss_net = NetWithLossClass(deepspeech_net) weights = ParameterTuple(deepspeech_net.trainable_params())
if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(network, param_dict) # convert fusion network to quantization aware network if config.quantization_aware: network = quant.convert_quant_network(network, bn_fold=True, per_channel=[True, False], symmetric=[True, False]) # get learning rate lr = Tensor(get_lr(global_step=config.start_epoch * step_size, lr_init=0, lr_end=0, lr_max=config.lr, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size + config.start_epoch, steps_per_epoch=step_size)) # define optimization opt = nn.Momentum(filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum, config.weight_decay) # define model model = Model(network, loss_fn=loss, optimizer=opt) print("============== Starting Training ==============") callback = None if rank_id == 0: callback = [Monitor(lr_init=lr.asnumpy())] if config.save_checkpoint: