def train(): """Train function.""" args = get_args("train") if args.need_profiler: from mindspore.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) ds = create_dataset(args) G_A = get_generator(args) G_B = get_generator(args) D_A = get_discriminator(args) D_B = get_discriminator(args) load_ckpt(args, G_A, G_B, D_A, D_B) imgae_pool_A = ImagePool(args.pool_size) imgae_pool_B = ImagePool(args.pool_size) generator = Generator(G_A, G_B, args.lambda_idt > 0) loss_D = DiscriminatorLoss(args, D_A, D_B) loss_G = GeneratorLoss(args, generator, D_A, D_B) optimizer_G = nn.Adam(generator.trainable_params(), get_lr(args), beta1=args.beta1) optimizer_D = nn.Adam(loss_D.trainable_params(), get_lr(args), beta1=args.beta1) net_G = TrainOneStepG(loss_G, generator, optimizer_G) net_D = TrainOneStepD(loss_D, optimizer_D) data_loader = ds.create_dict_iterator() reporter = Reporter(args) reporter.info('==========start training===============') for _ in range(args.max_epoch): reporter.epoch_start() for data in data_loader: img_A = data["image_A"] img_B = data["image_B"] res_G = net_G(img_A, img_B) fake_A = res_G[0] fake_B = res_G[1] res_D = net_D(img_A, img_B, imgae_pool_A.query(fake_A), imgae_pool_B.query(fake_B)) reporter.step_end(res_G, res_D) reporter.visualizer(img_A, img_B, fake_A, fake_B) reporter.epoch_end(net_G) if args.need_profiler: profiler.analyse() break reporter.info('==========end training===============')
def train(): """Train function.""" args.outputs_dir = params['save_model_path'] if args.group_size > 1: init() context.set_auto_parallel_context( device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_{}/".format(str(get_rank()))) args.rank = get_rank() else: args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/") args.rank = 0 if args.group_size > 1: args.max_epoch = params["max_epoch_train_NP"] args.loss_scale = params['loss_scale'] / 2 args.lr_steps = list(map(int, params["lr_steps_NP"].split(','))) params['train_type'] = params['train_type_NP'] params['optimizer'] = params['optimizer_NP'] params['group_params'] = params['group_params_NP'] else: args.max_epoch = params["max_epoch_train"] args.loss_scale = params['loss_scale'] args.lr_steps = list(map(int, params["lr_steps"].split(','))) # create network print('start create network') criterion = openpose_loss() criterion.add_flags_recursive(fp32=True) network = OpenPoseNet(vggpath=params['vgg_path'], vgg_with_bn=params['vgg_with_bn']) if params["load_pretrain"]: print("load pretrain model:", params["pretrained_model_path"]) load_model(network, params["pretrained_model_path"]) train_net = BuildTrainNetwork(network, criterion) # create dataset if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \ and os.path.exists(args.maskpath_train): print('start create dataset') else: print('Error: wrong data path') return 0 num_worker = 20 if args.group_size > 1 else 48 de_dataset_train = create_dataset(args.jsonpath_train, args.imgpath_train, args.maskpath_train, batch_size=params['batch_size'], rank=args.rank, group_size=args.group_size, num_worker=num_worker, multiprocessing=True, shuffle=True, repeat_num=1) steps_per_epoch = de_dataset_train.get_dataset_size() print("steps_per_epoch: ", steps_per_epoch) # lr scheduler lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size, params['lr_gamma'], steps_per_epoch, args.max_epoch, args.lr_steps, args.group_size, lr_type=params['lr_type'], warmup_epoch=params['warmup_epoch']) # optimizer if params['group_params']: vgg19_base_params = list( filter(lambda x: 'base.vgg_base' in x.name, train_net.trainable_params())) base_params = list( filter(lambda x: 'base.conv' in x.name, train_net.trainable_params())) stages_params = list( filter(lambda x: 'base' not in x.name, train_net.trainable_params())) group_params = [{ 'params': vgg19_base_params, 'lr': lr_vgg }, { 'params': base_params, 'lr': lr_base }, { 'params': stages_params, 'lr': lr_stage }] if params['optimizer'] == "Momentum": opt = Momentum(group_params, learning_rate=lr_stage, momentum=0.9) elif params['optimizer'] == "Adam": opt = Adam(group_params) else: raise ValueError("optimizer not support.") else: if params['optimizer'] == "Momentum": opt = Momentum(train_net.trainable_params(), learning_rate=lr_stage, momentum=0.9) elif params['optimizer'] == "Adam": opt = Adam(train_net.trainable_params(), learning_rate=lr_stage) else: raise ValueError("optimizer not support.") # callback config_ck = CheckpointConfig( save_checkpoint_steps=params['ckpt_interval'], keep_checkpoint_max=params["keep_checkpoint_max"]) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank), directory=args.outputs_dir, config=config_ck) time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size()) if args.rank == 0: callback_list = [MyLossMonitor(), time_cb, ckpoint_cb] else: callback_list = [MyLossMonitor(), time_cb] # train if params['train_type'] == 'clip_grad': train_net = TrainOneStepWithClipGradientCell(train_net, opt, sens=args.loss_scale) train_net.set_train() model = Model(train_net) elif params['train_type'] == 'fix_loss_scale': loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) train_net.set_train() model = Model(train_net, optimizer=opt, loss_scale_manager=loss_scale_manager) else: raise ValueError("Type {} is not support.".format( params['train_type'])) print("============== Starting Training ==============") model.train(args.max_epoch, de_dataset_train, callbacks=callback_list, dataset_sink_mode=False) return 0
def main(): """Main entrance for training""" args = parser.parse_args() print(sys.argv) devid, args.rank_id, args.rank_size = 0, 0, 1 context.set_context(mode=context.GRAPH_MODE) if args.distributed: if args.GPU: init("nccl") context.set_context(device_target='GPU') else: init() devid = int(os.getenv('DEVICE_ID')) context.set_context(device_target='Ascend', device_id=devid, reserve_class_name_in_scope=False) context.reset_auto_parallel_context() args.rank_id = get_rank() args.rank_size = get_group_size() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=args.rank_size) else: if args.GPU: context.set_context(device_target='GPU') is_master = not args.distributed or (args.rank_id == 0) # parse model argument assert args.model.startswith( "tinynet"), "Only Tinynet models are supported." _, sub_name = args.model.split("_") net = tinynet(sub_model=sub_name, num_classes=args.num_classes, drop_rate=args.drop, drop_connect_rate=args.drop_connect, global_pool="avg", bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps) if is_master: print("Total number of parameters:", count_params(net)) # input image size of the network input_size = net.default_cfg['input_size'][1] train_dataset = val_dataset = None train_data_url = os.path.join(args.data_path, 'train') val_data_url = os.path.join(args.data_path, 'val') val_dataset = create_dataset_val(args.batch_size, val_data_url, workers=args.workers, distributed=False, input_size=input_size) if args.train: train_dataset = create_dataset(args.batch_size, train_data_url, workers=args.workers, distributed=args.distributed, input_size=input_size) batches_per_epoch = train_dataset.get_dataset_size() loss = LabelSmoothingCrossEntropy(smooth_factor=args.smoothing, num_classes=args.num_classes) time_cb = TimeMonitor(data_size=batches_per_epoch) loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) lr_array = get_lr(base_lr=args.lr, total_epochs=args.epochs, steps_per_epoch=batches_per_epoch, decay_epochs=args.decay_epochs, decay_rate=args.decay_rate, warmup_epochs=args.warmup_epochs, warmup_lr_init=args.warmup_lr, global_epoch=0) lr = Tensor(lr_array) loss_cb = LossMonitor(lr_array, args.epochs, per_print_times=args.per_print_times, start_epoch=0) param_group = add_weight_decay(net, weight_decay=args.weight_decay) if args.opt == 'sgd': if is_master: print('Using SGD optimizer') optimizer = SGD(param_group, learning_rate=lr, momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) elif args.opt == 'rmsprop': if is_master: print('Using rmsprop optimizer') optimizer = RMSProp(param_group, learning_rate=lr, decay=0.9, weight_decay=args.weight_decay, momentum=args.momentum, epsilon=args.opt_eps, loss_scale=args.loss_scale) loss.add_flags_recursive(fp32=True, fp16=False) eval_metrics = { 'Validation-Loss': Loss(), 'Top1-Acc': Top1CategoricalAccuracy(), 'Top5-Acc': Top5CategoricalAccuracy() } if args.ckpt: ckpt = load_checkpoint(args.ckpt) load_param_into_net(net, ckpt) net.set_train(False) model = Model(net, loss, optimizer, metrics=eval_metrics, loss_scale_manager=loss_scale_manager, amp_level=args.amp_level) net_ema = copy.deepcopy(net) net_ema.set_train(False) assert args.ema_decay > 0, "EMA should be used in tinynet training." ema_cb = EmaEvalCallBack(network=net, ema_network=net_ema, loss_fn=loss, eval_dataset=val_dataset, decay=args.ema_decay, save_epoch=args.ckpt_save_epoch, dataset_sink_mode=args.dataset_sink, start_epoch=0) callbacks = [loss_cb, ema_cb, time_cb] if is_master else [] if is_master: print("Training on " + args.model + " with " + str(args.num_classes) + " classes") model.train(args.epochs, train_dataset, callbacks=callbacks, dataset_sink_mode=args.dataset_sink)
"============ Precision is lower than expected when using vanilla RNN architecture ===========" ) embedding_table = np.loadtxt( os.path.join(cfg.preprocess_path, "weight.txt")).astype(np.float32) network = textrcnn(weight=Tensor(embedding_table), vocab_size=embedding_table.shape[0], cell=cfg.cell, batch_size=cfg.batch_size) ds_train = create_dataset(cfg.preprocess_path, cfg.batch_size, True) step_size = ds_train.get_dataset_size() loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True) lr = get_lr(cfg, step_size) num_epochs = cfg.num_epochs if cfg.cell == "lstm": num_epochs = cfg.lstm_num_epochs opt = nn.Adam(params=network.trainable_params(), learning_rate=lr) loss_cb = LossMonitor() time_cb = TimeMonitor() model = Model(network, loss, opt, {'acc': Accuracy()}, amp_level="O3") print("============== Starting Training ==============") config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=cfg.cell,
def train(args): # the number of N way, K shot images k = args.nway * args.kshot # Train data loading dataset = Dataset(args.dpath, state='train') train_sampler = Train_Sampler(dataset._labels, n_way=args.nway, k_shot=args.kshot, query=args.query) data_loader = DataLoader(dataset=dataset, batch_sampler=train_sampler, num_workers=4, pin_memory=True) # Validation data loading val_dataset = Dataset(args.dpath, state='val') val_sampler = Sampler(val_dataset._labels, n_way=args.nway, k_shot=args.kshot, query=args.query) val_data_loader = DataLoader(dataset=val_dataset, batch_sampler=val_sampler, num_workers=4, pin_memory=True) """ TODO 1.a """ " Make your own model for Few-shot Classification in 'model.py' file." # model setting #model = FewShotModel() model = FewShotModel_ensemble() """ TODO 1.a END """ # pretrained model load if args.restore_ckpt is not None: state_dict = torch.load(args.restore_ckpt) model.load_state_dict(state_dict) model.cuda() model.train() if args.test_mode == 1: Test_phase(model, args, k) """ TODO 1.b (optional) """ " Set an optimizer or scheduler for Few-shot classification (optional) " #optimizer = torch.optim.Adam(model.parameters(), lr=0.001) optimizer = torch.optim.SGD(model.parameters(), lr=4e-3, momentum=0.9) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.95) ce_loss_fn = torch.nn.CrossEntropyLoss().cuda() print('Loss mode: ', args.mymode) """ TODO 1.b (optional) END """ tl = Averager() # save average loss ta = Averager() # save average accuracy # training start print('train start') for i in range(TOTAL): for episode in data_loader: optimizer.zero_grad() data, label = [_.cuda() for _ in episode] # load an episode # split an episode images and labels into shots and query set # note! data_shot shape is ( nway * kshot, 3, h, w ) not ( kshot * nway, 3, h, w ) # Take care when reshape the data shot data_shot, data_query = data[:k], data[k:] label_shot, label_query = label[:k], label[k:] label_shot = sorted(list(set(label_shot.tolist()))) # convert labels into 0-4 values label_query = label_query.tolist() labels = [] for j in range(len(label_query)): label = label_shot.index(label_query[j]) labels.append(label) labels = torch.tensor(labels).cuda() """ TODO 2 ( Same as above TODO 2 ) """ """ Train the model Input: data_shot : torch.tensor, shot images, [args.nway * args.kshot, 3, h, w] be careful when using torch.reshape or .view functions data_query : torch.tensor, query images, [args.query, 3, h, w] labels : torch.tensor, labels of query images, [args.query] output: loss : torch scalar tensor which used for updating your model logits : A value to measure accuracy and loss """ # The loss_mode function is in "src/utils.py" logits, loss = loss_mode(args, model, data_shot, data_query, labels) """ TODO 2 END """ acc = count_acc(logits, labels) tl.add(loss.item()) ta.add(acc) loss.backward() optimizer.step() scheduler.step() # @@!!@@ added by nam proto = None logits = None loss = None if (i + 1) % PRINT_FREQ == 0: print('train {}, lr={:.4e} loss={:.4f} acc={:.4f}'.format( i + 1, get_lr(optimizer), tl.item(), ta.item())) # initialize loss and accuracy mean tl = None ta = None tl = Averager() ta = Averager() # validation start if (i + 1) % VAL_FREQ == 0: print('validation start') model.eval() with torch.no_grad(): vl = Averager() # save average loss va = Averager() # save average accuracy for j in range(VAL_TOTAL): for episode in val_data_loader: data, label = [_.cuda() for _ in episode] data_shot, data_query = data[:k], data[ k:] # load an episode label_shot, label_query = label[:k], label[k:] label_shot = sorted(list(set(label_shot.tolist()))) label_query = label_query.tolist() labels = [] for j in range(len(label_query)): label = label_shot.index(label_query[j]) labels.append(label) labels = torch.tensor(labels).cuda() """ TODO 2 ( Same as above TODO 2 ) """ """ Train the model Input: data_shot : torch.tensor, shot images, [args.nway * args.kshot, 3, h, w] be careful when using torch.reshape or .view functions data_query : torch.tensor, query images, [args.query, 3, h, w] labels : torch.tensor, labels of query images, [args.query] output: loss : torch scalar tensor which used for updating your model logits : A value to measure accuracy and loss """ # The loss_mode function is in "src/utils.py" logits, loss = loss_mode(args, model, data_shot, data_query, labels) """ TODO 2 END """ acc = count_acc(logits, labels) vl.add(loss.item()) va.add(acc) proto = None logits = None loss = None print('val accuracy mean : %.4f' % va.item()) print('val loss mean : %.4f' % vl.item()) # initialize loss and accuracy mean vl = None va = None vl = Averager() va = Averager() model.train() if (i + 1) % SAVE_FREQ == 0: PATH = 'checkpoints/%d_%s.pth' % (i + 1, args.name) torch.save(model.state_dict(), PATH) print('model saved, iteration : %d' % i)
def train(): """Train function.""" args = parse_args() args.outputs_dir = params['save_model_path'] if args.group_size > 1: init() context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_{}/".format(str(get_rank()))) args.rank = get_rank() else: args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/") args.rank = 0 # with out loss_scale if args.group_size > 1: args.loss_scale = params['loss_scale'] / 2 args.lr_steps = list(map(int, params["lr_steps_NP"].split(','))) else: args.loss_scale = params['loss_scale'] args.lr_steps = list(map(int, params["lr_steps"].split(','))) # create network print('start create network') criterion = openpose_loss() criterion.add_flags_recursive(fp32=True) network = OpenPoseNet(vggpath=params['vgg_path']) # network.add_flags_recursive(fp32=True) if params["load_pretrain"]: print("load pretrain model:", params["pretrained_model_path"]) load_model(network, params["pretrained_model_path"]) train_net = BuildTrainNetwork(network, criterion) # create dataset if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \ and os.path.exists(args.maskpath_train): print('start create dataset') else: print('Error: wrong data path') num_worker = 20 if args.group_size > 1 else 48 de_dataset_train = create_dataset(args.jsonpath_train, args.imgpath_train, args.maskpath_train, batch_size=params['batch_size'], rank=args.rank, group_size=args.group_size, num_worker=num_worker, multiprocessing=True, shuffle=True, repeat_num=1) steps_per_epoch = de_dataset_train.get_dataset_size() print("steps_per_epoch: ", steps_per_epoch) # lr scheduler lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size, params['lr_gamma'], steps_per_epoch, params["max_epoch_train"], args.lr_steps, args.group_size) vgg19_base_params = list(filter(lambda x: 'base.vgg_base' in x.name, train_net.trainable_params())) base_params = list(filter(lambda x: 'base.conv' in x.name, train_net.trainable_params())) stages_params = list(filter(lambda x: 'base' not in x.name, train_net.trainable_params())) group_params = [{'params': vgg19_base_params, 'lr': lr_vgg}, {'params': base_params, 'lr': lr_base}, {'params': stages_params, 'lr': lr_stage}] opt = Adam(group_params, loss_scale=args.loss_scale) train_net.set_train(True) loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) model = Model(train_net, optimizer=opt, loss_scale_manager=loss_scale_manager) params['ckpt_interval'] = max(steps_per_epoch, params['ckpt_interval']) config_ck = CheckpointConfig(save_checkpoint_steps=params['ckpt_interval'], keep_checkpoint_max=params["keep_checkpoint_max"]) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank), directory=args.outputs_dir, config=config_ck) time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size()) callback_list = [MyLossMonitor(), time_cb, ckpoint_cb] print("============== Starting Training ==============") model.train(params["max_epoch_train"], de_dataset_train, callbacks=callback_list, dataset_sink_mode=False)