def setUp(self): # enable dygraph mode place = paddle.CPUPlace() paddle.disable_static(place) # config seed paddle.seed(SEED) paddle.framework.random._manual_program_seed(SEED) # create network self.layer = LinearNet() self.loss_fn = nn.CrossEntropyLoss() self.sgd = opt.SGD(learning_rate=0.001, parameters=self.layer.parameters()) # create data loader dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) self.loader = paddle.io.DataLoader( dataset, places=place, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=0) # train train(self.layer, self.loader, self.loss_fn, self.sgd) # save self.model_path = "linear.example.model" paddle.jit.save(self.layer, self.model_path)
def train_worker(rank, train_config, network, config): # # set the parallel # torch.distributed.init_process_group(backend='nccl', # init_method='env://', world_size=train_config.world_size, rank=rank) # initialize model net = network() # load pretrain model # backbone_dict = torch.load(train_config.init_weights) # del backbone_dict['state_dict']['fc.weight'] # del backbone_dict['state_dict']['fc.bias'] # net.resnet50.load_state_dict(backbone_dict['state_dict']) #net.cuda(rank) begin_epoch = 1 # build optimizer # optimizer = SGD_bias.SGD(net.parameters(), optimizer = optim.SGD( parameters=net.parameters(), learning_rate=train_config.lr, #momentum=train_config.momentum, weight_decay=train_config.weight_decay) if train_config.resume_weights: model_file = os.path.join( train_config.model_dir, 'dump-{}.pth'.format(train_config.resume_weights)) check_point = torch.load(model_file) net.load_state_dict(check_point['state_dict']) begin_epoch = train_config.resume_weights + 1 # using distributed data parallel # net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[rank], broadcast_buffers=False) # build data provider crowdhuman = CrowdHuman(config, if_train=True) data_iter = DataLoader(dataset=crowdhuman, batch_size=train_config.mini_batch_size, num_workers=1, collate_fn=crowdhuman.merge_batch, shuffle=True) for epoch_id in range(begin_epoch, train_config.total_epoch + 1): do_train_epoch(net, data_iter, optimizer, rank, epoch_id, train_config) if rank == 0: # save the model fpath = os.path.join(train_config.model_dir, 'dump-{}.pth'.format(epoch_id)) model = dict(epoch=epoch_id, state_dict=net.module.state_dict(), optimizer=optimizer.state_dict()) torch.save(model, fpath)
def load_and_fine_tuning(self): # load translated_layer = paddle.jit.load(self.model_path) # train original layer continue self.layer.train() orig_loss = train(self.layer, self.loader, self.loss_fn, self.sgd) # fine-tuning translated_layer.train() sgd = opt.SGD(learning_rate=0.001, parameters=translated_layer.parameters()) loss = train(translated_layer, self.loader, self.loss_fn, sgd) self.assertTrue(np.array_equal(orig_loss.numpy(), loss.numpy()), msg="original loss:\n{}\nnew loss:\n{}\n".format( orig_loss.numpy(), loss.numpy()))
def get_optimizer(config, parameters): clip = nn.ClipGradByNorm(clip_norm=config.optim.grad_clip) if config.optim.optimizer == 'Adam': return optim.Adam(parameters=parameters, learning_rate=config.optim.lr, weight_decay=config.optim.weight_decay, beta1=config.optim.beta1, beta2=0.999, epsilon=config.optim.eps, grad_clip=clip) elif config.optim.optimizer == 'RMSProp': return optim.RMSprop(parameters=parameters, learning_rate=config.optim.lr, weight_decay=config.optim.weight_decay, grad_clip=clip) elif config.optim.optimizer == 'SGD': return optim.SGD(parameters=parameters, learning_rate=config.optim.lr, momentum=0.9, grad_clip=clip) else: raise NotImplementedError('Optimizer {} not understood.'.format( config.optim.optimizer))
# ])) print(net) for param in net.parameters(): print(param) # 3.3.4 初始化模型参数 # 设置全局参数初始化 fluid.set_global_initializer(initializer.Uniform(), initializer.Constant()) # 3.3.5 定义损失函数 loss = nn.MSELoss() # 3.3.6 定义优化算法 optimizer = optim.SGD(learning_rate=0.03, parameters=net.parameters()) print(optimizer) # 设置不同自网络的学习率(待修改) # optimizer = optim.SGD([ # {'params': net._sub_layers1.paramaters()}, # {'params': net._sub_layers2.paramaters(), 'lr': 0.01} # ], learning_rate=0.03) # for param_group in optimizer.param_groups: # param_group['lr'] *= 0.1 # 3.3.7 训练模型 num_epochs = 10 for epoch in range(1, num_epochs + 1): for X, y in data_iter: