img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) out, acc = vgg(img, label) loss = fluid.layers.cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) # 使用backward()方法可以执行反向网络 avg_loss.backward() optimizer.minimize(avg_loss) # 将参数梯度清零以保证下一轮训练的正确性 vgg.clear_gradients() all_train_iter = all_train_iter + train_parameters['train_batch_size'] all_train_iters.append(all_train_iter) all_train_costs.append(loss.numpy()[0]) all_train_accs.append(acc.numpy()[0]) if batch_id % 1 == 0: print( "Loss at epoch {} step {}: {}, acc: {}".format(epoch_num, batch_id, avg_loss.numpy(), acc.numpy())) draw_train_process("training", all_train_iters, all_train_costs, all_train_accs, "trainning cost", "trainning acc") draw_process("trainning loss", "red", all_train_iters, all_train_costs, "trainning loss") draw_process("trainning acc", "green", all_train_iters, all_train_accs, "trainning acc") # 保存模型参数 fluid.save_dygraph(vgg.state_dict(), "vgg") print("Final loss: {}".format(avg_loss.numpy()))
# Backprop and optimize loss = loss_fn(class_logits, x_class.cuda()) # darc1 regularizer (optional) darc1_loss = 0 #1e-3*torch.max(torch.sum(torch.abs(class_logits), dim=0)) loss = darc1_loss + loss optimizer.zero_grad() loss.backward() optimizer.step() if (i + 1) % 100 == 0: print("Epoch[{}/{}], Step [{}/{}], CE Loss: {:.4f}".format( epoch + 1, num_epochs, i + 1, len(train_data_loader), loss.item())) with torch.no_grad(): total = 0. correct = 0. for tx, tx_class in test_data_loader: tx = tx.cuda() #.view(-1, img_size) tclass_logits = model(tx) _, mostprob_result = torch.max(tclass_logits, dim=1) total += tx.size(0) correct += torch.sum(mostprob_result == tx_class.cuda()) print("%d/%d correct (%.2f %%)" % (correct, total, 100 * float(correct) / total)) torch.save(model.state_dict(), f'models/SVHN_{sys.argv[1]}net.pth')
class Solver(object): DEFAULTS = {} def __init__(self, version, data_loader, config, output_txt): """ Initializes a Solver object """ # data loader self.__dict__.update(Solver.DEFAULTS, **config) self.version = version self.data_loader = data_loader self.output_txt = output_txt self.build_model() # start with a pre-trained model if self.pretrained_model: self.load_pretrained_model() def build_model(self): """ Instantiates the model, loss criterion, and optimizer """ # instantiate model self.model = VGGNet(self.config, self.use_batch_norm, self.input_channels, self.class_count, self.init_weights) # instantiate loss criterion self.criterion = nn.CrossEntropyLoss() # instantiate optimizer self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum, weight_decay=self.weight_decay) # print network self.print_network(self.model, 'VGGNet') # use gpu if enabled if torch.cuda.is_available() and self.use_gpu: self.model.cuda() self.criterion.cuda() def print_network(self, model, name): """ Prints the structure of the network and the total number of parameters """ num_params = 0 for p in model.parameters(): num_params += p.numel() write_print(self.output_txt, name) write_print(self.output_txt, str(model)) write_print(self.output_txt, 'The number of parameters: {}'.format(num_params)) def load_pretrained_model(self): """ loads a pre-trained model from a .pth file """ self.model.load_state_dict( torch.load( os.path.join(self.model_save_path, '{}.pth'.format(self.pretrained_model)))) write_print(self.output_txt, 'loaded trained model {}'.format(self.pretrained_model)) def print_loss_log(self, start_time, iters_per_epoch, e, i, loss): """ Prints the loss and elapsed time for each epoch """ total_iter = self.num_epochs * iters_per_epoch cur_iter = e * iters_per_epoch + i elapsed = time.time() - start_time total_time = (total_iter - cur_iter) * elapsed / (cur_iter + 1) epoch_time = (iters_per_epoch - i) * elapsed / (cur_iter + 1) epoch_time = str(datetime.timedelta(seconds=epoch_time)) total_time = str(datetime.timedelta(seconds=total_time)) elapsed = str(datetime.timedelta(seconds=elapsed)) log = "Elapsed {}/{} -- {}, Epoch [{}/{}], Iter [{}/{}], " \ "loss: {:.4f}".format(elapsed, epoch_time, total_time, e + 1, self.num_epochs, i + 1, iters_per_epoch, loss) write_print(self.output_txt, log) def save_model(self, e): """ Saves a model per e epoch """ path = os.path.join(self.model_save_path, '{}/{}.pth'.format(self.version, e + 1)) torch.save(self.model.state_dict(), path) def model_step(self, images, labels): """ A step for each iteration """ # set model in training mode self.model.train() # empty the gradients of the model through the optimizer self.optimizer.zero_grad() # forward pass output = self.model(images) # compute loss loss = self.criterion(output, labels.squeeze()) # compute gradients using back propagation loss.backward() # update parameters self.optimizer.step() # return loss return loss def train(self): """ Training process """ self.losses = [] self.top_1_acc = [] self.top_5_acc = [] iters_per_epoch = len(self.data_loader) # start with a trained model if exists if self.pretrained_model: start = int(self.pretrained_model.split('/')[-1]) else: start = 0 # start training start_time = time.time() for e in range(start, self.num_epochs): for i, (images, labels) in enumerate(tqdm(self.data_loader)): images = to_var(images, self.use_gpu) labels = to_var(torch.LongTensor(labels), self.use_gpu) loss = self.model_step(images, labels) # print out loss log if (e + 1) % self.loss_log_step == 0: self.print_loss_log(start_time, iters_per_epoch, e, i, loss) self.losses.append((e, loss)) # save model if (e + 1) % self.model_save_step == 0: self.save_model(e) # evaluate on train dataset # if (e + 1) % self.train_eval_step == 0: # top_1_acc, top_5_acc = self.train_evaluate(e) # self.top_1_acc.append((e, top_1_acc)) # self.top_5_acc.append((e, top_5_acc)) # print losses write_print(self.output_txt, '\n--Losses--') for e, loss in self.losses: write_print(self.output_txt, str(e) + ' {:.4f}'.format(loss)) # print top_1_acc write_print(self.output_txt, '\n--Top 1 accuracy--') for e, acc in self.top_1_acc: write_print(self.output_txt, str(e) + ' {:.4f}'.format(acc)) # print top_5_acc write_print(self.output_txt, '\n--Top 5 accuracy--') for e, acc in self.top_5_acc: write_print(self.output_txt, str(e) + ' {:.4f}'.format(acc)) def eval(self, data_loader): """ Returns the count of top 1 and top 5 predictions """ # set the model to eval mode self.model.eval() top_1_correct = 0 top_5_correct = 0 total = 0 with torch.no_grad(): for images, labels in data_loader: images = to_var(images, self.use_gpu) labels = to_var(torch.LongTensor(labels), self.use_gpu) output = self.model(images) total += labels.size()[0] # top 1 # get the max for each instance in the batch _, top_1_output = torch.max(output.data, dim=1) top_1_correct += torch.sum( torch.eq(labels.squeeze(), top_1_output)) # top 5 _, top_5_output = torch.topk(output.data, k=5, dim=1) for i, label in enumerate(labels): if label in top_5_output[i]: top_5_correct += 1 return top_1_correct.item(), top_5_correct, total def train_evaluate(self, e): """ Evaluates the performance of the model using the train dataset """ top_1_correct, top_5_correct, total = self.eval(self.data_loader) log = "Epoch [{}/{}]--top_1_acc: {:.4f}--top_5_acc: {:.4f}".format( e + 1, self.num_epochs, top_1_correct / total, top_5_correct / total) write_print(self.output_txt, log) return top_1_correct / total, top_5_correct / total def test(self): """ Evaluates the performance of the model using the test dataset """ top_1_correct, top_5_correct, total = self.eval(self.data_loader) log = "top_1_acc: {:.4f}--top_5_acc: {:.4f}".format( top_1_correct / total, top_5_correct / total) write_print(self.output_txt, log)
name, avg_grad)) tbwriter.add_scalar('avg_grad/{}'.format(name), avg_grad.item(), total_steps) tbwriter.add_histogram('grad/{}'.format(name), parameter.grad.cpu().numpy(), total_steps) if parameter.data is not None: avg_weight = torch.mean(parameter.data) print('\tavg_weight for {} = {:.6f}'.format( name, avg_weight)) tbwriter.add_scalar('avg_weight/{}'.format(name), avg_weight.item()) tbwriter.add_histogram('weight/{}'.format(name), parameter.data.cpu().numpy(), total_steps) print() total_steps += 1 # save checkpoint after epoch cpt_dir = os.path.join(CPT_DIR, 'checkpoint_e{}.pkl'.format(epoch + 1)) torch.save( { 'epoch': epoch, 'model': vggnet.state_dict(), 'optimizer': optimizer.state_dict(), 'seed': seed, 'total_steps': total_steps, }, cpt_dir) tbwriter.close()