data, labels = data.cuda(), labels.cuda() data, labels = Variable(data), Variable(labels) data = model(data) loss = F.nll_loss(F.log_softmax(data), labels) optimizer.zero_grad() loss.backward() optimizer.step() if (iteration + 1) % args.interval == 0: _, data = th.max(data, 1) data = th.squeeze(data) n_errors = th.sum(data != labels).data[0] print 'batch %d training loss %f %d errors encountered' % ( iteration + 1, loss.data[0], n_errors) n_samples, n_errors = 0, 0 for batch in validation_loader: data, labels = batch if args.use_gpu: data, labels = data.cuda(), labels.cuda() data, labels = Variable(data), Variable(labels) _, p = th.max(model(data), 1) n_samples += data.size()[0] n_errors += th.sum(p != labels).data[0] error_rate = n_errors / n_samples print 'epoch %d validation error rate %f' % (epoch, error_rate) th.save(model.state_dict(), open('pretrained-cnn', 'w'))
def main(): batch_size = 100 train_data, val_data, test_data = create_train_val_test_split(batch_size) data_feeder = DataFeeder(train_data, preprocess_workers=1, cuda_workers=1, cpu_size=10, cuda_size=10, batch_size=batch_size, use_cuda=True, volatile=False) data_feeder.start_queue_threads() val_data = make_batch(len(val_data), 0, val_data, use_cuda=True, volatile=True) test_data = make_batch(len(test_data), 0, test_data, use_cuda=True, volatile=True) cnn = CNN().cuda() fcc = FCC().cuda() optimizer_cnn = optim.SGD(cnn.parameters(), lr=0.001, momentum=0.9, weight_decay=0.00001) optimizer_fcc = optim.SGD(fcc.parameters(), lr=0.001, momentum=0.9, weight_decay=0.00001) cnn_train_loss = Logger("cnn_train_losses.txt") cnn_val_loss = Logger("cnn_val_losses.txt") cnn_val_acc = Logger("cnn_val_acc.txt") fcc_train_loss = Logger("fcc_train_losses.txt") fcc_val_loss = Logger("fcc_val_losses.txt") fcc_val_acc = Logger("fcc_val_acc.txt") #permute = Variable(torch.from_numpy(np.random.permutation(28*28)).long().cuda(), requires_grad=False) permute = None for i in range(100001): images, labels = data_feeder.get_batch() train(cnn, optimizer_cnn, images, labels, i, cnn_train_loss, permute) train(fcc, optimizer_fcc, images, labels, i, fcc_train_loss, permute) if i % 100 == 0: print(i) evaluate_acc(batch_size, cnn, val_data, i, cnn_val_loss, cnn_val_acc, permute) evaluate_acc(batch_size, fcc, val_data, i, fcc_val_loss, fcc_val_acc, permute) if i in [70000, 90000]: decrease_lr(optimizer_cnn) decrease_lr(optimizer_fcc) if i % 1000 == 0: torch.save(cnn.state_dict(), "savedir/cnn_it" + str(i // 1000) + "k.pth") torch.save(fcc.state_dict(), "savedir/fcc_it" + str(i // 1000) + "k.pth") data_feeder.kill_queue_threads() import evaluate evaluate.main(permute)
def train(train_file_path, val_file_path, in_channels, num_class, batch_norm, dropout, n_epochs, batch_size, lr, momentum, weight_decay, optim_type, ckpt_path, max_ckpt_save_num, ckpt_save_interval, val_interval, resume, device='cpu'): ''' The main training procedure ---------------------------- :param train_file_path: file list of training image paths and labels :param val_file_path: file list of validation image paths and labels :param in_channels: channel number of image :param num_class: number of classes, in this task it is 26 English letters :param batch_norm: whether to use batch normalization in convolutional layers and linear layers :param dropout: dropout ratio of dropout layer which ranges from 0 to 1 :param n_epochs: number of training epochs :param batch_size: batch size of training :param lr: learning rate :param momentum: only used if optim_type == 'sgd' :param weight_decay: the factor of L2 penalty on network weights :param optim_type: optimizer, which can be set as 'sgd', 'adagrad', 'rmsprop', 'adam', or 'adadelta' :param ckpt_path: path to save checkpoint models :param max_ckpt_save_num: maximum number of saving checkpoint models :param ckpt_save_interval: intervals of saving checkpoint models, e.g., if ckpt_save_interval = 2, then save checkpoint models every 2 epochs :param val_interval: intervals of validation, e.g., if val_interval = 5, then do validation after each 5 training epochs :param resume: path to resume model :param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available ''' # construct training and validation data loader train_loader = dataLoader(train_file_path, norm_size=(32, 32), batch_size=batch_size) val_loader = dataLoader(val_file_path, norm_size=(32, 32), batch_size=1) model = CNN(in_channels, num_class, batch_norm, dropout) # put the model on CPU or GPU model = model.to(device) # define loss function and optimizer loss_func = nn.CrossEntropyLoss() if optim_type == 'sgd': optimizer = optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) elif optim_type == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr, weight_decay=weight_decay) elif optim_type == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr, weight_decay=weight_decay) elif optim_type == 'adam': optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay) elif optim_type == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr, weight_decay=weight_decay) else: print( '[Error] optim_type should be one of sgd, adagrad, rmsprop, adam, or adadelta' ) raise NotImplementedError if resume is not None: print('[Info] resuming model from %s ...' % resume) checkpoint = torch.load(resume) model.load_state_dict(checkpoint['model_state']) optimizer.load_state_dict(checkpoint['optimizer_state']) # training # to save loss of each training epoch in a python "list" data structure losses = [] # to save accuracy on validation set of each training epoch in a python "list" data structure accuracy_list = [] val_epochs = [] print('training...') for epoch in range(n_epochs): # set the model in training mode model.train() # to save total loss in one epoch total_loss = 0. for step, (input, label) in enumerate(train_loader): # get a batch of data # set data type and device input, label = input.type(torch.float).to(device), label.type( torch.long).to(device) # clear gradients in the optimizer optimizer.zero_grad() # run the model which is the forward process out = model(input) # compute the CrossEntropy loss, and call backward propagation function loss = loss_func(out, label) loss.backward() # update parameters of the model optimizer.step() # sum up of total loss, loss.item() return the value of the tensor as a standard python number # this operation is not differentiable total_loss += loss.item() # average of the total loss for iterations avg_loss = total_loss / len(train_loader) losses.append(avg_loss) # evaluate model on validation set if (epoch + 1) % val_interval == 0: val_accuracy = eval_one_epoch(model, val_loader, device) accuracy_list.append(val_accuracy) val_epochs.append(epoch) print( 'Epoch {:02d}: loss = {:.3f}, accuracy on validation set = {:.3f}' .format(epoch + 1, avg_loss, val_accuracy)) if (epoch + 1) % ckpt_save_interval == 0: # get info of all saved checkpoints ckpt_list = glob.glob(os.path.join(ckpt_path, 'ckpt_epoch_*.pth')) # sort checkpoints by saving time ckpt_list.sort(key=os.path.getmtime) # remove surplus ckpt file if the number is larger than max_ckpt_save_num if len(ckpt_list) >= max_ckpt_save_num: for cur_file_idx in range( 0, len(ckpt_list) - max_ckpt_save_num + 1): os.remove(ckpt_list[cur_file_idx]) # save model parameters in a file ckpt_name = os.path.join(ckpt_path, 'ckpt_epoch_%d.pth' % (epoch + 1)) save_dict = { 'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), 'configs': { 'in_channels': in_channels, 'num_class': num_class, 'batch_norm': batch_norm, 'dropout': dropout } } torch.save(save_dict, ckpt_name) print('Model saved in {}\n'.format(ckpt_name)) plot(losses, accuracy_list, val_epochs, ckpt_path)