def train_fold(fold, lr): train_indices, eval_indices = indices_for_fold(fold, len(train_data)) train_dataset = TrainEvalDataset(train_data.iloc[train_indices], transform=train_transform) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.batch_size, drop_last=True, shuffle=True, num_workers=args.workers) if config.mixup is not None: train_data_loader = MixupDataLoader(train_data_loader, config.mixup) eval_dataset = TrainEvalDataset(train_data.iloc[eval_indices], transform=eval_transform) eval_data_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=config.batch_size, num_workers=args.workers) model = Model(config.model, NUM_CLASSES) model = model.to(DEVICE) optimizer = build_optimizer(config.opt.type, model.parameters(), lr, config.opt.beta, weight_decay=config.opt.weight_decay) if config.sched.type == 'onecycle': scheduler = lr_scheduler_wrapper.StepWrapper( OneCycleScheduler(optimizer, lr=(lr / 20, lr), beta_range=config.sched.onecycle.beta, max_steps=len(train_data_loader) * config.epochs, annealing=config.sched.onecycle.anneal)) elif config.sched.type == 'cyclic': # TODO: add cyclic min/max momentum to config scheduler = lr_scheduler_wrapper.StepWrapper( CyclicLR(optimizer, 0., lr, step_size_up=len(train_data_loader), step_size_down=len(train_data_loader), mode='triangular2', cycle_momentum=True, base_momentum=config.sched.cyclic.beta[1], max_momentum=config.sched.cyclic.beta[0])) elif config.sched.type == 'cawr': scheduler = lr_scheduler_wrapper.StepWrapper( torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=len(train_data_loader), T_mult=2)) elif config.sched.type == 'plateau': scheduler = lr_scheduler_wrapper.ScoreWrapper( torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=0, verbose=True)) else: raise AssertionError('invalid sched {}'.format(config.sched.type)) best_score = 0 for epoch in range(config.epochs): train_epoch(model=model, optimizer=optimizer, scheduler=scheduler, data_loader=train_data_loader, fold=fold, epoch=epoch) gc.collect() score = eval_epoch(model=model, data_loader=eval_data_loader, fold=fold, epoch=epoch) gc.collect() scheduler.step_epoch() scheduler.step_score(score) if score > best_score: best_score = score torch.save( model.state_dict(), os.path.join(args.experiment_path, 'model_{}.pth'.format(fold)))
# If weights are tied between encoder and decoder, we can only optimize # parameters in one of those two layers if not lstm_tie_weights: param_list.extend([ {'params': lm.encoder.parameters(), 'lr':1e-3}, {'params': lm.decoder.parameters(), 'lr':1e-3}, ]) else: param_list.extend([ {'params': lm.decoder.parameters(), 'lr':1e-3}, ]) optimizer = torch.optim.Adam(param_list, lr=0.01) scheduler = CyclicLR(optimizer, max_lrs=[0.1, 0.1, 0.1, 0.1, 0.1], mode='ulmfit', ratio=1.5, cut_frac=0.4, n_epochs=num_epochs, batchsize=50000/1171, verbose=False, epoch_length=50000) history = training_loop(batch_size=batch_size, num_epochs=num_epochs, display_freq=1, model=lm, criterion=loss, optim=optimizer, scheduler=None, device=device, training_set=train_dl, validation_set=valid_dl, best_model_path=model_file_lm, history=None)
def processing(self): log_file = os.path.join(self.data_folder, 'train.log') logger = Logger('train', log_file) iters = len(self.train_loader) step_size = iters * 2 self.scheduler = CyclicLR(optimizer=self.optimizer, step_size=step_size, base_lr=self.lr) if self.evaluate: self.validate(logger) return iter_per_epoch = len(self.train_loader) logger.info('Iterations per epoch: {0}'.format(iter_per_epoch)) print('Iterations per epoch: {0}'.format(iter_per_epoch)) start_time = time.time() for epoch in range(self.start_epoch, self.epochs): # self.adjust_learning_rate(epoch) # train for one epoch train_losses, train_acc = self.train(logger, epoch) # evaluate on validation set with torch.no_grad(): val_losses, val_acc = self.validate(logger) # self.scheduler.step(val_losses.avg) # log visualize info_acc = {'train_acc': train_acc.avg, 'val_acc': val_acc.avg} info_loss = { 'train_loss': train_losses.avg, 'val_loss': val_losses.avg } self.visualizer.write_summary(info_acc, info_loss, epoch + 1) self.visualizer.write_histogram(model=self.model, step=epoch + 1) # remember best Accuracy and save checkpoint is_best = val_acc.avg > self.best_prec1 self.best_prec1 = max(val_acc.avg, self.best_prec1) self.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'best_prec1': self.best_prec1, 'optimizer': self.optimizer.state_dict(), }, is_best) self.check_early_stop(val_acc.avg, logger, start_time) end_time = time.time() print("--- Total training time %s seconds ---" % (end_time - start_time)) logger.info("--- Total training time %s seconds ---" % (end_time - start_time)) self.visualizer.writer_close()
class Training(object): def __init__(self, name_list, num_classes=400, modality='RGB', **kwargs): self.__dict__.update(kwargs) self.num_classes = num_classes self.modality = modality self.name_list = name_list # set accuracy avg = 0 self.count_early_stop = 0 # Set best precision = 0 self.best_prec1 = 0 # init start epoch = 0 self.start_epoch = 0 if self.log_visualize != '': self.visualizer = Visualizer(logdir=self.log_visualize) self.checkDataFolder() self.loading_model() self.train_loader, self.val_loader = self.loading_data() # run self.processing() if self.random: print('random pick images') def check_early_stop(self, accuracy, logger, start_time): if self.best_prec1 <= accuracy: self.count_early_stop = 0 else: self.count_early_stop += 1 if self.count_early_stop > self.early_stop: print('Early stop') end_time = time.time() print("--- Total training time %s seconds ---" % (end_time - start_time)) logger.info("--- Total training time %s seconds ---" % (end_time - start_time)) exit() def checkDataFolder(self): try: os.stat('./' + self.model_type + '_' + self.data_set) except: os.mkdir('./' + self.model_type + '_' + self.data_set) self.data_folder = './' + self.model_type + '_' + self.data_set # Loading P3D model def loading_model(self): print('Loading %s model' % (self.model_type)) if self.model_type == 'C3D': self.model = C3D() if self.pretrained: self.model.load_state_dict(torch.load('c3d.pickle')) elif self.model_type == 'I3D': if self.pretrained: self.model = I3D(num_classes=400, modality='rgb') self.model.load_state_dict( torch.load('kinetics_i3d_model_rgb.pth')) else: self.model = I3D(num_classes=self.num_classes, modality='rgb') else: if self.pretrained: print("=> using pre-trained model") self.model = P3D199(pretrained=True, num_classes=400, dropout=self.dropout) else: print("=> creating model P3D") self.model = P3D199(pretrained=False, num_classes=400, dropout=self.dropout) # Transfer classes self.model = transfer_model(model=self.model, model_type=self.model_type, num_classes=self.num_classes) # Check gpu and run parallel if check_gpu() > 0: self.model = torch.nn.DataParallel(self.model).cuda() # define loss function (criterion) and optimizer self.criterion = nn.CrossEntropyLoss() if check_gpu() > 0: self.criterion = nn.CrossEntropyLoss().cuda() params = self.model.parameters() if self.model_type == 'P3D': params = get_optim_policies(model=self.model, modality=self.modality, enable_pbn=True) self.optimizer = optim.SGD(params=params, lr=self.lr, momentum=self.momentum, weight_decay=self.weight_decay) # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=self.optimizer, mode='min', patience=10, verbose=True) # optionally resume from a checkpoint if self.resume: if os.path.isfile(self.resume): print("=> loading checkpoint '{}'".format(self.resume)) checkpoint = torch.load(self.resume) self.start_epoch = checkpoint['epoch'] self.best_prec1 = checkpoint['best_prec1'] self.model.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( self.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(self.resume)) if self.evaluate: file_model_best = os.path.join(self.data_folder, 'model_best.pth.tar') if os.path.isfile(file_model_best): print( "=> loading checkpoint '{}'".format('model_best.pth.tar')) checkpoint = torch.load(file_model_best) self.start_epoch = checkpoint['epoch'] self.best_prec1 = checkpoint['best_prec1'] self.model.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( self.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(self.resume)) cudnn.benchmark = True # Loading data def loading_data(self): random = True if self.random else False size = 160 if self.model_type == 'C3D': size = 112 if self.model_type == 'I3D': size = 224 normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transformations = Compose([ RandomSizedCrop(size), RandomHorizontalFlip(), # Resize((size, size)), # ColorJitter( # brightness=0.4, # contrast=0.4, # saturation=0.4, # ), ToTensor(), normalize ]) val_transformations = Compose([ # Resize((182, 242)), Resize(256), CenterCrop(size), ToTensor(), normalize ]) train_dataset = MyDataset(self.data, data_folder="train", name_list=self.name_list, version="1", transform=train_transformations, num_frames=self.num_frames, random=random) val_dataset = MyDataset(self.data, data_folder="validation", name_list=self.name_list, version="1", transform=val_transformations, num_frames=self.num_frames, random=random) train_loader = data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.workers, pin_memory=True) val_loader = data.DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.workers, pin_memory=False) return (train_loader, val_loader) def processing(self): log_file = os.path.join(self.data_folder, 'train.log') logger = Logger('train', log_file) iters = len(self.train_loader) step_size = iters * 2 self.scheduler = CyclicLR(optimizer=self.optimizer, step_size=step_size, base_lr=self.lr) if self.evaluate: self.validate(logger) return iter_per_epoch = len(self.train_loader) logger.info('Iterations per epoch: {0}'.format(iter_per_epoch)) print('Iterations per epoch: {0}'.format(iter_per_epoch)) start_time = time.time() for epoch in range(self.start_epoch, self.epochs): # self.adjust_learning_rate(epoch) # train for one epoch train_losses, train_acc = self.train(logger, epoch) # evaluate on validation set with torch.no_grad(): val_losses, val_acc = self.validate(logger) # self.scheduler.step(val_losses.avg) # log visualize info_acc = {'train_acc': train_acc.avg, 'val_acc': val_acc.avg} info_loss = { 'train_loss': train_losses.avg, 'val_loss': val_losses.avg } self.visualizer.write_summary(info_acc, info_loss, epoch + 1) self.visualizer.write_histogram(model=self.model, step=epoch + 1) # remember best Accuracy and save checkpoint is_best = val_acc.avg > self.best_prec1 self.best_prec1 = max(val_acc.avg, self.best_prec1) self.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'best_prec1': self.best_prec1, 'optimizer': self.optimizer.state_dict(), }, is_best) self.check_early_stop(val_acc.avg, logger, start_time) end_time = time.time() print("--- Total training time %s seconds ---" % (end_time - start_time)) logger.info("--- Total training time %s seconds ---" % (end_time - start_time)) self.visualizer.writer_close() # Training def train(self, logger, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() acc = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() rate = get_learning_rate(self.optimizer)[0] # switch to train mode self.model.train() end = time.time() for i, (images, target) in enumerate(self.train_loader): # adjust learning rate scheduler step self.scheduler.batch_step() # measure data loading time data_time.update(time.time() - end) if check_gpu() > 0: images = images.cuda(async=True) target = target.cuda(async=True) image_var = torch.autograd.Variable(images) label_var = torch.autograd.Variable(target) self.optimizer.zero_grad() # compute y_pred y_pred = self.model(image_var) if self.model_type == 'I3D': y_pred = y_pred[0] loss = self.criterion(y_pred, label_var) # measure accuracy and record loss prec1, prec5 = accuracy(y_pred.data, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) acc.update(prec1.item(), images.size(0)) top1.update(prec1.item(), images.size(0)) top5.update(prec5.item(), images.size(0)) # compute gradient and do SGD step loss.backward() self.optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % self.print_freq == 0: print('Epoch: [{0}/{1}][{2}/{3}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Lr {rate:.5f}\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, self.epochs, i, len(self.train_loader), batch_time=batch_time, data_time=data_time, rate=rate, loss=losses, top1=top1, top5=top5)) logger.info('Epoch: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Lr {rate:.5f}\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, self.epochs, batch_time=batch_time, data_time=data_time, rate=rate, loss=losses, top1=top1, top5=top5)) return losses, acc # Validation def validate(self, logger): batch_time = AverageMeter() losses = AverageMeter() acc = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode self.model.eval() end = time.time() for i, (images, labels) in enumerate(self.val_loader): if check_gpu() > 0: images = images.cuda(async=True) labels = labels.cuda(async=True) image_var = torch.autograd.Variable(images) label_var = torch.autograd.Variable(labels) # compute y_pred y_pred = self.model(image_var) if self.model_type == 'I3D': y_pred = y_pred[0] loss = self.criterion(y_pred, label_var) # measure accuracy and record loss prec1, prec5 = accuracy(y_pred.data, labels, topk=(1, 5)) losses.update(loss.item(), images.size(0)) acc.update(prec1.item(), images.size(0)) top1.update(prec1.item(), images.size(0)) top5.update(prec5.item(), images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % self.print_freq == 0: print('TrainVal: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(self.val_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5)) print(' * Accuracy {acc.avg:.3f} Loss {loss.avg:.3f}'.format( acc=acc, loss=losses)) logger.info(' * Accuracy {acc.avg:.3f} Loss {loss.avg:.3f}'.format( acc=acc, loss=losses)) return losses, acc # save checkpoint to file def save_checkpoint(self, state, is_best): checkpoint = os.path.join(self.data_folder, 'checkpoint.pth.tar') torch.save(state, checkpoint) model_best = os.path.join(self.data_folder, 'model_best.pth.tar') if is_best: shutil.copyfile(checkpoint, model_best) # adjust learning rate for each epoch def adjust_learning_rate(self, epoch): """Sets the learning rate to the initial LR decayed by 10 every 3K iterations""" iters = len(self.train_loader) num_epochs = 3000 // iters decay = 0.1**(epoch // num_epochs) lr = self.lr * decay for param_group in self.optimizer.param_groups: param_group['lr'] = lr * param_group['lr_mult'] param_group['weight_decay'] = decay * param_group['decay_mult']
def fit_language_model(self, pretrained_itos=None, pretrained_weight_file=None, lm_hidden_dim=1150, lm_embedding_dim=400, lm_lstm_layers=3, num_epochs=100, display_epoch_freq=1, scheduler='ulmfit', max_lrs=[1e-3, 1e-3, 1e-3, 1e-3, 1e-3]): ## Model Architecture self.hidden_dim = lm_hidden_dim self.embedding_dim = lm_embedding_dim self.dropout = 0.3 self.lstm_layers = lm_lstm_layers self.lstm_bidirection = False self.lstm_tie_weights = True self.num_epochs = num_epochs self.display_epoch_freq = display_epoch_freq if self.use_gpu: torch.cuda.manual_seed(303) else: torch.manual_seed(303) # set up Files to save stuff in runtime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if pretrained_weight_file is not None and pretrained_itos is not None: print("Starting Loading pretrained Wikitext model") enc = torch.load(pretrained_weight_file, map_location=lambda storage, loc: storage) self.embedding_dim = enc['0.encoder.weight'].shape[1] self.hidden_dim = int( enc['0.rnns.0.module.weight_hh_l0_raw'].shape[0] / 4) self.lstm_layers = 3 new_enc = {} for k, v in enc.items(): layer_detail = k.split('.') layer_name = layer_detail[-1].replace('_raw', '') if len(layer_detail) == self.lstm_layers: new_enc[f'{layer_detail[1]}.{layer_name}'] = v else: new_enc[ f'{layer_detail[1]}.{layer_detail[2]}.{layer_name}'] = v del new_enc['encoder_with_dropout.embed.weight'] pretrained_idx2word = pickle.load(open(pretrained_itos, 'rb')) pretrained_word2idx =\ {k: i for i,k in enumerate(pretrained_idx2word)} new_model_vectorizer = self.vectorizer pretrained_encoder_weights = enc['0.encoder.weight'] row_m = pretrained_encoder_weights.mean(dim=0) row_m = [x.item() for x in row_m] new_vocab_size = len(new_model_vectorizer.word2idx) new_encoder_weights = torch.tensor( [row_m for i in range(new_vocab_size)]) new_idx2weights = {} for word, i in new_model_vectorizer.word2idx.items(): if word in pretrained_word2idx: word_idx = pretrained_word2idx[word] new_encoder_weights[i] =\ pretrained_encoder_weights[word_idx] new_enc['encoder.weight'] = new_encoder_weights new_enc['decoder.weight'] = copy.copy(new_encoder_weights) new_enc['decoder.bias'] =\ torch.zeros(new_enc['decoder.weight'].shape[0]) self.lm = RNNLM(device=self.device, vocab_size=new_vocab_size, embedding_size=self.embedding_dim, hidden_size=self.hidden_dim, batch_size=50, num_layers=3, tie_weights=True, word2idx=new_model_vectorizer.word2idx) print("Initialised loading with pretrained Wikitext model") else: # Build and initialize the model print("No Pretrained Model specified") self.lm = RNNLM(self.device, self.vectorizer.vocabulary_size, self.embedding_dim, self.hidden_dim, self.batch_size, dropout=self.dropout, tie_weights=self.lstm_tie_weights, num_layers=self.lstm_layers, bidirectional=self.lstm_bidirection, word2idx=self.vectorizer.word2idx, log_softmax=False) if self.use_gpu: self.lm = self.lm.to(self.device) # Loss and Optimizer self.loss = nn.CrossEntropyLoss() # Extract pointers to the parameters of the lstms param_list = [{ 'params': rnn.parameters(), 'lr': 1e-3 } for rnn in self.lm.rnns] # If weights are tied between encoder and decoder, we can only optimize # parameters in one of those two layers if not self.lstm_tie_weights: param_list.extend([ { 'params': self.lm.encoder.parameters(), 'lr': 1e-3 }, { 'params': self.lm.decoder.parameters(), 'lr': 1e-3 }, ]) else: param_list.extend([ { 'params': self.lm.decoder.parameters(), 'lr': 1e-3 }, ]) self.optimizer = torch.optim.Adam(param_list) if scheduler == 'ulmfit': self.scheduler = CyclicLR(self.optimizer, max_lrs=max_lrs, mode='ulmfit', ratio=1.5, cut_frac=0.4, train_data_loader=self.train_dl, verbose=False) print("Beginning LM Fine Tuning") self.freezeTo(3) history = training_loop(batch_size=self.batch_size, num_epochs=1, display_freq=self.display_epoch_freq, model=self.lm, criterion=self.loss, optim=self.optimizer, scheduler=self.scheduler, device=self.device, training_set=self.train_dl, validation_set=self.valid_dl, best_model_path=self.model_file_lm, history=None) self.freezeTo(2) history = training_loop(batch_size=self.batch_size, num_epochs=1, display_freq=self.display_epoch_freq, model=self.lm, criterion=self.loss, optim=self.optimizer, scheduler=self.scheduler, device=self.device, training_set=self.train_dl, validation_set=self.valid_dl, best_model_path=self.model_file_lm, history=history) self.freezeTo(0) history = training_loop(batch_size=self.batch_size, num_epochs=self.num_epochs - 2, display_freq=self.display_epoch_freq, model=self.lm, criterion=self.loss, optim=self.optimizer, scheduler=self.scheduler, device=self.device, training_set=self.train_dl, validation_set=self.valid_dl, best_model_path=self.model_file_lm, history=history)