def val_epoch(self, epoch): model_with_loss = self.model_with_loss model_with_loss.eval() data_time, batch_time = AverageMeter(), AverageMeter() avg_loss_stats = {l: AverageMeter() for l in self.loss_stats} end = time.time() for iter_id, batch in enumerate(self.val_loader): show_str = '[%d/%d/%d] ' % (epoch + 1, iter_id + 1, self.num_val_iter) data_time.update(time.time() - end) with torch.no_grad(): for k in batch: batch[k] = batch[k].to(device=self.config.TRAIN['DEVICE'], non_blocking=True) loss, loss_stats = model_with_loss(batch) batch_time.update(time.time() - end) end = time.time() for l in avg_loss_stats: avg_loss_stats[l].update(loss_stats[l].mean().item(), batch['input'].size(0)) self.writer.add_scalar('val/' + l, avg_loss_stats[l].avg, epoch * self.num_val_iter + iter_id) show_str += ' {}:{:0.4} '.format(l, avg_loss_stats[l].avg) print(show_str) save_checkpoint( model_with_loss.model, self.config.TRAIN['CHECKPOINT'] + '/model_%d.pth' % epoch)
def run(self): # checkpoint self.scheduler = get_scheduler(self.config, self.optimizer, self.last_epoch) self.model.train() postfix_dic = { 'lr': 0.0, 'acc': 0.0, 'loss': 0.0, } if self.config.data.sampler == "weight": self.train_weigh() else: for epoch in range(self.last_epoch, self.num_epochs): self.train_single_epoch(epoch) if epoch % 200 == 199: save_checkpoint(self.config, self.model, self.optimizer, self.optimizer_center, epoch, self.step) self.scheduler.step() if epoch > self.config.train.num_epochs: break
def fit_model( model, n_epoch, dev_dataloader, optimizer, criterion, loss_fn, metric_fn, val_dataloader=None, checkpoint=False, model_fn="pytorch", ): n_dev_obs, dev_batch_size, dev_batch_per_epoch = get_batch_info( dev_dataloader) for idx_epoch in tqdm(range(n_epoch), total=n_epoch): t = tqdm(enumerate(dev_dataloader), total=dev_batch_per_epoch) for idx_batch, data in t: model = model.train() loss = loss_fn(model, criterion, data) train_step(optimizer, loss) with torch.no_grad(): model = model.eval() metric = metric_fn(model, data) t.set_postfix({"loss": loss.item(), "metric": metric.item()}) if val_dataloader is not None: val_loss, val_metric = validate_model(model, criterion, loss_fn, metric_fn, val_dataloader) print(" val_loss : {}, val_metric : {}".format( val_loss, val_metric)) if checkpoint: model_filename = "{}_{}".format(model_fn, idx_epoch) save_checkpoint(model, optimizer, model_filename) return model
def postEpoch(self, epoch, optimizer, trainData: EpochData, validData: EpochData): logger = self.getLogger() model = self.getModel() trainDataRow = trainData.summaryDataRow() validDataRow = validData.summaryDataRow() # add epoch number trainDataRow[self.epochNumKey] = epoch # add learning rate trainDataRow[self.lrKey] = self.formats[self.lrKey](optimizer.param_groups[0]['lr']) # add flops ratio trainDataRow[self.flopsRatioKey] = self.formats[self.flopsRatioKey](model.flopsRatio()) # merge trainDataRow with validDataRow for k, v in validDataRow.items(): trainDataRow[k] = v # save model checkpoint save_checkpoint(self.getTrainFolderPath(), model, optimizer, validData.accDict()) # add data to main logger table logger.addDataRow(trainDataRow) # select new path for next epoch self._selectNewPath()
def train_weigh(self): acc_sample = 0 count_all = 0 all_loss = 0 all_center_loss = 0 total_num = len(self.dataset) batch_size = self.config.train.batch_size.batch1 * self.config.train.batch_size.batch2 step_num = math.ceil(total_num / batch_size) epoch = self.last_epoch iteration = epoch * step_num # print("step number is ", step_num) for seq, vID, label, _ in self.data_loader: iteration += 1 count_all += len(label) acc_i, loss, loss_center = self.train_sigle_iteration(seq, label) all_loss += loss all_center_loss += loss_center acc_sample += acc_i if iteration % step_num == step_num - 1: self.scheduler.step() if self.scheduler_center is not None: self.scheduler_center.step() epoch += 1 if (epoch % self.config.train.save_step) == ( self.config.train.save_step - 1): print("save loss log image") self.plot_loss() save_checkpoint(self.config, self.model, self.optimizer, self.center_model, self.optimizer_center, epoch, self.step) if self.writer is not None: self.writer.add_scalar("train_loss", all_loss, epoch) acc_epoch = acc_sample * 1.0 / count_all if self.center_model is not None: print( "training in epoch :{}, the acc is {}% ,\n the cross loss is {}, the center loss is {}" .format(epoch, acc_epoch * 100, all_loss, all_center_loss)) self.loss_center_data.append(all_center_loss) else: print( "training in epoch :{}, the acc is {}% ,\n the loss is {}" .format(epoch, acc_epoch * 100, all_loss)) self.loss_data.append(all_loss) print("learning rate: ", self.optimizer.param_groups[0]['lr']) acc_sample = 0 count_all = 0 all_loss = 0 all_center_loss = 0 if epoch > self.config.train.num_epochs: break
def train(self): args = self.args model = self.model logger = self.logger epochRange = self._getEpochRange(self.nEpochs) # init optimizer optimizer = SGD(model.alphas(), args.search_learning_rate, momentum=args.search_momentum, weight_decay=args.search_weight_decay) # init scheduler scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.95, patience=args.search_patience, min_lr=args.search_learning_rate_min) for epoch in epochRange: print('========== Epoch:[{}/{}] =============='.format( epoch, self.nEpochs)) # init epoch train logger trainLogger = HtmlLogger(self.trainFolderPath, epoch) # set loggers dictionary loggersDict = {self.trainLoggerKey: trainLogger} # create epoch jobs epochDataRows = self._createEpochJobs(epoch) # add epoch data rows for jobDataRow in epochDataRows: logger.addDataRow(jobDataRow, trType='<tr bgcolor="#2CBDD6">') # train alphas # epochLossDict, alphasDataRow = self.trainAlphas(self._getNextSearchQueueDataLoader(), optimizer, epoch, loggersDict) epochLossDict, alphasDataRow = self.trainAlphas( self.valid_queue, optimizer, epoch, loggersDict) # update scheduler scheduler.step(epochLossDict.get(self.flopsLoss.totalKey())) # calc model choosePathAlphasAsPartition flops ratio model.choosePathAlphasAsPartition() # add values to alphas data row additionalData = { self.epochNumKey: epoch, self.lrKey: optimizer.param_groups[0]['lr'], self.validFlopsRatioKey: model.flopsRatio() } self._applyFormats(additionalData) # add alphas data row alphasDataRow.update(additionalData) logger.addDataRow(alphasDataRow) # save checkpoint save_checkpoint(self.trainFolderPath, model, optimizer, epochLossDict)
def train(cfg): train_loader = construct_loader(cfg, train=True) val_loader = construct_loader(cfg, train=False) model = build_model(cfg) optimizer = construct_optimizer(model, cfg) for epoch in range(cfg.TRAIN.MAX_EPOCH): shuffle_dataset(train_loader, epoch) train_epoch(train_loader, model, optimizer, epoch, cfg) eval_epoch(val_loader, model, epoch, cfg) save_checkpoint(model, optimizer, epoch, cfg)
def train(self, epochs, validate_every, start_epoch): """ Runs the model on training dataset. Args: epochs (int): Total epochs. validate_every (int): Run validation after every validate_every no of epochs. start_epoch (int): Starting epoch if using the stored checkpoint. """ #self.validation(epoch = 0) #batch_size = Config.get("training_batch_size") for epoch in range(start_epoch, epochs + 1): training_batch_losses = [] for _, data in tqdm(enumerate(self.training_loader, 0)): images, captions, lengths, _ = data self.optimizer.zero_grad() images = images.to(Config.get("device")) captions = captions.to(Config.get("device")) #setting up training mode self.encoder = self.encoder.train() self.decoder = self.decoder.train() #image features image_features = self.encoder(images) #predicted captions predicted_captions = self.decoder.teacher_forcing( image_features, captions, lengths, self.pretrained_embeddings) #max_length, _ = lengths.max(0) #ref_captions_mask = torch.ones(batch_size, max_length).to(Config.get("device")) #loss function loss = self.criterion(predicted_captions, captions) #calculating the gradients loss.backward() #updating the parameters self.optimizer.step() training_batch_losses.append(loss.item()) self.stat.record(training_losses=np.mean(training_batch_losses)) self.stat.push_tensorboard_losses(epoch) self.stat.log_losses(epoch) if (epoch - 1) % validate_every == 0: self.validation(epoch=epoch) save_checkpoint(epoch=epoch, outdir=self.output_dir, encoder=self.encoder, decoder=self.decoder, optimizer=self.optimizer, criterion=self.criterion)
def postEpoch(self, epoch, optimizer, trainData: EpochData, validData: EpochData): logger = self.getLogger() model = self.getModel() # init data row dataRow = trainData.summaryDataRow() # add epoch number dataRow[self.epochNumKey] = epoch # add learning rate dataRow[self.lrKey] = self.formats[self.lrKey]( optimizer.param_groups[0]['lr']) # merge trainData with validData for k, v in validData.summaryDataRow().items(): dataRow[k] = v # get valid acc dict & loss dict validAccDict = validData.accDict() validLossDict = validData.lossDict() # update optimum values according to current epoch values and get optimum table for logger optimumTable = self.trainOptimum.update(validAccDict, epoch) # add update time to optimum table optimumTable.append(['Update time', logger.getTimeStr()]) # update nEpochsOptimum table logger.addInfoTable('Optimum', optimumTable) # update best precision only after switching stage is complete is_best = self.trainOptimum.is_best(epoch) if is_best: # update optimal epoch data self.optimalEpochData = (validAccDict, validLossDict) # found new optimum, reset nEpochsOptimum self.nEpochsOptimum = 0 else: # optimum hasn't changed self.nEpochsOptimum += 1 # save model checkpoint save_checkpoint(self.getTrainFolderPath(), model, optimizer, validAccDict, is_best) # add data to main logger table logger.addDataRow(dataRow)
def run(args): df = pd.read_csv(args.df_path) df_train = df[df['fold'] != args.fold] model = get_model(args).cuda() dataloader = get_dataloader(args.data_dir, df_train, 'train', args.pretrain, args.batch_size) checkpoints = get_checkpoints(args) checkpoint.load_checkpoint( args, model, None, checkpoint=checkpoints[0] ) # args, model, ckpt_name, checkpoint=None, optimizer=None for i, ckpt in enumerate(checkpoints[1:]): print(i, ckpt) model2 = get_model(args).cuda() last_epoch, _ = checkpoint.load_checkpoint(args, model2, None, checkpoint=ckpt) if args.ema is None: swa.moving_average(model, model2, 1. / (i + 2)) else: swa.moving_average(model, model2, args.ema) with torch.no_grad(): swa.bn_update(dataloader, model) if args.ema is not None: output_name = f'model_ema_{len(checkpoints)}' else: output_name = f'model_swa_{len(checkpoints)}' print('save {}'.format(output_name)) checkpoint.save_checkpoint(args, model, None, 0, 0, name=output_name, weights_dict={'state_dict': model.state_dict()})
def fit_model(model, n_epoch, dev_dataloader, optimizer, criterion, loss_fn, metric_fn, val_dataloader=None, checkpoint=False, model_filename="checkpoint", **kwargs): cur_time = datetime.datetime.now().strftime('%Y%m%d-%H%M') if not os.path.exists(os.path.join(model_cp_path, cur_time)): os.mkdir(os.path.join(model_cp_path, cur_time)) save_metadata(cur_time, model, n_epoch, dev_dataloader, optimizer, criterion, val_dataloader) n_dev_obs, dev_batch_size, dev_batch_per_epoch = get_batch_info( dev_dataloader) for idx_epoch in tqdm(range(n_epoch), total=n_epoch): t = tqdm(enumerate(dev_dataloader), total=dev_batch_per_epoch) for idx_batch, data in t: model = model.train() loss = loss_fn(model, criterion, data) train_step(optimizer, loss) with torch.no_grad(): model = model.eval() metric = metric_fn(model, data) t.set_postfix({"loss": loss.item(), "metric": metric.item()}) if val_dataloader is not None: val_loss, val_metric = validate_model(model, criterion, loss_fn, metric_fn, val_dataloader) print(" val_loss : {}, val_metric : {}".format( val_loss, val_metric)) if checkpoint: filename = "{}_{}".format(model_filename, idx_epoch) save_checkpoint(model, optimizer, cur_time, filename) return model
if epoch % 10 == 0: # Don't want to save all test-stats test.validate(q_network, epoch, test_loader, args, ReinforcementLearning, statistics, TEXT, still_training=True) # Save best checkpoint if training_status_handler.update_best( statistics.statistics['training_test_reward']): statistics.update_state(q_network.state_dict()) save_checkpoint(statistics.statistics, args.name, filename="best.pth.tar") # Save checkpoint if epoch % training_status_handler.SAVE == 0: statistics.update_state(q_network.state_dict()) save_checkpoint(statistics.statistics, args.name) # Save backup checkpoint if epoch % training_status_handler.BACKUP == 0: statistics.update_state(q_network.state_dict()) save_checkpoint(statistics.statistics, args.name, filename="backup.pth.tar") # Final checkpoint
def main(): args = get_arguments() # configuration CONFIG = Dict(yaml.safe_load(open(args.config))) # writer if CONFIG.writer_flag: writer = SummaryWriter(CONFIG.result_path) else: writer = None # DataLoaders train_data = PASCALVOC( CONFIG, mode="train", transform=Compose([ RandomCrop(CONFIG), Resize(CONFIG), RandomFlip(), ToTensor(), Normalize(mean=get_mean(), std=get_std()), ]) ) val_data = PASCALVOC( CONFIG, mode="val", transform=Compose([ RandomCrop(CONFIG), Resize(CONFIG), ToTensor(), Normalize(mean=get_mean(), std=get_std()), ]) ) train_loader = DataLoader( train_data, batch_size=CONFIG.batch_size, shuffle=True, num_workers=CONFIG.num_workers, drop_last=True ) val_loader = DataLoader( val_data, batch_size=CONFIG.batch_size, shuffle=False, num_workers=CONFIG.num_workers ) # load model print('\n------------------------Loading Model------------------------\n') if CONFIG.attention == 'dual': model = DANet(CONFIG) print('Dual Attintion modules will be added to this base model') elif CONFIG.attention == 'channel': model = CANet(CONFIG) print('Channel Attintion modules will be added to this base model') else: if CONFIG.model == 'drn_d_22': print( 'Dilated ResNet D 22 w/o Dual Attention modules will be used as a model.') model = drn_d_22(pretrained=True, num_classes=CONFIG.n_classes) elif CONFIG.model == 'drn_d_38': print( 'Dilated ResNet D 28 w/o Dual Attention modules will be used as a model.') model = drn_d_38(pretrained=True, num_classes=CONFIG.n_classes) else: print('There is no option you chose as a model.') print( 'Therefore, Dilated ResNet D 22 w/o Dual Attention modules will be used as a model.') model = drn_d_22(pretrained=True, num_classes=CONFIG.n_classes) # set optimizer, lr_scheduler if CONFIG.optimizer == 'Adam': print(CONFIG.optimizer + ' will be used as an optimizer.') optimizer = optim.Adam(model.parameters(), lr=CONFIG.learning_rate) elif CONFIG.optimizer == 'SGD': print(CONFIG.optimizer + ' will be used as an optimizer.') optimizer = optim.SGD( model.parameters(), lr=CONFIG.learning_rate, momentum=CONFIG.momentum, dampening=CONFIG.dampening, weight_decay=CONFIG.weight_decay, nesterov=CONFIG.nesterov) elif CONFIG.optimizer == 'AdaBound': print(CONFIG.optimizer + ' will be used as an optimizer.') optimizer = adabound.AdaBound( model.parameters(), lr=CONFIG.learning_rate, final_lr=CONFIG.final_lr, weight_decay=CONFIG.weight_decay) else: print('There is no optimizer which suits to your option. \ Instead, SGD will be used as an optimizer.') optimizer = optim.SGD( model.parameters(), lr=CONFIG.learning_rate, momentum=CONFIG.momentum, dampening=CONFIG.dampening, weight_decay=CONFIG.weight_decay, nesterov=CONFIG.nesterov) # learning rate scheduler if CONFIG.optimizer == 'SGD': scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=CONFIG.lr_patience) else: scheduler = None # send the model to cuda/cpu device = 'cuda' if torch.cuda.is_available() else 'cpu' model.to(device) if device == 'cuda': model = torch.nn.DataParallel(model) # make parallel torch.backends.cudnn.benchmark = True # resume if you want begin_epoch = 0 if args.resume: if os.path.exists(os.path.join(CONFIG.result_path, 'checkpoint.pth')): print('loading the checkpoint...') begin_epoch, model, optimizer, scheduler = \ resume(CONFIG, model, optimizer, scheduler) print('training will start from {} epoch'.format(begin_epoch)) # criterion for loss if CONFIG.class_weight: criterion = nn.CrossEntropyLoss( weight=get_class_weight().to(device), ignore_index=255 ) else: criterion = nn.CrossEntropyLoss(ignore_index=255) # train and validate model print('\n------------------------Start training------------------------\n') losses_train = [] losses_val = [] val_ious = [] mean_ious = [] mean_ious_without_bg = [] best_mean_iou = 0.0 for epoch in range(begin_epoch, CONFIG.max_epoch): # training loss_train = train( model, train_loader, criterion, optimizer, CONFIG, device) losses_train.append(loss_train) # validation val_iou, loss_val = validation( model, val_loader, criterion, CONFIG, device) val_ious.append(val_iou) losses_val.append(loss_val) if CONFIG.optimizer == 'SGD': scheduler.step(loss_val) mean_ious.append(val_ious[-1].mean().item()) mean_ious_without_bg.append(val_ious[-1][1:].mean().item()) # save checkpoint every 5 epoch if epoch % 5 == 0 and epoch != 0: save_checkpoint(CONFIG, epoch, model, optimizer, scheduler) # save a model every 50 epoch if epoch % 50 == 0 and epoch != 0: torch.save( model.state_dict(), os.path.join(CONFIG.result_path, 'epoch_{}_model.prm'.format(epoch))) if best_mean_iou < mean_ious[-1]: best_mean_iou = mean_ious[-1] torch.save( model.state_dict(), os.path.join(CONFIG.result_path, 'best_mean_iou_model.prm')) # tensorboardx if writer: writer.add_scalars( "loss", { 'loss_train': losses_train[-1], 'loss_val': losses_val[-1]}, epoch) writer.add_scalar( "mean_iou", mean_ious[-1], epoch) writer.add_scalar( "mean_iou_w/o_bg", mean_ious_without_bg[-1], epoch) print( 'epoch: {}\tloss_train: {:.5f}\tloss_val: {:.5f}\tmean IOU: {:.3f}\tmean IOU w/o bg: {:.3f}'.format( epoch, losses_train[-1], losses_val[-1], mean_ious[-1], mean_ious_without_bg[-1]) ) torch.save( model.state_dict(), os.path.join(CONFIG.result_path, 'final_model.prm'))
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Set up environment. du.init_distributed_training(cfg) # Set random seed from configs. np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Setup logging format. logging.setup_logging(cfg.OUTPUT_DIR) # Init multigrid. multigrid = None if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: multigrid = MultigridSchedule() cfg = multigrid.init_multigrid(cfg) if cfg.MULTIGRID.LONG_CYCLE: cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = build_model(cfg) # model = x3d.MyModel() if du.is_master_proc() and cfg.LOG_MODEL_INFO: misc.log_model_info(model, cfg, is_train=True) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) logger.info("Load from last checkpoint, {}.".format(last_checkpoint)) checkpoint_epoch = cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer ) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer, inflation=cfg.TRAIN.CHECKPOINT_INFLATE, convert_from_caffe2=cfg.TRAIN.CHECKPOINT_TYPE == "caffe2", ) start_epoch = checkpoint_epoch + 1 else: start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") precise_bn_loader = loader.construct_loader( cfg, "train", is_precise_bn=True ) # Create meters. if cfg.DETECTION.ENABLE: train_meter = AVAMeter(len(train_loader), cfg, mode="train") val_meter = AVAMeter(len(val_loader), cfg, mode="val") else: train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(len(val_loader), cfg) # set up writer for logging to Tensorboard format. if cfg.TENSORBOARD.ENABLE and du.is_master_proc( cfg.NUM_GPUS * cfg.NUM_SHARDS ): writer = tb.TensorboardWriter(cfg) else: writer = None # Perform the training loop. logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): if cfg.MULTIGRID.LONG_CYCLE: cfg, changed = multigrid.update_long_cycle(cfg, cur_epoch) if changed: ( model, optimizer, train_loader, val_loader, precise_bn_loader, train_meter, val_meter, ) = build_trainer(cfg) # Load checkpoint. if cu.has_checkpoint(cfg.OUTPUT_DIR): last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) assert "{:05d}.pyth".format(cur_epoch) in last_checkpoint else: last_checkpoint = cfg.TRAIN.CHECKPOINT_FILE_PATH logger.info("Load from {}".format(last_checkpoint)) cu.load_checkpoint( last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer ) # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Train for one epoch. train_epoch( train_loader, model, optimizer, train_meter, cur_epoch, cfg, writer ) # Compute precise BN stats. if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: calculate_and_update_precise_bn( precise_bn_loader, model, min(cfg.BN.NUM_BATCHES_PRECISE, len(precise_bn_loader)), ) _ = misc.aggregate_sub_bn_stats(model) # Save a checkpoint. if cu.is_checkpoint_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule ): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, cfg) # Evaluate the model on validation set. if misc.is_eval_epoch( cfg, cur_epoch, None if multigrid is None else multigrid.schedule ): eval_epoch(val_loader, model, val_meter, cur_epoch, cfg, writer) if writer is not None: writer.close()
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") bb_df = pd.read_csv(bb_repo) train_idx = np.arange(len(bb_df)) dev_idx, val_idx = train_test_split(train_idx, test_size=0.20) dev_df = bb_df.iloc[dev_idx, :].reset_index(drop=True) val_df = bb_df.iloc[val_idx, :].reset_index(drop=True) bb_train_dataset = BBDataset(True, device, dev_df) bb_dev_dataset = BBDataset(True, device, dev_df) bb_val_dataset = BBDataset(True, device, val_df) bb_test_dataset = BBDataset(False, device) train_dataloader = DataLoader(bb_train_dataset, batch_size=32) dev_dataloader = DataLoader(bb_dev_dataset, batch_size=32, shuffle=True) val_dataloader = DataLoader(bb_val_dataset, batch_size=32) test_dataloader = DataLoader(bb_test_dataset, batch_size=32) preload_model = torchvision.models.resnet50(pretrained=True).to(device) header_model = Res50BBHead([1000], 0.5).to(device) model = ResPneuNet(preload_model, header_model) n_epoch = 5 optimizer = optim.Adam( [ { "params": model.preload_backbone.parameters(), "lr": 0.0001 }, { "params": model.header.parameters(), "lr": 0.001 }, ], betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False, ) criterion = nn.L1Loss().to(device) n_obs, batch_size, n_batch_per_epoch = get_batch_info(dev_dataloader) clr = CLR(n_epoch, n_batch_per_epoch, 0.1, 1., 0.95, 0.85, 2) callbacks = [clr] model = fit_model( model, n_epoch, dev_dataloader, optimizer, criterion, loss_fn, metric_fn, val_dataloader, checkpoint=True, model_fn="bb", ) prediction = predict_model(model, test_dataloader, pred_fn) string_prediction = [ "{} {} {} {}".format(x[0], x[1], x[2], x[3]) for x in prediction ] patientid = test_dataloader.dataset.patientId pneu_bb = string_prediction bb_pred_df = pd.DataFrame({"name": patientid, "label": pneu_bb}) bb_pred_df.to_csv(bb_predict_repo, index=False) save_checkpoint(model, optimizer, fname="bb")
def main(): parser = argparse.ArgumentParser(description='Dataloader test') parser.add_argument('--gpu', default='0', help='gpu id') parser.add_argument('--workers', default=16, type=int, help='num workers for data loading') parser.add_argument('--nb_epoch', default=100, type=int, help='training epoch') parser.add_argument('--lr', default=1e-4, type=float, help='learning rate') parser.add_argument('--power', default=0, type=float, help='lr poly power; 0 indicates step decay by half') parser.add_argument('--batch_size', default=8, type=int, help='batch size') parser.add_argument('--size', default=256, type=int, help='image size') parser.add_argument( '--anchor_imsize', default=416, type=int, help='scale used to calculate anchors defined in model cfg file') parser.add_argument('--data_root', type=str, default='./ln_data/DMS/', help='path to ReferIt splits data folder') parser.add_argument('--split_root', type=str, default='data', help='location of pre-parsed dataset info') parser.add_argument('--dataset', default='referit', type=str, help='referit/flickr/unc/unc+/gref') parser.add_argument('--time', default=20, type=int, help='maximum time steps (lang length) per batch') parser.add_argument('--emb_size', default=512, type=int, help='fusion module embedding dimensions') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument( '--pretrain', default='', type=str, metavar='PATH', help= 'pretrain support load state_dict that are not identical, while have no loss saved as resume' ) parser.add_argument('--print_freq', '-p', default=2000, type=int, metavar='N', help='print frequency (default: 1e3)') parser.add_argument('--savename', default='default', type=str, help='Name head for saved model') parser.add_argument('--seed', default=13, type=int, help='random seed') parser.add_argument('--bert_model', default='bert-base-uncased', type=str, help='bert model') parser.add_argument('--test', dest='test', default=False, action='store_true', help='test') parser.add_argument('--nflim', default=3, type=int, help='nflim') parser.add_argument('--mstage', dest='mstage', default=False, action='store_true', help='if mstage') parser.add_argument('--mstack', dest='mstack', default=False, action='store_true', help='if mstack') parser.add_argument('--w_div', default=0.125, type=float, help='weight of the diverge loss') parser.add_argument('--fusion', default='prod', type=str, help='prod/cat') parser.add_argument('--tunebert', dest='tunebert', default=False, action='store_true', help='if tunebert') parser.add_argument('--large', dest='large', default=False, action='store_true', help='if large mode: fpn16, convlstm out, size 512') global args, anchors_full args = parser.parse_args() if args.large: args.gsize = 16 args.size = 512 else: args.gsize = 8 print( '----------------------------------------------------------------------' ) print(sys.argv[0]) print(args) print( '----------------------------------------------------------------------' ) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu ## fix seed cudnn.benchmark = False cudnn.deterministic = True random.seed(args.seed) np.random.seed(args.seed + 1) torch.manual_seed(args.seed + 2) torch.cuda.manual_seed_all(args.seed + 3) eps = 1e-10 ## following anchor sizes calculated by kmeans under args.anchor_imsize=416 if args.dataset == 'refeit': anchors = '30,36, 78,46, 48,86, 149,79, 82,148, 331,93, 156,207, 381,163, 329,285' elif args.dataset == 'flickr': anchors = '29,26, 55,58, 137,71, 82,121, 124,205, 204,132, 209,263, 369,169, 352,294' else: anchors = '10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326' anchors = [float(x) for x in anchors.split(',')] anchors_full = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)][::-1] ## save logs if args.savename == 'default': args.savename = 'filmconv_nofpn32_%s_batch%d' % (args.dataset, args.batch_size) if not os.path.exists('./logs'): os.mkdir('logs') logging.basicConfig(level=logging.INFO, filename="./logs/%s" % args.savename, filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s") logging.info(str(sys.argv)) logging.info(str(args)) input_transform = Compose([ ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) train_dataset = ReferDataset(data_root=args.data_root, split_root=args.split_root, dataset=args.dataset, split='train', imsize=args.size, transform=input_transform, max_query_len=args.time, augment=True) val_dataset = ReferDataset(data_root=args.data_root, split_root=args.split_root, dataset=args.dataset, split='val', imsize=args.size, transform=input_transform, max_query_len=args.time) ## note certain dataset does not have 'test' set: ## 'unc': {'train', 'val', 'trainval', 'testA', 'testB'} test_dataset = ReferDataset(data_root=args.data_root, split_root=args.split_root, dataset=args.dataset, testmode=True, split='val', imsize=args.size, transform=input_transform, max_query_len=args.time) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, pin_memory=True, drop_last=True, num_workers=args.workers) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, pin_memory=True, drop_last=True, num_workers=args.workers) test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, pin_memory=True, drop_last=True, num_workers=0) ## Model model = grounding_model_multihop(NFilm=args.nflim, fusion=args.fusion, intmd=args.mstack, mstage=args.mstage, \ emb_size=args.emb_size, coordmap=True, convlstm=args.large, \ bert_model=args.bert_model, dataset=args.dataset, tunebert=args.tunebert) model = torch.nn.DataParallel(model).cuda() if args.pretrain: model = load_pretrain(model, args, logging) if args.resume: model = load_resume(model, args, logging) print('Num of parameters:', sum([param.nelement() for param in model.parameters()])) logging.info('Num of parameters:%d' % int(sum([param.nelement() for param in model.parameters()]))) if args.tunebert: visu_param = model.module.visumodel.parameters() text_param = model.module.textmodel.parameters() rest_param = [ param for param in model.parameters() if ((param not in visu_param) and (param not in text_param)) ] visu_param = list(model.module.visumodel.parameters()) text_param = list(model.module.textmodel.parameters()) sum_visu = sum([param.nelement() for param in visu_param]) sum_text = sum([param.nelement() for param in text_param]) sum_fusion = sum([param.nelement() for param in rest_param]) print('visu, text, fusion module parameters:', sum_visu, sum_text, sum_fusion) else: visu_param = model.module.visumodel.parameters() rest_param = [ param for param in model.parameters() if param not in visu_param ] visu_param = list(model.module.visumodel.parameters()) sum_visu = sum([param.nelement() for param in visu_param]) sum_text = sum([ param.nelement() for param in model.module.textmodel.parameters() ]) sum_fusion = sum([param.nelement() for param in rest_param]) - sum_text print('visu, text, fusion module parameters:', sum_visu, sum_text, sum_fusion) ## optimizer; rmsprop default if args.tunebert: optimizer = torch.optim.RMSprop([{ 'params': rest_param }, { 'params': visu_param, 'lr': args.lr / 10. }, { 'params': text_param, 'lr': args.lr / 10. }], lr=args.lr, weight_decay=0.0005) else: optimizer = torch.optim.RMSprop([{ 'params': rest_param }, { 'params': visu_param, 'lr': args.lr / 10. }], lr=args.lr, weight_decay=0.0005) ## training and testing best_accu = -float('Inf') if args.test: _ = test_epoch(test_loader, model) else: for epoch in range(args.nb_epoch): adjust_learning_rate(args, optimizer, epoch) train_epoch(train_loader, model, optimizer, epoch) accu_new = validate_epoch(val_loader, model) ## remember best accu and save checkpoint is_best = accu_new > best_accu best_accu = max(accu_new, best_accu) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_loss': accu_new, 'optimizer': optimizer.state_dict(), }, is_best, args, filename=args.savename) print('\nBest Accu: %f\n' % best_accu) logging.info('\nBest Accu: %f\n' % best_accu)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # configuration CONFIG = Dict(yaml.safe_load(open(args.config))) # writer if CONFIG.writer_flag: writer = SummaryWriter(CONFIG.result_path) else: writer = None if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print('\n------------------------Loading Model------------------------\n') if CONFIG.model == 'resnet18': print('ResNet18 will be used as a model.') model = resnet.generate_model(18, n_classes=CONFIG.n_classes) elif CONFIG.model == 'resnet50': print('ResNet50 will be used as a model.') model = resnet.generate_model(50, n_classes=CONFIG.n_classes) else: print('resnet18 will be used as a model.') model = resnet.generate_model(18, n_classes=CONFIG.n_classes) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True # define loss function (criterion) and optimizer if CONFIG.class_weight: criterion = nn.CrossEntropyLoss( weight=get_class_weight(CONFIG.n_classes).cuda(args.gpu)).cuda( args.gpu) else: criterion = nn.CrossEntropyLoss().cuda(args.gpu) if CONFIG.optimizer == 'Adam': print(CONFIG.optimizer + ' will be used as an optimizer.') optimizer = optim.Adam(model.parameters(), lr=CONFIG.learning_rate) elif CONFIG.optimizer == 'SGD': optimizer = torch.optim.SGD(model.parameters(), lr=CONFIG.learning_rate, momentum=CONFIG.momentum, dampening=CONFIG.dampening, weight_decay=CONFIG.weight_decay, nesterov=CONFIG.nesterov) # learning rate scheduler if CONFIG.optimizer == 'SGD': scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=CONFIG.lr_patience) else: scheduler = None # resume if you want begin_epoch = 0 log = None if args.resume: if os.path.exists(os.path.join(CONFIG.result_path, 'checkpoint.pth')): print('loading the checkpoint...') begin_epoch, model, optimizer, best_acc1, scheduler = resume( CONFIG, model, optimizer, scheduler) if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) print('training will start from {} epoch'.format(begin_epoch)) else: print("there is no checkpoint at the result folder") if os.path.exists(os.path.join(CONFIG.result_path, 'log.csv')): print('loading the log file...') log = pd.read_csv(os.path.join(CONFIG.result_path, 'log.csv')) else: print("there is no log file at the result folder.") print('Making a log file...') log = pd.DataFrame(columns=[ 'epoch', 'lr', 'train_loss', 'val_loss', 'train_acc@1', 'train_acc@5', 'val_acc@1', 'val_acc@5' ]) # DataLoaders normalize = Normalize(mean=get_mean(), std=get_std()) train_data = Kinetics(CONFIG, transform=Compose([ RandomCrop((CONFIG.height, CONFIG.width)), ToTensor(), normalize, ])) val_data = Kinetics(CONFIG, transform=Compose([ RandomCrop((CONFIG.height, CONFIG.width)), ToTensor(), normalize, ]), mode='validation') if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data) else: train_sampler = None train_loader = DataLoader(train_data, batch_size=CONFIG.batch_size, shuffle=(train_sampler is None), num_workers=CONFIG.num_workers, pin_memory=True, sampler=train_sampler, drop_last=True) val_loader = DataLoader(val_data, batch_size=CONFIG.batch_size, shuffle=False, num_workers=CONFIG.num_workers, pin_memory=True) # train and validate model print('\n------------------------Start training------------------------\n') train_losses = [] val_losses = [] train_top1_accuracy = [] train_top5_accuracy = [] val_top1_accuracy = [] val_top5_accuracy = [] for epoch in range(begin_epoch, CONFIG.max_epoch): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_loss, train_acc1, train_acc5 = train(train_loader, model, criterion, optimizer, epoch, args, CONFIG) train_losses.append(train_loss) train_top1_accuracy.append(train_acc1) train_top5_accuracy.append(train_acc5) # validation on validation set val_loss, val_acc1, val_acc5 = validate(val_loader, model, criterion, args, CONFIG) val_losses.append(val_loss) val_top1_accuracy.append(val_acc1) val_top5_accuracy.append(val_acc5) # scheduler if CONFIG.optimizer == 'SGD': scheduler.step(val_loss) # save a model if top1 acc is higher than ever if best_acc1 < val_acc1: best_acc1 = val_acc1 torch.save(model.state_dict(), os.path.join(CONFIG.result_path, 'best_acc1_model.prm')) # save checkpoint every epoch save_checkpoint(CONFIG, epoch, model, optimizer, best_acc1, scheduler) # save a model every 10 epoch # save base models, NOT DataParalled models if epoch % 10 == 0 and epoch != 0: torch.save( model.state_dict(), os.path.join(CONFIG.result_path, 'epoch_{}_model.prm'.format(epoch))) # tensorboardx if writer is not None: writer.add_scalars("loss", { 'train': train_losses[-1], 'val': val_losses[-1] }, epoch) writer.add_scalars("train_acc", { 'top1': train_top1_accuracy[-1], 'top5': train_top5_accuracy[-1] }, epoch) writer.add_scalars("val_acc", { 'top1': val_top1_accuracy[-1], 'top5': val_top5_accuracy[-1] }, epoch) # write logs to dataframe and csv file tmp = pd.Series([ epoch, scheduler.get_lr()[0], train_losses[-1], val_losses[-1], train_top1_accuracy[-1], train_top5_accuracy[-1], val_top1_accuracy[-1], val_top5_accuracy[-1], ], index=log.columns) log = log.append(tmp, ignore_index=True) log.to_csv(os.path.join(CONFIG.result_path, 'log.csv'), index=False) print( 'epoch: {}\tlr: {}\tloss train: {:.4f}\tloss val: {:.4f}\tval_acc1: {:.5f}\tval_acc5: {:.4f}' .format(epoch, scheduler.get_lr()[0], train_losses[-1], val_losses[-1], val_top1_accuracy[-1], val_top5_accuracy[-1])) # save base models, NOT DataParalled models torch.save(model.module.state_dict(), os.path.join(CONFIG.result_path, 'final_model.prm'))
def main(): """ Training and validation. """ global best_bleu4, epochs_since_improvement, checkpoint, tagger_checkpoint, start_epoch, fine_tune_encoder, data_name, word_map print('Running on device {}\n'.format(device)) # Read word map word_map_file = os.path.join(data_folder, 'WORDMAP_' + data_name + '.json') with open(word_map_file, 'r') as j: word_map = json.load(j) # Initialize / load checkpoint tagger_checkpoint = torch.load(tagger_checkpoint) encoder_tagger = tagger_checkpoint['encoder'] encoder_tagger.fine_tune(False) if checkpoint is None: decoder = PureSCN(embed_dim=emb_dim, decoder_dim=decoder_dim, factored_dim=factored_dim, semantic_dim=semantic_dim, vocab_size=len(word_map), dropout=dropout) decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()), lr=decoder_lr) encoder = EncoderCaption() encoder.fine_tune(fine_tune_encoder) encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) if fine_tune_encoder else None else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] best_bleu4 = checkpoint['bleu-4'] decoder = checkpoint['decoder'] decoder_optimizer = checkpoint['decoder_optimizer'] encoder = checkpoint['encoder'] encoder_optimizer = checkpoint['encoder_optimizer'] if fine_tune_encoder is True and encoder_optimizer is None: encoder.fine_tune(fine_tune_encoder) encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) # Move to GPU, if available decoder = decoder.to(device) encoder = encoder.to(device) encoder_tagger = encoder_tagger.to(device) # Loss function criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_loader = torch.utils.data.DataLoader( CaptionDataset(data_folder, data_name, 'TRAIN', transform=transforms.Compose([normalize])), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( CaptionDataset(data_folder, data_name, 'VAL', transform=transforms.Compose([normalize])), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) # Epochs for epoch in range(start_epoch, epochs): print('Current epoch {}\n'.format(epoch + 1)) # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20 if epochs_since_improvement == 20: break if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0: adjust_learning_rate(decoder_optimizer, 0.8) if fine_tune_encoder: adjust_learning_rate(encoder_optimizer, 0.8) # One epoch's training train(train_loader=train_loader, encoder=encoder, encoder_tagger=encoder_tagger, decoder=decoder, criterion=criterion, encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, epoch=epoch) # One epoch's validation recent_bleu4 = validate(val_loader=val_loader, encoder=encoder, encoder_tagger=encoder_tagger, decoder=decoder, criterion=criterion) # Check if there was an improvement is_best = recent_bleu4 > best_bleu4 best_bleu4 = max(recent_bleu4, best_bleu4) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,)) else: epochs_since_improvement = 0 print('Saving checkpoint for epoch {}\n'.format(epoch + 1)) # Save checkpoint save_checkpoint('scn', data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer, recent_bleu4, is_best)
def train(train_data_loader, eval_data_loader, model, reconstruction_loss, vocoder, mel_stat, optimizer, scheduler, global_step, writer=None, DEVICE=None): model.train() while global_step < args.max_training_step: for step, (mels, _) in tqdm(enumerate(train_data_loader), total=len(train_data_loader), unit='B', ncols=70, leave=False): mels = mels.float().to(DEVICE) optimizer.zero_grad() mels_hat, commitment_loss, perplexity = model(mels.detach()) commitment_loss = args.commitment_cost * commitment_loss recon_loss = reconstruction_loss(mels_hat, mels) loss = commitment_loss + recon_loss loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_thresh) optimizer.step() if global_step % args.save_checkpoint_step == 0: save_checkpoint(checkpoint_path=args.model_checkpoint_path, model=model, optimizer=optimizer, scheduler=scheduler, global_step=global_step) if global_step % args.eval_step == 0: evaluate(model=model, vocoder=vocoder, eval_data_loader=eval_data_loader, criterion=reconstruction_loss, mel_stat=mel_stat, global_step=global_step, writer=writer, DEVICE=DEVICE) model.train() if args.log_tensorboard: writer.add_scalars(mode="train_recon_loss", global_step=global_step, loss=recon_loss) writer.add_scalars(mode="train_commitment_loss", global_step=global_step, loss=commitment_loss) writer.add_scalars(mode="train_perplexity", global_step=global_step, loss=perplexity) writer.add_scalars(mode="train_total_loss", global_step=global_step, loss=loss) global_step += 1 scheduler.step()
def save_training(self, out_dir): meta = dict(c_epoch=self.c_epoch, c_iter=self.c_iter) filename = out_dir + 'epoch_{}.pth'.format(self.c_epoch + 1) optimizer = self.optimizer save_checkpoint(filename, self.model, optimizer, meta)
def main(): best_result = math.inf if TASK == 'count' else 0.0 best_type_meters = dict() train_loader, test_loader = get_dataloader(config, logger) num_classes = 1 if TASK == 'frameqa': answer_dict = utils.load_answer_dict() num_classes = len(answer_dict) if TASK == 'youtube2text': if config.get_bool('abc.is_multiple_choice'): num_classes = 1 else: num_classes = 1000 logger.info(f'Num classes: {num_classes}') vocab_size = utils.get_vocab_size(config, TASK, level='word') char_vocab_size = utils.get_vocab_size(config, TASK, level='char') model = get_model(vocab_size, char_vocab_size, num_classes) model = model.cuda() if TASK in MULTIPLE_CHOICE_TASKS: criterion = nn.CrossEntropyLoss(reduction='sum') elif TASK == 'count': inner_criterion = nn.MSELoss() def criterion(input, target): target = (target - 1.) / 10. return inner_criterion(input, target) # criterion = nn.SmoothL1Loss() elif TASK in ['frameqa']: criterion = nn.CrossEntropyLoss() elif TASK == 'youtube2text': if config.get_bool('abc.is_multiple_choice'): criterion = nn.CrossEntropyLoss(reduction='sum') else: criterion = nn.CrossEntropyLoss() optimizer_type = config.get_string('optimizer') if optimizer_type == 'adam': optimizer = optim.Adam(model.parameters(), lr=config.get_float('adam.lr')) else: raise Exception(f'Unknow optimizer: {optimizer_type}') start_epoch = 1 end_epoch = config.get_int('num_epochs') for epoch in range(start_epoch, end_epoch + 1): logger.info(f'Epoch [{epoch}/{end_epoch}] start') train(model, train_loader, criterion, optimizer, epoch) current_result, current_type_meters = test(model, test_loader, criterion, epoch) logger.info(f'Epoch [{epoch}/{end_epoch}] end') if args.debug: break is_best = False if TASK == 'count': if current_result < best_result: is_best = True best_result = current_result else: if current_result > best_result: is_best = True best_result = current_result best_type_meters = current_type_meters logger.info( colored( "Current best result: {:.2f}, Exp path: {}".format( best_result, args.experiment_path), "red")) logger.info(best_type_meters) save_checkpoint( { 'arch': config.get_string('arch'), 'task': TASK, 'state_dict': model.state_dict(), 'epoch': epoch + 1, 'best_result': best_result, 'optimizer': optimizer.state_dict(), 'best_type_meters': best_type_meters, }, is_best=is_best, folder=args.experiment_path) if TASK == 'count': logger.info(f'Best MSE: {best_result}') else: logger.info(f'Best Acc: {best_result}')
def train_model(): """Trains the model.""" # Build the model (before the loaders to speed up debugging) model = model_builder.build_model() log_model_info(model) # Define the loss function loss_fun = losses.get_loss_fun() # Construct the optimizer optimizer = optim.construct_optimizer(model) start_epoch = 0 min_val_loss = np.inf cur_patience = 0 # Create data loaders # train_data, val_data, test_data = loader.load_and_prepare_data() train_loader = loader.construct_train_loader(root=cfg.PATHS.DATAPATH) val_loader = loader.construct_val_loader(root=cfg.PATHS.DATAPATH) test_loader = loader.construct_test_loader(root=cfg.PATHS.DATAPATH) # Create meters train_meter = Meter(len(train_loader), cfg.TRAIN.BATCH_SIZE, mode="train") val_meter = Meter(len(val_loader), cfg.TEST.BATCH_SIZE, mode="valid") test_meter = Meter(len(val_loader), cfg.TEST.BATCH_SIZE, mode="test") # setup tb logging tb = None if cfg.IS_TB_LOG: tb = TensorboardLogger(log_dir=cfg.PATHS.TB_OUT_DIR, flush_secs=30) # Perform the training loop logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH): # Train for one epoch train_epoch( train_loader, model, loss_fun, optimizer, train_meter, cur_epoch, mode="train", tb=tb, ) # Compute precise BN stats if cfg.BN.USE_PRECISE_STATS: nu.compute_precise_bn_stats(model, train_loader) # Save a checkpoint if cu.is_checkpoint_epoch(cur_epoch): checkpoint_file = cu.save_checkpoint(model, optimizer, cur_epoch) logger.info("Wrote checkpoint to: {}".format(checkpoint_file)) # Evaluate the model if is_eval_epoch(cur_epoch): val_loss = test_epoch(val_loader, model, loss_fun, val_meter, cur_epoch, mode="valid", tb=tb) # Save the best model based on val score if val_loss < min_val_loss: min_val_loss = val_loss cur_patience = 0 checkpoint_file = cu.save_best_loss_checkpoint( model, optimizer, cur_epoch, val_loss) print(f"Wrote best score checkpoint to: {checkpoint_file}") # Handle early stopping based on val score elif val_loss - cfg.TRAIN.ES_THRESHOLD > min_val_loss: cur_patience += 1 print( f"Val loss larger than min value, patience at: {cur_patience} (max {cfg.TRAIN.ES_PATIENCE})" ) if cur_patience > cfg.TRAIN.ES_PATIENCE: logger.info( f"ES patience hit at {cur_epoch} epochs, quitting") break best_checkpoint = cu.get_best_score_checkpoint() best_epoch = cu.load_checkpoint(best_checkpoint, model, optimizer) print(f"Loaded checkpoint from epoch: {best_epoch+1}") print("=" * 100) test_epoch(train_loader, model, loss_fun, train_meter, cur_epoch, mode="train", tb=None) test_epoch(test_loader, model, loss_fun, test_meter, cur_epoch, mode="test", tb=None) if tb is not None: tb.close()
def main(cfg): # basic settings loss_F = torch.nn.CrossEntropyLoss() gpu_nums = int(cfg['NUM_GPUS']) if gpu_nums == 0: use_cuda = False else: use_cuda = True # load model model = AnyNet(cfg) if use_cuda: model = torch.nn.DataParallel(model, device_ids=[0]) model = model.cuda() # load_dataset Trainpath = cfg['TRAIN']['PATH'] RESIZE_SIZE = cfg['TRAIN']['IM_SIZE'] train_data = SingleDataset(Trainpath, split='train', resize_size=RESIZE_SIZE) train_loader= DataLoader(dataset=train_data, batch_size=cfg['TRAIN']['BATCH_SIZE'], shuffle=True, num_workers=cfg['DATA_LOADER']['NUM_WORKERS'], pin_memory=True) Testpath = cfg['TEST']['PATH'] RESIZE_SIZE_val = cfg['TEST']['IM_SIZE'] test_data = SingleDataset(Testpath, split='val', resize_size=RESIZE_SIZE_val) test_loader = DataLoader(dataset=test_data, batch_size=cfg['TEST']['BATCH_SIZE'], shuffle=False, num_workers=cfg['DATA_LOADER']['NUM_WORKERS'], pin_memory=True) # optimizer and loss function and evaluator if cfg['OPTIM']['OPTIMIZER'] == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=cfg['OPTIM']['BASE_LR'], weight_decay=1e-4) else: optimizer = torch.optim.SGD(model.parameters(), lr=cfg['OPTIM']['BASE_LR'], momentum=0.9, weight_decay=5e-4) # load checkpoint or initial weights start_epoch = 0 if cfg['TRAIN']['RESUME'] is not None: resume = cfg['TRAIN']['RESUME'] if not os.path.isfile(resume): raise RuntimeError("=> no checkpoint found at '{}'".format(resume)) checkpoint_epoch = cp.load_checkpoint(resume, gpu_num=gpu_nums, model=model, optimizer=optimizer) start_epoch = checkpoint_epoch + 1 elif cfg['TRAIN']['WEIGHTS']: cp.load_checkpoint(cfg['TRAIN']['WEIGHTS'], gpu_nums, model) else: init_weights(model, zero_init_gamma=cfg['BN']['ZERO_INIT_FINAL_GAMMA']) # save training process log_file = log_g.get_log_filename(os.path.join(cfg['OUT_DIR'], 'log/')) log = open(log_file, 'w+') # start training max_epoch = cfg['OPTIM']['MAX_EPOCH'] batch_size = cfg['TRAIN']['BATCH_SIZE'] eval_period = cfg['TRAIN']['EVAL_PERIOD'] batch_count = 0 total_step = len(train_loader) num_class = cfg['MODEL']['NUM_CLASSES'] # correct_all = list(0. for i in range(cfg['MODEL']['NUM_CLASSES'])) # total_all = list(0. for i in range(cfg['MODEL']['NUM_CLASSES'])) for epoch in range(start_epoch, max_epoch): print('**************train --%d-- **************' % (epoch)) log.write('**************train --%d-- **************\n' % (epoch)) # update learning rate lr = optim.get_epoch_lr(epoch_i=epoch, cfg=cfg) optim.set_lr(optimizer, lr) ############################################################################# # start training an epoch ############################################################################# model.train() c_train = 0 t_train = 0 for i, (img, lbl) in enumerate(train_loader): batch_count += 1 # use cuda if use_cuda: img, lbl = img.cuda(), lbl.cuda() # forward preds = model(img) loss = loss_F(preds, lbl) # backward # optimizer.zero_grad() loss.backward() # optimizer.step() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) if (batch_count % batch_size) == 0: optimizer.step() optimizer.zero_grad() batch_count = 0 _, predicted = preds.max(1) c_train += predicted.eq(lbl).sum().item() t_train += lbl.size(0) # print epoch, step, loss, lr print('[%s]--train: %d/%d\tstep:%d/%d----lr:%.5f---loss:%.4f---Acc:%.3f' % ( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), (epoch + 1), max_epoch, (i + 1), total_step, lr, loss.item(), 100*(c_train/t_train))) log.write('[%s]--train: [%d/%d]\tstep: [%d/%d]\t----lr:%.5f---loss:%.4f---Acc:%.3f\n' %( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), (epoch + 1), max_epoch, (i + 1), total_step, lr, loss.item(), 100*(c_train/t_train))) ############################################################################# # start validation ############################################################################# if ((epoch+1) % eval_period == 0): print('**************validation --%d-- **************' % ((epoch + 1) // eval_period)) model.eval() mean_loss_val = 0 correct = np.zeros((num_class)) total = np.zeros((num_class)) top1_acc_sum = [] with torch.no_grad(): for val_epoch, (img_val, lbl_val) in enumerate(test_loader): if use_cuda: img_val, lbl_val = img_val.cuda(), lbl_val.cuda() # predict preds_val = model(img_val) # calculate loss loss_val = loss_F(preds_val, lbl_val) mean_loss_val += loss_val.item() # evaluation top1_acc, top2_acc = Evaluator.accuracy(preds_val, lbl_val, [1,2]) correct_i, total_i = Evaluator.accuracy_perclass(preds_val, lbl_val, num_class) correct += correct_i total += total_i top1_acc_sum.append(top1_acc) print('[%s]--valid: [%d/%d]\tloss: %.4f---top1_acc: %.3f' % ( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), val_epoch, len(test_loader), loss_val.item(), top1_acc.item())) print('[{}]--valid: [{}]\tmean_loss: {}\ttop1_acc: {}\tper_class_acc: {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), (epoch + 1), (mean_loss_val / len(test_loader)), np.mean(top1_acc_sum), 100*(correct/total))) # save log log.write('[{}]--valid: [{}]\tmean_loss: {}\ttop1_acc: {}\tper_class_acc: {}\n'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), (epoch + 1), (mean_loss_val / val_epoch), np.mean(top1_acc_sum), 100*(correct/total))) ############################################################################# # save model ############################################################################# if ((epoch+1)%5==0): checkpoint_file = os.path.join(cfg['OUT_DIR'], 'checkpoint/') checkpoint_filename = cp.save_checkpoint(model, optimizer, epoch, gpu_nums, checkpoint_file) log.write('[{}]--save checkpoint: {}\n'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), checkpoint_filename )) log.close()
def train_net(args, logger, seed): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id logger.info('seed={}'.format(seed)) # init seed torch.manual_seed(seed) torch.cuda.manual_seed(seed) random.seed(seed) np.random.seed(seed) # cudnn.benchmark = True cudnn.benchmark = False cudnn.deterministic = True # cudnn writer = SummaryWriter(args.outpath) start_epoch = 0 val_best_acc = 0 val_best_acc_index = 0 # data_loader train_loader, val_loader, target_class_num, dataset_sizes = \ get_target_dataloader(args.target_dataset, args.batch_size, args.num_workers, args.target_data_dir, image_size=args.image_size, data_aug=args.data_aug, logger=logger) # model setting model_source, model_target = get_model(args.base_model_name, args.base_task, logger, args) # target_model split: (feature, classifier) model_feature, model_source_classifier, model_target_classifier = \ model_split(args.base_model_name, model_target, target_class_num, logger, args) if len(args.gpu_id) > 1: model_source = nn.DataParallel(model_source) model_feature = nn.DataParallel(model_feature) model_source_classifier = nn.DataParallel(model_source_classifier) model_target_classifier = nn.DataParallel(model_target_classifier) model_source = model_source.cuda() model_feature = model_feature.cuda() model_target_classifier = model_target_classifier.cuda() model_source_classifier = model_source_classifier.cuda() logger.info("push all model to dataparallel and then gpu") else: model_source = model_source.cuda() model_feature = model_feature.cuda() model_target_classifier = model_target_classifier.cuda() model_source_classifier = model_source_classifier.cuda() logger.info("push all model to gpu") # iterations -> epochs num_epochs = int(np.round(args.max_iter * args.batch_size / dataset_sizes)) step = [int(0.67 * num_epochs)] logger.info('num_epochs={}, step={}'.format(num_epochs, step)) # loss loss_fn = get_loss_type(loss_type=args.loss_type, logger=logger) # get feature_criterions if args.reg_type in ['channel_att_fea_map_learn', 'fea_loss']: feature_criterions = get_reg_criterions(args, logger) # optimizer and lr_scheduler optimizer, lr_scheduler = get_optimier_and_scheduler( args, model_feature, model_target_classifier, feature_criterions, step, logger) # init framework framework = TransferFramework(args, train_loader, val_loader, target_class_num, args.data_aug, args.base_model_name, model_source, model_feature, model_source_classifier, model_target_classifier, feature_criterions, loss_fn, num_epochs, optimizer, lr_scheduler, writer, logger, print_freq=args.print_freq) # epochs for epoch in range(start_epoch, num_epochs): # train epoch clc_loss, kl_loss, fea_loss, train_total_loss, train_top1_acc = framework.train( epoch) # val epoch val_loss, val_top1_acc = framework.val(epoch) # record into txt ours_record_epoch_data(args.outpath, epoch, clc_loss, kl_loss, fea_loss, train_total_loss, train_top1_acc, val_loss, val_top1_acc) if val_top1_acc >= val_best_acc: val_best_acc = val_top1_acc val_best_acc_index = epoch # save_checkpoint save_checkpoint(args.outpath, epoch, model_feature, model_source_classifier, model_target_classifier, optimizer, lr_scheduler, val_best_acc) logger.info( '||==>Val Epoch: Val_best_acc_index={}\tVal_best_acc={:.4f}\n'. format(val_best_acc_index, val_best_acc)) # break return val_best_acc
def main(): args = get_arguments() # configuration CONFIG = Dict(yaml.safe_load(open(args.config))) # writer if CONFIG.writer_flag: writer = SummaryWriter(CONFIG.result_path) else: writer = None # DataLoaders normalize = Normalize(mean=get_mean(), std=get_std()) train_data = Kinetics(CONFIG, transform=Compose([ RandomCrop((CONFIG.height, CONFIG.width)), ToTensor(), normalize, ])) val_data = Kinetics(CONFIG, transform=Compose([ RandomCrop((CONFIG.height, CONFIG.width)), ToTensor(), normalize, ]), mode='validation') train_loader = DataLoader(train_data, batch_size=CONFIG.batch_size, shuffle=True, num_workers=CONFIG.num_workers, drop_last=True) val_loader = DataLoader(val_data, batch_size=CONFIG.batch_size, shuffle=False, num_workers=CONFIG.num_workers) # load model print('\n------------------------Loading Model------------------------\n') if CONFIG.model == 'resnet18': print(CONFIG.model + ' will be used as a model.') model = resnet.generate_model(18, n_classes=CONFIG.n_classes) elif CONFIG.model == 'resnext': print('ResNext101 will be used as a model.') model = resnext.generate_model(101, n_classes=CONFIG.n_classes) elif CONFIG.model == 'slowfast': print('slowfast will be used as a model.') model = slowfast.resnet152(class_num=CONFIG.n_classes) elif CONFIG.model == 'slowfast101_nl': print('slowfast101 with non local network will be used as a model.') model = slowfast.resnet101_NL(class_num=CONFIG.n_classes) elif CONFIG.model == 'slowfast_nl': if CONFIG.dual_attention: print('slowfast_nl w/ dual attention will be used as a model.') model = slowfast.resnet152_NL(class_num=CONFIG.n_classes, dual_attention=True) else: print('slowfast_nl w/o dual attention will be used as a model.') model = slowfast.resnet152_NL(class_num=CONFIG.n_classes) elif CONFIG.model == 'slowfast_nl': print('slowfast101_nl w/o dual attention will be used as a model.') model = slowfast.resnet101_NL(class_num=CONFIG.n_classes, dual_attention=False) else: print('resnet18 will be used as a model.') model = resnet.generate_model(18, n_classes=CONFIG.n_classes) # metric if CONFIG.metric == 'L2constrain': print('L2constrain metric will be used.') model.fc = L2ConstrainedLinear(model.fc.in_features, model.fc.out_features) # multi-scale input if CONFIG.msc == 'Temporal': print('Temporal multi-scale input will be used') model = TemporalMSC(model) elif CONFIG.msc == 'Spatial': print('Spatial multi-scale input will be used') model = SpatialMSC(model) elif CONFIG.msc == 'SpatioTemporal': print('SpatioTemporal multi-scale input will be used') model = SpatioTemporalMSC(model) # set optimizer, lr_scheduler if CONFIG.optimizer == 'Adam': print(CONFIG.optimizer + ' will be used as an optimizer.') optimizer = optim.Adam(model.parameters(), lr=CONFIG.learning_rate) elif CONFIG.optimizer == 'SGD': print(CONFIG.optimizer + ' will be used as an optimizer.') optimizer = optim.SGD(model.parameters(), lr=CONFIG.learning_rate, momentum=CONFIG.momentum, dampening=CONFIG.dampening, weight_decay=CONFIG.weight_decay, nesterov=CONFIG.nesterov) elif CONFIG.optimizer == 'AdaBound': print(CONFIG.optimizer + ' will be used as an optimizer.') optimizer = adabound.AdaBound(model.parameters(), lr=CONFIG.learning_rate, final_lr=CONFIG.final_lr, weight_decay=CONFIG.weight_decay) else: print('There is no optimizer which suits to your option. \ Instead, SGD will be used as an optimizer.') optimizer = optim.SGD(model.parameters(), lr=CONFIG.learning_rate, momentum=CONFIG.momentum, dampening=CONFIG.dampening, weight_decay=CONFIG.weight_decay, nesterov=CONFIG.nesterov) # learning rate scheduler if CONFIG.optimizer == 'SGD': scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=CONFIG.lr_patience) else: scheduler = None # send the model to cuda/cpu device = 'cuda' if torch.cuda.is_available() else 'cpu' model.to(device) if device == 'cuda': model = torch.nn.DataParallel(model) # make parallel torch.backends.cudnn.benchmark = True else: print( 'You have to use GPUs because training 3DCNN is computationally expensive.' ) sys.exit(1) # resume if you want begin_epoch = 0 log = None if args.resume: if os.path.exists(os.path.join(CONFIG.result_path, 'checkpoint.pth')): print('loading the checkpoint...') begin_epoch, model, optimizer, scheduler = resume( CONFIG, model, optimizer, scheduler) print('training will start from {} epoch'.format(begin_epoch)) if os.path.exists(os.path.join(CONFIG.result_path, 'log.csv')): log = pd.read_csv(os.path.join(CONFIG.result_path, 'log.csv')) # generate log when you start training from scratch if log is None: log = pd.DataFrame(columns=[ 'epoch', 'lr', 'train_loss', 'val_loss', 'acc@1', 'acc@5' ]) # criterion for loss if CONFIG.class_weight: criterion = nn.CrossEntropyLoss(weight=get_class_weight().to(device)) else: criterion = nn.CrossEntropyLoss() # train and validate model print('\n------------------------Start training------------------------\n') losses_train = [] losses_val = [] top1_accuracy = [] top5_accuracy = [] best_top1_accuracy = 0.0 best_top5_accuracy = 0.0 for epoch in range(begin_epoch, CONFIG.max_epoch): # training loss_train = train(model, train_loader, criterion, optimizer, CONFIG, device) losses_train.append(loss_train) # validation loss_val, top1, top5 = validation(model, val_loader, criterion, CONFIG, device) if CONFIG.optimizer == 'SGD': scheduler.step(loss_val) losses_val.append(loss_val) top1_accuracy.append(top1) top5_accuracy.append(top5) # save a model if topk accuracy is higher than ever # save base models, NOT DataParalled models if best_top1_accuracy < top1_accuracy[-1]: best_top1_accuracy = top1_accuracy[-1] torch.save( model.module.state_dict(), os.path.join(CONFIG.result_path, 'best_top1_accuracy_model.prm')) if best_top5_accuracy < top5_accuracy[-1]: best_top5_accuracy = top5_accuracy[-1] torch.save( model.module.state_dict(), os.path.join(CONFIG.result_path, 'best_top5_accuracy_model.prm')) # save checkpoint every epoch save_checkpoint(CONFIG, epoch, model, optimizer, scheduler) # save a model every 10 epoch # save base models, NOT DataParalled models if epoch % 10 == 0 and epoch != 0: torch.save( model.module.state_dict(), os.path.join(CONFIG.result_path, 'epoch_{}_model.prm'.format(epoch))) # tensorboardx if writer is not None: writer.add_scalar("loss_train", losses_train[-1], epoch) writer.add_scalar('loss_val', losses_val[-1], epoch) writer.add_scalars( "iou", { 'top1_accuracy': top1_accuracy[-1], 'top5_accuracy': top5_accuracy[-1] }, epoch) # write logs to dataframe and csv file tmp = pd.Series([ epoch, scheduler.get_lr()[0], losses_train[-1], losses_val[-1], top1_accuracy[-1], top5_accuracy[-1], ], index=log.columns) log = log.append(tmp, ignore_index=True) log.to_csv(os.path.join(CONFIG.result_path, 'log.csv'), index=False) print( 'epoch: {}\tloss train: {:.5f}\tloss val: {:.5f}\ttop1_accuracy: {:.5f}\ttop5_accuracy: {:.5f}' .format(epoch, losses_train[-1], losses_val[-1], top1_accuracy[-1], top5_accuracy[-1])) # save base models, NOT DataParalled models torch.save(model.module.state_dict(), os.path.join(CONFIG.result_path, 'final_model.prm'))
def train(cfg): # logger logger = logging.getLogger(name="merlin.baseline.train") logger.info("training...") # transform transform_train_list = [ # transforms.RandomResizedCrop(size=128, scale=(0.75,1.0), ratio=(0.75,1.3333), interpolation=3), #Image.BICUBIC) transforms.Resize(size=cfg.INPUT.SIZE_TRAIN, interpolation=1), transforms.Pad(32), transforms.RandomCrop(cfg.INPUT.SIZE_TRAIN), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ] transform_val_list = [ transforms.Resize(size=cfg.INPUT.SIZE_TEST, interpolation=3), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ] # prepare dataset train_dataset = MyDataset(root=cfg.DATA.ROOT, transform=transforms.Compose(transform_train_list), type='train') val_dataset = MyDataset(root=cfg.DATA.ROOT, transform=transforms.Compose(transform_val_list), type='val') train_loader = DataLoader(train_dataset, batch_size=cfg.SOLVER.BATCH_SIZE, shuffle=True, num_workers=8, pin_memory=False) val_loader = DataLoader(val_dataset, batch_size=cfg.SOLVER.BATCH_SIZE, shuffle=True, num_workers=8, pin_memory=False) num_classes = cfg.MODEL.HEADS.NUM_CLASSES # prepare model model = build_model(cfg, num_classes) model = model.cuda() model = nn.DataParallel(model) # prepare solver optimizer = make_optimizer(cfg, model) scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) start_epoch = 0 # Train and val since = time.time() for epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCHS): model.train(True) logger.info("Epoch {}/{}".format(epoch, cfg.SOLVER.MAX_EPOCHS - 1)) logger.info('-' * 10) running_loss = 0.0 # Iterate over data it = 0 running_acc = 0 for data in train_loader: it += 1 # get the inputs inputs, labels = data now_batch_size, c, h, w = inputs.shape if now_batch_size < cfg.SOLVER.BATCH_SIZE: # skip the last batch continue # wrap them in Variable inputs = Variable(inputs.cuda().detach()) labels = Variable(labels.cuda().detach()) # zero the parameter gradients optimizer.zero_grad() # forward out = model(inputs) loss_dict = get_loss(cfg, outs=out, label=labels) loss = sum(loss_dict.values()) loss.backward() optimizer.step() scheduler.step() # statistics with torch.no_grad(): _, preds = torch.max(out['pred_class_logits'], 1) running_loss += loss running_acc += torch.sum(preds == labels.data).float().item() / cfg.SOLVER.BATCH_SIZE if it % 50 == 0: logger.info( 'epoch {}, iter {}, loss: {:.3f}, acc: {:.3f}, lr: {:.5f}'.format( epoch, it, running_loss / it, running_acc / it, optimizer.param_groups[0]['lr'])) epoch_loss = running_loss / it epoch_acc = running_acc / it logger.info('epoch {} loss: {:.4f} Acc: {:.4f}'.format(epoch, epoch_loss, epoch_acc)) # save checkpoint if epoch % cfg.SOLVER.CHECKPOINT_PERIOD == 0: checkpoint = {'epoch': epoch + 1, 'model': model.module.state_dict() if (len(cfg.MODEL.DEVICE_ID) - 2) > 1 else model.state_dict(), 'optimizer': optimizer.state_dict() } save_checkpoint(checkpoint, epoch, cfg) # evaluate if epoch % cfg.SOLVER.EVAL_PERIOD == 0: logger.info('evaluate...') model.train(False) total = 0.0 correct = 0.0 for data in val_loader: inputs, labels = data inputs = Variable(inputs.cuda().detach()) labels = Variable(labels.cuda().detach()) with torch.no_grad(): out = model(inputs) _, preds = torch.max(out['pred_class_logits'], 1) c = (preds == labels).squeeze() total += c.size(0) correct += c.float().sum().item() acc = correct / total logger.info('eval acc:{:.4f}'.format(acc)) time_elapsed = time.time() - since logger.info('Training complete in {:.0f}m {:.0f}s\n'.format( time_elapsed // 60, time_elapsed % 60)) return model
def train(cfg): """ Train a video model for many epochs on train set and evaluate it on val set. Args: cfg (CfgNode): configs. Details can be found in slowfast/config/defaults.py """ # Setup logging format. logging.setup_logging(logger, cfg) # Print config. logger.info("Train with config:") logger.info(pprint.pformat(cfg)) # Build the video model and print model statistics. model = model_builder.build_model(cfg) if du.is_master_proc(): misc.log_model_info(model) # Construct the optimizer. optimizer = optim.construct_optimizer(model, cfg) # Record global step gs = 0 # Load a checkpoint to resume training if applicable. if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint(cfg.OUTPUT_DIR): logger.info("Load from last checkpoint.") last_checkpoint = cu.get_last_checkpoint(cfg.OUTPUT_DIR) gs, checkpoint_epoch = cu.load_checkpoint(last_checkpoint, model, cfg.NUM_GPUS > 1, optimizer) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.CHECKPOINT_FILE_PATH != "": logger.info("Load from given checkpoint file.") if cfg.TRAIN.LOAD_PART_OF_CHECKPOINT: gs, checkpoint_epoch = cu.load_part_of_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer=None) else: gs, checkpoint_epoch = cu.load_checkpoint( cfg.TRAIN.CHECKPOINT_FILE_PATH, model, cfg.NUM_GPUS > 1, optimizer=None, inflation=False, convert_from_caffe2=False) start_epoch = checkpoint_epoch + 1 else: gs = 0 start_epoch = 0 # Create the video train and val loaders. train_loader = loader.construct_loader(cfg, "train") val_loader = loader.construct_loader(cfg, "val") # Create meters. train_meter = TrainMeter(len(train_loader), cfg) val_meter = ValMeter(cfg) # Perform the training loop. logger.info("Start epoch: {} gs {}".format(start_epoch + 1, gs + 1)) for cur_epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCH): # Shuffle the dataset. loader.shuffle_dataset(train_loader, cur_epoch) # Evaluate the model on validation set. if misc.is_eval_epoch(cfg, cur_epoch): if cfg.TRAIN.USE_CENTER_VALIDATION: validation_epoch_center(val_loader, model, val_meter, cur_epoch, cfg) else: validation_epoch(val_loader, model, val_meter, cur_epoch, cfg) # Train for one epoch. gs = train_epoch(train_loader, model, optimizer, train_meter, cur_epoch, gs, cfg) # Compute precise BN stats. # if cfg.BN.USE_PRECISE_STATS and len(get_bn_modules(model)) > 0: # calculate_and_update_precise_bn( # train_loader, model, cfg.BN.NUM_BATCHES_PRECISE # ) # Save a checkpoint. if cu.is_checkpoint_epoch(cur_epoch, cfg.TRAIN.CHECKPOINT_PERIOD): cu.save_checkpoint(cfg.OUTPUT_DIR, model, optimizer, cur_epoch, gs, cfg)