def train(net, data_file, epochs, lr): transforms = T.Compose([ T.Resize(size=(256, 256)), T.RandomHorizontalFlip(p=0.5), T.ToTensor(), T.Normalize([0.56687369, 0.44000871, 0.39886727], [0.2415682, 0.2131414, 0.19494878]) ]) dataset = MyDataset(data_file, transforms) model = net data_loader = DataLoader(dataset, batch_size=24, shuffle=True) if torch.cuda.is_available(): model.cuda() # print(model) optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4, nesterov=True) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.1) loss_func = nn.CrossEntropyLoss() model.train(True) num_epochs = 0 writer = tb.SummaryWriter() for epoch in range(epochs): for index, data in enumerate(data_loader): im, label = data # print(label) label = label.long() if torch.cuda.is_available(): im = im.cuda() label = label.cuda() optimizer.zero_grad() out = model(im) loss = loss_func(out, label) loss.backward() optimizer.step() num_epochs += 1 writer.add_scalar('loss', loss, num_epochs) if index % 10 == 0 or index == len(data_loader) - 1: print( '{} / {} learning rate: {} : {} / {} -----------> loss: {}' .format(epoch + 1, epochs, lr_scheduler.get_lr()[0], index + 1, len(data_loader), loss)) if (epoch + 1) % 2 == 0: save_network(net, epoch + 1) lr_scheduler.step() writer.close()
def train_model(model, model_test, criterion, optimizer, scheduler, num_epochs=25): since = time.time() #best_model_wts = model.state_dict() #best_acc = 0.0 warm_up = 0.1 # We start from the 0.1*lrRate warm_iteration = round(dataset_sizes['satellite'] / opt.batchsize) * opt.warm_epoch # first 5 epoch for epoch in range(num_epochs - start_epoch): epoch = epoch + start_epoch print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train']: if phase == 'train': model.train(True) # Set model to training mode else: model.train(False) # Set model to evaluate mode running_loss = 0.0 running_corrects = 0.0 running_corrects2 = 0.0 running_corrects3 = 0.0 # Iterate over data. for data, data2, data3, data4 in zip(dataloaders['satellite'], dataloaders['street'], dataloaders['drone'], dataloaders['google']): # get the inputs inputs, labels = data inputs2, labels2 = data2 inputs3, labels3 = data3 inputs4, labels4 = data4 now_batch_size, c, h, w = inputs.shape if now_batch_size < opt.batchsize: # skip the last batch continue if use_gpu: inputs = Variable(inputs.cuda().detach()) inputs2 = Variable(inputs2.cuda().detach()) inputs3 = Variable(inputs3.cuda().detach()) labels = Variable(labels.cuda().detach()) labels2 = Variable(labels2.cuda().detach()) labels3 = Variable(labels3.cuda().detach()) if opt.extra_Google: inputs4 = Variable(inputs4.cuda().detach()) labels4 = Variable(labels4.cuda().detach()) else: inputs, labels = Variable(inputs), Variable(labels) # zero the parameter gradients optimizer.zero_grad() # forward if phase == 'val': with torch.no_grad(): outputs, outputs2 = model(inputs, inputs2) else: if opt.views == 2: outputs, outputs2 = model(inputs, inputs2) elif opt.views == 3: if opt.extra_Google: outputs, outputs2, outputs3, outputs4 = model( inputs, inputs2, inputs3, inputs4) else: outputs, outputs2, outputs3 = model( inputs, inputs2, inputs3) _, preds = torch.max(outputs.data, 1) _, preds2 = torch.max(outputs2.data, 1) if opt.views == 2: loss = criterion(outputs, labels) + criterion( outputs2, labels2) elif opt.views == 3: _, preds3 = torch.max(outputs3.data, 1) loss = criterion(outputs, labels) + criterion( outputs2, labels2) + criterion(outputs3, labels3) if opt.extra_Google: loss += criterion(outputs4, labels4) # backward + optimize only if in training phase if epoch < opt.warm_epoch and phase == 'train': warm_up = min(1.0, warm_up + 0.9 / warm_iteration) loss *= warm_up if phase == 'train': if fp16: # we use optimier to backward loss with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() ########## if opt.moving_avg < 1.0: update_average(model_test, model, opt.moving_avg) # statistics if int(version[0]) > 0 or int( version[2] ) > 3: # for the new version like 0.4.0, 0.5.0 and 1.0.0 running_loss += loss.item() * now_batch_size else: # for the old version like 0.3.0 and 0.3.1 running_loss += loss.data[0] * now_batch_size running_corrects += float(torch.sum(preds == labels.data)) running_corrects2 += float(torch.sum(preds2 == labels2.data)) if opt.views == 3: running_corrects3 += float( torch.sum(preds3 == labels3.data)) epoch_loss = running_loss / dataset_sizes['satellite'] epoch_acc = running_corrects / dataset_sizes['satellite'] epoch_acc2 = running_corrects2 / dataset_sizes['satellite'] if opt.views == 2: print( '{} Loss: {:.4f} Satellite_Acc: {:.4f} Street_Acc: {:.4f}' .format(phase, epoch_loss, epoch_acc, epoch_acc2)) elif opt.views == 3: epoch_acc3 = running_corrects3 / dataset_sizes['satellite'] print( '{} Loss: {:.4f} Satellite_Acc: {:.4f} Street_Acc: {:.4f} Drone_Acc: {:.4f}' .format(phase, epoch_loss, epoch_acc, epoch_acc2, epoch_acc3)) y_loss[phase].append(epoch_loss) y_err[phase].append(1.0 - epoch_acc) # deep copy the model if phase == 'train': scheduler.step() last_model_wts = model.state_dict() if epoch % 20 == 19: save_network(model, opt.name, epoch) #draw_curve(epoch) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) #print('Best val Acc: {:4f}'.format(best_acc)) #save_network(model_test, opt.name+'adapt', epoch) return model
def train(model, criterion, optimizer, scheduler, dataloader, num_epochs, device): ''' train ''' start_time = time.time() # Logger instance logger = utils.Logger(save_dir_path) logger.info('-' * 10) logger.info(vars(args)) for epoch in range(num_epochs): logger.info('Epoch {}/{}'.format(epoch + 1, num_epochs)) model.train() adjust_lr(epoch) # Training running_loss = 0.0 batch_num = 0 for inputs, labels in dataloader: batch_num += 1 inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() # with torch.set_grad_enabled(True): outputs = model(inputs) # Sum up the stripe softmax loss loss = 0 if isinstance(outputs, (list, )): for logits in outputs: stripe_loss = criterion(logits, labels) loss += stripe_loss elif isinstance(outputs, (torch.Tensor, )): loss = criterion(outputs, labels) else: raise Exception('outputs type is error !') loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) epoch_loss = running_loss / len(dataloader.dataset.imgs) logger.info('Training Loss: {:.4f}'.format(epoch_loss)) # Save result to logger logger.x_epoch_loss.append(epoch + 1) logger.y_train_loss.append(epoch_loss) if (epoch + 1) % 20 == 0 or epoch + 1 == num_epochs: # Testing / Validating torch.cuda.empty_cache() CMC, mAP = test(model, args.dataset, args.dataset_path, 512) logger.info('Testing: top1:%.2f top5:%.2f top10:%.2f mAP:%.2f' % (CMC[0], CMC[4], CMC[9], mAP)) logger.x_epoch_test.append(epoch + 1) logger.y_test['top1'].append(CMC[0]) logger.y_test['mAP'].append(mAP) if epoch + 1 != num_epochs: utils.save_network(model, save_dir_path, str(epoch + 1)) logger.info('-' * 10) # Save the loss curve logger.save_curve() time_elapsed = time.time() - start_time logger.info('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) # Save final model weights utils.save_network(model, save_dir_path, 'final')
def run(self): # For each module, check we have pre-trained modules and load them print("-------------------------------------------------") print(" Looking for previous results ") print("-------------------------------------------------") for _key in ["kp", "ori", "desc", "joint"]: restore_network(self, _key) print("-------------------------------------------------") print(" Training ") print("-------------------------------------------------") subtask = self.config.subtask batch_size = self.config.batch_size for step in trange(int(self.best_step[subtask]), int(self.config.max_step), desc="Subtask = {}".format(subtask), ncols=self.config.tqdm_width): # ---------------------------------------- # Forward pass: Note that we only compute the loss in the forward # pass. We don't do summary writing or saving fw_data = [] fw_loss = [] batches = self.hardmine_scheduler(self.config, step) for num_cur in batches: cur_data = self.dataset.next_batch(task="train", subtask=subtask, batch_size=num_cur, aug_rot=self.use_aug_rot) cur_loss = self.network.forward(subtask, cur_data) # Sanity check if min(cur_loss) < 0: raise RuntimeError('Negative loss while mining?') # Data may contain empty (zero-value) samples: set loss to zero if num_cur < batch_size: cur_loss[num_cur - batch_size:] = 0 fw_data.append(cur_data) fw_loss.append(cur_loss) # Fill a single batch with hardest if len(batches) > 1: cur_data = get_hard_batch(fw_loss, fw_data) # ---------------------------------------- # Backward pass: Note that the backward pass returns summary only # when it is asked. Also, we manually keep note of step here, and # not use the tensorflow version. This is to simplify the migration # to another framework, if needed. do_validation = step % self.config.validation_interval == 0 cur_summary = self.network.backward(subtask, cur_data, provide_summary=do_validation) if do_validation and cur_summary is not None: # Make sure we have the summary data assert cur_summary is not None # Write training summary self.summary_writer[subtask].add_summary(cur_summary, step) # Do multiple rounds of validation cur_val_loss = np.zeros(self.config.validation_rounds) for _val_round in xrange(self.config.validation_rounds): # Fetch validation data cur_data = self.dataset.next_batch( task="valid", subtask=subtask, batch_size=batch_size, aug_rot=self.use_aug_rot) # Perform validation of the model using validation data cur_val_loss[_val_round] = self.network.validate( subtask, cur_data) cur_val_loss = np.mean(cur_val_loss) # Inject validation result to summary summaries = [ tf.Summary.Value( tag="validation/err-{}".format(subtask), simple_value=cur_val_loss, ) ] self.summary_writer[subtask].add_summary( tf.Summary(value=summaries), step) # Flush the writer self.summary_writer[subtask].flush() # TODO: Repeat without augmentation if necessary # ... if cur_val_loss < self.best_val_loss[subtask]: self.best_val_loss[subtask] = cur_val_loss self.best_step[subtask] = step save_network(self, subtask)
def main(): setup_train_experiment(logger, FLAGS, "%(model)s_at") logger.info("Loading data...") data = mnist_load(FLAGS.train_size, FLAGS.seed) X_train, y_train = data.X_train, data.y_train X_val, y_val = data.X_val, data.y_val X_test, y_test = data.X_test, data.y_test img_shape = [None, 1, 28, 28] train_images = T.tensor4('train_images') train_labels = T.lvector('train_labels') val_images = T.tensor4('valid_labels') val_labels = T.lvector('valid_labels') layer_dims = [int(dim) for dim in FLAGS.layer_dims.split("-")] num_classes = layer_dims[-1] net = create_network(FLAGS.model, img_shape, layer_dims=layer_dims) model = with_end_points(net) train_outputs = model(train_images) val_outputs = model(val_images, deterministic=True) # losses train_ce = categorical_crossentropy(train_outputs['prob'], train_labels).mean() train_at = adversarial_training(lambda x: model(x)['prob'], train_images, train_labels, epsilon=FLAGS.epsilon).mean() train_loss = train_ce + FLAGS.lmbd * train_at val_ce = categorical_crossentropy(val_outputs['prob'], val_labels).mean() val_deepfool_images = deepfool( lambda x: model(x, deterministic=True)['logits'], val_images, val_labels, num_classes, max_iter=FLAGS.deepfool_iter, clip_dist=FLAGS.deepfool_clip, over_shoot=FLAGS.deepfool_overshoot) # metrics train_acc = categorical_accuracy(train_outputs['logits'], train_labels).mean() train_err = 1.0 - train_acc val_acc = categorical_accuracy(val_outputs['logits'], val_labels).mean() val_err = 1.0 - val_acc # deepfool robustness reduc_ind = range(1, train_images.ndim) l2_deepfool = (val_deepfool_images - val_images).norm(2, axis=reduc_ind) l2_deepfool_norm = l2_deepfool / val_images.norm(2, axis=reduc_ind) train_metrics = OrderedDict([('loss', train_loss), ('nll', train_ce), ('at', train_at), ('err', train_err)]) val_metrics = OrderedDict([('nll', val_ce), ('err', val_err)]) summary_metrics = OrderedDict([('l2', l2_deepfool.mean()), ('l2_norm', l2_deepfool_norm.mean())]) lr = theano.shared(floatX(FLAGS.initial_learning_rate), 'learning_rate') train_params = get_all_params(net, trainable=True) train_updates = adam(train_loss, train_params, lr) logger.info("Compiling theano functions...") train_fn = theano.function([train_images, train_labels], outputs=train_metrics.values(), updates=train_updates) val_fn = theano.function([val_images, val_labels], outputs=val_metrics.values()) summary_fn = theano.function([val_images, val_labels], outputs=summary_metrics.values() + [val_deepfool_images]) logger.info("Starting training...") try: samples_per_class = FLAGS.summary_samples_per_class summary_images, summary_labels = select_balanced_subset( X_val, y_val, num_classes, samples_per_class) save_path = os.path.join(FLAGS.samples_dir, 'orig.png') save_images(summary_images, save_path) epoch = 0 batch_index = 0 while epoch < FLAGS.num_epochs: epoch += 1 start_time = time.time() train_iterator = batch_iterator(X_train, y_train, FLAGS.batch_size, shuffle=True) epoch_outputs = np.zeros(len(train_fn.outputs)) for batch_index, (images, labels) in enumerate(train_iterator, batch_index + 1): batch_outputs = train_fn(images, labels) epoch_outputs += batch_outputs epoch_outputs /= X_train.shape[0] // FLAGS.batch_size logger.info( build_result_str( "Train epoch [{}, {:.2f}s]:".format( epoch, time.time() - start_time), train_metrics.keys(), epoch_outputs)) # update learning rate if epoch > FLAGS.start_learning_rate_decay: new_lr_value = lr.get_value( ) * FLAGS.learning_rate_decay_factor lr.set_value(floatX(new_lr_value)) logger.debug("learning rate was changed to {:.10f}".format( new_lr_value)) # validation start_time = time.time() val_iterator = batch_iterator(X_val, y_val, FLAGS.test_batch_size, shuffle=False) val_epoch_outputs = np.zeros(len(val_fn.outputs)) for images, labels in val_iterator: val_epoch_outputs += val_fn(images, labels) val_epoch_outputs /= X_val.shape[0] // FLAGS.test_batch_size logger.info( build_result_str( "Test epoch [{}, {:.2f}s]:".format( epoch, time.time() - start_time), val_metrics.keys(), val_epoch_outputs)) if epoch % FLAGS.summary_frequency == 0: summary = summary_fn(summary_images, summary_labels) logger.info( build_result_str( "Epoch [{}] adversarial statistics:".format(epoch), summary_metrics.keys(), summary[:-1])) save_path = os.path.join(FLAGS.samples_dir, 'epoch-%d.png' % epoch) df_images = summary[-1] save_images(df_images, save_path) if epoch % FLAGS.checkpoint_frequency == 0: save_network(net, epoch=epoch) except KeyboardInterrupt: logger.debug("Keyboard interrupt. Stopping training...") finally: save_network(net) # evaluate final model on test set test_iterator = batch_iterator(X_test, y_test, FLAGS.test_batch_size, shuffle=False) test_results = np.zeros(len(val_fn.outputs)) for images, labels in test_iterator: test_results += val_fn(images, labels) test_results /= X_test.shape[0] // FLAGS.test_batch_size logger.info( build_result_str("Final test results:", val_metrics.keys(), test_results))
def main(): args = args_initialize() save_freq = args.save_freq epochs = args.num_epoch cuda = args.cuda train_dataset = UnalignedDataset(is_train=True) train_loader = DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=0 ) net_G_A = ResNetGenerator(input_nc=3, output_nc=3) net_G_B = ResNetGenerator(input_nc=3, output_nc=3) net_D_A = Discriminator() net_D_B = Discriminator() if args.cuda: net_G_A = net_G_A.cuda() net_G_B = net_G_B.cuda() net_D_A = net_D_A.cuda() net_D_B = net_D_B.cuda() fake_A_pool = ImagePool(50) fake_B_pool = ImagePool(50) criterionGAN = GANLoss(cuda=cuda) criterionCycle = torch.nn.L1Loss() criterionIdt = torch.nn.L1Loss() optimizer_G = torch.optim.Adam( itertools.chain(net_G_A.parameters(), net_G_B.parameters()), lr=args.lr, betas=(args.beta1, 0.999) ) optimizer_D_A = torch.optim.Adam(net_D_A.parameters(), lr=args.lr, betas=(args.beta1, 0.999)) optimizer_D_B = torch.optim.Adam(net_D_B.parameters(), lr=args.lr, betas=(args.beta1, 0.999)) log_dir = './logs' checkpoints_dir = './checkpoints' os.makedirs(log_dir, exist_ok=True) os.makedirs(checkpoints_dir, exist_ok=True) writer = SummaryWriter(log_dir) for epoch in range(epochs): running_loss = np.zeros((8)) for batch_idx, data in enumerate(train_loader): input_A = data['A'] input_B = data['B'] if cuda: input_A = input_A.cuda() input_B = input_B.cuda() real_A = Variable(input_A) real_B = Variable(input_B) """ Backward net_G """ optimizer_G.zero_grad() lambda_idt = 0.5 lambda_A = 10.0 lambda_B = 10.0 # 各 Generatorに変換後の画像を入力 # 何もしないのが理想の出力 idt_B = net_G_A(real_B) loss_idt_A = criterionIdt(idt_B, real_B) * lambda_B * lambda_idt idt_A = net_G_B(real_A) loss_idt_B = criterionIdt(idt_A, real_A) * lambda_A * lambda_idt # GAN loss = D_A(G_A(A)) # G_Aとしては生成した偽物画像が本物(True)と判断して欲しい fake_B = net_G_A(real_A) pred_fake = net_D_A(fake_B) loss_G_A = criterionGAN(pred_fake, True) fake_A = net_G_B(real_B) pred_fake = net_D_B(fake_A) loss_G_B = criterionGAN(pred_fake, True) rec_A = net_G_B(fake_B) loss_cycle_A = criterionCycle(rec_A, real_A) * lambda_A rec_B = net_G_A(fake_A) loss_cycle_B = criterionCycle(rec_B, real_B) * lambda_B loss_G = loss_G_A + loss_G_B + loss_cycle_A + loss_cycle_B + loss_idt_A + loss_idt_B loss_G.backward() optimizer_G.step() """ update D_A """ optimizer_D_A.zero_grad() fake_B = fake_B_pool.query(fake_B.data) pred_real = net_D_A(real_B) loss_D_real = criterionGAN(pred_real, True) pred_fake = net_D_A(fake_B.detach()) loss_D_fake = criterionGAN(pred_fake, False) loss_D_A = (loss_D_real + loss_D_fake) * 0.5 loss_D_A.backward() optimizer_D_A.step() """ update D_B """ optimizer_D_B.zero_grad() fake_A = fake_A_pool.query(fake_A.data) pred_real = net_D_B(real_A) loss_D_real = criterionGAN(pred_real, True) pred_fake = net_D_B(fake_A.detach()) loss_D_fake = criterionGAN(pred_fake, False) loss_D_B = (loss_D_real + loss_D_fake) * 0.5 loss_D_B.backward() optimizer_D_B.step() ret_loss = np.array([ loss_G_A.data.detach().cpu().numpy(), loss_D_A.data.detach().cpu().numpy(), loss_G_B.data.detach().cpu().numpy(), loss_D_B.data.detach().cpu().numpy(), loss_cycle_A.data.detach().cpu().numpy(), loss_cycle_B.data.detach().cpu().numpy(), loss_idt_A.data.detach().cpu().numpy(), loss_idt_B.data.detach().cpu().numpy() ]) running_loss += ret_loss """ Save checkpoints """ if (epoch + 1) % save_freq == 0: save_network(net_G_A, 'G_A', str(epoch + 1)) save_network(net_D_A, 'D_A', str(epoch + 1)) save_network(net_G_B, 'G_B', str(epoch + 1)) save_network(net_D_B, 'D_B', str(epoch + 1)) running_loss /= len(train_loader) losses = running_loss print('epoch %d, losses: %s' % (epoch + 1, running_loss)) writer.add_scalar('loss_G_A', losses[0], epoch) writer.add_scalar('loss_D_A', losses[1], epoch) writer.add_scalar('loss_G_B', losses[2], epoch) writer.add_scalar('loss_D_B', losses[3], epoch) writer.add_scalar('loss_cycle_A', losses[4], epoch) writer.add_scalar('loss_cycle_B', losses[5], epoch) writer.add_scalar('loss_idt_A', losses[6], epoch) writer.add_scalar('loss_idt_B', losses[7], epoch)
def train_model(model, model_test, criterion, optimizer, scheduler, num_epochs=25): since = time.time() # best_model_wts = model.state_dict() # best_acc = 0.0 warm_up = 0.1 # We start from the 0.1*lrRate warm_iteration = round(dataset_sizes['satellite'] / opt.batchsize) * opt.warm_epoch # first 5 epoch if opt.arcface: criterion_arcface = losses.ArcFaceLoss(num_classes=opt.nclasses, embedding_size=512) if opt.cosface: criterion_cosface = losses.CosFaceLoss(num_classes=opt.nclasses, embedding_size=512) if opt.circle: criterion_circle = CircleLoss(m=0.25, gamma=32) # gamma = 64 may lead to a better result. if opt.triplet: miner = miners.MultiSimilarityMiner() criterion_triplet = losses.TripletMarginLoss(margin=0.3) if opt.lifted: criterion_lifted = losses.GeneralizedLiftedStructureLoss(neg_margin=1, pos_margin=0) if opt.contrast: criterion_contrast = losses.ContrastiveLoss(pos_margin=0, neg_margin=1) if opt.sphere: criterion_sphere = losses.SphereFaceLoss(num_classes=opt.nclasses, embedding_size=512, margin=4) for epoch in range(num_epochs - start_epoch): epoch = epoch + start_epoch print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train']: if phase == 'train': model.train(True) # Set model to training mode else: model.train(False) # Set model to evaluate mode running_loss = 0.0 running_corrects = 0.0 running_corrects2 = 0.0 running_corrects3 = 0.0 # Iterate over data. for data, data2, data3, data4 in zip(dataloaders['satellite'], dataloaders['street'], dataloaders['drone'], dataloaders['google']): # get the inputs inputs, labels = data inputs2, labels2 = data2 inputs3, labels3 = data3 inputs4, labels4 = data4 now_batch_size, c, h, w = inputs.shape if now_batch_size < opt.batchsize: # skip the last batch continue if use_gpu: inputs = Variable(inputs.cuda().detach()) inputs2 = Variable(inputs2.cuda().detach()) inputs3 = Variable(inputs3.cuda().detach()) labels = Variable(labels.cuda().detach()) labels2 = Variable(labels2.cuda().detach()) labels3 = Variable(labels3.cuda().detach()) if opt.extra_Google: inputs4 = Variable(inputs4.cuda().detach()) labels4 = Variable(labels4.cuda().detach()) else: inputs, labels = Variable(inputs), Variable(labels) # zero the parameter gradients optimizer.zero_grad() # forward if phase == 'val': with torch.no_grad(): outputs, outputs2 = model(inputs, inputs2) else: if opt.views == 2: outputs, outputs2 = model(inputs, inputs2) elif opt.views == 3: if opt.extra_Google: outputs, outputs2, outputs3, outputs4 = model(inputs, inputs2, inputs3, inputs4) else: outputs, outputs2, outputs3 = model(inputs, inputs2, inputs3) return_feature = opt.arcface or opt.cosface or opt.circle or opt.triplet or opt.contrast or opt.lifted or opt.sphere if opt.views == 2: _, preds = torch.max(outputs.data, 1) _, preds2 = torch.max(outputs2.data, 1) loss = criterion(outputs, labels) + criterion(outputs2, labels2) elif opt.views == 3: if return_feature: logits, ff = outputs logits2, ff2 = outputs2 logits3, ff3 = outputs3 fnorm = torch.norm(ff, p=2, dim=1, keepdim=True) fnorm2 = torch.norm(ff2, p=2, dim=1, keepdim=True) fnorm3 = torch.norm(ff3, p=2, dim=1, keepdim=True) ff = ff.div(fnorm.expand_as(ff)) # 8*512,tensor ff2 = ff2.div(fnorm2.expand_as(ff2)) ff3 = ff3.div(fnorm3.expand_as(ff3)) loss = criterion(logits, labels) + criterion(logits2, labels2) + criterion(logits3, labels3) _, preds = torch.max(logits.data, 1) _, preds2 = torch.max(logits2.data, 1) _, preds3 = torch.max(logits3.data, 1) # Multiple perspectives are combined to calculate losses, please join ''--loss_merge'' in run.sh if opt.loss_merge: ff_all = torch.cat((ff, ff2, ff3), dim=0) labels_all = torch.cat((labels, labels2, labels3), dim=0) if opt.extra_Google: logits4, ff4 = outputs4 fnorm4 = torch.norm(ff4, p=2, dim=1, keepdim=True) ff4 = ff4.div(fnorm4.expand_as(ff4)) loss = criterion(logits, labels) + criterion(logits2, labels2) + criterion(logits3, labels3) +criterion(logits4, labels4) if opt.loss_merge: ff_all = torch.cat((ff_all, ff4), dim=0) labels_all = torch.cat((labels_all, labels4), dim=0) if opt.arcface: if opt.loss_merge: loss += criterion_arcface(ff_all, labels_all) else: loss += criterion_arcface(ff, labels) + criterion_arcface(ff2, labels2) + criterion_arcface(ff3, labels3) # /now_batch_size if opt.extra_Google: loss += criterion_arcface(ff4, labels4) # /now_batch_size if opt.cosface: if opt.loss_merge: loss += criterion_cosface(ff_all, labels_all) else: loss += criterion_cosface(ff, labels) + criterion_cosface(ff2, labels2) + criterion_cosface(ff3, labels3) # /now_batch_size if opt.extra_Google: loss += criterion_cosface(ff4, labels4) # /now_batch_size if opt.circle: if opt.loss_merge: loss += criterion_circle(*convert_label_to_similarity(ff_all, labels_all)) / now_batch_size else: loss += criterion_circle(*convert_label_to_similarity(ff, labels)) / now_batch_size + criterion_circle(*convert_label_to_similarity(ff2, labels2)) / now_batch_size + criterion_circle(*convert_label_to_similarity(ff3, labels3)) / now_batch_size if opt.extra_Google: loss += criterion_circle(*convert_label_to_similarity(ff4, labels4)) / now_batch_size if opt.triplet: if opt.loss_merge: hard_pairs_all = miner(ff_all, labels_all) loss += criterion_triplet(ff_all, labels_all, hard_pairs_all) else: hard_pairs = miner(ff, labels) hard_pairs2 = miner(ff2, labels2) hard_pairs3 = miner(ff3, labels3) loss += criterion_triplet(ff, labels, hard_pairs) + criterion_triplet(ff2, labels2, hard_pairs2) + criterion_triplet(ff3, labels3, hard_pairs3)# /now_batch_size if opt.extra_Google: hard_pairs4 = miner(ff4, labels4) loss += criterion_triplet(ff4, labels4, hard_pairs4) if opt.lifted: if opt.loss_merge: loss += criterion_lifted(ff_all, labels_all) else: loss += criterion_lifted(ff, labels) + criterion_lifted(ff2, labels2) + criterion_lifted(ff3, labels3) # /now_batch_size if opt.extra_Google: loss += criterion_lifted(ff4, labels4) if opt.contrast: if opt.loss_merge: loss += criterion_contrast(ff_all, labels_all) else: loss += criterion_contrast(ff, labels) + criterion_contrast(ff2,labels2) + criterion_contrast(ff3, labels3) # /now_batch_size if opt.extra_Google: loss += criterion_contrast(ff4, labels4) if opt.sphere: if opt.loss_merge: loss += criterion_sphere(ff_all, labels_all) / now_batch_size else: loss += criterion_sphere(ff, labels) / now_batch_size + criterion_sphere(ff2, labels2) / now_batch_size + criterion_sphere(ff3, labels3) / now_batch_size if opt.extra_Google: loss += criterion_sphere(ff4, labels4) else: _, preds = torch.max(outputs.data, 1) _, preds2 = torch.max(outputs2.data, 1) _, preds3 = torch.max(outputs3.data, 1) if opt.loss_merge: outputs_all = torch.cat((outputs, outputs2, outputs3), dim=0) labels_all = torch.cat((labels, labels2, labels3), dim=0) if opt.extra_Google: outputs_all = torch.cat((outputs_all, outputs4), dim=0) labels_all = torch.cat((labels_all, labels4), dim=0) loss = 4*criterion(outputs_all, labels_all) else: loss = criterion(outputs, labels) + criterion(outputs2, labels2) + criterion(outputs3, labels3) if opt.extra_Google: loss += criterion(outputs4, labels4) # backward + optimize only if in training phase if epoch < opt.warm_epoch and phase == 'train': warm_up = min(1.0, warm_up + 0.9 / warm_iteration) loss *= warm_up if phase == 'train': if fp16: # we use optimier to backward loss with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() ########## if opt.moving_avg < 1.0: update_average(model_test, model, opt.moving_avg) # statistics if int(version[0]) > 0 or int(version[2]) > 3: # for the new version like 0.4.0, 0.5.0 and 1.0.0 running_loss += loss.item() * now_batch_size else: # for the old version like 0.3.0 and 0.3.1 running_loss += loss.data[0] * now_batch_size running_corrects += float(torch.sum(preds == labels.data)) running_corrects2 += float(torch.sum(preds2 == labels2.data)) if opt.views == 3: running_corrects3 += float(torch.sum(preds3 == labels3.data)) epoch_loss = running_loss / dataset_sizes['satellite'] epoch_acc = running_corrects / dataset_sizes['satellite'] epoch_acc2 = running_corrects2 / dataset_sizes['satellite'] if opt.views == 2: print('{} Loss: {:.4f} Satellite_Acc: {:.4f} Street_Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc, epoch_acc2)) elif opt.views == 3: epoch_acc3 = running_corrects3 / dataset_sizes['satellite'] print('{} Loss: {:.4f} Satellite_Acc: {:.4f} Street_Acc: {:.4f} Drone_Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc, epoch_acc2, epoch_acc3)) y_loss[phase].append(epoch_loss) y_err[phase].append(1.0 - epoch_acc) # deep copy the model if phase == 'train': scheduler.step() last_model_wts = model.state_dict() if epoch % 20 == 19: save_network(model, opt.name, epoch) # draw_curve(epoch) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) # print('Best val Acc: {:4f}'.format(best_acc)) # save_network(model_test, opt.name+'adapt', epoch) return model
from parameter import get_parameter from utils import load_network, save_network from train import train_network from evaluate import test_network from hardprune import hard_prune_network from softprune import soft_prune_network import os os.environ["CUDA_VISIBLE_DEVICES"] = "2" if __name__ == '__main__': args = get_parameter() network = load_network(args) print(network) if args.train_flag: print('args.train_flag:', args.train_flag) network = train_network(network, args) elif args.hard_prune_flag: print('hard_prune_flag:', args.hard_prune_flag) network = hard_prune_network(network, args) elif args.soft_prune_flag: network = soft_prune_network(network, args) print(network) test_network(network, args) # network = train_network(network, args) save_network(network, args)
def train(model, criterion, optimizer, scheduler, dataloder, text_loader, num_epochs, device, stage): start_time = time.time() # Logger instance logger = utils.Logger(save_dir_path) logger.info('-' * 10) logger.info(vars(arg)) logger.info('Stage: ' + stage) print( "################################### Train stage I ######################################" ) for epoch in range(num_epochs): logger.info('Epoch {}/{}'.format(epoch + 1, num_epochs)) model.train() scheduler.step() ##Training running_loss = 0.0 running_text_loss = 0.0 batch_num = 0 img_cor = torch.zeros(1).squeeze().cuda() total = torch.zeros(1).squeeze().cuda() txt_cor = torch.zeros(1).squeeze().cuda() txt_total = torch.zeros(1).squeeze().cuda() for (inputs, labels), (text_inputs, text_labels) in zip(dataloder, text_loader): batch_num += 1 inputs = inputs.to(device) labels = labels.to(device) text_inputs = text_inputs.to(device) text_labels = text_labels.to(device, dtype=torch.int64) outputs, text_outs = model(inputs, text_inputs) ###Intance loss loss = criterion(outputs, labels) text_loss = criterion(text_outs, text_labels) optimizer.zero_grad() loss.backward() text_loss.backward() optimizer.step() running_loss += loss.item() * inputs.size(0) running_text_loss += text_loss.item() * text_inputs.size(0) #Accurate img_pre = torch.argmax(outputs, 1) img_cor += (img_pre == labels).sum().float() total += len(labels) txt_pre = torch.argmax(text_outs, 1) txt_cor += (txt_pre == text_labels).sum().float() txt_total += len(text_labels) if batch_num % 10 == 0: logger.info( 'Train image epoch : {} [{}/{}]\t Image Loss:{:.6f}\t || Text Loss:{:.6f}' .format(epoch + 1, batch_num * len(inputs), len(dataloder.dataset.imgs), running_loss / (batch_num * arg.batch_size), running_text_loss / (batch_num * arg.batch_size))) # logger.info("Img_acc: {:.2f} \t Txt_acc: {:.2f}".format((img_cor/total).cpu().detach().data.numpy(),(txt_cor/txt_total).cpu().detach().data.numpy())) logger.info('Epoch {}:Done!!!'.format(epoch + 1)) loss_val_runing = 0.0 loss_val_runing_text = 0.0 img_cor_val = torch.zeros(1).squeeze().cuda() txt_cor_val = torch.zeros(1).squeeze().cuda() if (epoch + 1) % 2 == 0 or epoch + 1 == num_epochs: ##Testing / Vlidating torch.cuda.empty_cache() model.mode = 'test' CMC, mAP = test(model, arg.datasets, 128) logger.info('Testing: Top1:%.2f Top5:%.2f Top10:%.2f mAP:%.2f' % (CMC[0], CMC[4], CMC[9], mAP)) model.mode = 'train' # gallery_dataloder = utils.getDataloader( # arg.datasets,arg.batch_size,'val',shuffle=False,augment=False # ) # text_dataloder = load_data(arg.datasets,'val_npy', arg.batch_size) # for (inputs_val,label),(text_inputs_val,text_label) in zip(gallery_dataloder,text_dataloder): # inputs_val = inputs_val.to(device) # label = label.to(device) # text_inputs_val = text_inputs_val.to(device) # text_label = text_label.to(device,dtype=torch.int64) # img_val_out,text_outputs_val = model(inputs_val,text_inputs_val) # loss_img = criterion(img_val_out,label) # loss_val_text = criterion(text_outputs_val,text_label) # loss_val_runing += loss_img.item()*inputs_val.size(0) # loss_val_runing_text += loss_val_text.item()*text_inputs_val.size(0) # #Accurate # img_pre_val = torch.argmax(img_val_out,1) # img_cor_val += (img_pre_val == label).sum().float() # ### # txt_pre_val = torch.argmax(text_outputs_val,1) # txt_cor_val += (txt_pre_val == text_label).sum().float() # # logger.info("Img_VAL_acc: {:.2f} \t Txt_VAL_acc: {:.2f}".format( # # (img_cor_val/len(gallery_dataloder.dataset.imgs)).cpu().detach().data.numpy(),(txt_cor_val/len(text_dataloder.dataset)).cpu().detach().data.numpy())) # result = loss_val_runing/len(gallery_dataloder.dataset.imgs) # logger.info('[**]Validing image Loss: {:.4f}'.format(result)) # result_text = loss_val_runing_text/len(text_dataloder.dataset) ###Note # logger.info('[**]Validing text Loss: {:.4f}'.format(result_text)) logger.info('-' * 10) time_elapsed = time.time() - start_time logger.info('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) #Save final model weithts utils.save_network(model, save_dir_path, 'final')
epoch_acc += acc writer_train.add_scalar('Train/Positive Activation', pa, total_iter) writer_train.add_scalar('Train/Negtive Activation', na, total_iter) writer_train.add_scalar('Train/bce_loss', bce, total_iter) writer_train.add_scalar('Train/categery_acc', acc, total_iter) total_iter += 1 progress.finish() utils.clear_progressbar() print('[%02d] bce: %.5f | acc: %.3f (%d)' % (epoch, epoch_bce/opt.epoch_size, float(epoch_acc*100)/opt.epoch_size, epoch*opt.epoch_size*opt.batch_size)) with open(os.path.join(opt.log_dir,'discriminator_losses%s.txt' %(opt.dataset)),mode='a') as f: f.write('%0.8f %.4f \n' %(epoch_bce/opt.epoch_size, float(epoch_acc)/opt.epoch_size)) # save the model save_network(discriminator, 'discriminator', 'last', opt.log_dir, opt.gpu_ids) update_learning_rate(optimizers, epoch, opt.lr_decay_iters, gamma = 0.1) # Testing: discriminator.eval() tsize = 50 time_wise_pa = np.zeros((tsize, opt.n_future)) time_wise_na = np.zeros((tsize, opt.n_future)) time_wise_acc = np.zeros((tsize, opt.n_future)) print('Testing epoch %d'%(epoch)) progress_test = ProgressBar(widgets=widgets, maxval=tsize).start() for k in range(tsize): progress_test.update(k+1) x = next(testing_batch_generator) x = generate_sequence(x)
def train_model(model, criterion, optimizer, scheduler, stage=None, num_epochs=25): since = time.time() # best_model_wts = model.state_dict() # best_acc = 0.0 warm_up = 0.1 # We start from the 0.1*lrRate warm_iteration = round(dataset_sizes['train'] / opt.batchsize) * opt.warm_epoch # first 5 epoch for epoch in range(num_epochs - start_epoch): epoch = epoch + start_epoch print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': #scheduler.step() model.train(True) # Set model to training mode else: model.train(False) # Set model to evaluate mode running_loss = 0.0 running_corrects = 0.0 # Iterate over data. for data in dataloaders[phase]: # get the inputs inputs, labels = data now_batch_size, c, h, w = inputs.shape if now_batch_size < opt.batchsize: # skip the last batch continue if use_gpu: inputs = Variable(inputs.cuda().detach()) labels = Variable(labels.cuda().detach()) else: inputs, labels = Variable(inputs), Variable(labels) # zero the parameter gradients optimizer.zero_grad() # forward if phase == 'val': with torch.no_grad(): outputs = model(inputs) else: outputs = model(inputs) if not opt.PCB: _, preds = torch.max(outputs.data, 1) loss = criterion(outputs, labels) else: part = {} sm = nn.Softmax(dim=1) num_part = opt.parts for i in range(num_part): part[i] = outputs[i] if num_part == 6: score = sm(part[0]) + sm(part[1]) + sm(part[2]) + sm( part[3]) + sm(part[4]) + sm(part[5]) elif num_part == 8: score = sm(part[0]) + sm(part[1]) + sm(part[2]) + sm( part[3]) + sm(part[4]) + sm(part[5]) + sm( part[6]) + sm(part[7]) else: score = sm(part[0]) + sm(part[1]) + sm(part[2]) + sm( part[3]) _, preds = torch.max(score.data, 1) loss = criterion(part[0], labels) for i in range(num_part - 1): loss += criterion(part[i + 1], labels) # backward + optimize only if in training phase if epoch < opt.warm_epoch and phase == 'train': warm_up = min(1.0, warm_up + 0.9 / warm_iteration) loss *= warm_up if phase == 'train': if fp16: # we use optimier to backward loss with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() ########## # statistics if int(version[0]) > 0 or int( version[2] ) > 3: # for the new version like 0.4.0, 0.5.0 and 1.0.0 running_loss += loss.item() * now_batch_size else: # for the old version like 0.3.0 and 0.3.1 running_loss += loss.data[0] * now_batch_size running_corrects += float(torch.sum(preds == labels.data)) if phase == 'train': scheduler.step() epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects / dataset_sizes[phase] print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) y_loss[phase].append(epoch_loss) y_err[phase].append(1.0 - epoch_acc) # deep copy the model if phase == 'val': last_model_wts = model.state_dict() if epoch % 10 == 9 or ((stage == 'full' or stage == 'rpp') and epoch % 10 == 4): save_network(model, epoch, stage) draw_curve(epoch, stage) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) # print('Best val Acc: {:4f}'.format(best_acc)) model.load_state_dict(last_model_wts) save_network(model, 'last', stage) return model
def train_model(model, criterion, optimizer, scheduler, start_epoch=0, num_epochs=25): since = time.time() warm_up = 0.1 # We start from the 0.1*lrRate gamma = 0.0 #auto_aug warm_iteration = round(dataset_sizes['train'] / opt.batchsize) * opt.warm_epoch # first 5 epoch total_iteration = round( dataset_sizes['train'] / opt.batchsize) * num_epochs best_model_wts = model.state_dict() best_loss = 9999 best_epoch = 0 for epoch in range(num_epochs - start_epoch): epoch = epoch + start_epoch print('gamma: %.4f' % gamma) print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train']: if phase == 'train': scheduler.step() model.train(True) # Set model to training mode else: model.train(False) # Set model to evaluate mode running_loss = 0.0 running_corrects = 0.0 # Iterate over data. for data in dataloaders[phase]: # get the inputs if opt.autoaug: inputs, inputs2, labels = data if random.uniform(0, 1) > gamma: inputs = inputs2 gamma = min(1.0, gamma + 1.0 / total_iteration) else: inputs, labels = data now_batch_size, c, h, w = inputs.shape if now_batch_size < opt.batchsize: # skip the last batch continue #print(inputs.shape) # wrap them in Variable if use_gpu: inputs = Variable(inputs.cuda().detach()) labels = Variable(labels.cuda().detach()) else: inputs, labels = Variable(inputs), Variable(labels) # if we use low precision, input also need to be fp16 #if fp16: # inputs = inputs.half() # zero the parameter gradients optimizer.zero_grad() # forward if phase == 'val': with torch.no_grad(): outputs = model(inputs) else: outputs = model(inputs) if opt.PCB: part = {} sm = nn.Softmax(dim=1) num_part = 6 for i in range(num_part): part[i] = outputs[i] score = sm(part[0]) + sm(part[1]) + sm(part[2]) + sm( part[3]) + sm(part[4]) + sm(part[5]) _, preds = torch.max(score.data, 1) loss = criterion(part[0], labels) for i in range(num_part - 1): loss += criterion(part[i + 1], labels) elif opt.CPB: part = {} sm = nn.Softmax(dim=1) num_part = 3 for i in range(num_part): part[i] = outputs[i] score = sm(part[0]) + sm(part[1]) + sm(part[2]) _, preds = torch.max(score.data, 1) loss = criterion(part[0], labels) for i in range(num_part - 1): loss += criterion(part[i + 1], labels) else: loss = criterion(outputs, labels) if opt.angle or opt.arc: outputs = outputs[0] _, preds = torch.max(outputs.data, 1) # backward + optimize only if in training phase if epoch < opt.warm_epoch and phase == 'train': warm_up = min(1.0, warm_up + 0.9 / warm_iteration) loss *= warm_up # backward + optimize only if in training phase if phase == 'train': if fp16: # we use optimier to backward loss with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() #print('Iteration: loss:%.2f accuracy:%.2f'%(loss.item(), float(torch.sum(preds == labels.data))/now_batch_size ) ) # statistics if int(version[0]) > 0 or int( version[2] ) > 3: # for the new version like 0.4.0, 0.5.0 and 1.0.0 running_loss += loss.item() * now_batch_size else: # for the old version like 0.3.0 and 0.3.1 running_loss += loss.data[0] * now_batch_size running_corrects += float(torch.sum(preds == labels.data)) del (loss, outputs, inputs, preds) epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects / dataset_sizes[phase] print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) y_loss[phase].append(epoch_loss) y_err[phase].append(1.0 - epoch_acc) # deep copy the model if len(opt.gpu_ids) > 1: save_network(model.module, opt.name, epoch + 1) else: save_network(model, opt.name, epoch + 1) draw_curve(epoch) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print() if epoch_loss < best_loss: best_loss = epoch_loss best_epoch = epoch last_model_wts = model.state_dict() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best epoch: {:d} Best Train Loss: {:4f}'.format( best_epoch, best_loss)) # load best model weights model.load_state_dict(last_model_wts) save_network(model, opt.name, 'last') return model
def train_rank(model, criterion, optimizer, scheduler, dataloder, text_loader, num_epochs, device, stage): start_time = time.time() # Logger instance logger = utils.Logger(save_dir_path) logger.info('-' * 10) logger.info(vars(arg)) logger.info('Stage: ' + stage) print( "############################ Train stage II #############################" ) for epoch in range(num_epochs): logger.info('Epoch {}/{}'.format(epoch + 1, num_epochs)) model.train() scheduler.step() ##Training batch_num = 0 loss_avg = [] get = list(zip(dataloder, text_loader)) random.shuffle(get) img, txt = zip(*get) for (inputs, labels), (text_inputs, text_labels) in zip(img, txt): batch_num += 1 inputs = inputs.to(device) labels = labels.to(device) text_inputs = text_inputs.to(device) text_labels = text_labels.to(device, dtype=torch.int64) outputs, text_outs = model(inputs, text_inputs) # print("output.shape:: ",outputs.shape) # print("text_out.shape:: ",text_outs.shape) # print("label.shape: ",labels) # print("text_label.shape:: ",text_labels) anc_IT, pos_IT, neg_IT = ImageSelector(outputs, text_outs, labels) anc_TI, pos_TI, neg_TI = TextSelector(text_outs, outputs, labels) loss_rank = criterion(anc_IT, pos_IT, neg_IT) + criterion( anc_TI, pos_TI, neg_TI) optimizer.zero_grad() loss_rank.backward() optimizer.step() loss_avg.append(loss_rank) if batch_num % 10 == 0: loss_avg = sum(loss_avg) / len(loss_avg) logger.info( 'Stage II training : {} [{}]]\t Rank_loss:{:.6f}'.format( epoch + 1, batch_num * len(inputs), loss_avg)) loss_avg = [] if (epoch + 1) % 2 == 0 or epoch + 1 == num_epochs: ##Testing / Vlidaing torch.cuda.empty_cache() # model.mode = 'test' CMC, mAP = test(model, arg.datasets, 128) logger.info('Testing: Top1:%.2f Top5:%.2f Top10:%.2f mAP:%.2f' % (CMC[0], CMC[4], CMC[9], mAP)) logger.info('-' * 10) time_cost = time.time() - start_time logger.info('Training complete in {:.0f}m {:.0f}s'.format( time_cost // 60, time_cost % 60)) utils.save_network(model, save_dir_path, 'final_r')
def train_model(model, criterion, optimizer, scheduler, start_epoch=0, num_epochs=25): bert_tokenizer = AutoTokenizer.from_pretrained("roberta-base") since = time.time() warm_up = 0.1 # We start from the 0.1*lrRate gamma = 0.0 #auto_aug warm_iteration = round( dataset_size / opt.batchsize) * opt.warm_epoch * 2 # first 5 epoch print(warm_iteration) total_iteration = round(dataset_size / opt.batchsize) * num_epochs best_model_wts = model.state_dict() best_loss = 9999 best_epoch = 0 if opt.circle: criterion_circle = CircleLoss(m=0.25, gamma=32) for epoch in range(num_epochs - start_epoch): epoch = epoch + start_epoch print('gamma: %.4f' % gamma) print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train']: if phase == 'train': scheduler.step() model.train(True) # Set model to training mode else: model.train(False) # Set model to evaluate mode running_loss = 0.0 running_corrects = 0.0 # Iterate over data. with tqdm(dataloader, ascii=True) as tq: for data in tq: # zero the parameter gradients if opt.motion: nl, crop, motion, nl_id, crop_id, label = data else: nl, crop, nl_id, crop_id, label = data motion = None tokens = bert_tokenizer.batch_encode_plus( nl, padding='longest', return_tensors='pt') optimizer.zero_grad() loss = compute_loss(model, tokens['input_ids'].cuda(), tokens['attention_mask'].cuda(), crop.cuda(), motion, nl_id, crop_id, label, warm_up) # backward + optimize only if in training phase if epoch < opt.warm_epoch and phase == 'train': warm_up = min(1.0, warm_up + 0.9 / warm_iteration) loss *= warm_up # backward + optimize only if in training phase if phase == 'train': if fp16: # we use optimier to backward loss with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if opt.sam: optimizer.first_step(zero_grad=True) loss.backward() optimizer.second_step(zero_grad=True) else: optimizer.step() # statistics if int(version[0]) > 0 or int( version[2] ) > 3: # for the new version like 0.4.0, 0.5.0 and 1.0.0 running_loss += loss.item() * opt.batchsize else: # for the old version like 0.3.0 and 0.3.1 running_loss += loss.data[0] * now_batch_size del (loss, tokens, data, nl, crop, nl_id, crop_id, label) epoch_loss = running_loss / dataset_size print('{} Loss: {:.4f}'.format(phase, epoch_loss)) y_loss[phase].append(epoch_loss) # deep copy the model #if len(opt.gpu_ids)>1: # save_network(model.module, opt.name, epoch+1) #else: if epoch % 10 == 0: save_network(model, opt.name, epoch + 1) draw_curve(epoch) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print() if epoch_loss < best_loss: best_loss = epoch_loss best_epoch = epoch last_model_wts = model.state_dict() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best epoch: {:d} Best Train Loss: {:4f}'.format( best_epoch, best_loss)) # load best model weights model.load_state_dict(last_model_wts) save_network(model, opt.name, 'last') return model