def train_one_epoch(self, specializing=False, cosine_decay=False): """ One epoch training function :return: """ if specializing: tqdm_batch = tqdm.tqdm( self.sub_data_loader.binary_train_loader, total=self.sub_data_loader.binary_train_iterations, desc="Epoch-{}-".format(self.current_epoch)) else: tqdm_batch = tqdm.tqdm(self.data_loader.train_loader, total=self.data_loader.train_iterations, desc="Epoch-{}-".format(self.current_epoch)) self.model.train() epoch_loss = AverageMeter() top1_acc = AverageMeter() top5_acc = AverageMeter() current_batch = 0 for i, (x, y) in enumerate(tqdm_batch): if self.cuda: x, y = x.cuda(non_blocking=self.config.async_loading), y.cuda( non_blocking=self.config.async_loading) self.optimizer.zero_grad() if cosine_decay: self.adjust_learning_rate(self.optimizer, self.current_epoch, i, self.data_loader.train_iterations) pred = self.model(x) cur_loss = self.loss_fn(pred, y) if np.isnan(float(cur_loss.item())): raise ValueError('Loss is nan during training...') cur_loss.backward() self.optimizer.step() if specializing: top1 = cls_accuracy(pred.data, y.data) top1_acc.update(top1[0].item(), x.size(0)) else: top1, top5 = cls_accuracy(pred.data, y.data, topk=(1, 5)) top1_acc.update(top1.item(), x.size(0)) top5_acc.update(top5.item(), x.size(0)) epoch_loss.update(cur_loss.item()) self.current_iteration += 1 current_batch += 1 self.lr_list.append(self.optimizer.param_groups[0]['lr']) tqdm_batch.close() print("Training at epoch-" + str(self.current_epoch) + " | " + "loss: " + str(epoch_loss.val) + "\tTop1 Acc: " + str(top1_acc.val))
def validate(self, specializing=False): """ One epoch validation :return: """ if specializing: tqdm_batch = tqdm.tqdm( self.sub_data_loader.binary_valid_loader, total=self.sub_data_loader.binary_valid_iterations, desc="Epoch-{}-".format(self.current_epoch)) else: tqdm_batch = tqdm.tqdm(self.data_loader.valid_loader, total=self.data_loader.valid_iterations, desc="Valiation at -{}-".format( self.current_epoch)) self.model.eval() epoch_loss = AverageMeter() top1_acc = AverageMeter() top5_acc = AverageMeter() for x, y in tqdm_batch: if self.cuda: x, y = x.cuda(non_blocking=self.config.async_loading), y.cuda( non_blocking=self.config.async_loading) # model pred = self.model(x) # loss cur_loss = self.loss_fn(pred, y) if np.isnan(float(cur_loss.item())): raise ValueError('Loss is nan during validation...') if specializing: top1 = cls_accuracy(pred.data, y.data) top1_acc.update(top1[0].item(), x.size(0)) else: top1, top5 = cls_accuracy(pred.data, y.data, topk=(1, 5)) top1_acc.update(top1.item(), x.size(0)) top5_acc.update(top5.item(), x.size(0)) epoch_loss.update(cur_loss.item()) self.logger.info("Validation results at epoch-" + str(self.current_epoch) + " | " + "loss: " + str(epoch_loss.avg) + "\tTop1 Acc: " + str(top1_acc.val)) tqdm_batch.close() return top1_acc.avg
def train_one_epoch(self): """ One epoch training function :return: """ tqdm_batch = tqdm.tqdm(self.data_loader.train_loader, total=self.data_loader.train_iterations, desc="Epoch-{}-".format(self.current_epoch)) self.train() epoch_loss = AverageMeter() top1_acc = AverageMeter() top5_acc = AverageMeter() current_batch = 0 for i, (x, y) in enumerate(tqdm_batch): if self.cuda: x, y = x.cuda(non_blocking=self.config.async_loading), y.cuda( non_blocking=self.config.async_loading) self.optimizer.zero_grad() # self.adjust_learning_rate(self.optimizer, self.current_epoch, i, self.data_loader.train_iterations) pred = self(x) cur_loss = self.loss_fn(pred, y) if np.isnan(float(cur_loss.item())): raise ValueError('Loss is nan during training...') cur_loss.backward() self.optimizer.step() top1, top5 = cls_accuracy(pred.data, y.data, topk=(1, 5)) top1_acc.update(top1.item(), x.size(0)) top5_acc.update(top5.item(), x.size(0)) epoch_loss.update(cur_loss.item()) self.current_iteration += 1 current_batch += 1 tqdm_batch.close() print("Training at epoch-" + str(self.current_epoch) + " | " + "loss: " + str(epoch_loss.val) + "\tTop1 Acc: " + str(top1_acc.val))
def _validate(self, config): """ One epoch validation :return: """ self.data_loader = Cifar100DataLoader(self.config) self.loss_fn = nn.CrossEntropyLoss() self.loss_fn = self.loss_fn.to(self.device) tqdm_batch = tqdm.tqdm(self.data_loader.valid_loader, total=self.data_loader.valid_iterations, desc="Valiation at -{}-".format( self.current_epoch)) self.eval() epoch_loss = AverageMeter() top1_acc = AverageMeter() top5_acc = AverageMeter() for x, y in tqdm_batch: if self.cuda: x, y = x.cuda(non_blocking=self.config.async_loading), y.cuda( non_blocking=self.config.async_loading) # model pred = self(x) # loss cur_loss = self.loss_fn(pred, y) if np.isnan(float(cur_loss.item())): raise ValueError('Loss is nan during validation...') top1, top5 = cls_accuracy(pred.data, y.data, topk=(1, 5)) top1_acc.update(top1.item(), x.size(0)) top5_acc.update(top5.item(), x.size(0)) epoch_loss.update(cur_loss.item()) print("Validation results at epoch-" + str(self.current_epoch) + " | " + "loss: " + str(epoch_loss.avg) + "\tTop1 Acc: " + str(top1_acc.val)) tqdm_batch.close() return top1_acc.avg
def validate(self): """ One epoch validation :return: """ tqdm_batch = tqdm(self.data_loader.valid_loader, total=self.data_loader.valid_iterations, desc="Valiation at -{}-".format(self.current_epoch)) # set the model in training mode self.model.eval() epoch_loss = AverageMeter() top1_acc = AverageMeter() top5_acc = AverageMeter() for x, y in tqdm_batch: if self.cuda: x, y = x.cuda(self.config.async_loading), y.cuda( self.config.async_loading) x, y = Variable(x), Variable(y) # model pred = self.model(x) # loss cur_loss = self.loss(pred, y) if np.isnan(float(cur_loss.item())): raise ValueError('Loss is nan during validation...') top1, top5 = cls_accuracy(pred.data, y.data, topk=(1, 5)) epoch_loss.update(cur_loss.item()) top1_acc.update(top1.item(), x.size(0)) top5_acc.update(top5.item(), x.size(0)) self.logger.info("Validation results at epoch-" + str(self.current_epoch) + " | " + "loss: " + str(epoch_loss.avg) + "- Top1 Acc: " + str(top1_acc.val) + "- Top5 Acc: " + str(top5_acc.val)) tqdm_batch.close() return top1_acc.avg
def validate(self): """ One epoch validation :return: """ self.data_loader = fashion_mnist_dataloader(BATCH_SIZE=128) tqdm_batch = tqdm.tqdm(self.data_loader.valid_loader, total=self.data_loader.valid_iterations, desc="Valiation at -{}-".format(self.current_epoch)) self.eval() epoch_loss = AverageMeter() top1_acc = AverageMeter() top5_acc = AverageMeter() for x, y in tqdm_batch: if self.cuda: x, y = x.cuda(non_blocking=True), y.cuda(non_blocking=True) # model pred = self(x) # loss cur_loss = self.loss_fn(pred, y) if np.isnan(float(cur_loss.item())): raise ValueError('Loss is nan during validation...') top1, top5 = cls_accuracy(pred.data, y.data, topk=(1, 5)) top1_acc.update(top1.item(), x.size(0)) top5_acc.update(top5.item(), x.size(0)) epoch_loss.update(cur_loss.item()) print("Validation results at epoch-" + str(self.current_epoch) + " | " + "loss: " + str(epoch_loss.avg) + "\tTop1 Acc: " + str(top1_acc.val)) tqdm_batch.close() return top1_acc.avg
def train_one_epoch(self): """ One epoch training function """ # Initialize tqdm tqdm_batch = tqdm(self.data_loader.train_loader, total=self.data_loader.train_iterations, desc="Epoch-{}-".format(self.current_epoch)) # Set the model to be in training mode self.model.train() # Initialize your average meters epoch_loss = AverageMeter() top1_acc = AverageMeter() top5_acc = AverageMeter() current_batch = 0 for x, y in tqdm_batch: if self.cuda: x, y = x.cuda(self.config.async_loading), y.cuda( self.config.async_loading) # current iteration over total iterations progress = float( self.current_epoch * self.data_loader.train_iterations + current_batch) / (self.config.max_epoch * self.data_loader.train_iterations) # progress = float(self.current_iteration) / (self.config.max_epoch * self.data_loader.train_iterations) x, y = Variable(x), Variable(y) lr = adjust_learning_rate(self.optimizer, self.current_epoch, self.config, batch=current_batch, nBatch=self.data_loader.train_iterations) # model pred = self.model(x, progress) # loss cur_loss = self.loss(pred, y) if np.isnan(float(cur_loss.item())): raise ValueError('Loss is nan during training...') # optimizer self.optimizer.zero_grad() cur_loss.backward() self.optimizer.step() top1, top5 = cls_accuracy(pred.data, y.data, topk=(1, 5)) epoch_loss.update(cur_loss.item()) top1_acc.update(top1.item(), x.size(0)) top5_acc.update(top5.item(), x.size(0)) self.current_iteration += 1 current_batch += 1 self.summary_writer.add_scalar("epoch/loss", epoch_loss.val, self.current_iteration) self.summary_writer.add_scalar("epoch/accuracy", top1_acc.val, self.current_iteration) tqdm_batch.close() self.logger.info("Training at epoch-" + str(self.current_epoch) + " | " + "loss: " + str(epoch_loss.val) + "- Top1 Acc: " + str(top1_acc.val) + "- Top5 Acc: " + str(top5_acc.val))
def train_neural_network(self): print_training = "Training CONV: valid_idx:{}, test_idx{} batch_norm:{}, keep_prob:{}".format( self.valid_idx, self.test_idx, self.batch_norm, self.keep_prob) print(print_training) logging.debug(print_training) self.session.run(tf.global_variables_initializer()) best_validation_accuracy = 0 last_improvement = 0 start_time = time.time() idx = 0 epochs = 0 for i in range(self.num_iterations): # Batch Training j = self.get_last_batch_index(self.num_examples, idx, self.batch_size) x_batch, y_batch = self.train_x[idx:j, :], self.train_y[idx:j, :] # TODO simplify batch processing if j == self.num_examples: epochs += 1 idx = 0 is_epoch = True else: is_epoch = False idx = j summary, train_loss, train_y_pred_cls, _ = self.session.run( [self.merged, self.cost, self.y_pred_cls, self.optimizer], feed_dict={ self.x: x_batch, self.y: y_batch, self.is_training: True }) train_cls_true = metrics.convert_labels_to_cls(y_batch) train_correct = (train_y_pred_cls == train_cls_true) train_acc, _ = metrics.cls_accuracy(train_correct) self.train_cost.append(train_loss) self.train_acc.append(train_acc) self.train_writer.add_summary(summary, i) # Calculate the accuracy valid_correct, _, valid_cost = self.predict_cls( images=self.valid_x, labels=self.valid_y, cls_true=metrics.convert_labels_to_cls(self.valid_y)) validation_acc, _ = metrics.cls_accuracy(valid_correct) self.validation_acc.append(validation_acc) self.validation_cost.append(valid_cost) if is_epoch or (i == (self.num_iterations - 1)): if validation_acc > best_validation_accuracy: # Save Best Perfoming all variables of the TensorFlow graph to file. self.saver.save(sess=self.session, save_path=self.save_path) # update best validation accuracy best_validation_accuracy = validation_acc last_improvement = i improved_str = '*' else: improved_str = '' print_opt = "Epoch: {}, Training Loss:{}, Acc: {}, " \ " Validation Loss:{}, Acc:{} {}".format(epochs, train_loss, train_acc, valid_cost, validation_acc, improved_str) print(print_opt) logging.debug(print_opt) if i - last_improvement > self.require_improvement: print_impro = "No improvement found in a while, stopping optimization." print(print_impro) logging.debug(print_impro) # Break out from the for-loop. break # Ending time. end_time = time.time() time_dif = end_time - start_time print_time = "Time usage: " + str( timedelta(seconds=int(round(time_dif)))) print(print_time) logging.debug(print_time) return last_improvement, epochs
def train_one_epoch(self): """ One epoch of training :return: """ self.model.train() # self.scheduler.step() for batch_idx, (data, target) in enumerate(self.data_loader.train_loader): data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() # output = self.model(data) target_one_hot = to_one_hot( target.cpu(), num_class=self.config.train_classes).to(self.device) # label soomth # target_one_hot = (1.0 - 0.1) * target_one_hot + 0.1 / 100 #target_float_hot = to_float_hot(target.cpu(), num_class=self.config.train_classes).to(self.device) # loss = self.loss(output, target) inputs_var, labels_var = data, target class_label = torch.Tensor( np.array(range(self.config.train_classes))) center_labels_var = torch.autograd.Variable( class_label.to(torch.long)).cuda() fvec, feature, class_weight = self.model(inputs_var) if self.q.full(): self.q.get() self.q.put(class_weight.cpu()) # import pdb # pdb.set_trace() a = list(self.q.queue) # temp = (a[0] + a[1] a[2] + a[3] + a[4]) temp = ((a[0] + a[1] + a[2] + a[3] + a[4]) / 5) class_weight = 0.2 * class_weight + temp.to(self.device) else: self.q.put(class_weight.cpu()) #on_hot vector labels_var_one_hot = target_one_hot # inter_class_distance fvec = fvec - 4 * labels_var_one_hot.cuda() #intra_class_distance loss_1 = self.loss(fvec, labels_var) origin_class_weight = class_weight batch_center = self.Center(feature, target, self.config.train_classes, class_weight) # batch_center = F.relu(batch_center) batch_center = F.normalize(batch_center, p=2, dim=1) # if self.current_epoch < 13: # linear_beta = (13 - self.current_epoch) / 13 # norm_beta = scipy.stats.norm(0, 1).pdf(self.current_epoch/12/2) # beta = 1 # class_weight = torch.div(class_weight + beta * batch_center, 2) class_weight = torch.div(class_weight + batch_center, 2) # class_weight = torch.div(class_weight - batch_center, 2) class_weight = F.normalize(class_weight) center_loss = self.loss( torch.mm(class_weight, torch.t(class_weight)), center_labels_var) triplet_loss = self.triplet(feature, target, class_weight) triplet_origin_loss = self.triplet(feature, target, origin_class_weight) loss = 0.5 * center_loss + loss_1 + 0.1 * triplet_loss # loss = 0.5 * center_loss + 0.1 * triplet_loss if self.config.loss_mode == '100': loss = triplet_origin_loss if self.config.loss_mode == '101': loss = triplet_origin_loss + loss_1 if self.config.loss_mode == '110': loss = triplet_loss if self.config.loss_mode == '011': loss = loss_1 if self.config.loss_mode == '111': loss = 0.5 * center_loss + loss_1 + 0.1 * triplet_loss prec1, prec5 = cls_accuracy(fvec, target, topk=(1, 5)) self.epoch_loss.update(loss.item()) self.top1.update(prec1.item()) self.top5.update(prec5.item()) # loss.backward() loss.backward(retain_graph=True) self.optimizer.step() if batch_idx % self.config.log_interval == 0: # self.logger.info(f'center_loss:{center_loss}\t loss_1:{loss_1}\t triplet_loss:{triplet_loss}') self.logger.info( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\tPrec@5 {top5.val:.3f} ({top5.avg:.3f}) \tlr {lr}' .format(self.current_epoch, batch_idx * len(data), len(self.data_loader.train_loader.dataset), 100. * batch_idx / len(self.data_loader.train_loader), loss=self.epoch_loss, top1=self.top1, top5=self.top5, lr=self.optimizer.param_groups[0]['lr'])) self.current_iteration += 1 if np.isnan(float(loss.item())): raise ValueError('Loss is nan during training...') self.summary_writer.add_scalar("batch/loss", self.epoch_loss.avg, self.current_iteration) self.summary_writer.add_scalar("batch/top1", self.top1.avg, self.current_iteration) self.summary_writer.add_scalar("batch/top5", self.top5.avg, self.current_iteration) self.summary_writer.add_scalar( "batch/lr", self.optimizer.param_groups[0]['lr'], self.current_iteration)