def valid_epoch(summary, summary_writer, epoch, model, loss_fn, dataloader_valid, cfg): model.eval() num_classes = cfg['num_classes'] class_point = cfg['class_point'] eval_loss = AverageMeter() eval_acc = AverageMeter() confusion_matrix = ConfusionMatrix(num_classes=num_classes) dataloader = [dataloader_valid] name = cfg['labels'] time_now = time.time() loss_sum = 0 acc_sum = 0 count = 0 steps_count = 0 for i in range(len(dataloader)): steps = len(dataloader[i]) batch_size = dataloader[i].batch_size dataiter = iter(dataloader[i]) # 使用 torch,no_grad()构建不需要track的上下文环境 with torch.no_grad(): acc_tmp = 0 loss_tmp = 0 prefetcher = data_prefetcher(dataiter) img, target, mask = prefetcher.next() for step in range(steps): # data, target = next(dataiter) data = img.to(device) target = target.to(device) output = model(data) output = output.view(int(batch_size), num_classes) target = target.view(int(batch_size)) mask = mask.view(int(batch_size)) conf_targets = target[mask] conf_preds = output[mask] loss = loss_fn(conf_preds, conf_targets) torch.cuda.synchronize() probs = F.softmax(output, dim=1) _, predicts = torch.max(probs, 1) acc = (predicts[mask] == conf_targets).type( torch.cuda.FloatTensor).sum() * 1.0 / conf_targets.size(0) for t in range(num_classes): for p in range(num_classes): count = (predicts[mask][conf_targets == t] == p).type( torch.cuda.FloatTensor).sum() reduced_count = reduce_tensor( count.data, reduction=False) confusion_matrix.update(t, p, to_python_float(reduced_count)) reduced_loss = reduce_tensor(loss.data) reduced_acc = reduce_tensor(acc.data) eval_loss.update(to_python_float(reduced_loss)) eval_acc.update(to_python_float(reduced_acc)) if args.local_rank == 0: time_spent = time.time() - time_now time_now = time.time() logging.info( 'data_num : {}, Step : {}, Testing Loss : {:.5f}, ' 'Testing Acc : {:.3f}, Run Time : {:.2f}' .format( str(i), summary['step'] + 1, reduced_loss, reduced_acc, time_spent)) summary['step'] += 1 img, target, mask = prefetcher.next() if args.local_rank == 0: summary['confusion_matrix'] = plot_confusion_matrix( confusion_matrix.matrix, cfg['labels'], tensor_name='train/Confusion matrix') summary['loss'] = eval_loss.avg # summary['acc'] = acc_sum / (steps * (batch_size)) summary['acc'] = eval_acc.avg return summary
def valid_epoch(summary, summary_writer, epoch, model, loss_fn, dataloader_valid, cfg): logger = log.logger() model.eval() num_classes = cfg['num_classes'] class_point = cfg['class_point'] eval_loss = AverageMeter() eval_acc = AverageMeter() eval_pred_posit = AverageMeter() eval_label_posit = AverageMeter() confusion_matrix = ConfusionMatrix(num_classes=(num_classes)+1) dataloader = [dataloader_valid] name = cfg['labels'] time_now = time.time() loss_sum = 0 acc_sum = 0 count = 0 steps_count = 0 for i in range(len(dataloader)): steps = len(dataloader[i]) batch_size = dataloader[i].batch_size dataiter = iter(dataloader[i]) # 使用 torch,no_grad()构建不需要track的上下文环境 with torch.no_grad(): acc_tmp = 0 loss_tmp = 0 prefetcher = data_prefetcher(dataiter) img, target, label, label_degree = prefetcher.next() for step in range(steps): # data, target = next(dataiter) data = img.to(device) target = target.to(device) output = model(data) output = output.view(img.size(0), num_classes) target = target.view(img.size(0), num_classes) label = label.view(img.size(0)) conf_preds = torch.sigmoid(output) # print("conf_preds", conf_preds.shape) loss = loss_fn(conf_preds, target) torch.cuda.synchronize() predicts = (conf_preds >= 0.5) d = torch.Tensor([0] * img.size(0) ).reshape(-1, 1).to(device) predicts = torch.cat((d, predicts.float()), 1) logger.get_info(predicts) # _, predicts = torch.max(predicts, 1) predicts = MaxIndex(predicts, batch_size) # logger.get_info(predicts) acc = (predicts == label).type( torch.cuda.FloatTensor).sum() * 1.0 / img.size(0) recall_pred = (predicts[label_degree >= 20] > 1).type( torch.cuda.FloatTensor).sum() * 1.0 recall_label = (label_degree >= 20).sum() for t in range(num_classes+1): for p in range(num_classes+1): count = (predicts[label == t] == p).type( torch.cuda.FloatTensor).sum() reduced_count = reduce_tensor( count.data, reduction=False) confusion_matrix.update(t, p, to_python_float(reduced_count)) reduced_loss = reduce_tensor(loss.data) reduced_acc = reduce_tensor(acc.data) reduced_pred_20 = reduce_tensor(recall_pred.data) reduced_label_20 = reduce_tensor(recall_label) eval_loss.update(to_python_float(reduced_loss)) eval_acc.update(to_python_float(reduced_acc)) eval_pred_posit.update(to_python_float(reduced_pred_20)) eval_label_posit.update(to_python_float(reduced_label_20)) if args.local_rank == 0: time_spent = time.time() - time_now time_now = time.time() logging.info( 'data_num : {}, Step : {}, Testing Loss : {:.5f}, ' 'Testing Acc : {:.3f}, Run Time : {:.2f}' .format( str(i), summary['step'] + 1, reduced_loss, reduced_acc, time_spent)) summary['step'] += 1 img, target, label, label_degree = prefetcher.next() if args.local_rank == 0: recall = eval_pred_posit.sum/float(eval_label_posit.sum) summary['confusion_matrix'] = plot_confusion_matrix( confusion_matrix.matrix, cfg['labels'], tensor_name='Confusion matrix') summary['loss'] = eval_loss.avg summary['recall'] = recall # summary['acc'] = acc_sum / (steps * (batch_size)) summary['acc'] = eval_acc.avg print("Recall >=20:", recall) return summary
def train_epoch(epoch, summary, summary_writer, model, loss_fn, optimizer, dataloader_train, cfg): model.train() num_classes = cfg['num_classes'] class_point = cfg['class_point'] train_loss = AverageMeter() train_acc = AverageMeter() confusion_matrix = ConfusionMatrix(num_classes=num_classes) steps = len(dataloader_train) batch_size = dataloader_train.batch_size dataiter = iter(dataloader_train) time_now = time.time() loss_sum = 0 acc_sum = 0 summary['epoch'] = epoch if args.local_rank == 0: print("steps:", steps) prefetcher = data_prefetcher(dataiter) img, target, mask = prefetcher.next() for step in range(steps): data = img.to(device) target = target.to(device) # # mixup # # generate mixed inputs, two one-hot label vectors and mixing coefficient # data, target_a, target_b, lam = mixup_data( # data, target, args.alpha, use_cuda) # print(data.shape) output = model(data) output = output.view(int(batch_size), num_classes) target = target.view(int(batch_size)) mask = mask.view(int(batch_size)) # target = target.long() conf_targets = target[mask] conf_preds = output[mask] # print("conf_preds", conf_preds.shape) loss = loss_fn(conf_preds, conf_targets) # loss = loss_func(loss_fn, output) optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # loss.backward() optimizer.step() torch.cuda.synchronize() # scheduler.step() # lr = scheduler.get_last_lr()[0] probs = F.softmax(output, dim=1) # torch.max(a,1) 返回每一行中最大值的那个元素FloatTensor,且返回其索引LongTensor(返回最大元素在这一行的列索引) _, predicts = torch.max(probs, 1) # target = (target >= class_point).long() acc = (predicts[mask] == conf_targets).type( torch.cuda.FloatTensor).sum() * 1.0 / conf_targets.size(0) for t in range(num_classes): for p in range(num_classes): count = (predicts[mask][conf_targets == t] == p).type( torch.cuda.FloatTensor).sum() reduced_count = reduce_tensor(count.data, reduction=False) confusion_matrix.update(t, p, to_python_float(reduced_count)) reduced_loss = reduce_tensor(loss.data) reduced_acc = reduce_tensor(acc.data) train_loss.update(to_python_float(reduced_loss)) train_acc.update(to_python_float(reduced_acc)) if args.local_rank == 0: time_spent = time.time() - time_now time_now = time.time() logging.info( 'Epoch : {}, Step : {}, Training Loss : {:.5f}, ' 'Training Acc : {:.3f}, Run Time : {:.2f}' .format( summary['epoch'] + 1, summary['step'] + 1, train_loss.avg, train_acc.avg, time_spent)) summary['step'] += 1 img, target, mask = prefetcher.next() if args.local_rank == 0: time_spent = time.time() - time_now time_now = time.time() summary_writer.add_scalar( 'train/loss', train_loss.val, epoch) summary_writer.add_scalar( 'train/acc', train_acc.val, epoch) # summary_writer.add_scalar( # 'learning_rate', lr, summary['step'] + steps*epoch) summary_writer.flush() summary['confusion_matrix'] = plot_confusion_matrix( confusion_matrix.matrix, cfg['labels'], tensor_name='train/Confusion matrix') # summary['loss'] = train_loss.avg # summary['acc'] = acc_sum / (steps * (batch_size)) # summary['acc'] = train_acc.avg summary['epoch'] = epoch return summary
def train_epoch(epoch, summary, summary_writer, model, loss_fn, optimizer, dataloader_train, cfg): # logger = log.logger() model.train() num_classes = cfg['num_classes'] class_point = cfg['class_point'] train_loss = AverageMeter() train_acc = AverageMeter() train_pred_posit = AverageMeter() train_label_posit = AverageMeter() confusion_matrix = ConfusionMatrix(num_classes=(num_classes)+1) steps = len(dataloader_train) batch_size = dataloader_train.batch_size dataiter = iter(dataloader_train) time_now = time.time() loss_sum = 0 acc_sum = 0 summary['epoch'] = epoch if args.local_rank == 0: print("steps:", steps) prefetcher = data_prefetcher(dataiter) img, target, label, label_degree = prefetcher.next() for step in range(steps): # logger.get_info('...........'+'step' + str(step) + '............') data = img.to(device) target = target.to(device) # # mixup # # generate mixed inputs, two one-hot label vectors and mixing coefficient # data, target_a, target_b, lam = mixup_data( # data, target, args.alpha, use_cuda) # print(data.shape) output = model(data) output = output.view(int(batch_size), num_classes) target = target.view(int(batch_size), num_classes) label = label.view(int(batch_size)) # target = target.long() conf_preds = torch.sigmoid(output) # print("conf_preds", conf_preds.shape) loss = loss_fn(conf_preds, target) optimizer.zero_grad() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # loss.backward() optimizer.step() torch.cuda.synchronize() # scheduler.step() # lr = scheduler.get_last_lr()[0] # print(conf_preds.shape) # torch.max(a,1) 返回每一行中最大值的那个元素FloatTensor,且返回其索引LongTensor(返回最大元素在这一行的列索引) predicts = (conf_preds >= 0.5) d = torch.Tensor([0] * int(batch_size)).reshape(-1, 1).to(device) predicts = torch.cat((d, predicts.float()), 1) # logger.get_info(predicts) predicts = MaxIndex(predicts, batch_size) # logger.get_info(predicts) # target = (target >= class_point).long() acc = (predicts == label).type( torch.cuda.FloatTensor).sum() * 1.0 / label.size(0) # print(type(predicts), predicts[label_degree >= 20]) recall_pred = (predicts[label_degree >= 20] > 1).type( torch.cuda.FloatTensor).sum() * 1.0 recall_label = (label_degree >= 20).sum() # print('recall_pred : {}, recall_label : {}'.format(recall_pred, recall_label)) for t in range(num_classes+1): for p in range(num_classes+1): count = (predicts[label == t] == p).type( torch.cuda.FloatTensor).sum() reduced_count = reduce_tensor(count.data, reduction=False) confusion_matrix.update(t, p, to_python_float(reduced_count)) reduced_loss = reduce_tensor(loss.data) reduced_acc = reduce_tensor(acc.data) reduced_pred_20 = reduce_tensor(recall_pred.data) reduced_label_20 = reduce_tensor(recall_label) train_loss.update(to_python_float(reduced_loss)) train_acc.update(to_python_float(reduced_acc)) train_pred_posit.update(to_python_float(reduced_pred_20)) train_label_posit.update(to_python_float(reduced_label_20)) if args.local_rank == 0: time_spent = time.time() - time_now time_now = time.time() logging.info( 'Epoch : {}, Step : {}, Training Loss : {:.5f}, ' 'Training Acc : {:.3f}, Run Time : {:.2f}' .format( summary['epoch'] + 1, summary['step'] + 1, train_loss.avg, train_acc.avg, time_spent)) summary['step'] += 1 img, target, label, label_degree = prefetcher.next() if args.local_rank == 0: time_spent = time.time() - time_now time_now = time.time() recall = train_pred_posit.sum/float(train_label_posit.sum) summary_writer.add_scalar( 'train/loss', train_loss.val, epoch) summary_writer.add_scalar( 'train/acc', train_acc.val, epoch) summary_writer.add_scalar('train/recall', recall, epoch) # summary_writer.add_scalar( # 'learning_rate', lr, summary['step'] + steps*epoch) summary_writer.flush() summary['confusion_matrix'] = plot_confusion_matrix( confusion_matrix.matrix, cfg['labels'], tensor_name='train/Confusion matrix') # summary['loss'] = train_loss.avg # summary['acc'] = acc_sum / (steps * (batch_size)) # summary['acc'] = train_acc.avg summary['epoch'] = epoch print("Recall >=20:", recall) return summary