resp_offset = resp_offset[iou_mask] resp_strd = resp_strd[iou_mask] conf = resp_true_pred[iou_mask][:, 4].mean().item() class_mask = targets[:, 5:].type(torch.BoolTensor).squeeze(0) if (iou_mask.sum() == class_mask.shape[0]): pos_class = resp_true_pred[iou_mask][:, 5:][class_mask].mean().item() neg_class = resp_true_pred[iou_mask][:, 5:][~class_mask].mean().item( ) else: pos_class = 0 neg_class = 0 loss = util.yolo_loss(resp_raw_pred, targets, no_obj, mask, resp_anchors, resp_offset, resp_strd, inp_dim, hyperparameters) loss.backward() optimizer.step() avg_conf = avg_conf + conf avg_no_conf = avg_no_conf + no_obj_conf avg_pos = avg_pos + pos_class avg_neg = avg_neg + neg_class total_loss = total_loss + loss.item() avg_iou = avg_iou + iou prg_counter = prg_counter + 1 # sys.stdout.write('\rPgr:'+str(prg_counter/dataset_len*100*batch_size)+'%' ' L:'+ str(loss.item())) # sys.stdout.write(' IoU:' +str(iou)+' pob:'+str(conf)+ ' nob:'+str(no_obj_conf)) # sys.stdout.write(' PCls:' +str(pos_class)+' ncls:'+str(neg_class))
def train_one_epoch(model, optimizer, dataloader, hyperparameters, mode): model.train() if (mode['show_temp_summary'] == True): writer = SummaryWriter('../tensorboard/test_vis/') epoch = hyperparameters['resume_from'] if type(model) is nn.DataParallel: inp_dim = model.module.inp_dim pw_ph = model.module.pw_ph cx_cy = model.module.cx_cy stride = model.module.stride else: inp_dim = model.inp_dim pw_ph = model.pw_ph cx_cy = model.cx_cy stride = model.stride coco_version = hyperparameters['coco_version'] pw_ph = pw_ph.cuda() cx_cy = cx_cy.cuda() stride = stride.cuda() break_flag = 0 dataset_len = len(dataloader.dataset) batch_size = dataloader.batch_size total_loss = 0 avg_iou = 0 prg_counter = 0 train_counter = 0 avg_conf = 0 avg_no_conf = 0 avg_pos = 0 avg_neg = 0 for images, targets in dataloader: train_counter = train_counter + 1 prg_counter = prg_counter + 1 optimizer.zero_grad() images = images.cuda() if mode['debugging'] == True: with autograd.detect_anomaly(): raw_pred = model(images, torch.cuda.is_available()) else: raw_pred = model(images, torch.cuda.is_available()) if (torch.isinf(raw_pred).sum() > 0): break_flag = 1 break true_pred = util.transform(raw_pred.clone().detach(), pw_ph, cx_cy, stride) iou_list = util.get_iou_list(true_pred, targets, hyperparameters, inp_dim) resp_raw_pred, resp_cx_cy, resp_pw_ph, resp_stride, no_obj = util.build_tensors( raw_pred, iou_list, pw_ph, cx_cy, stride, hyperparameters) stats = helper.get_progress_stats(true_pred, no_obj, iou_list, targets) if hyperparameters['wasserstein'] == True: no_obj = util.get_wasserstein_matrices(raw_pred, iou_list, inp_dim) if mode['debugging'] == True: with autograd.detect_anomaly(): loss = util.yolo_loss(resp_raw_pred, targets, no_obj, resp_pw_ph, resp_cx_cy, resp_stride, inp_dim, hyperparameters) elif mode['bayes_opt'] == True: try: loss = util.yolo_loss(resp_raw_pred, targets, no_obj, resp_pw_ph, resp_cx_cy, resp_stride, inp_dim, hyperparameters) except RuntimeError: # print('bayes opt failed') break_flag = 1 break else: loss = util.yolo_loss(resp_raw_pred, targets, no_obj, resp_pw_ph, resp_cx_cy, resp_stride, inp_dim, hyperparameters) loss.backward() optimizer.step() avg_conf = avg_conf + stats['pos_conf'] avg_no_conf = avg_no_conf + stats['neg_conf'] avg_pos = avg_pos + stats['pos_class'] avg_neg = avg_neg + stats['neg_class'] total_loss = total_loss + loss.item() avg_iou = avg_iou + stats['iou'] if mode['show_output'] == True: sys.stdout.write('\rPgr:' + str(prg_counter / dataset_len * 100 * batch_size) + '%' ' L:' + str(loss.item())) sys.stdout.write(' IoU:' + str(stats['iou']) + ' pob:' + str(stats['pos_conf']) + ' nob:' + str(stats['neg_conf'])) sys.stdout.write(' PCls:' + str(stats['pos_class']) + ' ncls:' + str(stats['neg_class'])) sys.stdout.flush() if (mode['show_temp_summary'] == True): writer.add_scalar('AvLoss/train', total_loss / train_counter, train_counter) writer.add_scalar('AvIoU/train', avg_iou / train_counter, train_counter) writer.add_scalar('AvPConf/train', avg_conf / train_counter, train_counter) writer.add_scalar('AvNConf/train', avg_no_conf / train_counter, train_counter) writer.add_scalar('AvClass/train', avg_pos / train_counter, train_counter) writer.add_scalar('AvNClass/train', avg_neg / train_counter, train_counter) total_loss = total_loss / train_counter avg_iou = avg_iou / train_counter avg_pos = avg_pos / train_counter avg_neg = avg_neg / train_counter avg_conf = avg_conf / train_counter avg_no_conf = avg_no_conf / train_counter outcome = { 'avg_loss': total_loss, 'avg_iou': avg_iou, 'avg_pos': avg_pos, 'avg_neg': avg_neg, 'avg_conf': avg_conf, 'avg_no_conf': avg_no_conf, 'broken': break_flag } return outcome
def train(trainloader, model, optimizer, epoch, cuda=True): # switch to train mode model.train() hyperparameters = model.hp mode = model.mode if type(model) is nn.DataParallel: inp_dim = model.module.inp_dim pw_ph = model.module.pw_ph cx_cy = model.module.cx_cy stride = model.module.stride else: inp_dim = model.inp_dim pw_ph = model.pw_ph cx_cy = model.cx_cy stride = model.stride if cuda: pw_ph = pw_ph.cuda() cx_cy = cx_cy.cuda() stride = stride.cuda() batch_time = AverageMeter() data_time = AverageMeter() avg_loss = AverageMeter() avg_iou = AverageMeter() avg_conf = AverageMeter() avg_no_conf = AverageMeter() avg_pos = AverageMeter() avg_neg = AverageMeter() end = time.time() break_flag = 0 if mode['show_temp_summary'] == True: writer = SummaryWriter(os.path.join(track.trial_dir(), 'temp_vis/')) for batch_idx, (inputs, targets) in enumerate(trainloader): # measure data loading time data_time.update(time.time() - end) if cuda: inputs = inputs.cuda() # compute output raw_pred = model(inputs, torch.cuda.is_available()) true_pred = util.transform(raw_pred.clone().detach(), pw_ph, cx_cy, stride) iou_list = util.get_iou_list(true_pred, targets, hyperparameters, inp_dim) resp_raw_pred, resp_cx_cy, resp_pw_ph, resp_stride, no_obj = util.build_tensors( raw_pred, iou_list, pw_ph, cx_cy, stride, hyperparameters) stats = helper.get_progress_stats(true_pred, no_obj, iou_list, targets) if hyperparameters['wasserstein'] == True: no_obj = util.get_wasserstein_matrices(raw_pred, iou_list, inp_dim) try: loss = util.yolo_loss(resp_raw_pred, targets, no_obj, resp_pw_ph, resp_cx_cy, resp_stride, inp_dim, hyperparameters) except RuntimeError: print('bayes opt failed') break_flag = 1 break # measure accuracy and record loss avg_loss.update(loss.item()) avg_iou.update(stats['iou']) avg_conf.update(stats['pos_conf']) avg_no_conf.update(stats['neg_conf']) avg_pos.update(stats['pos_class']) avg_neg.update(stats['neg_class']) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if mode['show_output'] == True: # plot progress progress_str = 'Loss: %.4f | AvIoU: %.3f | AvPConf: %.3f | AvNConf: %.5f | AvClass: %.3f | AvNClass: %.5f'\ % (loss.item(), stats['iou'], stats['pos_conf'], stats['neg_conf'],stats['pos_class'],stats['neg_class']) progress_bar(batch_idx, len(trainloader), progress_str) iteration = epoch * len(trainloader) + batch_idx if mode['show_temp_summary'] == True: writer.add_scalar('AvLoss/train', avg_loss.avg, iteration) writer.add_scalar('AvIoU/train', avg_iou.avg, iteration) writer.add_scalar('AvPConf/train', avg_conf.avg, iteration) writer.add_scalar('AvNConf/train', avg_no_conf.avg, iteration) writer.add_scalar('AvClass/train', avg_pos.avg, iteration) writer.add_scalar('AvNClass/train', avg_neg.avg, iteration) track.metric(iteration=iteration, epoch=epoch, avg_train_loss=avg_loss.avg, avg_train_iou=avg_iou.avg, avg_train_conf=avg_conf.avg, avg_train_neg_conf=avg_no_conf.avg, avg_train_pos=avg_pos.avg, avg_train_neg=avg_neg.avg) outcome = { 'avg_loss': avg_loss.avg, 'avg_iou': avg_iou.avg, 'avg_pos': avg_pos.avg, 'avg_neg': avg_neg.avg, 'avg_conf': avg_conf.avg, 'avg_no_conf': avg_no_conf.avg, 'broken': break_flag } return outcome
epochs = 20 lock = 0 total_loss = 0 for e in range(epochs): prg_counter = 0 total_loss = 0 print("\n epoch " + str(e)) for index, row in df.iterrows(): optimizer.zero_grad() imgpath = '../images/images/' + row['filename'] + '_img' + row[ 'framespan'].split(':')[0] + '.jpg' inp = get_test_input(imgpath) targets = torch.tensor([[[ row['x'] * (416 / 1980), row['y'] * (416 / 1080), row['width'] * (416 / 1980), row['height'] * (416 / 1080), 1, 1 ]]]) pred = model(inp, torch.cuda.is_available()) pred = pred.to(device='cuda') targets = targets.to(device='cuda') loss = util.yolo_loss(pred, targets) loss.backward() optimizer.step() sys.stdout.write('\r Progress is ' + str(prg_counter / 9570 * 100) + '%' ' loss is: ' + str(loss.item())) sys.stdout.flush() prg_counter = prg_counter + 1 total_loss = total_loss + loss.item() torch.save(model.state_dict(), PATH) print('\n total average loss is ' + str(total_loss / 9570))
anchors = pw_ph.clone() offset = cx_cy.clone() strd = stride.clone() noobj_box = raw_pred[:, :, 4:5].clone() noobj_box = noobj_box[noobj_mask.T, :] raw_pred = raw_pred[iou_mask.T, :] anchors = anchors[iou_mask.T, :] offset = offset[iou_mask.T, :] strd = strd[iou_mask.T, :] if (strd.shape[0] == 1): target[:, 0:4] = target[:, 0:4] * (inp_dim / strd) target = target.squeeze(-2) target = util.transform_groundtruth(target, anchors, offset) loss = util.yolo_loss(raw_pred, target, noobj_box, 1) loss.backward() optimizer.step() sys.stdout.write('\r Progress is ' + str(prg_counter / 9570 * 100) + '%' ' loss is: ' + str(loss.item())) sys.stdout.flush() prg_counter = prg_counter + 1 total_loss = total_loss + loss.item() else: print('missed') prg_counter = prg_counter + 1 torch.save(model.state_dict(), PATH) print('\n total average loss is ' + str(total_loss / 9570))
noobj_box = noobj_box[noobj_mask.T, :] no_obj_conf = noobj_box.mean().item() raw_pred = raw_pred[iou_mask.T, :] anchors = anchors[iou_mask.T, :] offset = offset[iou_mask.T, :] strd = strd[iou_mask.T, :] if ( strd.shape[0] == sample_batched['image'].shape[0] ): #this means that iou_mask failed and was all true, because max of zeros is true for all lenght of mask strd target = util.xyxy_to_xywh(target) target = target.squeeze(1) target = util.transform_groundtruth(target, anchors, offset, strd) loss = util.yolo_loss(raw_pred, target, noobj_box, batch_size) loss.backward() optimizer.step() total_loss = total_loss + loss.item() avg_iou = avg_iou + iou sys.stdout.write('\r Progress is ' + str(prg_counter / dataset_len * 100 * batch_size) + '%' ' loss is: ' + str(loss.item())) sys.stdout.write(' Iou is ' + str(iou) + ' conf is ' + str(conf) + ' no_obj conf is ' + str(no_obj_conf)) sys.stdout.flush() del loss, raw_pred, target, true_pred, sample_batched[ 'image'], iou, noobj_box, conf torch.cuda.empty_cache() prg_counter = prg_counter + 1