def train_model(model, optimizer, dataloader, epoch): model.train() running_mask_loss, running_size_loss, running_offset_loss = 0, 0, 0 for batch_idx, (img_batch, mask_batch) in enumerate(dataloader): img_batch = img_batch.to(device) mask_batch = mask_batch.to(device) # center_index = center_index.to(device) optimizer.zero_grad() output = model(img_batch) mask_loss, size_loss, offset_loss = criterion(output, mask_batch) loss = mask_loss + size_loss + offset_loss loss.backward() optimizer.step() running_mask_loss += mask_loss running_size_loss += size_loss running_offset_loss += offset_loss if batch_idx % 5 == 0: print( f'\r{running_mask_loss/(batch_idx+1):.3f} {running_size_loss/(batch_idx+1):.3f} {running_offset_loss/(batch_idx+1):.3f}', end='', flush=True) print('\r', end='', flush=True) print( f"Epoch: {epoch} mask_loss: {running_mask_loss/(batch_idx): .3f} " f"size_loss: {running_size_loss/(batch_idx): .3f} offset_loss: {running_offset_loss/(batch_idx): .3f}" )
def train(model, optimizer, epochs, trainloader, valloader): model.train() for epoch_index in range(epochs): pbar = Progbar(target=len(trainloader)) index_train = epoch_index * len(trainloader) running_loss = 0.0 for batch_index, data in enumerate(trainloader): batch_index_ = batch_index batch_index_ += index_train lr = ajust_learning_tri(optimizer, batch_index_, step_size=len(trainloader) * 2) # 输入数据 inputs, labels = data inputs = inputs.cuda() labels = labels.cuda() outputs = model(inputs) cost = criterion(outputs, labels) # 梯度清零 optimizer.zero_grad() cost.backward() # 更新参数 optimizer.step() running_loss += cost.item() pbar.update(batch_index + 1, values=[('loss', running_loss / (batch_index + 1)), ('epoch:', epoch_index)]) lr_list.append(lr) val(model, valloader)
def evaluate(model, dataloader): model.eval() eval_loss = 0.0 with torch.no_grad(): for idx, data in enumerate(dataloader, 0): X, Y = data X = X.to(_DEVICE_) Y = Y.to(_DEVICE_) res = model(X) batch_loss = criterion(res, Y) eval_loss += batch_loss eval_loss = eval_loss / len(dataloader) return eval_loss
def val(model, valloader): correct = 0 # 预测正确的图片数 total = 0 # 总共的图片数 # 由于测试的时候不需要求导,可以暂时关闭autograd,提高速度,节约内存 model.eval() with torch.no_grad(): running_loss = 0.0 pbar = Progbar(target=len(valloader)) for i, data in enumerate(valloader): images, labels = data images = images.cuda() labels = labels.cuda() outputs = model(images) cost = criterion(outputs, labels) running_loss += cost.item() _, predicted = torch.max(outputs, 1) total += labels.cpu().numpy().shape[0] correct += (predicted.cpu().numpy() == labels.cpu().numpy()).sum() acc = correct / total pbar.update(i + 1, values=[('loss', running_loss / (i + 1)), ('acc:', acc)]) save_model(model, acc, distributed=False)
def train_one_epoch(self): train_annot_dir = self.train_config['train_annot_dir'] val_annot_dir = self.train_config['val_annot_dir'] if not [is_photo(a) for a in ls(train_annot_dir)]: return if not [is_photo(a) for a in ls(val_annot_dir)]: return if self.first_loop: self.first_loop = False self.write_message('Training started') self.log('Starting Training') train_loader = DataLoader( self.train_set, self.bs, shuffle=True, # 12 workers is good for performance # on 2 RTX2080 Tis # 0 workers is good for debugging num_workers=12, drop_last=False, pin_memory=True) epoch_start = time.time() self.model.train() tps = 0 fps = 0 tns = 0 fns = 0 defined_total = 0 loss_sum = 0 for step, (photo_tiles, foreground_tiles, defined_tiles) in enumerate(train_loader): self.check_for_instructions() photo_tiles = photo_tiles.cuda() foreground_tiles = foreground_tiles.cuda() defined_tiles = defined_tiles.cuda() self.optimizer.zero_grad() outputs = self.model(photo_tiles) softmaxed = softmax(outputs, 1) # just the foreground probability. foreground_probs = softmaxed[:, 1, :] # remove any of the predictions for which we don't have ground truth # Set outputs to 0 where annotation undefined so that # The network can predict whatever it wants without any penalty. outputs[:, 0] *= defined_tiles outputs[:, 1] *= defined_tiles loss = criterion(outputs, foreground_tiles) loss.backward() self.optimizer.step() foreground_probs *= defined_tiles predicted = foreground_probs > 0.5 # we only want to calculate metrics on the # part of the predictions for which annotations are defined # so remove all predictions and foreground labels where # we didn't have any annotation. defined_list = defined_tiles.view(-1) preds_list = predicted.view(-1)[defined_list > 0] foregrounds_list = foreground_tiles.view(-1)[defined_list > 0] # # calculate all the false positives, false negatives etc tps += torch.sum( (foregrounds_list == 1) * (preds_list == 1)).cpu().numpy() tns += torch.sum( (foregrounds_list == 0) * (preds_list == 0)).cpu().numpy() fps += torch.sum( (foregrounds_list == 0) * (preds_list == 1)).cpu().numpy() fns += torch.sum( (foregrounds_list == 1) * (preds_list == 0)).cpu().numpy() defined_total += torch.sum(defined_list > 0).cpu().numpy() loss_sum += loss.item() # float sys.stdout.write(f"Training {(step+1) * self.bs}/" f"{len(train_loader.dataset)} " f" loss={round(loss.item(), 3)} \r") self.check_for_instructions() # could update training parameter if not self.training: return duration = round(time.time() - epoch_start, 3) print('epoch train duration', duration) self.log_metrics( 'train', get_metrics(tps, fps, tns, fns, defined_total, duration)) before_val_time = time.time() self.validation() print('epoch validation duration', time.time() - before_val_time)
def main(): torch.backends.cudnn.benchmark = True num_train_imgs = len(loaders_dict['train'].dataset) num_val_imgs = len(loaders_dict['val'].dataset) batch_size = loaders_dict['train'].batch_size logs = [] for epoch in range(num_epochs): t_epoch_start = time.time() epoch_train_loss = 0.0 epoch_val_loss = 0.0 epoch_train_score = 0.0 epoch_val_score = 0.0 print('-----------------------') print(f'Epoch {epoch+1}/{num_epochs}') print('-----------------------') for phase in ['train', 'val']: if phase == 'train': model.train() optimizer.zero_grad() else: model.eval() count = 0 for img_batch, label_batch in loaders_dict[phase]: if use_mixup and (pahse=='train'): mixup_flag = np.random.randint(use_mixup)==1 if mixup_flag: img_batch, label_batch = mixup(img_batch, label_batch, alpha=1, n_classes=18) img_batch = img_batch.to(device, dtype=torch.float) label_batch = label_batch.to(device, dtype=torch.float) if (phase=='train') and (count==0): optimizer.step() optimizer.zero_grad() count = batch_multiplier with torch.set_grad_enabled(phase == 'train'): output = torch.sigmoid(model(img_batch)) loss = criterion(output, label_batch) loss /= batch_multiplier if phase == 'train': loss.backward() count -= 1 epoch_train_loss += loss.item() * batch_multiplier for pred, label in zip(output, label_batch): pred = pred.detach().cpu().numpy() label = label.detach().cpu().numpy() epoch_train_score += metric(label, pred) else: epoch_val_loss += loss.item() * batch_multiplier for pred, label in zip(output, label_batch): pred = pred.detach().cpu().numpy() label = label.detach().cpu().numpy() epoch_val_score += metric(label, pred) train_loss = epoch_train_loss / num_train_imgs val_loss = epoch_val_loss / num_val_imgs train_score = epoch_train_score / num_train_imgs val_score = epoch_val_score / num_val_imgs t_epoch_finish = time.time() print(f'epoch: {epoch+1}') print(f'Epoch_Train_Loss: {train_loss:.3f}') print(f'Epoch_Val_Loss: {val_loss:.3f}\n') print(f'Epoch_Train_Score: {train_score:.3f}') print(f'Epoch_Val_Score: {val_score:.3f}\n') print('timer: {:.3f} sec.'.format(t_epoch_finish - t_epoch_start), '\n') t_epoch_start = time.time() for g in optimizer.param_groups: print('lr: ', g['lr'], '\n\n') log_epoch = { 'epoch': epoch+1, 'train_loss': train_loss, 'val_loss': val_loss, 'train_score': train_score, 'val_score': val_score, } logs.append(log_epoch) df = pd.DataFrame(logs) df.to_csv(f'{export_model}/log.csv', index=False) torch.save(model.state_dict(), f'{export_model}/model_epoch{epoch+1}.pth') scheduler.step(val_loss) df = pd.read_csv(f'{export_model}/log.csv') plt.plot(df['train_loss'], label='train loss') plt.plot(df['val_loss'], label='val loss') plt.legend() plt.savefig(f'{export_figure}/loss.png') plt.close() plt.plot(df['train_score'], label='train score') plt.plot(df['val_score'], label='val score') plt.legend() plt.savefig(f'{export_figure}/score.png') plt.close()
epoch_loss = 0.0 epoch_since = time.time() model.train() for idx, data in enumerate(dataloader['train'], 0): with torch.set_grad_enabled(True): images, detections = data images = images.to(_DEVICE_) detections = detections.to(_DEVICE_) optimiser.zero_grad() predictions = model(images) batch_loss = criterion(predictions, detections) epoch_loss += batch_loss.item() if idx % 100 == 0: logger.info( f"\tIteration {idx+1}/{len(dataloader['train'])}: Loss = {batch_loss.item()}" ) batch_loss.backward() optimiser.step() epoch_loss = epoch_loss / len(dataloader['train']) epoch_elapsed = time.time() - epoch_since logger.info( f"\tAverage Train Epoch loss is {epoch_loss:.2f} [{epoch_elapsed//60:.0f}m {epoch_elapsed%60:.0f}s]" )