def test_nested_model_sessions(): model = torch.nn.Sequential(torch.nn.Linear(1, 1), torch.nn.Dropout(0.1)) print(model.training) with TorchEvalSession(model): print(model.training) with TorchTrainSession(model): print(model.training) with TorchEvalSession(model): print(model.training) with TorchTrainSession(model): print(model.training) with TorchEvalSession(model): print(model.training) print(model.training)
def train_model( model, optimiser, epoch_i: int, metric_writer: Writer, loader: DataLoader, log_interval=10, ): with TorchTrainSession(model): train_accum_loss = 0 generator = tqdm(enumerate(loader)) for batch_idx, (original, *_) in generator: original = original.to(global_torch_device()) optimiser.zero_grad() reconstruction, mean, log_var = model(original) loss = loss_function(reconstruction, original, mean, log_var) loss.backward() optimiser.step() train_accum_loss += loss.item() metric_writer.scalar("train_loss", loss.item()) if batch_idx % log_interval == 0: generator.set_description( f"Train Epoch: {epoch_i}" f" [{batch_idx * len(original)}/" f"{len(loader.dataset)}" f" ({100. * batch_idx / len(loader):.0f}%)]\t" f"Loss: {loss.item() / len(original):.6f}") break print(f"====> Epoch: {epoch_i}" f" Average loss: {train_accum_loss / len(loader.dataset):.4f}")
def single_epoch_fitting( model: torch.nn.Module, optimiser, train_loader_, *, epoch: int = None, writer: Writer = None, device_: torch.device = global_torch_device()) -> None: accum_loss = 0 num_batches = len(train_loader_) with TorchTrainSession(model): for batch_idx, (data, target) in tqdm(enumerate(train_loader_), desc='train batch #', total=num_batches): loss = nll_loss( model(data.to(device_)).squeeze(), target.to(device_) ) # negative log-likelihood for a tensor of size (batch x 1 x n_output) optimiser.zero_grad() loss.backward() optimiser.step() accum_loss += loss.item() if writer: writer.scalar('loss', accum_loss / num_batches, epoch)
def maskrcnn_train_single_epoch( *, model: Module, optimiser: torch.optim.Optimizer, data_loader: DataLoader, device: torch.device = global_torch_device(), writer: Writer = None, ) -> None: """ :param model: :param optimiser: :param data_loader: :param epoch_i: :param log_frequency: :param device: :param writer: :return: """ model.to(device) with TorchTrainSession(model): for images, targets in tqdm.tqdm(data_loader, desc="Batch #"): images = [img.to(device) for img in images] targets = [{k: v.to(device) for k, v in t.items()} for t in targets] # torch.cuda.synchronize(device) loss_dict = model(images, targets=targets) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_dict( loss_dict) # reduce losses over all GPUs for logging purposes losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print(f"Loss is {loss_value}, stopping training") print(loss_dict_reduced) sys.exit(1) optimiser.zero_grad() losses.backward() optimiser.step() if writer: for k, v in { "loss": losses_reduced, "lr": torch.optim.Optimizer.param_groups[0]["lr"], **loss_dict_reduced, }.items(): writer.scalar(k, v)
def train_siamese( model, optimiser, criterion, *, writer: Writer = MockWriter(), train_number_epochs, data_dir, train_batch_size, model_name, save_path, save_best=False, img_size, validation_interval: int = 1, ): """ :param data_dir: :type data_dir: :param optimiser: :type optimiser: :param criterion: :type criterion: :param writer: :type writer: :param model_name: :type model_name: :param save_path: :type save_path: :param save_best: :type save_best: :param model: :type model: :param train_number_epochs: :type train_number_epochs: :param train_batch_size: :type train_batch_size: :return: :rtype: Parameters ---------- img_size validation_interval""" train_dataloader = DataLoader( TripletDataset( data_path=data_dir, transform=transforms.Compose([ transforms.Grayscale(), transforms.Resize(img_size), transforms.ToTensor(), ]), split=SplitEnum.training, ), shuffle=True, num_workers=0, batch_size=train_batch_size, ) valid_dataloader = DataLoader( TripletDataset( data_path=data_dir, transform=transforms.Compose([ transforms.Grayscale(), transforms.Resize(img_size), transforms.ToTensor(), ]), split=SplitEnum.validation, ), shuffle=True, num_workers=0, batch_size=train_batch_size, ) best = math.inf E = tqdm(range(0, train_number_epochs)) batch_counter = count() for epoch in E: for tss in train_dataloader: batch_i = next(batch_counter) with TorchTrainSession(model): optimiser.zero_grad() loss_contrastive = criterion(*model( *[t.to(global_torch_device()) for t in tss])) loss_contrastive.backward() optimiser.step() a = loss_contrastive.cpu().item() writer.scalar("train_loss", a, batch_i) if batch_counter.__next__() % validation_interval == 0: with TorchEvalSession(model): for tsv in valid_dataloader: o = model(*[t.to(global_torch_device()) for t in tsv]) a_v = criterion(*o).cpu().item() valid_positive_acc = (accuracy( distances=pairwise_distance(o[0], o[1]), is_diff=0).cpu().item()) valid_negative_acc = (accuracy( distances=pairwise_distance(o[0], o[2]), is_diff=1).cpu().item()) valid_acc = numpy.mean( (valid_negative_acc, valid_positive_acc)) writer.scalar("valid_loss", a_v, batch_i) writer.scalar("valid_positive_acc", valid_positive_acc, batch_i) writer.scalar("valid_negative_acc", valid_negative_acc, batch_i) writer.scalar("valid_acc", valid_acc, batch_i) if a_v < best: best = a_v print(f"new best {best}") if save_best: save_model_parameters( model, optimiser=optimiser, model_name=model_name, save_directory=save_path, ) E.set_description( f"Epoch number {epoch}, Current train loss {a}, valid loss {a_v}, valid acc {valid_acc}" ) return model
def pred_target_train_model( model, train_iterator, criterion, optimizer, scheduler, writer, interrupted_path, test_data_iterator=None, num_updates: int = 250000, early_stop=None, ) -> torch.nn.Module: best_model_wts = copy.deepcopy(model.state_dict()) best_val_loss = 1e10 since = time.time() try: sess = tqdm.tqdm(range(num_updates), leave=False, disable=False) val_loss = 0 update_loss = 0 val_acc = 0 last_val = None last_out = None with torch.autograd.detect_anomaly(): for update_i in sess: for phase in [Split.Training, Split.Validation]: if phase == Split.Training: with TorchTrainSession(model): input, true_label = zip(*next(train_iterator)) rgb_imgs = torch_vision_normalize_batch_nchw( uint_nhwc_to_nchw_float_batch( rgb_drop_alpha_batch_nhwc( to_tensor(input)))) true_label = to_tensor(true_label, dtype=torch.long) optimizer.zero_grad() pred = model(rgb_imgs) loss = criterion(pred, true_label) loss.backward() optimizer.step() if last_out is None: last_out = pred else: if not torch.dist(last_out, pred) > 0: print(f"Same output{last_out},{pred}") last_out = pred update_loss = loss.data.cpu().numpy() writer.scalar(f"loss/train", update_loss, update_i) if scheduler: scheduler.step() elif test_data_iterator: with TorchEvalSession(model): test_rgb_imgs, test_true_label = zip( *next(train_iterator)) test_rgb_imgs = torch_vision_normalize_batch_nchw( uint_nhwc_to_nchw_float_batch( rgb_drop_alpha_batch_nhwc( to_tensor(test_rgb_imgs)))) test_true_label = to_tensor(test_true_label, dtype=torch.long) with torch.no_grad(): val_pred = model(test_rgb_imgs) val_loss = criterion(val_pred, test_true_label) _, cat = torch.max(val_pred, -1) val_acc = torch.sum( cat == test_true_label) / float(cat.size(0)) writer.scalar(f"loss/acc", val_acc, update_i) writer.scalar(f"loss/val", val_loss, update_i) if last_val is None: last_val = cat else: if all(last_val == cat): print(f"Same val{last_val},{cat}") last_val = cat if val_loss < best_val_loss: best_val_loss = val_loss best_model_wts = copy.deepcopy( model.state_dict()) sess.write( f"New best validation model at update {update_i} with test_loss {best_val_loss}" ) torch.save(model.state_dict(), interrupted_path) if early_stop is not None and val_pred < early_stop: break sess.set_description_str(f"Update {update_i} - {phase} " f"update_loss:{update_loss:2f} " f"test_loss:{val_loss}" f"val_acc:{val_acc}") except KeyboardInterrupt: print("Interrupt") finally: pass model.load_state_dict(best_model_wts) # load best model weights time_elapsed = time.time() - since print(f"{time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s") print(f"Best val loss: {best_val_loss:3f}") return model
def predictor_response_train_model( model, *, train_iterator, criterion, optimizer, scheduler, writer, interrupted_path, val_data_iterator=None, num_updates: int = 250000, device=global_torch_device(), early_stop=None, ): """ :param model: :param train_iterator: :param criterion: :param optimizer: :param scheduler: :param writer: :param interrupted_path: :param val_data_iterator: :param num_updates: :param device: :param early_stop: :return: """ best_model_wts = copy.deepcopy(model.state_dict()) best_val_loss = 1e10 since = time.time() try: sess = tqdm(range(num_updates), leave=False, disable=False) val_loss = 0 update_loss = 0 val_acc = 0 last_val = None last_out = None with torch.autograd.detect_anomaly(): for update_i in sess: for phase in [Split.Training, Split.Validation]: if phase == Split.Training: with TorchTrainSession(model): input, true_label = next(train_iterator) rgb_imgs = to_tensor( input, dtype=torch.float, device=device ).repeat(1, 3, 1, 1) true_label = to_tensor( true_label, dtype=torch.long, device=device ) optimizer.zero_grad() pred = model(rgb_imgs) loss = criterion(pred, true_label) loss.backward() optimizer.step() update_loss = loss.data.cpu().numpy() writer.scalar(f"loss/train", update_loss, update_i) if scheduler: scheduler.step() elif val_data_iterator: with TorchEvalSession(model): test_rgb_imgs, test_true_label = next(val_data_iterator) test_rgb_imgs = to_tensor( test_rgb_imgs, dtype=torch.float, device=device ).repeat(1, 3, 1, 1) test_true_label = to_tensor( test_true_label, dtype=torch.long, device=device ) with torch.no_grad(): val_pred = model(test_rgb_imgs) val_loss = criterion(val_pred, test_true_label) _, cat = torch.max(val_pred, -1) val_acc = torch.sum(cat == test_true_label) / float( cat.size(0) ) writer.scalar(f"loss/acc", val_acc, update_i) writer.scalar(f"loss/val", val_loss, update_i) if val_loss < best_val_loss: best_val_loss = val_loss best_model_wts = copy.deepcopy(model.state_dict()) sess.write( f"New best validation model at update {update_i} with best_val_loss {best_val_loss}" ) torch.save(model.state_dict(), interrupted_path) if early_stop is not None and val_pred < early_stop: break sess.set_description_str( f"Update {update_i} - {phase} " f"update_loss:{update_loss:2f} " f"val_loss:{val_loss}" f"val_acc:{val_acc}" ) except KeyboardInterrupt: print("Interrupt") finally: pass model.load_state_dict(best_model_wts) # load best model weights time_elapsed = time.time() - since print(f"{time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s") print(f"Best val loss: {best_val_loss:3f}") return model
def inner_train_ssd(*, data_root: Path, cfg: NOD, model: Module, data_loader: DataLoader, optimiser: Optimizer, scheduler: WarmupMultiStepLR, check_pointer: callable, device: callable, arguments: callable, kws: NOD, ) -> Module: """ :param data_root: :type data_root: :param cfg: :type cfg: :param model: :type model: :param data_loader: :type data_loader: :param optimiser: :type optimiser: :param scheduler: :type scheduler: :param check_pointer: :type check_pointer: :param device: :type device: :param arguments: :type arguments: :param kws: :type kws: :return: :rtype: """ logger = logging.getLogger("SSD.trainer") logger.info("Start training ...") meters = MetricLogger() with TorchTrainSession(model): save_to_disk = global_distribution_rank() == 0 if kws.use_tensorboard and save_to_disk: import tensorboardX writer = tensorboardX.SummaryWriter( log_dir=str(PROJECT_APP_PATH.user_data / "results" / "tf_logs") ) else: writer = None max_iter = len(data_loader) start_iter = arguments["iteration"] start_training_time = time.time() end = time.time() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): arguments["iteration"] = iteration images = images.to(device) targets = targets.to(device) loss_instance = MultiBoxLoss(neg_pos_ratio=cfg.model.neg_pos_ratio) cls_logits, bbox_pred = model(images) reg_loss, cls_loss = loss_instance( cls_logits, bbox_pred, targets.labels, targets.boxes ) loss_dict = dict(reg_loss=reg_loss, cls_loss=cls_loss) loss = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict( loss_dict ) # reduce losses over all GPUs for logging purposes losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(total_loss=losses_reduced, **loss_dict_reduced) optimiser.zero_grad() loss.backward() optimiser.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time) if iteration % kws.log_step == 0: eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) logger.info( meters.delimiter.join( [ f"iter: {iteration:06d}", f"lr: {optimiser.param_groups[0]['lr']:.5f}", f"{str(meters)}", f"eta: {eta_string}", f"mem: {round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0)}M", ] ) ) if writer: global_step = iteration writer.add_scalar( "losses/total_loss", losses_reduced, global_step=global_step ) for loss_name, loss_item in loss_dict_reduced.items(): writer.add_scalar( f"losses/{loss_name}", loss_item, global_step=global_step ) writer.add_scalar( "lr", optimiser.param_groups[0]["lr"], global_step=global_step ) if iteration % kws.save_step == 0: check_pointer.save(f"model_{iteration:06d}", **arguments) if ( kws.eval_step > 0 and iteration % kws.eval_step == 0 and not iteration == max_iter ): with TorchEvalSession(model): eval_results = do_ssd_evaluation( data_root, cfg, model, distributed=kws.distributed, iteration=iteration, ) if global_distribution_rank() == 0 and writer: for eval_result, dataset in zip( eval_results, cfg.datasets.test ): write_metrics_recursive( eval_result["metrics"], "metrics/" + dataset, writer, iteration, ) check_pointer.save("model_final", **arguments) total_training_time = int( time.time() - start_training_time ) # compute training time logger.info( f"Total training time: {datetime.timedelta(seconds=total_training_time)} (" f"{total_training_time / max_iter:.4f} s / it)" ) return model
def train_person_segmenter( model, train_loader, valid_loader, criterion, optimizer, scheduler, save_model_path: Path, n_epochs: int = 100, ): """ :param model: :type model: :param train_loader: :type train_loader: :param valid_loader: :type valid_loader: :param criterion: :type criterion: :param optimizer: :type optimizer: :param scheduler: :type scheduler: :param save_model_path: :type save_model_path: :param n_epochs: :type n_epochs: :return: :rtype: """ valid_loss_min = numpy.Inf # track change in validation loss assert n_epochs > 0, n_epochs E = tqdm(range(1, n_epochs + 1)) for epoch in E: train_loss = 0.0 valid_loss = 0.0 dice_score = 0.0 with TorchTrainSession(model): for data, target in tqdm(train_loader): data, target = ( data.to(global_torch_device()), target.to(global_torch_device()), ) optimizer.zero_grad() output, *_ = model(data) output = torch.sigmoid(output) loss = criterion(output, target) loss.backward() optimizer.step() train_loss += loss.item() * data.size(0) with TorchEvalSession(model): with torch.no_grad(): for data, target in tqdm(valid_loader): data, target = ( data.to(global_torch_device()), target.to(global_torch_device()), ) output, *_ = model( data ) # forward pass: compute predicted outputs by passing inputs to the model output = torch.sigmoid(output) loss = criterion(output, target) # calculate the batch loss valid_loss += loss.item() * data.size( 0) # update average validation loss dice_cof = intersection_over_union( output.cpu().detach().numpy(), target.cpu().detach().numpy()) dice_score += dice_cof * data.size(0) # calculate average losses train_loss = train_loss / len(train_loader.dataset) valid_loss = valid_loss / len(valid_loader.dataset) dice_score = dice_score / len(valid_loader.dataset) # print training/validation statistics E.set_description(f"Epoch: {epoch}" f" Training Loss: {train_loss:.6f} " f"Validation Loss: {valid_loss:.6f} " f"Dice Score: {dice_score:.6f}") # save model if validation loss has decreased if valid_loss <= valid_loss_min: print( f"Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ..." ) torch.save(model.state_dict(), save_model_path) valid_loss_min = valid_loss scheduler.step() model, scheduler = reschedule_learning_rate(model, epoch, scheduler) return model
def main(): pyplot.style.use("bmh") base_path = Path.home() / "/Data" / "PennFudanPed" save_model_path = PROJECT_APP_PATH.user_data / 'models' / "penn_fudan_ped_seg.model" train_model = False eval_model = not train_model SEED = 87539842 batch_size = 8 num_workers = 1 # os.cpu_count() learning_rate = 0.01 torch_seed(SEED) train_set = PennFudanDataset(base_path, Split.Training) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers) valid_loader = DataLoader( PennFudanDataset(base_path, Split.Validation), batch_size=batch_size, shuffle=False, num_workers=num_workers, ) model = SkipHourglassFission( input_channels=train_set.predictor_shape[-1], output_heads=(train_set.response_shape[-1], ), encoding_depth=1, ) model.to(global_torch_device()) if train_model: if save_model_path.exists(): model.load_state_dict(torch.load(str(save_model_path))) print("loading saved model") with TorchTrainSession(model): criterion = BCEDiceLoss(eps=1.0) optimiser = torch.optim.SGD(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimiser, T_max=7, eta_min=learning_rate / 100, last_epoch=-1) model = train_person_segmenter( model, train_loader, valid_loader, criterion, optimiser, scheduler, save_model_path, ) if eval_model: if save_model_path.exists(): model.load_state_dict(torch.load(str(save_model_path))) print("loading saved model") with TorchDeviceSession(global_torch_device(cuda_if_available=False), model): with torch.no_grad(): with TorchCacheSession(): with TorchEvalSession(model): valid_masks = [] a = (350, 525) tr = min(len(valid_loader.dataset) * 4, 2000) probabilities = numpy.zeros((tr, *a), dtype=numpy.float32) for sample_i, (data, target) in enumerate( tqdm(valid_loader)): data = data.to(global_torch_device()) target = target.cpu().detach().numpy() outpu, *_ = model(data) outpu = torch.sigmoid(outpu).cpu().detach().numpy() for p in range(data.shape[0]): output, mask = outpu[p], target[p] """ for m in mask: valid_masks.append(cv2_resize(m, a)) for probability in output: probabilities[sample_i, :, :] = cv2_resize(probability, a) sample_i += 1 """ if sample_i >= tr - 1: break if sample_i >= tr - 1: break f, ax = pyplot.subplots(3, 3, figsize=(24, 12)) for i in range(3): ax[0, i].imshow(valid_masks[i], vmin=0, vmax=1) ax[0, i].set_title("Original", fontsize=14) ax[1, i].imshow(valid_masks[i], vmin=0, vmax=1) ax[1, i].set_title("Target", fontsize=14) ax[2, i].imshow(probabilities[i], vmin=0, vmax=1) ax[2, i].set_title("Prediction", fontsize=14) pyplot.show()
def train_d( model, train_loader, valid_loader, criterion, optimiser, scheduler, save_model_path, n_epochs=0, ): """ Args: model: train_loader: valid_loader: criterion: optimiser: scheduler: save_model_path: n_epochs: Returns: """ valid_loss_min = numpy.Inf # track change in validation loss E = tqdm(range(1, n_epochs + 1)) for epoch in E: train_loss = 0.0 valid_loss = 0.0 dice_score = 0.0 with TorchTrainSession(model): train_set = tqdm(train_loader, postfix={"train_loss": 0.0}) for data, target in train_set: data, target = ( data.to(global_torch_device()), target.to(global_torch_device()), ) optimiser.zero_grad() output, *_ = model(data) output = torch.sigmoid(output) loss = criterion(output, target) loss.backward() optimiser.step() train_loss += loss.item() * data.size(0) train_set.set_postfix(ordered_dict={"train_loss": loss.item()}) with TorchEvalSession(model): with torch.no_grad(): validation_set = tqdm(valid_loader, postfix={ "valid_loss": 0.0, "dice_score": 0.0 }) for data, target in validation_set: data, target = ( data.to(global_torch_device()), target.to(global_torch_device()), ) # forward pass: compute predicted outputs by passing inputs to the model output, *_ = model(data) output = torch.sigmoid(output) # calculate the batch loss loss = criterion(output, target) # update average validation loss valid_loss += loss.item() * data.size(0) dice_cof = intersection_over_union( output.cpu().detach().numpy(), target.cpu().detach().numpy()) dice_score += dice_cof * data.size(0) validation_set.set_postfix(ordered_dict={ "valid_loss": loss.item(), "dice_score": dice_cof }) # calculate average losses train_loss = train_loss / len(train_loader.dataset) valid_loss = valid_loss / len(valid_loader.dataset) dice_score = dice_score / len(valid_loader.dataset) # print training/validation statistics E.set_description(f"Epoch: {epoch}" f" Training Loss: {train_loss:.6f} " f"Validation Loss: {valid_loss:.6f} " f"Dice Score: {dice_score:.6f}") # save model if validation loss has decreased if valid_loss <= valid_loss_min: print( f"Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ..." ) torch.save(model.state_dict(), save_model_path) valid_loss_min = valid_loss scheduler.step() model, scheduler = reschedule(model, epoch, scheduler) return model
def main(): dataset_root = Path.home() / "Data" base_path = ensure_existence(PROJECT_APP_PATH.user_data / 'maskrcnn') log_path = ensure_existence(PROJECT_APP_PATH.user_log / 'maskrcnn') export_root = ensure_existence(base_path / 'models') model_name = f'maskrcnn_pennfudanped' batch_size = 4 num_epochs = 10 optimiser_spec = GDKC(torch.optim.Adam, lr=3e-4) scheduler_spec = GDKC( torch.optim.lr_scheduler. StepLR, # a learning rate scheduler which decreases the learning rate by step_size=3, # 10x every 3 epochs gamma=0.1, ) num_workers = os.cpu_count() torch_seed(3825) dataset = PennFudanDataset(dataset_root / "PennFudanPed", Split.Training, return_variant=ReturnVariant.all) dataset_validation = PennFudanDataset( dataset_root / "PennFudanPed", Split.Validation, return_variant=ReturnVariant.all, ) split = SplitIndexer(len(dataset), validation=0.3, testing=0) split_indices = torch.randperm(split.total_num).tolist() data_loader = DataLoader( Subset(dataset, split_indices[:-split.validation_num]), batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_batch_fn, ) data_loader_val = DataLoader( Subset(dataset_validation, split_indices[-split.validation_num:]), batch_size=1, shuffle=False, num_workers=num_workers, collate_fn=collate_batch_fn, ) model = get_pretrained_instance_segmentation_maskrcnn( dataset.response_channels) optimiser = optimiser_spec(trainable_parameters(model)) lr_scheduler = scheduler_spec(optimiser) if True: model = load_model(model_name=model_name, model_directory=export_root) if True: with TorchTrainSession(model): with TensorBoardPytorchWriter(log_path / model_name) as writer: for epoch_i in tqdm(range(num_epochs), desc="Epoch #"): maskrcnn_train_single_epoch(model=model, optimiser=optimiser, data_loader=data_loader, writer=writer) lr_scheduler.step() # update the learning rate maskrcnn_evaluate( model, data_loader_val, writer=writer ) # evaluate on the validation dataset save_model(model, model_name=model_name, save_directory=export_root) if True: with TorchEvalSession(model): # put the model in evaluation mode img, _ = dataset_validation[ 0] # pick one image from the test set with torch.no_grad(): prediction = model([img.to(global_torch_device())]) from matplotlib import pyplot pyplot.imshow( Image.fromarray( img.mul(255).permute(1, 2, 0).byte().numpy())) pyplot.show() import cv2 pyplot.imshow( Image.fromarray(prediction[0]["masks"][0, 0].mul( 255).byte().cpu().numpy())) pyplot.show() (boxes, labels, scores) = ( prediction[0]["boxes"].to('cpu').numpy(), prediction[0]["labels"].to('cpu').numpy(), torch.sigmoid(prediction[0]["scores"]).to('cpu').numpy(), ) from draugr.opencv_utilities import draw_bounding_boxes from draugr.torch_utilities.images.conversion import quick_to_pil_image indices = scores > 0.1 cv2.namedWindow(model_name, cv2.WINDOW_NORMAL) cv2.imshow( model_name, draw_bounding_boxes( quick_to_pil_image(img), boxes[indices], labels=labels[indices], scores=scores[indices], #categories=categories, )) cv2.waitKey()
def train_siamese( model: Module, optimiser: Optimizer, criterion: callable, *, writer: Writer = MockWriter(), train_number_epochs: int, data_dir: Path, train_batch_size: int, model_name: str, save_path: Path, save_best: bool = False, img_size: Tuple[int, int], validation_interval: int = 1, ): """ :param img_size: :type img_size: :param validation_interval: :type validation_interval: :param data_dir: :type data_dir: :param optimiser: :type optimiser: :param criterion: :type criterion: :param writer: :type writer: :param model_name: :type model_name: :param save_path: :type save_path: :param save_best: :type save_best: :param model: :type model: :param train_number_epochs: :type train_number_epochs: :param train_batch_size: :type train_batch_size: :return: :rtype: """ train_dataloader = DataLoader( PairDataset( data_path=data_dir, transform=transforms.Compose([ transforms.Grayscale(), transforms.Resize(img_size), transforms.ToTensor(), ]), split=Split.Training, ), shuffle=True, num_workers=4, batch_size=train_batch_size, ) valid_dataloader = DataLoader( PairDataset( data_path=data_dir, transform=transforms.Compose([ transforms.Grayscale(), transforms.Resize(img_size), transforms.ToTensor(), ]), split=Split.Validation, ), shuffle=True, num_workers=4, batch_size=train_batch_size, ) best = math.inf E = tqdm(range(0, train_number_epochs)) batch_counter = count() for epoch in E: for tss in train_dataloader: batch_i = next(batch_counter) with TorchTrainSession(model): o = [t.to(global_torch_device()) for t in tss] optimiser.zero_grad() loss_contrastive = criterion(model(*o[:2]), o[2].to(dtype=torch.float)) loss_contrastive.backward() optimiser.step() train_loss = loss_contrastive.cpu().item() writer.scalar("train_loss", train_loss, batch_i) if batch_counter.__next__() % validation_interval == 0: with TorchEvalSession(model): for tsv in valid_dataloader: ov = [t.to(global_torch_device()) for t in tsv] v_o, fact = model(*ov[:2]), ov[2].to(dtype=torch.float) valid_loss = criterion(v_o, fact).cpu().item() valid_accuracy = (accuracy(distances=v_o, is_diff=fact).cpu().item()) writer.scalar("valid_loss", valid_loss, batch_i) if valid_loss < best: best = valid_loss print(f"new best {best}") writer.blip("new_best", batch_i) if save_best: save_model_parameters( model, optimiser=optimiser, model_name=model_name, save_directory=save_path, ) E.set_description( f"Epoch number {epoch}, Current train loss {train_loss}, valid loss {valid_loss}, valid_accuracy {valid_accuracy}" ) return model
def train_person_segmentor( model: torch.nn.Module, train_loader: torch.utils.data.DataLoader, valid_loader: torch.utils.data.DataLoader, criterion: callable, optimiser: torch.optim.Optimizer, *, save_model_path: Path, learning_rate: Number = 6e-2, scheduler: torch.optim.lr_scheduler = None, n_epochs: int = 100, writer: ImageWriterMixin = MockWriter(), ): """ :param model: :type model: :param train_loader: :type train_loader: :param valid_loader: :type valid_loader: :param criterion: :type criterion: :param optimiser: :type optimiser: :param scheduler: :type scheduler: :param save_model_path: :type save_model_path: :param n_epochs: :type n_epochs: :return: :rtype:""" valid_loss_min = numpy.Inf # track change in validation loss assert n_epochs > 0, n_epochs E = tqdm(range(1, n_epochs + 1)) for epoch_i in E: train_loss = 0.0 valid_loss = 0.0 with TorchTrainSession(model): for data, target in tqdm(train_loader): output, *_ = model(data.to(global_torch_device())) loss = criterion(output, target.to(global_torch_device()).float()) optimiser.zero_grad() loss.backward() optimiser.step() train_loss += loss.cpu().item() * data.size(0) with TorchEvalSession(model): with torch.no_grad(): for data, target in tqdm(valid_loader): target = target.float() ( output, *_, ) = model( # forward pass: compute predicted outputs by passing inputs to the model data.to(global_torch_device())) validation_loss = criterion( output, target.to( global_torch_device())) # calculate the batch loss writer.scalar( "dice_validation", dice_loss(output, target.to(global_torch_device())), ) valid_loss += validation_loss.detach().cpu().item( ) * data.size(0) # update average validation loss writer.image("input", data, epoch_i) # write the last batch writer.image("truth", target, epoch_i) # write the last batch writer.image("prediction", torch.sigmoid(output), epoch_i) # write the last batch # calculate average losses train_loss = train_loss / len(train_loader.dataset) valid_loss = valid_loss / len(valid_loader.dataset) # save model if validation loss has decreased if valid_loss <= valid_loss_min: print( f"Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ..." ) torch.save(model.state_dict(), save_model_path) valid_loss_min = valid_loss if scheduler: scheduler.step() optimiser, scheduler = reschedule_learning_rate( model, optimiser, epoch_i, scheduler, starting_learning_rate=learning_rate, ) # print training/validation statistics current_lr = next(iter(optimiser.param_groups))["lr"] E.set_description(f"Epoch: {epoch_i} " f"Training Loss: {train_loss:.6f} " f"Validation Loss: {valid_loss:.6f} " f"Learning rate: {current_lr:.6f}") writer.scalar("training_loss", train_loss) writer.scalar("validation_loss", valid_loss) writer.scalar("learning_rate", current_lr) return model
def main( base_path: Path = Path.home() / "Data" / "Datasets" / "PennFudanPed", train_model: bool = True, load_prev_model: bool = True, writer: Writer = TensorBoardPytorchWriter(PROJECT_APP_PATH.user_log / "instanced_person_segmentation" / f"{time.time()}"), ): """ """ # base_path = Path("/") / "encrypted_disk" / "heider" / "Data" / "PennFudanPed" base_path: Path = Path.home() / "Data3" / "PennFudanPed" # base_path = Path('/media/heider/OS/Users/Christian/Data/Datasets/') / "PennFudanPed" pyplot.style.use("bmh") save_model_path = ( ensure_existence(PROJECT_APP_PATH.user_data / "models") / "instanced_penn_fudan_ped_seg.model") eval_model = not train_model SEED = 9221 batch_size = 32 num_workers = 0 encoding_depth = 2 learning_rate = 6e-6 # sequence 6e-2 6e-3 6e-4 6e-5 seed_stack(SEED) train_set = PennFudanDataset( base_path, SplitEnum.training, return_variant=PennFudanDataset.PennFudanReturnVariantEnum.instanced, ) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers) valid_loader = DataLoader( PennFudanDataset( base_path, SplitEnum.validation, return_variant=PennFudanDataset.PennFudanReturnVariantEnum. instanced, ), batch_size=batch_size, shuffle=False, num_workers=num_workers, ) model = SkipHourglassFission( input_channels=train_set.predictor_shape[-1], output_heads=(train_set.response_shape[-1], ), encoding_depth=encoding_depth, ) model.to(global_torch_device()) if load_prev_model and save_model_path.exists(): model.load_state_dict(torch.load(str(save_model_path))) print("loading saved model") if train_model: with TorchTrainSession(model): criterion = BCEDiceLoss() # optimiser = torch.optim.SGD(model.parameters(), lr=learning_rate) optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimiser, T_max=7, eta_min=learning_rate / 100, last_epoch=-1 ) model = train_person_segmentor( model, train_loader, valid_loader, criterion, optimiser, save_model_path=save_model_path, learning_rate=learning_rate, writer=writer, ) if eval_model: validate_model(model, valid_loader)