def __init__(self, model, loss, optimizer, resume, config): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.optimizer = optimizer self.steps = config['trainer']['steps'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.start_step = 0 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu'], config.get('gpu_list')) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.train_logger = train_logger cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.verbosity = cfg_trainer['verbosity'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = cfg_trainer.get('early_stop', math.inf) self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m-%d_%H-%M-%S') self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(cfg_trainer['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, cfg_trainer['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def main(args): conf = yaml.load(open(args.config)) conf.update(conf[conf['model']]) if args.multi_gpu: conf['batch_size'] *= torch.cuda.device_count() datasets = { 'MNIST': torchvision.datasets.MNIST, 'CIFAR': torchvision.datasets.CIFAR10 } if conf['dataset'].upper() == 'MNIST': conf['data_path'] = os.path.join(conf['data_path'], 'MNIST') size = 28 classes = list(range(10)) mean, std = ((0.1307,), (0.3081,)) elif conf['dataset'].upper() == 'CIFAR': conf['data_path'] = os.path.join(conf['data_path'], 'CIFAR') size = 32 classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] mean, std = ((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) else: raise ValueError('Dataset must be either MNIST or CIFAR!') transform = transforms.Compose([ transforms.RandomCrop(size, padding=2), transforms.ToTensor(), transforms.Normalize(mean, std) ]) loaders = {} trainset = datasets[conf['dataset'].upper()](root=conf['data_path'], train=True, download=True, transform=transform) testset = datasets[conf['dataset'].upper()](root=conf['data_path'], train=False, download=True, transform=transform) loaders['train'] = torch.utils.data.DataLoader(trainset, batch_size=conf['batch_size'], shuffle=True, num_workers=4) loaders['test'] = torch.utils.data.DataLoader(testset, batch_size=conf['batch_size'], shuffle=False, num_workers=4) print(9*'#', 'Using {} dataset'.format(conf['dataset']), 9*'#') # Training use_gpu = not args.disable_gpu and torch.cuda.is_available() caps_net = CapsNetTrainer(loaders, conf['model'], conf['lr'], conf['lr_decay'], conf['num_classes'], conf['num_routing'], conf['loss'], use_gpu=use_gpu, multi_gpu=args.multi_gpu) ensure_dir('logs') # logger = {} logger['train'] = Logger('logs/{}-train'.format(conf['dataset'])) logger['test'] = Logger('logs/{}-test'.format(conf['dataset'])) ensure_dir(conf['save_dir']) # caps_net.train(conf['epochs'], classes, conf['save_dir'], logger)
def evaluate_full_image_list(self, file_list, save_dir, save_name): sub_dirs = self.get_full_image_example_dir_names() dirs = [] for d in sub_dirs: if 'target' not in d: path = os.path.join(save_dir, 'chickpea-full-image', d) ensure_dir(path) dirs.append(path) metric_names = self.get_full_image_metric_names() metrics = {n: [] for n in metric_names} with torch.no_grad(): for i, file in enumerate(tqdm(file_list)): image = cv2.imread(file, 0) if image.shape[0] > 3200 or image.shape[1] > 3200: continue image = remove_artifacts(image, 10) resized_img, binary_inpainted, rgb_inpainted, unthresh_inpainted = inpaint_full_image( image, self.model, 50) remove_binary_inptined = remove_artifacts(binary_inpainted, 10) labeled_input, num_labels_input = label( (resized_img / 255.).astype(np.uint8), neighbors=8, background=0, return_num=True) labeled_input = convert_labels_to_rgb(labeled_input) labeled_pred, num_labels_pred = label( (binary_inpainted / 255.).astype(np.uint8), neighbors=8, background=0, return_num=True) labeled_pred = convert_labels_to_rgb(labeled_pred) labeled_pred_rm, num_labels_pred = label( (remove_binary_inptined / 255.).astype(np.uint8), neighbors=8, background=0, return_num=True) labeled_pred_rm = convert_labels_to_rgb(labeled_pred_rm) images = [ resized_img, binary_inpainted, remove_binary_inptined, labeled_input, labeled_pred, unthresh_inpainted, rgb_inpainted, labeled_pred_rm ] metrics["num_labels_input"].append(num_labels_input) metrics["num_labels_pred"].append(num_labels_pred) metrics["num_labels_pred_rm"].append(num_labels_pred) for save_path, image in zip(dirs, images): cv2.imwrite(os.path.join(save_path, '{}.png'.format(i)), image.astype(np.uint8)) df = pd.DataFrame(metrics, columns=metrics.keys()) df.to_csv(save_dir + '/' + save_name + '.csv') df.describe().to_csv(save_dir + '/' + save_name + '-stats.csv')
def __init__(self, models, metrics, optimizers, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) if not isinstance(models, collections.Iterable): models = [models] else: assert len(models) > 0 # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.models = [] for i, model in enumerate(models): self.models.append(model.to(self.device)) if len(device_ids) > 1: self.models[i] = torch.nn.DataParallel(self.models[i], device_ids=device_ids) self.metrics = metrics self.optimizers = optimizers self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.train_logger = train_logger # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX( writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def trainer_paths(config): """Returns the paths to save checkpoints and tensorboard runs. eg. saved/Mnist_LeNet/<start time>/checkpoints saved/Mnist_LeNet/<start time>/runs """ arch_datetime = arch_datetime_path(config) return (ensure_dir(arch_datetime / 'checkpoints'), ensure_dir(arch_datetime / 'runs'))
def _get_SummaryWriter(self): if not self.args.debug and not self.args.do_test: ensure_dir(os.path.join('./summary/', self.experiment_name)) self.summarywriter = summary.create_file_writer( logdir='./summary/{}/{}/train'.format( self.experiment_name, time.strftime("%m%d-%H-%M-%S", time.localtime( time.time()))))
def _download_taxonomy(self): ensure_dir(self.taxonomy_dir) self.logger.info("Downloading taxonomic tree...") res = download_file(self.TAX_URL, self.taxonomy_dir / 'taxdump.tar.gz') if res is None: self.logger.error( "Could not download taxdump information...Please try again") exit(1)
def __init__(self, model, loss, metrics, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) self.model = model self.loss = loss self.metrics = metrics self.name = config['name'] self.epochs = config['trainer']['epochs'] print('self.epochs ', self.epochs) self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.with_cuda = config['cuda'] and torch.cuda.is_available() if config['cuda'] and not torch.cuda.is_available(): self.logger.warning( 'Warning: There\'s no CUDA support on this machine, ' 'training is performed on CPU.') else: self.gpu = torch.device('cuda:' + str(config['gpu'])) self.model = self.model.to(self.gpu) self.train_logger = train_logger # here we add to the optimizer only those parameters that are not frozen! non_frozen_parameters = [ p for p in model.parameters() if p.requires_grad ] print('%d non_frozen_parameters ' % len(non_frozen_parameters)) self.optimizer = getattr(optim, config['optimizer_type'])( non_frozen_parameters, **config['optimizer']) self.lr_scheduler = getattr(optim.lr_scheduler, config['lr_scheduler_type'], None) if self.lr_scheduler: self.lr_scheduler = self.lr_scheduler(self.optimizer, **config['lr_scheduler']) self.lr_scheduler_freq = config['lr_scheduler_freq'] self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode == 'min' or self.monitor_mode == 'max' self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], self.name) ensure_dir(self.checkpoint_dir) json.dump(config, open(os.path.join(self.checkpoint_dir, 'config.json'), 'w'), indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume) torch.cuda.empty_cache() print('self.monitor_best = ', self.monitor_best)
def __init__(self, model, loss, metrics, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) self.model = model self.loss = loss self.metrics = metrics self.name = config['name'] self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.with_cuda = config['cuda'] and torch.cuda.is_available() if config['cuda'] and not torch.cuda.is_available(): self.logger.warning( 'Warning: There\'s no CUDA support on this machine, ' 'training is performed on CPU.') device = 'cpu' else: self.gpus = {i: item for i, item in enumerate(self.config['gpus'])} device = 'cuda' self.model = torch.nn.DataParallel(self.model) torch.cuda.empty_cache() self.device = torch.device(device) self.model = self.model.to(self.device) self.logger.debug('Model is initialized.') self._log_memory_useage() self.train_logger = train_logger self.optimizer = getattr(optim, config['optimizer_type'])( model.parameters(), **config['optimizer']) self.lr_scheduler = getattr(optim.lr_scheduler, config['lr_scheduler_type'], None) if self.lr_scheduler: self.lr_scheduler = self.lr_scheduler(self.optimizer, **config['lr_scheduler']) self.lr_scheduler_freq = config['lr_scheduler_freq'] self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode == 'min' or self.monitor_mode == 'max' self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], self.name) ensure_dir(self.checkpoint_dir) json.dump(config, open(os.path.join(self.checkpoint_dir, 'config.json'), 'w'), indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__( self, model, losses, metrics, optimizer_g, optimizer_d_s, optimizer_d_t, resume, config, train_logger=None, pretrained_path=None, ): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) self.losses = losses self.metrics = metrics self.optimizer_g = optimizer_g self.optimizer_d_s = optimizer_d_s self.optimizer_d_t = optimizer_d_t self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] # Set pretrained_load_strict to False to load model without strict state name matching # It's useful when pretrained model without GAN but we want to use GAN for this time self.pretrained_load_strict = config['trainer']['pretrained_load_strict'] self.train_logger = train_logger # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX(writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume) elif pretrained_path is not None: self._load_pretrained(pretrained_path) # put model into DataParallel module only after the checkpoint is loaded if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids)
def __init__(self, model, loss, metrics, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.with_cuda = config['cuda'] and torch.cuda.is_available() if config['cuda'] and not torch.cuda.is_available(): self.logger.warning( 'Warning: There\'s no GPU available on this machine, ' 'training will be performed on CPU.') self.device = torch.device( 'cuda:' + str(config['gpu']) if self.with_cuda else 'cpu') self.model = model.to(self.device) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.train_logger = train_logger # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = WriterTensorboardX( writer_dir, self.logger, config['visualization']['tensorboardX']) # Save configuration into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__( self, model, loss, metrics, optimizer, resume, config, train_logger=None ): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config["n_gpu"]) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.train_logger = train_logger cfg_trainer = config["trainer"] self.epochs = cfg_trainer["epochs"] self.save_period = cfg_trainer["save_period"] self.verbosity = cfg_trainer["verbosity"] self.monitor = cfg_trainer.get("monitor", "off") # configuration to monitor model performance and save best if self.monitor == "off": self.mnt_mode = "off" self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = math.inf if self.mnt_mode == "min" else -math.inf self.early_stop = cfg_trainer.get("early_stop", math.inf) self.start_epoch = 1 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime("%m%d_%H%M%S") self.checkpoint_dir = os.path.join( cfg_trainer["save_dir"], config["name"], start_time ) # setup visualization writer instance writer_dir = os.path.join(cfg_trainer["log_dir"], config["name"], start_time) self.writer = WriterTensorboardX( writer_dir, self.logger, cfg_trainer["tensorboardX"] ) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, "config.json") with open(config_save_path, "w") as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, metrics, optimizer, resume, config): self.config = config self.logger = logging.getLogger(self.__class__.__name__) self.data_type = torch.float32 # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.eval_freq = config['trainer']['eval_freq'] self.metric_freq = config['trainer']['metric_freq'] self.early_stopping = config['trainer']['early_stopping'] # configuration to monitor model performance and save best self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode in ['min', 'max', 'off'] self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.monitor_best_se = 0 self.start_epoch = 1 self.best_epoch = 0 # setup directory for checkpoint saving start_time = datetime.datetime.now().strftime('%m%d_%H%M%S') self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], config['name'], start_time) # setup visualization writer instance writer_dir = os.path.join(config['visualization']['log_dir'], config['name'], start_time) self.writer = SummaryWriter(writer_dir) if hasattr(self.loss, "set_writer"): self.loss.set_writer(self.writer) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__( self, model, config, args, test_data_loader, begin_time, resume_file, loss_weight, ): # for general self.config = config self.args = args self.device = torch.device( 'cpu') if self.args.gpu == -1 else torch.device('cuda:{}'.format( self.args.gpu)) #self.do_predict = do_predict # for train #self.visdom = visdom self.model = model.to(self.device) self.loss_weight = loss_weight.to(self.device) self.loss = self._loss(loss_function=self.config.loss).to(self.device) self.optimizer = self._optimizer(lr_algorithm=self.config.lr_algorithm) self.lr_scheduler = self._lr_scheduler() # for time self.begin_time = begin_time # for data self.test_data_loader = test_data_loader # for resume/save path self.history = { 'eval': { 'loss': [], 'acc': [], 'miou': [], 'time': [], }, } self.test_log_path = os.path.join(self.args.output, 'test', 'log', self.model.name, self.begin_time) self.predict_path = os.path.join(self.args.output, 'test', 'predict', self.model.name, self.begin_time) # here begin_time is the same with the time used in BaseTrainer.py # loading args.weight or the checkpoint-best.pth self.resume_ckpt_path = resume_file if resume_file is not None else \ os.path.join(self.config.save_dir, self.model.name, self.begin_time, 'checkpoint-best.pth') ensure_dir(self.test_log_path) ensure_dir(self.predict_path)
def __init__(self, model, config): super(UnetEvaluator, self).__init__(model, config) self.syn_test_dataloader = module_data.TestRootDataLoader( name='synthetic') self.real_test_dataloader = module_data.TestRootDataLoader( name='chickpea') self.chickpea_test_file_list = get_files(chickpea_valid_path) self.testing_dir = os.path.join(self.config["checkpoint_dir"], 'testing') ensure_dir(self.testing_dir)
def _create_saving_dir(self, args): saving_dir = os.path.join(global_config['trainer']['save_dir'], args.ckpts_subdir, global_config['name'], self.start_time) ensure_dir(saving_dir) # create a link to the resumed checkpoint as a reference if args.resume is not None: link = os.path.join(saving_dir, 'resumed_ckpt.pth') os.symlink(os.path.abspath(args.resume), link) return saving_dir
def __init__(self, model, loss, metrics, resume, config, train_logger=None): self.config = config self.logger = Logger( self.__class__.__name__ ).logger # logging.getLogger(self.__class__.__name__) self.model = model self.loss = loss self.metrics = metrics self.name = config['name'] self.epochs = config['trainer']['epochs'] self.device = config['device'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.train_logger = train_logger self.train_logger.propagate = False # here we add to the optimizer only those parameters that are not frozen! non_frozen_parameters = [ p for p in model.parameters() if p.requires_grad ] self.logger.info('%d non_frozen_parameters ' % len(non_frozen_parameters)) self.optimizer = getattr(optim, config['optimizer_type'])( non_frozen_parameters, **config['optimizer']) self.lr_scheduler = getattr(optim.lr_scheduler, config['lr_scheduler_type'], None) if self.lr_scheduler: self.lr_scheduler = self.lr_scheduler(self.optimizer, **config['lr_scheduler']) self.lr_scheduler_freq = config['lr_scheduler_freq'] self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode == 'min' or self.monitor_mode == 'max' self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], self.name) ensure_dir(self.checkpoint_dir) json.dump(config, open(os.path.join(self.checkpoint_dir, 'config.json'), 'w'), indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__(self, model, config): self.model = model self.model.eval() self.config = config # create validation and testing directory in model checkpoints directory self.validation_dir = os.path.join(self.config["checkpoint_dir"], 'validation') ensure_dir(self.validation_dir) self.testing_dir = os.path.join(self.config["checkpoint_dir"], 'testing') ensure_dir(self.testing_dir)
def save_patch_examples(self, example_dict, save_dir, save_name): """ Save patch examples saved in example dictionary :param example_dict: a dictionary contains patch examples :param save_dir: saving directory :param save_name: saving name :return: None """ # create sub folders sub_dirs = self.get_patch_example_dir_names() dirs = [] for d in sub_dirs: path = os.path.join(save_dir, save_name, d) ensure_dir(path) dirs.append(path) # obtain image lists input_images = example_dict["input_images"] target_images = example_dict["target_images"] pred_images = example_dict["pred_images"] unthresh_pred_images = example_dict["unthresh_pred_images"] i = 0 for input, target, unthresh_pred, pred in zip(input_images, target_images, unthresh_pred_images, pred_images): # convert binary images into colourful labels labeled_input = label(input.astype(np.uint8), neighbors=8, background=0) labeled_input = convert_labels_to_rgb(labeled_input) labeled_target = label(target.astype(np.uint8), neighbors=8, background=0) labeled_target = convert_labels_to_rgb(labeled_target) labeled_pred = label(pred.astype(np.uint8), neighbors=8, background=0) labeled_pred = convert_labels_to_rgb(labeled_pred) images = [ input, target, pred, labeled_input, labeled_target, labeled_pred, unthresh_pred ] # save images for save_path, image in zip(dirs, images): cv2.imwrite(os.path.join(save_path, '{}.png'.format(i)), (image * 255).astype(np.uint8)) i += 1
def __init__(self, model, loss, metrics, data_loader, valid_data_loader, optimizer, epochs, batch_size, save_dir, save_freq, resume, verbosity, training_name, device, train_logger=None, writer=None, monitor='loss', monitor_mode='min'): self.logger = logging.getLogger(self.__class__.__name__) self.model = model self.loss = loss self.metrics = metrics self.data_loader = data_loader self.batch_size = batch_size self.valid_data_loader = valid_data_loader self.valid = True if self.valid_data_loader is not None else False self.optimizer = optimizer self.epochs = epochs self.save_freq = save_freq self.verbosity = verbosity self.training_name = training_name self.train_logger = train_logger self.writer = writer self.train_iter = 0 self.valid_iter = 0 self.device = device self.monitor = monitor self.monitor_mode = monitor_mode assert monitor_mode == 'min' or monitor_mode == 'max' self.monitor_best = math.inf if monitor_mode == 'min' else -math.inf self.start_epoch = 1 self.checkpoint_dir = os.path.join(save_dir, training_name) ensure_dir(self.checkpoint_dir) if resume: self._resume_checkpoint(resume)
def __init__(self, model, loss, metrics, class_wise, optimizer, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu'], config['gpu_id']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.class_wise = class_wise self.optimizer = optimizer self.train_logger = train_logger cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.verbosity = cfg_trainer['verbosity'] # default is off self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: # "monitor": "min val_loss" self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = math.inf if self.mnt_mode == 'min' else -math.inf self.early_stop = cfg_trainer.get('early_stop', math.inf) self.start_epoch = 1 # setup directory for checkpoint saving start_time = str(datetime.now(timezone('US/Pacific')).strftime('%m-%d_%H:%M:%S')) + '_' +str(config['arch']['type']) self.checkpoint_dir = os.path.join(cfg_trainer['save_dir'], config['name'], start_time) # Save configuration file into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def save_images(self, generator_outputs, targets, generator_labels, target_labels, epoch, batch_idx, r=1, c=2): generator_outputs = generator_outputs.cpu().numpy() generator_outputs = generator_outputs.transpose((0, 2, 3, 1)) generator_outputs = generator_outputs[..., 1] generator_outputs[generator_outputs >= 0.5] = 1 generator_outputs[generator_outputs < 0.5] = 0 targets = targets.cpu().numpy() generator_labels = generator_labels.cpu().numpy() target_labels = target_labels.cpu().numpy() fig, axs = plt.subplots(r, c) if r == 1: axs[0].set_title('Fake Disc:{:.2f}'.format(generator_labels[0, 0])) axs[0].imshow(generator_outputs[0], cmap='gray') axs[0].axis('off') axs[1].set_title('Target Disc:{:.2f}'.format(target_labels[0, 0])) axs[1].imshow(targets[0], cmap='gray') axs[1].axis('off') else: count = 0 for row in range(r): axs[row, 0].set_title('Fake Disc:{:.1f}'.format( generator_labels[count, 0])) axs[row, 0].imshow(generator_outputs[count]) axs[row, 0].axis('off') axs[row, 1].set_title('Target Disc:{:.1f}'.format( target_labels[count, 0])) axs[row, 1].imshow(targets[count]) axs[row, 1].axis('off') count += 1 ensure_dir( os.path.join(self.checkpoint_dir, 'results', 'epoch_{}').format(epoch)) fig.savefig('{0}/results/epoch_{1}/{2}.jpg'.format( self.checkpoint_dir, epoch, batch_idx)) plt.close(fig)
def main(blurred_dir, sharp_dir, aligned_dir): image_names = os.listdir( blurred_dir ) # we assume that blurred and sharp images have the same names ensure_dir(aligned_dir) for image_name in tqdm(image_names, ascii=True): # convert PIL image to numpy array (H, W, C) blurred = np.array(Image.open(os.path.join(blurred_dir, image_name)).convert('RGB'), dtype=np.uint8) sharp = np.array(Image.open(os.path.join(sharp_dir, image_name)).convert('RGB'), dtype=np.uint8) aligned = np.concatenate((blurred, sharp), axis=1) # horizontal alignment Image.fromarray(aligned).save(os.path.join(aligned_dir, image_name))
def _save_checkpoint(self, epoch, save_best=False): ensure_dir(self.save_dir) state = { 'epoch': epoch, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'config': self.cfg, 'best_score': self.best_score } if save_best: filename = str(self.save_dir + '/model_best.pt') torch.save(state, filename) self.logger.debug('Saving current best: {}...'.format(filename)) else: filename = str(self.save_dir + '/checkpoint_epoch_{}.pt'.format(epoch)) torch.save(state, filename) self.logger.debug('Saving checkpoint: {} ...'.format(filename))
def __init__(self, dirPath, split, config): self.img_height = config['img_height'] #with open(os.path.join(dirPath,'sets.json')) as f: with open(os.path.join('data', 'sets.json')) as f: set_list = json.load(f)[split] self.authors = defaultdict(list) self.lineIndex = [] for page_idx, name in enumerate(set_list): lines, author = parseXML( os.path.join(dirPath, 'xmls', name + '.xml')) authorLines = len(self.authors[author]) self.authors[author] += [ (os.path.join(dirPath, 'forms', name + '.png'), ) + l for l in lines ] self.lineIndex += [(author, i + authorLines) for i in range(len(lines))] char_set_path = config['char_file'] with open(char_set_path) as f: char_set = json.load(f) self.char_to_idx = char_set['char_to_idx'] self.augmentation = config[ 'augmentation'] if 'augmentation' in config else None self.normalized_dir = config[ 'cache_normalized'] if 'cache_normalized' in config else None if self.normalized_dir is not None: ensure_dir(self.normalized_dir) self.warning = False #DEBUG if 'overfit' in config and config['overfit']: self.lineIndex = self.lineIndex[:10] self.center = config[ 'center_pad'] #if 'center_pad' in config else True self.add_spaces = config[ 'add_spaces'] if 'add_spces' in config else False
def __init__(self, model, loss, metrics, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) self.model = model self.loss = loss self.metrics = metrics self.name = config['name'] self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.with_cuda = config['cuda'] and torch.cuda.is_available() if config['cuda'] and not torch.cuda.is_available(): self.logger.warning( 'Warning: There\'s no CUDA support on this machine, ' 'training is performed on CPU.') self.train_logger = train_logger self.optimizers = {} for optim_name, optim_config in config['optimizers'].items(): self.optimizers[optim_name] = getattr(optim, optim_config['type'])( eval('model.' + optim_name).parameters(), **optim_config['config']) self.save_best = config['trainer']['save_best'] if self.save_best: self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode == 'min' or self.monitor_mode == 'max' self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], self.name) ensure_dir(self.checkpoint_dir) json.dump(config, open(os.path.join(self.checkpoint_dir, 'config.json'), 'w'), indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def _create_saving_dir(self, args): saving_dir = os.path.join(global_config['trainer']['save_dir'], args.outputs_subdir, global_config['name']) if os.path.exists(saving_dir): logger.warning( f'The saving directory "{saving_dir}" already exists. ' f'If continued, some files might be overwriten.') response = input('Proceed? [y/N] ') if response != 'y': logger.info('Exit.') exit() ensure_dir(saving_dir) if args.resume is not None: link = os.path.join(saving_dir, 'resumed_ckpt.pth') if os.path.exists(link): os.remove(link) # Mark the used resume path by a symbolic link os.symlink(os.path.abspath(args.resume), link) return saving_dir
def __init__(self, model, loss, metrics, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) self.model = model self.loss = loss self.metrics = metrics self.name = config['name'] self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.with_cuda = config['cuda'] and torch.cuda.is_available() if config['cuda'] and not torch.cuda.is_available(): self.logger.warning('Warning: There\'s no CUDA support on this machine, ' 'training is performed on CPU.') self.device = torch.device('cuda:' + str(config['gpu']) if self.with_cuda else 'cpu') self.model = self.model.to(self.device) self.train_logger = train_logger self.writer = WriterTensorboardX(config) self.optimizer = getattr(optim, config['optimizer_type'])(filter(lambda p: p.requires_grad, model.parameters()), **config['optimizer']) self.lr_scheduler = getattr( optim.lr_scheduler, config['lr_scheduler_type'], None) if self.lr_scheduler: self.lr_scheduler = self.lr_scheduler(self.optimizer, **config['lr_scheduler']) self.lr_scheduler_freq = config['lr_scheduler_freq'] self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode == 'min' or self.monitor_mode == 'max' self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf self.start_epoch = 1 self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], self.name) # Save configuration into checkpoint directory: ensure_dir(self.checkpoint_dir) config_save_path = os.path.join(self.checkpoint_dir, 'config.json') with open(config_save_path, 'w') as handle: json.dump(config, handle, indent=4, sort_keys=False) if resume: self._resume_checkpoint(resume)
def __init__(self, model, config, test_data_loader, begin_time, loss_weight, #do_predict, ): # for general self.config = config self.device = torch.device('cuda:{}'.format(self.config.device_id)) if self.config.use_gpu else torch.device('cpu') #self.do_predict = do_predict # for train #self.visdom = visdom self.model = model.to(self.device) self.loss_weight = loss_weight.to(self.device) self.loss = self._loss(loss_function= self.config.loss).to(self.device) self.optimizer = self._optimizer(lr_algorithm=self.config.lr_algorithm) self.lr_scheduler = self._lr_scheduler() # for time self.begin_time = begin_time # for data self.test_data_loader = test_data_loader # for resume/save path self.history = { 'eval': { 'loss': [], 'acc': [], 'miou': [], 'time': [], }, } self.test_log_path = os.path.join(self.config.test_log_dir, model.name, self.begin_time) self.predict_path = os.path.join(self.config.pred_dir, model.name, self.begin_time) self.resume_ckpt_path = os.path.join(self.config.save_dir, model.name, self.begin_time, 'checkpoint-best.pth') ensure_dir(self.test_log_path) ensure_dir(self.predict_path)