def __init__(self, config, logger, generator, discriminator, gif_generator): self.config = config self.logger = logger self.gif_generator = gif_generator # setup GPU device if available, move model into configured device self.device, self.device_ids = self._prepare_device(config['n_gpu']) self.generator = self.initialize_training(generator) self.discriminator = self.initialize_training(discriminator) self.generator['config'] = config['generator'] self.discriminator['config'] = config['discriminator'] cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.n_critic = cfg_trainer["n_critic"] self.num_samples = cfg_trainer.get('num_samples', 0) self.z_size = config['generator']['arch']['args']['input_size'] if cfg_trainer.get('fixed_image', False): self.default_z = Variable(torch.randn(self.num_samples, z_size)).to(self.device) else: self.default_z = None self.generator['monitor'] = self.initialize_monitor( cfg_trainer['monitor'].get('generator', 'off')) self.discriminator['monitor'] = self.initialize_monitor( cfg_trainer['monitor'].get('discriminator', 'off')) self.generator['checkpoint_dir'] = config.save_dir / "generator" self.generator['checkpoint_dir'].mkdir(parents=True, exist_ok=True) self.discriminator[ 'checkpoint_dir'] = config.save_dir / "discriminator" self.discriminator['checkpoint_dir'].mkdir(parents=True, exist_ok=True) self.generator['writer'] = TensorboardWriter( config.log_dir / 'generator', self.logger, cfg_trainer['tensorboard']) self.discriminator['writer'] = TensorboardWriter( config.log_dir / 'discriminator', self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint('generator', config.resume['generator']) self._resume_checkpoint('discriminator', config.resume['discriminator'])
def __init__(self, config, disc_model, disc_loss, disc_optimizer, gen_model, gen_loss, gen_optimizer): self.config = config self.logger = config.get_logger('trainer', config['trainer']['level']) self.device, device_ids = self._prepare_device(config['n_gpu']) self.epochs = config['trainer']['epochs'] self.disc_gen_ratio = config['trainer']['disc_gen_ratio'] self.save_period = config['trainer']['save_period'] self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, config['trainer']['tensorboard']) BaseDiscriminator.__init__(self, disc_model, disc_loss, disc_optimizer) BaseGenerator.__init__(self, gen_model, gen_loss, gen_optimizer) self.disc_model = self.disc_model.to(self.device) self.gen_model = self.gen_model.to(self.device) if len(device_ids) > 1: self.disc_model = torch.nn.DataParallel(self.disc_model, device_ids=device_ids) self.disc_model = torch.nn.DataParallel(self.disc_model, device_ids=device_ids)
def __init__(self, model, criterion: TrackingLoss, optimizer, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) self.model = model self.criterion = criterion self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) if self.early_stop <= 0: self.early_stop = inf self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, models, criterion, metric_ftns, optimizers, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) self.n_ensembles = len(models) log_dict = {} self.keys = [] self.keys = [*['loss_' + str(i) for i in range(self.n_ensembles)], *[m.__name__ + '_' + str(i) for m in metric_ftns for i in range(self.n_ensembles)]] for key in self.keys: log_dict[key] = [] self.log = log_dict # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.models = [] for model in models: model = model.to(self.device) if len(device_ids) > 1: model = torch.nn.DataParallel(model, device_ids=device_ids) self.models.append(model) self.criterion = criterion self.metric_ftns = metric_ftns self.optimizers = optimizers cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() # mnt_mode min \ mnt_metric: val_loss assert self.mnt_mode in ['min', 'max'] self.mnt_best = np.empty(self.n_ensembles) if self.mnt_mode == 'min': self.mnt_best.fill(np.inf) else: self.mnt_best.fill(-np.inf) self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.start_epochs = np.ones(self.n_ensembles) self.checkpoint_dir = config.save_dir self.log_dir = config.log_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, loss, metrics, optimizer, config): """ params: model: neural network model loss: function to optimize metrics: list of metrics to validate during training optimizer: config: ConfigParser object """ self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) # Loss to optimize self.loss = loss # Metric to validate self.metrics = metrics # Optimizer to use, e.g Adam self.optimizer = optimizer cfg_trainer = config['trainer'] # Number of epochs to train self.epochs = cfg_trainer['epochs'] # Save model every # epochs self.save_period = cfg_trainer['save_period'] # Whether to monitor best model and apply early stopping self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf # Num epochs to track self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 # Last model in the model directory self.keep_last = config['trainer']['keep_last'] self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, criterion, metric_ftns, optimizer, config,train_sampler=None): self.config = config if dist.is_initialized(): logger_name="{}{}".format(__name__,dist.get_rank()) else: logger_name=__name__ self.logger = get_logger(name=logger_name, log_dir=config.log_dir, verbosity=config['trainer']['verbosity']) self.model = model self.criterion = criterion self.metric_ftns = metric_ftns self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') self.train_sampler=train_sampler # configuration to monitor model performance and save best if not is_master() or self.monitor == 'off': self.mnt_mode = 'off' else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) if self.early_stop <= 0: self.early_stop = inf self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance if is_master(): self.writer = TensorboardWriter(config, cfg_trainer['tensorboard']) else: self.writer = TensorboardWriter(config, False) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, metrics, optimizer, config, train_dataset, valid_datasets, lr_scheduler=None): super().__init__(model, metrics, optimizer, config, train_dataset) self.config = config self.config['data_loaders']['valid']['args']['batch_size'] = self.data_loader.batch_size self.valid_data_loaders = {} for corpus, valid_dataset in valid_datasets.items(): self.valid_data_loaders[corpus] = config.init_obj('data_loaders.valid', module_loader, valid_dataset) self.lr_scheduler = lr_scheduler self.log_step = math.ceil(len(self.data_loader.dataset) / np.sqrt(self.data_loader.batch_size) / 200) self.writer = TensorboardWriter(config.log_dir)
def __init__(self, model, criterion, metric_ftns, optimizer, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.device_ids = device_ids self.model = model self.model = self.model.to(self.device) self.real_model = self.model if len(self.device_ids) > 1: self.model = torch.nn.DataParallel(self.model, device_ids=device_ids) self.criterion = criterion.to(self.device) self.metric_ftns = metric_ftns self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.load_crt is not None: print("Loading from cRT pretrain: {}".format(config.load_crt)) self._load_crt(config.load_crt) if config.resume is not None: state_dict_only = config._config.get("resume_state_dict_only", False) self._resume_checkpoint(config.resume, state_dict_only=state_dict_only)
def __init__( self, model: torch.nn.Module, criterion: torch.nn.modules.loss._Loss, metric_ftns: List[Callable[..., float]], optimizer: torch.optim.Optimizer, config: ConfigParser, lr_scheduler: Union[torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.ReduceLROnPlateau, None, ] = None, ): self.config = config self.logger = config.get_logger("trainer", config["trainer"]["verbosity"]) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config["n_gpu"]) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.criterion = criterion self.metric_ftns = metric_ftns self.optimizer = optimizer self.lr_scheduler = lr_scheduler cfg_trainer = config["trainer"] self.epochs = cfg_trainer["epochs"] self.save_period = cfg_trainer["save_period"] self.monitor = cfg_trainer.get("monitor", "off") self.save_last = cfg_trainer.get("save_last", False) # configuration to monitor model performance and save best if self.monitor == "off": self.mnt_mode = "off" self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = inf if self.mnt_mode == "min" else -inf self.early_stop = cfg_trainer.get("early_stop", inf) self.start_epoch = 1 self.checkpoint_dir = config.model_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer["tensorboard"]) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, loss, metrics, optimizer, config, mini_train, num_keep_ckpts, skip_tboard): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer self.num_keep_ckpts = num_keep_ckpts self.skip_tboard = skip_tboard or mini_train # This property can be overriden in the subclass self.skip_first_n_saves = 0 cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') self.save_only_best = cfg_trainer.get("save_only_best", True) # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance if not self.skip_tboard: summary_dir = config.log_dir / f"seed-{config['seed']}" self.writer = TensorboardWriter(summary_dir, self.logger, cfg_trainer['tensorboard']) self.include_optim_in_ckpts = config["trainer"].get( "include_optim_in_ckpts", 1) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, generator, discriminator, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # setup GPU device if available, move model into configured device self.device, self.device_ids = self._prepare_device(config['n_gpu']) self.generator = self.initialize_training(generator) self.discriminator = self.initialize_training(discriminator) self.generator['config'] = config['generator'] self.discriminator['config'] = config['discriminator'] cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.generator['monitor'] = self.initialize_monitor( cfg_trainer['monitor'].get('generator', 'off')) self.discriminator['monitor'] = self.initialize_monitor( cfg_trainer['monitor'].get('discriminator', 'off')) self.generator['checkpoint_dir'] = os.path.join( config.save_dir, "generator") self.discriminator['checkpoint_dir'] = os.path.join( config.save_dir, "discriminator") self.generator['writer'] = TensorboardWriter( config.log_dir / 'generator', self.generator['logger'], cfg_trainer['tensorboard']) self.discriminator['writer'] = TensorboardWriter( config.log_dir / 'discriminator', self.discriminator['logger'], cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, criterion, metric_ftns, optimizer, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.criterion = criterion # # # If criterion is a stateful object (for now considered to me an nn.Module) and not # # a functional then move it to the computing device # if isinstance(self.criterion, torch.nn.Module): # self.criterion = self.criterion.to(self.device) self.metric_ftns = metric_ftns self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume) elif config.load is not None: self._load_checkpoint(config.load)
def __init__(self, model, criterion, metrics, optimizer, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # setup GPU device if available, move models into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) # For training self.criterion = criterion.to(self.device) self.metrics = metrics self.optimizer = optimizer self.metric_names = [x.__name__ for x in self.metrics] # Hyper parameter for trainer cfg_trainer = config['trainer'] self.accumulation_steps = cfg_trainer['accumulation_steps'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') self.do_validation = cfg_trainer['do_validation'] self.do_validation_interval = cfg_trainer['do_validation_interval'] self.log_step = cfg_trainer['log_step'] if cfg_trainer['save_for_track']: self.save_for_track = cfg_trainer['save_for_track'] else: self.save_for_track = None self.start_epoch = 1 self.checkpoint_dir = config.save_dir # configuration to monitor models performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard'])
def __init__(self, model, criterion, metric_ftns, optimizer, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) log_dict = {} self.keys = ['loss', *[m.__name__ for m in metric_ftns]] for key in self.keys: log_dict[key] = [] self.log = log_dict # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.criterion = criterion self.metric_ftns = metric_ftns self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir self.log_dir = config.log_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, criterion, optimizer, metric_fcns, config): self.config = config self.logger = config.get_logger( name="trainer", verbosity=config["trainer"]["verbosity"]) # setup GPU device if available self.device, device_ids = self._prepare_device(config['n_gpu']) # move model to configured device self.model = model.to(self.device) # if multiple devices, parallelize module if len(device_ids) > 1: self.model = torch.nn.DataParallel(module=model, device_ids=device_ids) self.criterion = criterion self.optimizer = optimizer self.metric_fcns = metric_fcns # trainer configuration cfg_trainer = config["trainer"] self.epochs = cfg_trainer["epochs"] # set number of epochs # number of epochs to save best model self.save_period = cfg_trainer["save_period"] self.monitor = cfg_trainer.get("monitor", "off") # configuration to monitor model performance and save best model if self.monitor == "off": self.mnt_mode = "off" self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = inf if self.mnt_mode == "min" else -inf self.early_stop = cfg_trainer.get("early_stop", inf) self.start_epoch = 1 # start epoch from 1 self.checkpoint_dir = config.save_dir # TODO: comment # setup visualization writer instance self.writer = TensorboardWriter( log_dir=config.log_dir, logger=self.logger, enabled=cfg_trainer["tensorboard"], ) # TODO: comment if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, criterion, metric_ftns, optimizer, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device( config['n_gpu']) # device_ids是用在DataParallel的 self.model = model.to(self.device) # 模型to device if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.criterion = criterion self.metric_ftns = metric_ftns # 评价方法 self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] # 每多少epoch存一次 self.monitor = cfg_trainer.get( 'monitor', 'off' ) # 这是dict的方法,返回cfg_trainer中的monitor,如果不存在,返回默认值'off',可能跟后面的配置有关 # configuration to monitor model performance and save best # monitor用来记录各种指标变化情况 if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in [ 'min', 'max' ] # 断言是:应该...,断言失败抛出assertionerror,断言成功继续运行 self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) # 早停 self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, criterion, metric_ftns, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) model.initialize(config, self.device) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.criterion = criterion self.metric_ftns = metric_ftns trainable_params = filter(lambda p: p.requires_grad, model.parameters()) self.optimizer = config.init_obj('optimizer', torch.optim, trainable_params) cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, loss, metrics, optimizer, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) # TODO: Check if this helps torch.backends.cudnn.benchmark = True self.loss = loss self.metrics = metrics self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir self.writer = None if cfg_trainer[ 'tensorboard']: # TODO should just move tensorboard writer directly in here self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, criterion, metric_ftns, optimizer, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.criterion = criterion self.metric_ftns = metric_ftns self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume) self.logger.info("Model params: {}".format(self.model)) self.logger.info("Current Opitmizer state: {}".format(self.optimizer)) self.logger.info("Criterion: {}".format(self.criterion.__name__)) for metric in self.metric_ftns: self.logger.info("Tracking metric {}".format(metric.__name__))
def __init__(self, torch_objs: dict, save_dir, **kwargs): # data_loaders self.train_data_loaders = torch_objs["data_loaders"]["train"] self.valid_data_loaders = torch_objs["data_loaders"]["valid"] # models self.models = torch_objs["models"] # losses self.losses = torch_objs["losses"] # metrics self.metrics_iter = torch_objs["metrics"]["iter"] self.metrics_epoch = torch_objs["metrics"]["epoch"] self.metrics_threshold = torch_objs["metrics"]["threshold"] # optimizers self.optimizers = torch_objs["optimizers"] # lr_schedulers self.lr_schedulers = torch_objs["lr_schedulers"] # amp self.amp = torch_objs["amp"] self.model_dir = save_dir["model"] # set json kwargs to self.{kwargs} for key, value in kwargs.items(): setattr(self, key, value) self.logger = get_logger("trainer", verbosity=self.verbosity) if self.early_stop <= 0 or self.early_stop is None: self.early_stop = np.inf self.start_epoch = 1 # configuration to monitor model performance and save best self.num_best = 0 if self.monitor == "off": self.mnt_mode = "off" self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = np.inf if self.mnt_mode == "min" else -np.inf # setup visualization writer instance self.writer = TensorboardWriter(save_dir["log"], self.logger, self.tensorboard) if self.metrics_threshold is not None: self.threshold = 0.5
def __init__(self, model, criterion, metric_ftns, optimizer, config): self.config = config # 创建一个日志 self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # 获取到GPU或者CPU self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) # 设置优化器,损失函数,评估函数 self.criterion = criterion self.metric_ftns = metric_ftns self.optimizer = optimizer cfg_trainer = config['trainer'] # epoch self.epochs = cfg_trainer['epochs'] # 多少次保存模型一次 self.save_period = cfg_trainer['save_period'] # 拿不到monitor,就为off self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] # mnt_best为最好的损失 self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, config, data_loader, losses, transformation_module, registration_module, metrics): self.config = config self.logger = config.logger self.device = 'cuda:0' self.data_loader = data_loader self.structures_dict = self.config.structures_dict self.save_dirs = self.data_loader.save_dirs # losses self.losses = dict() self.losses['data'] = {k: loss.to(self.device) for k, loss in losses['data'].items()} self.losses['reg'] = {k: loss.to(self.device) for k, loss in losses['reg'].items()} self.losses['entropy'] = losses['entropy'].to(self.device) # transformation and registration modules self.transformation_module = transformation_module.to(self.device) self.registration_module = registration_module.to(self.device) # differential operator for use with the transformation Jacobian self.diff_op = self.losses['reg']['loss'].diff_op # model logic cfg_trainer = config['trainer'] self.VI = cfg_trainer['VI'] self.start_iter_VI, self.no_iters_VI = 1, int(cfg_trainer['no_iters_VI']) self.no_samples_VI_test = int(cfg_trainer['no_samples_VI_test']) self.log_period_VI = cfg_trainer['log_period_VI'] self.MCMC = cfg_trainer['MCMC'] self.MCMC_init = cfg_trainer['MCMC_init'] # NOTE (DG): one of ['VI', 'identity', 'noise'] self.no_chains = int(cfg_trainer['no_chains']) self.no_samples_MCMC = int(cfg_trainer['no_samples_MCMC']) self.no_iters_burn_in = int(cfg_trainer['no_iters_burn_in']) self.log_period_MCMC = cfg_trainer['log_period_MCMC'] # metrics and prints self.writer = TensorboardWriter(config.log_dir, cfg_trainer['tensorboard']) self.writer.write_hparams(config.config_str) self.metrics = MetricTracker(*[m for m in metrics], writer=self.writer)
def __init__(self, model, optimizer, config): self.config = config self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # setup GPU device if available, move model into configured device self.device = self._prepare_device(config['n_gpu']) self.n_gpu_use = len(self.device) if self.device[0] != -1: self.model = model.to(self.device[0]) else: self.model = model self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') self.clip = cfg_trainer.get('clip') # configuration to monitor model performance and save best self.best_log = None if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume) self.best_state = None
def __init__(self, model, loss, metrics, optimizer, config, use_apex=True): self.config = config # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config['n_gpu']) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.loss = loss self.metrics = metrics self.optimizer = optimizer if use_apex: self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O1") cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.monitor = cfg_trainer.get('monitor', 'off') # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, criterion, metric_ftns, optimizer, config): self.config = config self.logger = config.get_logger("trainer", config["trainer"]["verbosity"]) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config["n_gpu"]) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.criterion = criterion self.metric_ftns = metric_ftns self.optimizer = optimizer cfg_trainer = config["trainer"] self.epochs = cfg_trainer["epochs"] self.save_period = cfg_trainer["save_period"] self.monitor = cfg_trainer.get("monitor", "off") # configuration to monitor model performance and save best if self.monitor == "off": self.mnt_mode = "off" self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = inf if self.mnt_mode == "min" else -inf self.early_stop = cfg_trainer.get("early_stop", inf) self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer["tensorboard"]) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, criterion, metric_ftns, optimizer, config): self.config = config # ConfigParser类 self.logger = config.get_logger('trainer', config['trainer']['verbosity']) # self.model = model self.criterion = criterion # loss函数 self.metric_ftns = metric_ftns # 评价标准 self.optimizer = optimizer # 优化函数 cfg_trainer = config['trainer'] # 训练参数获取 self.epochs = cfg_trainer['epochs'] # 获取训练总轮数 self.save_period = cfg_trainer['save_period'] # 获取保存频率 self.monitor = cfg_trainer.get('monitor', 'off') # 监视器 # configuration to monitor model performance and save best if self.monitor == 'off': self.mnt_mode = 'off' self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split( ) # 监视模型是否精进的指标 assert self.mnt_mode in ['min', 'max'] self.mnt_best = inf if self.mnt_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) # 早停法 if self.early_stop <= 0: self.early_stop = inf self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter( config.log_dir, self.logger, cfg_trainer['tensorboard']) # 使用tensorboard记录结果 if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, criterion, metric_ftns, optimizer, config): self.config = config self.logger = config.get_logger("trainer", config["trainer"]["verbosity"]) self.model = model self.criterion = criterion self.metric_ftns = metric_ftns self.optimizer = optimizer cfg_trainer = config["trainer"] self.epochs = cfg_trainer["epochs"] self.save_period = cfg_trainer["save_period"] self.monitor = cfg_trainer.get("monitor", "off") # configuration to monitor model performance and save best if self.monitor == "off": self.mnt_mode = "off" self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = inf if self.mnt_mode == "min" else -inf self.early_stop = cfg_trainer.get("early_stop", inf) if self.early_stop <= 0: self.early_stop = inf self.start_epoch = 1 self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer["tensorboard"]) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, model, optimizer, config, data_loader, valid_data_loader=None, lr_scheduler=None, max_len_step=None): ''' :param model: :param optimizer: :param config: :param data_loader: :param valid_data_loader: :param lr_scheduler: :param max_len_step: controls number of batches(steps) in each epoch. ''' self.config = config self.distributed = config['distributed'] if self.distributed: self.local_master = (config['local_rank'] == 0) self.global_master = (dist.get_rank() == 0) else: self.local_master = True self.global_master = True self.logger = config.get_logger( 'trainer', config['trainer']['log_verbosity']) if self.local_master else None # setup GPU device if available, move model into configured device self.device, self.device_ids = self._prepare_device( config['local_rank'], config['local_world_size']) self.model = model.to(self.device) self.optimizer = optimizer cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] monitor_open = cfg_trainer['monitor_open'] if monitor_open: self.monitor = cfg_trainer.get('monitor', 'off') else: self.monitor = 'off' # configuration to monitor model performance and save best if self.monitor == 'off': self.monitor_mode = 'off' self.monitor_best = 0 else: self.monitor_mode, self.monitor_metric = self.monitor.split() assert self.monitor_mode in ['min', 'max'] self.monitor_best = inf if self.monitor_mode == 'min' else -inf self.early_stop = cfg_trainer.get('early_stop', inf) self.early_stop = inf if self.early_stop == -1 else self.early_stop self.start_epoch = 1 if self.local_master: self.checkpoint_dir = config.save_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer['tensorboard']) # load checkpoint for resume training if config.resume is not None: self._resume_checkpoint(config.resume) # load checkpoint following load to multi-gpu, avoid 'module.' prefix if self.config['trainer']['sync_batch_norm'] and self.distributed: self.model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( self.model) if self.distributed: self.model = DDP(self.model, device_ids=self.device_ids, output_device=self.device_ids[0], find_unused_parameters=True) self.data_loader = data_loader if max_len_step is None: # max length of iteration step of every epoch # epoch-based training self.len_step = len(self.data_loader) else: # iteration-based training self.data_loader = inf_loop(data_loader) self.len_step = max_len_step self.valid_data_loader = valid_data_loader self.do_validation = self.valid_data_loader is not None self.lr_scheduler = lr_scheduler log_step = self.config['trainer']['log_step_interval'] self.log_step = log_step if log_step != -1 and 0 < log_step < self.len_step else int( np.sqrt(data_loader.batch_size)) val_step_interval = self.config['trainer']['val_step_interval'] # self.val_step_interval = val_step_interval if val_step_interval!= -1 and 0 < val_step_interval < self.len_step\ # else int(np.sqrt(data_loader.batch_size)) self.val_step_interval = val_step_interval self.gl_loss_lambda = self.config['trainer']['gl_loss_lambda'] self.train_loss_metrics = MetricTracker( 'loss', 'gl_loss', 'crf_loss', writer=self.writer if self.local_master else None) self.valid_f1_metrics = SpanBasedF1MetricTracker(iob_labels_vocab_cls)
def __init__( self, model: Module, loss_fn: Callable, loss_args: Dict[str, Any], metric_fns: List[Callable], metric_args: List[Dict[str, Any]], optimizer: Optimizer, config: ConfigParser, ): self.config: ConfigParser = config self.logger: Logger = config.get_logger("trainer", config["trainer"]["verbosity"]) # Setup GPU device if available. self.device: torch.device device_ids: List[int] self.device, device_ids = self._prepare_device(config["n_gpu"]) # Move model into configured device(s). self.model: Module = model.to(self.device) if len(device_ids) > 1: self.model = DataParallel(model, device_ids=device_ids) # Set loss function and arguments. self.loss_fn: Callable = loss_fn self.loss_args: Dict[str, Any] = loss_args # Set all metric functions and associated arguments. self.metric_fns: List[Callable] = metric_fns self.metric_args: List[Dict[str, Any]] = metric_args # Set optimizer. self.optimizer: Optimizer = optimizer # Set training configuration. cfg_trainer: Dict[str, Any] = config["trainer"] self.epochs: int = cfg_trainer["epochs"] self.save_period: int = cfg_trainer["save_period"] self.monitor: str = cfg_trainer.get("monitor", "off") # Configuration to monitor model performance and save best. if self.monitor == "off": self.mnt_mode: str = "off" self.mnt_best: float = 0 else: self.mnt_metric: str self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = inf if self.mnt_mode == "min" else -inf self.early_stop: float = cfg_trainer.get("early_stop", inf) self.start_epoch: int = 1 self.checkpoint_dir: Path = config.save_dir # Setup visualization writer instance. self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer["tensorboard"]) if config.resume is not None: self._resume_checkpoint(config.resume)
def __init__(self, config, logger, generator, discriminator, encoder, valid_data_loader=None): self.config = config self.logger = logger self.evaluator = Evaluator(config, logger, generator, discriminator, encoder, valid_data_loader) # setup GPU device if available, move model into configured device self.device, self.device_ids = self._prepare_device(config['n_gpu']) self.generator = self.initialize_training(generator) self.discriminator = self.initialize_training(discriminator) self.encoder = encoder self.generator['config'] = config['generator'] self.discriminator['config'] = config['discriminator'] self.encoder['config'] = config['encoder'] self.lambda_cat = self.encoder['config']['lambda_cat'] self.lambda_con = self.encoder['config']['lambda_con'] cfg_trainer = config['trainer'] self.epochs = cfg_trainer['epochs'] self.save_period = cfg_trainer['save_period'] self.early_stop = cfg_trainer.get('early_stop', inf) self.start_epoch = 1 self.latent_dim = config['generator']['arch']['args']['latent_dim'] self.cat_dim = config['generator']['arch']['args']['cat_dim'] self.cont_dim = config['generator']['arch']['args']['cont_dim'] self.sample_generator = SampleGenerator(config.output_dir, generator['model'], self.latent_dim, self.cat_dim, self.cont_dim, n_row=self.cat_dim) self.generator['model'].apply(init_weights_to_normal) self.discriminator['model'].apply(init_weights_to_normal) self.generator['monitor'] = self.initialize_monitor( cfg_trainer['monitor'].get('generator', 'off')) self.discriminator['monitor'] = self.initialize_monitor( cfg_trainer['monitor'].get('discriminator', 'off')) self.encoder['monitor'] = self.initialize_monitor( cfg_trainer['monitor'].get('encoder', 'off')) self.generator['checkpoint_dir'] = config.save_dir / "generator" self.generator['checkpoint_dir'].mkdir(parents=True, exist_ok=True) self.discriminator[ 'checkpoint_dir'] = config.save_dir / "discriminator" self.discriminator['checkpoint_dir'].mkdir(parents=True, exist_ok=True) self.encoder['checkpoint_dir'] = config.save_dir / "encoder" self.encoder['checkpoint_dir'].mkdir(parents=True, exist_ok=True) self.generator['writer'] = TensorboardWriter( config.log_dir / 'generator', self.logger, cfg_trainer['tensorboard']) self.discriminator['writer'] = TensorboardWriter( config.log_dir / 'discriminator', self.logger, cfg_trainer['tensorboard']) self.encoder['writer'] = TensorboardWriter(config.log_dir / 'encoder', self.logger, cfg_trainer['tensorboard']) if config.resume is not None: self._resume_checkpoint('generator', config.resume['generator']) self._resume_checkpoint('discriminator', config.resume['discriminator'])