def main(cfg_dict: DictConfig): # TODO: erase previous logs in the folder at every run config = ConfigParser(cfg_dict) logger = config.get_logger('train') # setup data_loader instances data_loader = config.init_obj('data_loader', module_data) valid_data_loader = data_loader.split_validation() # build model architecture, then print to console model = config.init_obj('arch', module_arch) # logger.info(model) # get function handles of loss and metrics criterion = getattr(module_loss, config['loss']) metrics = [getattr(module_metric, met) for met in config['metrics']] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.init_obj('optimizer', torch.optim, trainable_params) lr_scheduler = config.init_obj('lr_scheduler', torch.optim.lr_scheduler, optimizer) trainer = Trainer(model, criterion, metrics, optimizer, config=config, data_loader=data_loader, valid_data_loader=valid_data_loader, lr_scheduler=lr_scheduler) trainer.train()
def main(config: ConfigParser): # 获取一个logging.getLogger,默认日志级别为debug logger = config.get_logger('train') # 数据模块 # 获取config中读取到的config.json里的loader的名字,并实例化,用json里的参数去填充 data_loader = config.init_obj('data_loader', module_data) valid_data_loader = data_loader.split_validation() # 模型模块 model = config.init_obj('arch', module_arch) logger.info(model) # 损失与评估模块 criterion = getattr(module_loss, config['loss']) # 这里面存的是function,也可能存的是类,通过__name__方法获得名字 metrics = [getattr(module_metric, met) for met in config['metrics']] # 优化器模块 # filter,过滤掉false值 trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.init_obj('optimizer', torch.optim, trainable_params) # 学习率衰减策略 lr_scheduler = config.init_obj('lr_scheduler', torch.optim.lr_scheduler, optimizer) # 训练模型 trainer = Trainer(model, criterion, metrics, optimizer, config=config, data_loader=data_loader, valid_data_loader=valid_data_loader, lr_scheduler=lr_scheduler) trainer.train()
def main(config: ConfigParser): logger = config.get_logger("train") # setup data_loader instances data_loader = config.init_obj("data_loader", module_data) valid_data_loader = data_loader.split_validation() # build model architecture, then print to console model = config.init_obj("arch", module_arch) logger.info(model) # get function handles of loss and metrics criterion = config.init_obj("criterion", module_criterion) metrics = [getattr(module_metric, met) for met in config["metrics"]] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.init_obj("optimizer", module_optim, trainable_params) lr_scheduler = config.init_obj("lr_scheduler", torch.optim.lr_scheduler, optimizer) trainer = Trainer( model, criterion, metrics, optimizer, config=config, data_loader=data_loader, valid_data_loader=valid_data_loader, lr_scheduler=lr_scheduler, ) trainer.train()
def main(config: ConfigParser) -> None: """ Main training function. Parameters ---------- config : parse_config.ConfigParser Parsed configuration JSON file. """ logger: Logger = config.get_logger("train") # Setup data_loader instances. data_loader: DataLoader = config.initialize("data_loader", module_data) valid_data_loader: Optional[DataLoader] = data_loader.split_validation() # Build model architecture, then print to console. model: Module = config.initialize("arch", module_arch) logger.info(model) # Get function handles of loss and metrics as well as args. loss_fn: Callable = getattr(module_loss, config["loss"]["type"]) loss_args: Dict[str, Any] = config["loss"]["args"] metric_fns: List[Callable] = [ getattr(module_metric, met) for met in config["metrics"] ] metric_args: List[Dict[str, Any]] = [ config["metrics"][met] for met in config["metrics"] ] # Build optimizer, learning rate scheduler. # Delete every line containing lr_scheduler to disable scheduler. trainable_params: Iterable[Tensor] = filter(lambda p: p.requires_grad, model.parameters()) optimizer: Optimizer = config.initialize("optimizer", torch.optim, trainable_params) lr_scheduler: Optional = config.initialize("lr_scheduler", torch.optim.lr_scheduler, optimizer) trainer: Trainer = Trainer( model, loss_fn, loss_args, metric_fns, metric_args, optimizer, config=config, data_loader=data_loader, valid_data_loader=valid_data_loader, lr_scheduler=lr_scheduler, ) trainer.train()
def __init__( self, model: torch.nn.Module, criterion: torch.nn.modules.loss._Loss, metric_ftns: List[Callable[..., float]], optimizer: torch.optim.Optimizer, config: ConfigParser, lr_scheduler: Union[torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.ReduceLROnPlateau, None, ] = None, ): self.config = config self.logger = config.get_logger("trainer", config["trainer"]["verbosity"]) # setup GPU device if available, move model into configured device self.device, device_ids = self._prepare_device(config["n_gpu"]) self.model = model.to(self.device) if len(device_ids) > 1: self.model = torch.nn.DataParallel(model, device_ids=device_ids) self.criterion = criterion self.metric_ftns = metric_ftns self.optimizer = optimizer self.lr_scheduler = lr_scheduler cfg_trainer = config["trainer"] self.epochs = cfg_trainer["epochs"] self.save_period = cfg_trainer["save_period"] self.monitor = cfg_trainer.get("monitor", "off") self.save_last = cfg_trainer.get("save_last", False) # configuration to monitor model performance and save best if self.monitor == "off": self.mnt_mode = "off" self.mnt_best = 0 else: self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = inf if self.mnt_mode == "min" else -inf self.early_stop = cfg_trainer.get("early_stop", inf) self.start_epoch = 1 self.checkpoint_dir = config.model_dir # setup visualization writer instance self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer["tensorboard"]) if config.resume is not None: self._resume_checkpoint(config.resume)
def entry_point(config: ConfigParser): ''' entry-point function for a single worker, distributed training ''' local_world_size = config['local_world_size'] # check gpu available if torch.cuda.is_available(): if torch.cuda.device_count() < local_world_size: raise RuntimeError(f'the number of GPU ({torch.cuda.device_count()}) is less than ' f'the number of processes ({local_world_size}) running on each node') local_master = config['local_rank'] == 0 else: raise RuntimeError('CUDA is not available, Distributed training is not supported.') if local_master: logger = config.get_logger('train') logger.info('Distributed training start...') # these are the parameters used to initialize the process group env_dict = { key: os.environ[key] for key in ('MASTER_ADDR', 'MASTER_PORT', 'RANK', 'WORLD_SIZE') } logger.info(f'[Process {os.getpid()}] Initializing process group with: {env_dict}') if local_master else None # init process group dist.init_process_group(backend='nccl', init_method='env://') logger.info( f'[Process {os.getpid()}] world_size = {dist.get_world_size()}, ' + f'rank = {dist.get_rank()}, backend={dist.get_backend()}' ) if local_master else None # start train main(config, local_master, logger if local_master else None) # tear down the process group dist.destroy_process_group()
def main(cfg_dict: DictConfig): config = ConfigParser(cfg_dict) logger = config.get_logger('test') # setup data_loader instances data_loader = getattr(module_data, config['data_loader']['type'])( config['data_loader']['args']['data_dir'], batch_size=512, shuffle=False, validation_split=0.0, training=False, num_workers=2) # build model architecture model = config.init_obj('arch', module_arch) logger.info(model) # get function handles of loss and metrics loss_fn = getattr(module_loss, config['loss']) metric_fns = [getattr(module_metric, met) for met in config['metrics']] logger.info('Loading checkpoint: {} ...'.format(config['resume'])) checkpoint = torch.load(config['resume']) state_dict = checkpoint['state_dict'] if config['n_gpu'] > 1: model = torch.nn.DataParallel(model) model.load_state_dict(state_dict) # prepare model for testing device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) model.eval() total_loss = 0.0 total_metrics = torch.zeros(len(metric_fns)) with torch.no_grad(): for i, (data, target) in enumerate(tqdm(data_loader)): # TODO: overlap objects with overlap_objects_from_batch in util.oy # TODO: check model's output is correct for the loss_fn data, target = data.to(device), target.to(device) output = model(data) # # save sample images, or do something with output here # # computing loss, metrics on test set loss = loss_fn(output, target) batch_size = data.shape[0] total_loss += loss.item() * batch_size for i, metric in enumerate(metric_fns): total_metrics[i] += metric(output, target) * batch_size n_samples = len(data_loader.sampler) log = {'loss': total_loss / n_samples} log.update({ met.__name__: total_metrics[i].item() / n_samples for i, met in enumerate(metric_fns) }) logger.info(log)
import os, sys from pathlib import Path import re root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) src_dir = os.path.join(root_dir, "src") sys.path.insert(0, src_dir) # change cwd to root dir os.chdir(root_dir) from parse_config import ConfigParser from utils import util_geo re_img_index = re.compile("img\d+") if __name__ == '__main__': config = ConfigParser(ConfigParser.from_file("test/configs/geotest.json")) logger = config.get_logger('train') data_dir = Path(config['data_loader']['args']['data_dir']) data_name = config['data_loader']['args']['data_name'] img_dir = data_dir / data_name / "RGB-PanSharpen" save_dir = data_dir / data_name / 'processed' img_save_dir = save_dir / "RGB" geojson_dir = data_dir / data_name / "geojson" mask_save_dir = save_dir / "labels" colors = config['data_loader']['args']["colors"] img_save_dir.mkdir(parents=True, exist_ok=True) mask_save_dir.mkdir(parents=True, exist_ok=True) util_geo.GeoLabelUtil.preprocess(img_dir, geojson_dir, img_save_dir, mask_save_dir, colors)
def main(config: ConfigParser): logger = config.get_logger('train') data_loader = getattr(module_data, config['data_loader']['type'])( config['data_loader']['args']['data_dir'], batch_size=config['data_loader']['args']['batch_size'], shuffle=config['data_loader']['args']['shuffle'], validation_split=config['data_loader']['args']['validation_split'], num_batches=config['data_loader']['args']['num_batches'], training=True, num_workers=config['data_loader']['args']['num_workers'], pin_memory=config['data_loader']['args']['pin_memory']) # valid_data_loader = data_loader.split_validation() valid_data_loader = None # test_data_loader = None test_data_loader = getattr(module_data, config['data_loader']['type'])( config['data_loader']['args']['data_dir'], batch_size=128, shuffle=False, validation_split=0.0, training=False, num_workers=2) #.split_validation() # build model architecture, then print to console model = config.initialize('arch', module_arch) # get function handles of loss and metrics logger.info(config.config) if hasattr(data_loader.dataset, 'num_raw_example'): num_examp = data_loader.dataset.num_raw_example else: num_examp = len(data_loader.dataset) train_loss = getattr(module_loss, config['train_loss']['type'])( num_examp=num_examp, num_classes=config['num_classes'], beta=config['train_loss']['args']['beta']) val_loss = getattr(module_loss, config['val_loss']) metrics = [getattr(module_metric, met) for met in config['metrics']] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.initialize('optimizer', torch.optim, [{ 'params': trainable_params }]) lr_scheduler = config.initialize('lr_scheduler', torch.optim.lr_scheduler, optimizer) trainer = Trainer(model, train_loss, metrics, optimizer, config=config, data_loader=data_loader, valid_data_loader=valid_data_loader, test_data_loader=test_data_loader, lr_scheduler=lr_scheduler, val_criterion=val_loss) trainer.train() logger = config.get_logger('trainer', config['trainer']['verbosity']) cfg_trainer = config['trainer']
def __init__( self, model: Module, loss_fn: Callable, loss_args: Dict[str, Any], metric_fns: List[Callable], metric_args: List[Dict[str, Any]], optimizer: Optimizer, config: ConfigParser, ): self.config: ConfigParser = config self.logger: Logger = config.get_logger("trainer", config["trainer"]["verbosity"]) # Setup GPU device if available. self.device: torch.device device_ids: List[int] self.device, device_ids = self._prepare_device(config["n_gpu"]) # Move model into configured device(s). self.model: Module = model.to(self.device) if len(device_ids) > 1: self.model = DataParallel(model, device_ids=device_ids) # Set loss function and arguments. self.loss_fn: Callable = loss_fn self.loss_args: Dict[str, Any] = loss_args # Set all metric functions and associated arguments. self.metric_fns: List[Callable] = metric_fns self.metric_args: List[Dict[str, Any]] = metric_args # Set optimizer. self.optimizer: Optimizer = optimizer # Set training configuration. cfg_trainer: Dict[str, Any] = config["trainer"] self.epochs: int = cfg_trainer["epochs"] self.save_period: int = cfg_trainer["save_period"] self.monitor: str = cfg_trainer.get("monitor", "off") # Configuration to monitor model performance and save best. if self.monitor == "off": self.mnt_mode: str = "off" self.mnt_best: float = 0 else: self.mnt_metric: str self.mnt_mode, self.mnt_metric = self.monitor.split() assert self.mnt_mode in ["min", "max"] self.mnt_best = inf if self.mnt_mode == "min" else -inf self.early_stop: float = cfg_trainer.get("early_stop", inf) self.start_epoch: int = 1 self.checkpoint_dir: Path = config.save_dir # Setup visualization writer instance. self.writer = TensorboardWriter(config.log_dir, self.logger, cfg_trainer["tensorboard"]) if config.resume is not None: self._resume_checkpoint(config.resume)
vocab_optimal, T_opt = get_optimal_ordering(config, args_outer) else: with open(args_outer.optimal_taxo_path, "rb") as f: T_opt = pickle.load(f) vocab_optimal = list(nx.topological_sort(T_opt)) if args_outer.model: vocab_model, T_model = get_insertion_ordering(config, args_outer) else: with open(args_outer.model_taxo_path, "rb") as f: T_model = pickle.load(f) vocab_model = list(nx.topological_sort(T_model)) if args_outer.direct_eval: logger = config.get_logger('test') logger.info(edge_metrics(T_opt, T_model)) logger.info(ancestor_metrics(T_opt, T_model)) if args_outer.insert: main_sequential(config, args_outer, vocab_model) # reverse optimal # rev_optimal = list(reversed(vocab_optimal)) # main_sequential(config, args_outer, rev_optimal) # random order insertion # vocab_random = [vocab_optimal[i] for i in np.random.permutation(len(vocab_optimal))] # main_sequential(config, args_outer, vocab_random)
def main(config: ConfigParser): logger = config.get_logger('train') logger.info(config.config) # setup data_loader instances data_loader1 = getattr(module_data, config['data_loader']['type'])( config['data_loader']['args']['data_dir'], batch_size=config['data_loader']['args']['batch_size'], shuffle=config['data_loader']['args']['shuffle'], validation_split=config['data_loader']['args']['validation_split'], num_batches=config['data_loader']['args']['num_batches'], training=True, num_workers=config['data_loader']['args']['num_workers'], pin_memory=config['data_loader']['args']['pin_memory']) data_loader2 = getattr(module_data, config['data_loader']['type'])( config['data_loader']['args']['data_dir'], batch_size=config['data_loader']['args']['batch_size2'], shuffle=config['data_loader']['args']['shuffle'], validation_split=config['data_loader']['args']['validation_split'], num_batches=config['data_loader']['args']['num_batches'], training=True, num_workers=config['data_loader']['args']['num_workers'], pin_memory=config['data_loader']['args']['pin_memory']) valid_data_loader = data_loader1.split_validation() test_data_loader = getattr(module_data, config['data_loader']['type'])( config['data_loader']['args']['data_dir'], batch_size=128, shuffle=False, validation_split=0.0, training=False, num_workers=2).split_validation() # build model architecture model1 = config.initialize('arch1', module_arch) model_ema1 = config.initialize('arch1', module_arch) model_ema1_copy = config.initialize('arch1', module_arch) model2 = config.initialize('arch2', module_arch) model_ema2 = config.initialize('arch2', module_arch) model_ema2_copy = config.initialize('arch2', module_arch) # get function handles of loss and metrics device_id = list(range(min(torch.cuda.device_count(), config['n_gpu']))) if hasattr(data_loader1.dataset, 'num_raw_example') and hasattr( data_loader2.dataset, 'num_raw_example'): num_examp1 = data_loader1.dataset.num_raw_example num_examp2 = data_loader2.dataset.num_raw_example else: num_examp1 = len(data_loader1.dataset) num_examp2 = len(data_loader2.dataset) train_loss1 = getattr(module_loss, config['train_loss']['type'])( num_examp=num_examp1, num_classes=config['num_classes'], device='cuda:' + str(device_id[0]), config=config.config, beta=config['train_loss']['args']['beta']) train_loss2 = getattr(module_loss, config['train_loss']['type'])( num_examp=num_examp2, num_classes=config['num_classes'], device='cuda:' + str(device_id[-1]), config=config.config, beta=config['train_loss']['args']['beta']) val_loss = getattr(module_loss, config['val_loss']) metrics = [getattr(module_metric, met) for met in config['metrics']] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler trainable_params1 = filter(lambda p: p.requires_grad, model1.parameters()) trainable_params2 = filter(lambda p: p.requires_grad, model2.parameters()) optimizer1 = config.initialize('optimizer1', torch.optim, [{ 'params': trainable_params1 }]) optimizer2 = config.initialize('optimizer2', torch.optim, [{ 'params': trainable_params2 }]) lr_scheduler1 = config.initialize('lr_scheduler', torch.optim.lr_scheduler, optimizer1) lr_scheduler2 = config.initialize('lr_scheduler', torch.optim.lr_scheduler, optimizer2) trainer = Trainer(model1, model2, model_ema1, model_ema2, train_loss1, train_loss2, metrics, optimizer1, optimizer2, config=config, data_loader1=data_loader1, data_loader2=data_loader2, valid_data_loader=valid_data_loader, test_data_loader=test_data_loader, lr_scheduler1=lr_scheduler1, lr_scheduler2=lr_scheduler2, val_criterion=val_loss, model_ema1_copy=model_ema1_copy, model_ema2_copy=model_ema2_copy) trainer.train() logger = config.get_logger('trainer', config['trainer']['verbosity']) cfg_trainer = config['trainer']
def main(config: ConfigParser): logger = config.get_logger('train') data_loader = getattr(module_data, config['data_loader']['type'])( config['data_loader']['args']['data_dir'], batch_size=config['data_loader']['args']['batch_size'], shuffle=config['data_loader']['args']['shuffle'], validation_split=config['data_loader']['args']['validation_split'], num_batches=config['data_loader']['args']['num_batches'], training=True, num_workers=config['data_loader']['args']['num_workers'], pin_memory=config['data_loader']['args']['pin_memory']) valid_data_loader = data_loader.split_validation() test_data_loader = getattr(module_data, config['data_loader']['type'])( config['data_loader']['args']['data_dir'], batch_size=128, shuffle=False, validation_split=0.0, training=False, num_workers=2).split_validation() # build model architecture, then print to console model = config.initialize('arch', module_arch) train_loss = getattr(module_loss, config['train_loss']) val_loss = getattr(module_loss, config['val_loss']) metrics = [getattr(module_metric, met) for met in config['metrics']] logger.info(str(model).split('\n')[-1]) # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler trainable_params = [{ 'params': [ p for p in model.parameters() if (not getattr(p, 'bin_gate', False)) and ( not getattr(p, 'bin_theta', False)) and ( not getattr(p, 'srelu_bias', False)) and getattr(p, 'requires_grad', False) ] }, { 'params': [ p for p in model.parameters() if getattr(p, 'bin_gate', False) and getattr(p, 'requires_grad', False) ], 'lr': config['optimizer']['args']['lr'] * 10, 'weight_decay': 0 }, { 'params': [ p for p in model.parameters() if getattr(p, 'srelu_bias', False) and getattr(p, 'requires_grad', False) ], 'weight_decay': 0 }, { 'params': [ p for p in model.parameters() if getattr(p, 'bin_theta', False) and getattr(p, 'requires_grad', False) ], 'lr': config['optimizer']['args']['lr'], 'weight_decay': 0 }] optimizer = config.initialize('optimizer', torch.optim, trainable_params) lr_scheduler = config.initialize('lr_scheduler', torch.optim.lr_scheduler, optimizer) trainer = Trainer(model, train_loss, metrics, optimizer, config=config, data_loader=data_loader, valid_data_loader=valid_data_loader, test_data_loader=test_data_loader, lr_scheduler=lr_scheduler, val_criterion=val_loss) trainer.train() logger = config.get_logger('trainer', config['trainer']['verbosity']) cfg_trainer = config['trainer']
def main(cfg_dict : DictConfig): generate = False load_gen = True save = True # remove_eigs = True remove_eigs = False config = ConfigParser(cfg_dict) T_rec, T_pred = config['n_timesteps'], config['seq_length'] - config['n_timesteps'] logger = config.get_logger('test') gt = True # gt = True model_name = 'ddpae-iccv' # model_name = 'DRNET' # model_name = 'scalor' # model_name = 'sqair' s_directory = os.path.join(config['data_loader']['args']['data_dir'], 'test_data') res_directory = os.path.join(config['data_loader']['args']['data_dir'], 'res_data') load_gen_directory = os.path.join(config['data_loader']['args']['data_dir'], 'results') # # TODO: Testing features # load_gen_directory = os.path.join(config['data_loader']['args']['data_dir'], 'test_data') if not os.path.exists(s_directory): os.makedirs(s_directory) if not os.path.exists(res_directory): os.makedirs(res_directory) dataset_dir = os.path.join(s_directory, config['data_loader']['args']['dataset_case']+ '_Len-'+str(config['seq_length'])+'_Nts-'+str(config['n_timesteps'])+'.npy') results_dir = os.path.join(res_directory, config['data_loader']['args']['dataset_case']+ '_Len-'+str(config['seq_length'])+'_Nts-'+str(config['n_timesteps'])+'.npz') all_data = [] if not os.path.exists(dataset_dir) and generate: config['data_loader']['args']['shuffle'] = False config['data_loader']['args']['training'] = False config['data_loader']['args']['validation_split'] = 0.0 data_loader = config.init_obj('data_loader', module_data) for i, data in enumerate(tqdm(data_loader)): all_data.append(data) all_data = torch.cat(all_data, dim=0).numpy() print(all_data.shape) np.save(dataset_dir, all_data) print(config['data_loader']['args']['dataset_case']+ ' data generated in: '+s_directory) exit() if os.path.exists(dataset_dir): print('LOADING EXISTING DATA FROM: ' + dataset_dir) inps = torch.from_numpy(np.load(dataset_dir)) if os.path.exists(load_gen_directory) and load_gen: if model_name == 'ddpae-iccv': outs = torch.from_numpy( np.load(os.path.join( load_gen_directory, model_name +'--'+config['data_loader']['args']['dataset_case']+ '_Len-'+str(config['seq_length'])+'_Nts-'+str(config['n_timesteps'])+'.npy'))) else: with np.load(os.path.join( load_gen_directory, model_name +'_'+config['data_loader']['args']['dataset_case']+'.npz')) as outputs: if model_name == 'scalor': outs = torch.from_numpy(outputs["pred"]).permute(0,1,3,2).unsqueeze(2) elif model_name == 'DRNET': outs = torch.from_numpy(outputs["pred"]).unsqueeze(2).float() else: outs = torch.from_numpy(outputs["pred"]).unsqueeze(2) print('Inps and Outs shapes', inps.shape, outs.shape) loaded_dataset = TensorDataset(inps, outs) else: loaded_dataset = TensorDataset(inps) data_loader = DataLoader(loaded_dataset, batch_size=40, shuffle=False, sampler=None, batch_sampler=None, num_workers=2, collate_fn=None, pin_memory=False) else: print('te has liao si te metes aqui') exit() config['data_loader']['args']['shuffle'] = False config['data_loader']['args']['training'] = False config['data_loader']['args']['validation_split'] = 0.0 data_loader = config.init_obj('data_loader', module_data) # build model architecture if not load_gen: model = config.init_obj('arch', module_arch) # logger.info(model) # get function handles of loss and metrics loss_fn = getattr(module_loss, config['loss']) metric_fns = [getattr(module_metric, met) for met in ["mse", "mae", "bce", "mssim", "mlpips"]] if not load_gen: logger.info('Loading checkpoint: {} ...'.format(config['resume'])) checkpoint = torch.load(config['resume']) state_dict = checkpoint['state_dict'] if config['n_gpu'] > 1: model = torch.nn.DataParallel(model) model.load_state_dict(state_dict) # prepare model for testing device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) model.eval() if remove_eigs: A_modified, indices, e = remove_eig_under_t( model.koopman.dynamics.dynamics.weight.data, t=0.7) A_modified = torch.from_numpy(A_modified.real).to(device) model.koopman.dynamics.dynamics.weight.data = A_modified total_loss = 0.0 total_metrics = [torch.zeros(len(metric_fns)), torch.zeros(len(metric_fns))] # TODO: Here we can change the model's K, and crop the eigenvalues under certain module threshold. # Si la nova prediccio es mes llarga, evaluem nomes la nova: # T_pred = 8 all_pred, all_rec = [], [] with torch.no_grad(): for i, data in enumerate(tqdm(data_loader)): if isinstance(data, list) and len(data) == 2: target = data[0] output = data[1] batch_size = target.shape[0] # total_loss += loss.item() * batch_size pred = output[:, -T_pred:], target[:, -T_pred:] rec = output[:, :T_rec], target[:, :T_rec] assert T_rec + T_pred == target.shape[1] assert target.shape == output.shape else: if isinstance(data, list) and len(data) == 1: data = data[0] # if config["data_loader"]["type"] == "MovingMNISTLoader": # data = overlap_objects_from_batch(data,config['n_objects']) target = data # Is data a variable? data, target = data.to(device), target.to(device) output = model(data, epoch_iter=[-1], test=True) # computing loss, metrics on test set # loss, loss_particles = loss_fn(output, target, # epoch_iter=[-1], # case=config["data_loader"]["args"]["dataset_case"]) batch_size = data.shape[0] # total_loss += loss.item() * batch_size pred = output["pred_roll"][:, -T_pred:] , target[:, -T_pred:] #* 0.85 rec = output["rec_ori"][:, :T_rec] * 0.85, target[:, :T_rec] assert T_rec + T_pred == target.shape[1] if config['data_loader']['args']['dataset_case'] == 'circles_crop': rec_cr, pred_cr = [crop_top_left_keepdim(vid[0], 35) for vid in [rec, pred]] rec, pred = (rec_cr, target[:, :T_rec]), (pred_cr, target[:, -T_pred:]) # Save image sample if i==0: if gt: idx_gt = 1 else: idx_gt = 0 # 11 fail to reconstruct. idx = 21 # print(rec.shape, pred.shape) # print_u = output["u"].reshape(40, 2, -1, 4)[idx,:,-torch.cat(pred, dim=-2).shape[1]:]\ # .cpu() # print_u = print_u.abs()*255 # print_im = torch.cat(pred, dim=-2).permute(0,2,3,1,4)[idx,0,:,:] print_im = pred[idx_gt].permute(0,2,3,1,4)[idx,0] np.save("/home/acomasma/ool-dynamics/dk/image_sample.npy", print_im.cpu().numpy()) image = im.fromarray(print_im.reshape(print_im.shape[-3], -1).cpu().numpy()*255) image = image.convert('RGB') image.save("/home/acomasma/ool-dynamics/dk/image_sample.png") # u_plot_o1 = im.fromarray(plot_matrix(print_u[0]).permute(1,0).numpy()).convert('RGB') # u_plot_o1.save("/home/acomasma/ool-dynamics/dk/input_sample_o1.png") # # u_plot_o2 = im.fromarray(plot_matrix(print_u[1]).permute(1,0).numpy()).convert('RGB') # u_plot_o2.save("/home/acomasma/ool-dynamics/dk/input_sample_o2.png") # exit() image = im.fromarray(rec[idx_gt].permute(0,2,3,1,4)[idx,0].reshape(64, -1).cpu().numpy()*255) image = image.convert('RGB') image.save("/home/acomasma/ool-dynamics/dk/image_sample_rec.png") exit() all_pred.append(pred[0]) all_rec.append(rec[0]) for j, (out, tar) in enumerate([rec, pred]): for i, metric in enumerate(metric_fns): # TODO: dataset case in metrics total_metrics[j][i] += metric(out, tar) * batch_size n_samples = len(data_loader.sampler) print('n_samples', n_samples) # log = {'loss': total_loss / n_samples} log = {} print('Timesteps Rec and pred: ' , T_rec, T_pred) for j, name in enumerate(['rec', 'pred']): log.update({ met.__name__: total_metrics[j][i].item() / n_samples for i, met in enumerate(metric_fns) }) print(name) logger.info(log)
def pred(self, paths, metas, m_cfg, id): print('pred') self.cfg = m_cfg res = Response() if len(paths) != len(metas): res.code = -2 res.msg = "The length of images and meta is not same." return res # if self.pred_th is not None: # if self.pred_th.is_alive(): # res.code = -3 # res.msg = "There is a task running, please wait it finish." # return res try: m_typename = m_cfg["name"].split("-")[1] if m_typename == "Deeplab" or m_typename == "UNet": from .predthread import SegPredThread self.device = torch.device( 'cuda:0' if self.n_gpu_use > 0 else 'cpu') torch.set_grad_enabled(False) m_cfg["save_dir"] = str(self.tmp_path) config = ConfigParser(m_cfg, Path(m_cfg["path"])) self.logger = config.get_logger('PredServer') self.model = config.init_obj('arch', module_arch) self.logger.info('Loading checkpoint: {} ...'.format( config.resume)) if self.n_gpu_use > 1: self.model = torch.nn.DataParallel(self.model) if self.n_gpu_use > 0: checkpoint = torch.load(config.resume) else: checkpoint = torch.load(config.resume, map_location=torch.device('cpu')) state_dict = checkpoint['state_dict'] self.model.load_state_dict(state_dict) self.model = self.model.to(self.device) self.model.eval() if "crop_size" in config["tester"]: self.crop_size = config["tester"]["crop_size"] if 'postprocessor' in config["tester"]: module_name = config["tester"]['postprocessor']['type'] module_args = dict( config["tester"]['postprocessor']['args']) self.postprocessor = getattr(postps_crf, module_name)(**module_args) self.tmp_path.mkdir(parents=True, exist_ok=True) self.pred_ths.append( SegPredThread(self, paths, metas, self.tmp_path, id)) elif m_typename == "CycleGAN": from .predthread import CycleGANPredThread from model import CycleGANOptions, CycleGANModel # config = ConfigParser(m_cfg, Path(m_cfg["path"])) opt = CycleGANOptions(**m_cfg["arch"]["args"]) opt.batch_size = self.batch_size opt.serial_batches = True opt.no_flip = True # no flip; opt.display_id = -1 # no visdom display; the test code saves the results to a HTML file. opt.isTrain = False opt.gpu_ids = [] for i in range(0, self.n_gpu_use): opt.gpu_ids.append(i) opt.checkpoints_dir = str(self.tmp_path) opt.preprocess = "none" opt.direction = 'AtoB' self.model = CycleGANModel(opt) orig_save_dir = self.model.save_dir self.model.save_dir = "" self.model.load_networks(m_cfg["path"]) self.model.save_dir = orig_save_dir torch.set_grad_enabled(False) self.model.set_requires_grad( [self.model.netG_A, self.model.netG_B], False) self.pred_ths.append( CycleGANPredThread(self, paths, metas, self.tmp_path, id)) else: raise NotImplementedError("Model type:", m_typename, "is not supported.") print('NotifyStartThread') self.pred_ths[-1].start() # self.pred_th.is_alive() except Exception as e: res.code = -1 res.msg = str(e) return res res.code = 0 res.msg = "Success" return res
def entry_point(config: ConfigParser): ''' entry-point function for a single worker distributed training a single worker contain (torch.cuda.device_count() / local_world_size) gpus ''' local_world_size = config['local_world_size'] # check distributed environment cfgs if config['distributed']: # distributed gpu mode # check gpu available if torch.cuda.is_available(): if torch.cuda.device_count() < local_world_size: raise RuntimeError( f'the number of GPU ({torch.cuda.device_count()}) is less than ' f'the number of processes ({local_world_size}) running on each node' ) local_master = (config['local_rank'] == 0) else: raise RuntimeError( 'CUDA is not available, Distributed training is not supported.' ) else: # one gpu or cpu mode if config['local_world_size'] != 1: raise RuntimeError( 'local_world_size must set be to 1, if distributed is set to false.' ) config.update_config('local_rank', 0) local_master = True config.update_config('global_rank', 0) logger = config.get_logger('train') if local_master else None if config['distributed']: logger.info('Distributed GPU training model start...' ) if local_master else None else: logger.info( 'One GPU or CPU training mode start...') if local_master else None # else: # sys.stdin.close() # cfg CUDNN whether deterministic if config['deterministic']: fix_random_seed_for_reproduce(config['seed']) logger.warn( 'You have chosen to deterministic training. ' 'This will fix random seed, turn on the CUDNN deterministic setting, turn off the CUDNN benchmark ' 'which can slow down your training considerably! ' ) if local_master else None else: torch.backends.cudnn.benchmark = True logger.warn( 'You have chosen to benchmark training. ' 'This will turn on the CUDNN benchmark setting' 'which can speed up your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints due to RandomizedMultiLinearMap need deterministic turn on.' ) if local_master else None if config['distributed']: # init process group dist.init_process_group(backend='nccl', init_method='env://') config.update_config('global_rank', dist.get_rank()) # log distributed training cfg logger.info( f'[Process {os.getpid()}] world_size = {dist.get_world_size()}, ' + f'rank = {dist.get_rank()}, backend={dist.get_backend()}' ) if local_master else None # start train main(config, local_master, logger if local_master else None) # tear down the process group dist.destroy_process_group()
def main(config: ConfigParser): access_token = '' with open('./pytorch_line_token') as f: access_token = str(f.readline()) bot = LINENotifyBot(access_token=access_token) logger = config.get_logger('train') # setup data_loader instances data_loader = config.initialize('data_loader', module_data) valid_data_loader = data_loader.split_validation() # build model architecture, then print to console model = config.initialize('arch', module_arch) logger.info(model) # get function handles of loss and metrics loss = getattr(module_loss, config['loss']) metrics = [getattr(module_metric, met) for met in config['metrics']] # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.initialize('optimizer', torch.optim, trainable_params) lr_scheduler = config.initialize('lr_scheduler', torch.optim.lr_scheduler, optimizer) trainer = Trainer(model, loss, metrics, optimizer, config=config, data_loader=data_loader, valid_data_loader=valid_data_loader, lr_scheduler=lr_scheduler) trainer.train() logger = config.get_logger('trainer', config['trainer']['verbosity']) cfg_trainer = config['trainer'] # mlflow.start_run()で__enter__()を実行できるようにする必要がある。一旦棚上げ。 # mlflow = MLFlow(config.log_dir, logger, cfg_trainer['mlflow']) with mlflow.start_run() as run: # Log args into mlflow log_params(config.config) # Log results into mlflow for loss in trainer.train_loss_list: mlflow.log_metric('train_loss', loss) for loss in trainer.val_loss_list: mlflow.log_metric('val_loss', loss) # Log other info # mlflow.log_param('loss_type', 'CrossEntropy') # Log model mlflow.pytorch.log_model(model, 'model') bot.send(message=f'{config["name"]}の訓練が終了しました。@{socket.gethostname()}')
CustomArgs(['--attn_drop'], type=float, target=('arch', 'args', 'attn_drop')), CustomArgs(['--hidden_drop'], type=float, target=('arch', 'args', 'hidden_drop')), CustomArgs(['--out_drop'], type=float, target=('arch', 'args', 'out_drop')), ] config = ConfigParser(args, options) args = args.parse_args() n_trials = args.n_trials if n_trials > 0: config.get_logger('train').info(f'number of trials: {n_trials}') metrics = config['metrics'] save_file = config.log_dir / 'evaluations.txt' fin = open(save_file, 'w') fin.write('\t'.join(metrics)) evaluations = [] for i in range(n_trials): config.set_save_dir(i + 1) res = main(config) evaluations.append(res) fin.write('\t'.join([f'{i:.3f}' for i in res])) evaluations = np.array(evaluations) means = evaluations.mean(axis=0) stds = evaluations.std(axis=0)
def main(config: ConfigParser) -> None: """ Main testing function. Parameters ---------- config : parse_config.ConfigParser Parsed configuration JSON file. """ logger: Logger = config.get_logger("test") # Setup data_loader instance. data_loader: DataLoader = getattr(module_data, config["data_loader"]["type"])( config["data_loader"]["args"]["data_dir"], batch_size=512, shuffle=False, validation_split=0.0, training=False, num_workers=2, ) # Build model architecture. model: Module = config.initialize("arch", module_arch) logger.info(model) # Get function handles of loss and metrics as well as args. loss_fn: Callable = getattr(module_loss, config["loss"]["type"]) loss_args: Dict[str, Any] = config["loss"]["args"] metric_fns: List[Callable] = [getattr(module_metric, met) for met in config["metrics"]] metric_args: List[Dict[str, Any]] = [config["metrics"][met] for met in config["metrics"]] logger.info("Loading checkpoint: {} ...".format(config.resume)) checkpoint: dict = torch.load(config.resume) state_dict: dict = checkpoint["state_dict"] if config["n_gpu"] > 1: model = torch.nn.DataParallel(model) model.load_state_dict(state_dict) # Prepare model for testing. device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) model.eval() total_loss: float = 0.0 total_metrics: Tensor = torch.zeros(len(metric_fns)) with torch.no_grad(): i: int data: Tensor target: Tensor for i, (data, target) in enumerate(tqdm(data_loader)): data, target = data.to(device), target.to(device) output: Tensor = model(data) # # save sample images, or do something with output here # # computing loss, metrics on test set loss: Tensor = loss_fn(output, target, **loss_args) batch_size: int = data.shape[0] total_loss += loss.item() * batch_size j: int metric: Callable for j, metric in enumerate(metric_fns): total_metrics[j] += metric(output, target, **metric_args[j]) * batch_size n_samples: int = len(data_loader.sampler) log: Dict[str, Any] = {"loss": total_loss / n_samples} met: Callable log.update( {met.__name__: total_metrics[i].item() / n_samples for i, met in enumerate(metric_fns)} ) logger.info(log)