def train(self):
        """
        Full training logic
        """
        self.not_improved_count = 0
        self.improved_since_last_save = False
        for epoch in range(self.start_epoch, self.epochs + 1):
            print()
            self.data_loader.step(epoch)

            result = self.train_epoch(epoch)
            if self.do_validation:
                self.valid_data_loader.step(epoch)
                val_log = self.valid_epoch(epoch)
                result = {**result, **val_log}

            if self.lr_scheduler is not None:
                self.logger.info(
                    f"Learning rate: {self.lr_scheduler.get_lr()}")
                self.lr_scheduler.step(epoch=epoch)

            if get_global_rank() == 0:
                log = self._log_info(result, epoch)
                early_stop = self._check_early_stop(log, epoch)

                if early_stop:
                    break
 def step(self, epoch):
     super().step(epoch)
     self.enable_multithreading_if_possible()
     if not self.fixed_dataset:
         self.dataset.idx_offset = epoch * len(self.dataset)
         seed = epoch
         seed = seed * get_world_size() + get_global_rank()
         if self.valid_loader:
             seed = 2**32 - seed
             self.dataset.set_to_lognorm()
         else:
             self.dataset.set_to_default_sim()
         self.dataset.simulator.set_seed(seed)
Beispiel #3
0
    def __init__(self, args, options='', timestamp=True):
        # parse default and custom cli options
        for opt in options:
            args.add_argument(*opt.flags, default=None, type=opt.type)
        args = args.parse_args()

        self.resume = None
        if args.device:
            os.environ["CUDA_VISIBLE_DEVICES"] = args.device
        if args.resume:
            self.resume = Path(args.resume)
            self.cfg_fname = self.resume.parent / 'config.json'
        if args.config:
            self.cfg_fname = Path(args.config)
        msg_no_cfg = ("Configuration file need to be specified. "
                      "Add '-c config.json', for example.")
        assert self.cfg_fname is not None, msg_no_cfg

        # load config file and apply custom cli options
        config = read_json(self.cfg_fname)
        self.__config = _update_config(config, options, args)
        self.__raw = copy.deepcopy(self.__config)

        # set save_dir where trained model and log will be saved.
        save_dir = Path(
            parse_value(self.config['trainer']['extra_args']['save_dir']))
        timestamp = datetime.now().strftime(
            r'%m%d_%H%M%S') if timestamp else ''

        exper_name = self.config['name']
        self.__save_dir = save_dir / 'models' / exper_name / timestamp
        self.__log_dir = save_dir / 'log' / exper_name / timestamp

        self.save_dir.mkdir(parents=True, exist_ok=True)
        self.log_dir.mkdir(parents=True, exist_ok=True)

        # save updated config file to the checkpoint dir
        if get_global_rank() == 0:
            write_json(self.config, self.save_dir / 'config.json')

        # configure logging module
        setup_logging(self.log_dir)
        self.log_levels = {
            0: logging.WARNING,
            1: logging.INFO,
            2: logging.DEBUG
        }
        logger = self.get_logger('config')
        logger.info(f"Experiment name: {exper_name}")
    def __init__(self, n_gpu):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.n_gpu = n_gpu

        self.logger.info("Initializing devices..")
        device, gpu_ids, n_gpu, n_processes = self.prepare_device()

        self.device = device
        self.gpu_ids = gpu_ids
        self.n_gpu = n_gpu
        self.n_processes = n_processes

        if get_global_rank() == 0:
            self.logger.info(
                f"Number of running processes: {self.n_processes}")
            self.logger.info(f"Number of usable GPUs: {self.n_gpu}")
Beispiel #5
0
 def __init__(self, indices, shuffle=True):
     self.num_replicas = get_world_size()
     # if num_replicas is None:
     #     if not dist.is_available():
     #         raise RuntimeError(
     #             "Requires distributed package to be available")
     self.rank = get_global_rank()
     # if rank is None:
     #     if not dist.is_available():
     #         raise RuntimeError(
     #             "Requires distributed package to be available")
     self.shuffle = shuffle
     self.indices = indices
     self.epoch = 0
     self.num_samples = int(
         math.ceil(len(self.indices) * 1.0 / self.num_replicas))
     self.total_size = self.num_samples * self.num_replicas
    def __init__(self, model, loss, metrics, optimizer, config, data_loader,
                 valid_data_loader, lr_scheduler, main_device):
        self._set_defaults(model, loss, metrics, optimizer, config,
                           data_loader, valid_data_loader, lr_scheduler,
                           main_device)
        cfg_trainer = config['trainer']['extra_args']

        self.checkpoint_dir = config.save_dir
        if get_global_rank() == 0:
            # setup visualization writer instance
            enable_board = cfg_trainer['tensorboardX']
        else:
            enable_board = False
        self.writer = WriterTensorboardX(config.log_dir, self.logger,
                                         enable_board)

        if config.resume is not None:
            self._resume_checkpoint(config.resume)