def __init__(self, config):
        self.config = config
        self.time_stamp = self.config['checkpoint'].get(
            'time_stamp',
            datetime.datetime.now().strftime('%m%d_%H-%M'))
        """device parameters"""
        self.world_size = self.config['world_size']
        self.rank = self.config['rank']
        self.gpu = self.config['local_rank']
        self.distributed = self.config['distributed']
        """get the train parameters!"""
        self.total_epochs = self.config['optimizer']['total_epochs']
        self.warmup_epochs = self.config['optimizer']['warmup_epochs']

        self.train_batch_size = self.config['data']['train_batch_size']
        self.val_batch_size = self.config['data']['val_batch_size']
        self.global_batch_size = self.world_size * self.train_batch_size

        self.num_examples = self.config['data']['num_examples']
        self.warmup_steps = self.warmup_epochs * self.num_examples // self.global_batch_size
        self.total_steps = self.total_epochs * self.num_examples // self.global_batch_size

        base_lr = self.config['optimizer']['base_lr'] / 256
        self.max_lr = base_lr * self.global_batch_size

        self.base_mm = self.config['model']['base_momentum']
        """construct the whole network"""
        self.resume_path = self.config['checkpoint']['resume_path']
        if torch.cuda.is_available():
            self.device = torch.device(f'cuda:{self.gpu}')
            torch.cuda.set_device(self.device)
            cudnn.benchmark = True
        else:
            self.device = torch.device('cpu')
        self.construct_model()
        """save oss path"""
        self.save_epoch = self.config['checkpoint']['save_epoch']
        self.ckpt_prefix = self.config['checkpoint']['ckpt_prefix'].format(
            self.time_stamp, self.config['model']['backbone']['type'], {})
        ckpt_endpoint = self.config['checkpoint']['ckpt_endpoint']
        ckpt_key_id = self.config['checkpoint']['ckpt_key_id']
        ckpt_secret_id = self.config['checkpoint']['ckpt_secret_id']
        ckpt_bucket = self.config['checkpoint']['ckpt_bucket']

        auth = oss.Auth(ckpt_key_id, ckpt_secret_id)
        try:
            self.ckpt_bucket = oss.Bucket(auth,
                                          ckpt_endpoint,
                                          ckpt_bucket,
                                          connect_timeout=300)
        except:
            raise ValueError("Oss server is unreachable!")
        """log tools in the running phase"""
        self.log_step = self.config['log']['log_step']
        self.logger = eval_util.LogCollector()
        self.logging = logging_util.get_std_logging()
        self.steps = 0

        if self.rank == 0:
            self.setup_oss_log_files()
Beispiel #2
0
def run_task(config):
    logging = logging_util.get_std_logging()
    if config['distributed']:
        world_size = int(os.environ['WORLD_SIZE'])
        rank = int(os.environ['RANK'])
        local_rank = int(os.environ.get('LOCAL_RANK', '0'))
        config.update({
            'world_size': world_size,
            'rank': rank,
            'local_rank': local_rank
        })

        dist.init_process_group(backend="nccl",
                                world_size=world_size,
                                rank=rank)
        logging.info(
            f'world_size {world_size}, gpu {local_rank}, rank {rank} init done.'
        )
    else:
        config.update({'world_size': 1, 'rank': 0, 'local_rank': 0})

    trainer = BYOLTrainer(config)
    trainer.resume_model()
    start_epoch = trainer.start_epoch

    for epoch in range(start_epoch + 1, trainer.total_epochs + 1):
        trainer.train_epoch(epoch, printer=logging.info)
        trainer.save_checkpoint(epoch)
Beispiel #3
0
    def __init__(self, config):
        self.config = config
        self.time_stamp = self.config['checkpoint'].get('time_stamp',
            datetime.datetime.now().strftime('%m%d_%H-%M'))

        """device parameters"""
        self.world_size = self.config['world_size']
        self.rank = self.config['rank']
        self.gpu = self.config['local_rank']
        self.distributed = self.config['distributed']

        """get the train parameters!"""
        self.total_epochs = self.config['optimizer']['total_epochs']
        self.warmup_epochs = self.config['optimizer']['warmup_epochs']

        self.train_batch_size = self.config['data']['train_batch_size']
        self.val_batch_size = self.config['data']['val_batch_size']
        self.global_batch_size = self.world_size * self.train_batch_size

        self.num_examples = self.config['data']['num_examples']
        self.warmup_steps = self.warmup_epochs * self.num_examples // self.global_batch_size
        self.total_steps = self.total_epochs * self.num_examples // self.global_batch_size

        base_lr = self.config['optimizer']['base_lr'] / 256
        self.max_lr = base_lr * self.global_batch_size

        self.base_mm = self.config['model']['base_momentum']

        """construct the whole network"""
        self.resume_path = self.config['checkpoint']['resume_path']
        if torch.cuda.is_available():
            self.device = torch.device(f'cuda:{self.gpu}')
            torch.cuda.set_device(self.device)
            cudnn.benchmark = True
        else:
            self.device = torch.device('cpu')
        self.construct_model()

        """save checkpoint path"""
        self.save_epoch = self.config['checkpoint']['save_epoch']
        self.ckpt_path = self.config['checkpoint']['ckpt_path'].format(
            self.time_stamp, self.config['model']['backbone']['type'], {})

        """log tools in the running phase"""
        self.steps = 0
        self.log_step = self.config['log']['log_step']
        self.logging = logging_util.get_std_logging()
        if self.rank == 0:
            self.writer = SummaryWriter(self.config['log']['log_dir'])
def run_task(config):
    logger = get_std_logging(
        os.path.join(config.path, "{}.log".format(config.name)))
    config.logger = logger

    if config.dist:
        world_size = int(os.environ['WORLD_SIZE'])
        rank = int(os.environ['RANK'])
        local_rank = int(os.environ.get('LOCAL_RANK', '0'))

        config.world_size = world_size
        config.rank = rank
        config.local_rank = local_rank

        if config.local_rank == 0:
            config.print_params(logger.info)

        dist.init_process_group(backend='nccl',
                                world_size=world_size,
                                rank=rank)
        logger.info(
            f'world_size {world_size}, gpu {local_rank}, rank {rank} init done.'
        )
    else:
        config.world_size, config.rank, config.local_rank = 1, 0, 0
        config.print_params(logger.info)

    trainer = AugmentCellTrainer(config)
    trainer.resume_model()
    start_epoch = trainer.start_epoch

    best_top1 = 0
    for epoch in range(start_epoch, trainer.total_epochs):
        drop_prob = config.drop_path_prob * epoch / trainer.total_epochs
        trainer.model.module.drop_path_prob(drop_prob)
        trainer.train_epoch(epoch, printer=logger.info)
        top1 = trainer.val_epoch(epoch, printer=logger.info)
        trainer.lr_scheduler.step()

        if best_top1 < top1:
            best_top1, is_best = top1, True
        else:
            is_best = False
        trainer.save_checkpoint(epoch, is_best)
        if config.local_rank == 0:
            logger.info("Until now, best Prec@1 = {:.4%}".format(best_top1))
    if config.local_rank == 0:
        logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
def run_task(config):
    logger = get_std_logging(
        os.path.join(config.path, "{}.log".format(config.name)))
    config.logger = logger

    config.print_params(logger.info)

    trainer = SearchDistributionTrainer(config)
    trainer.resume_model()
    start_epoch = trainer.start_epoch

    best_top1 = 0.
    for epoch in range(start_epoch, trainer.total_epochs):
        trainer.train_epoch(epoch, printer=logger.info)
        top1 = trainer.val_epoch(epoch, printer=logger.info)
        trainer.lr_scheduler.step()

        # plot macro architecture
        macro_arch = trainer.model.DAG()
        logger.info("DAG = {}".format(macro_arch))

        plot_path = os.path.join(config.DAG_path, "EP{:02d}".format(epoch + 1))
        caption = "Epoch {}".format(epoch + 1)
        plot2(macro_arch.DAG1, plot_path + '-DAG1', caption,
              macro_arch.DAG1_concat)
        plot2(macro_arch.DAG2, plot_path + '-DAG2', caption,
              macro_arch.DAG2_concat)
        plot2(macro_arch.DAG3, plot_path + '-DAG3', caption,
              macro_arch.DAG3_concat)

        if best_top1 < top1:
            best_top1, is_best = top1, True
            best_macro = macro_arch
        else:
            is_best = False
        logger.info("Until now, best Prec@1 = {:.4%}".format(best_top1))

    logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
    logger.info("Final Best Genotype = {}".format(best_macro))
def run_task(config):
    logger = get_std_logging(
        os.path.join(config.path, "{}.log".format(config.name)))
    config.logger = logger

    config.print_params(logger.info)
    trainer = SearchCellTrainer(config)
    trainer.resume_model()
    start_epoch = trainer.start_epoch

    best_top1 = 0.
    for epoch in range(start_epoch, trainer.total_epochs):
        trainer.train_epoch(epoch, printer=logger.info)
        top1 = trainer.val_epoch(epoch, printer=logger.info)
        trainer.lr_scheduler.step()

        genotype = trainer.model.genotype()
        logger.info("genotype = {}".format(genotype))
        plot_path = os.path.join(config.plot_path,
                                 "EP{:02d}".format(epoch + 1))
        caption = "Epoch {}".format(epoch + 1)
        plot(genotype.normal1, plot_path + "-normal1", caption)
        plot(genotype.reduce1, plot_path + "-reduce1", caption)
        plot(genotype.normal2, plot_path + "-normal2", caption)
        plot(genotype.reduce2, plot_path + "-reduce2", caption)
        plot(genotype.normal3, plot_path + "-normal3", caption)
        if best_top1 < top1:
            best_top1, is_best = top1, True
            best_genotype = genotype
        else:
            is_best = False
        trainer.save_checkpoint(epoch, is_best=is_best)
        logger.info("Until now, best Prec@1 = {:.4%}".format(best_top1))

    logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
    logger.info("Final Best Genotype = {}".format(best_genotype))
from utils.data_util import get_data
from tensorboardX import SummaryWriter
from config.augmentCell_config import AugmentCellConfig
from utils.logging_util import get_std_logging
from models.augment_cellcnn import AugmentCellCNN
from utils.eval_util import AverageMeter, accuracy

config = AugmentCellConfig()

device = torch.device("cuda")

# tensorboard
writer = SummaryWriter(log_dir=os.path.join(config.path, "tb"))
writer.add_text('config', config.as_markdown(), 0)

logger = get_std_logging(
    os.path.join(config.path, "{}.log".format(config.name)))
config.print_params(logger.info)


def main():
    logger.info("Logger is set - training start")

    # set default gpu device id
    torch.cuda.set_device(config.gpus[0])

    # set seed
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)

    torch.backends.cudnn.benchmark = True