Ejemplo n.º 1
0
def get_model():
    # mask_model.py에 정의된 특정 모델을 가져옵니다.
    model_module = getattr(import_module("recycle_model"), CFG.model)
    model = model_module(num_classes=12)

    # 모델의 파라미터를 GPU메모리로 옮깁니다.
    model.cuda()

    # wandb에서 model 감독
    wandb.watch(model)

    # 모델의 파라미터 수를 출력합니다.
    print('parameters: ',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    # GPU가 2개 이상이면 데이터패러럴로 학습 가능하게 만듭니다.
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # loss.py에 정의된 criterion을 가져옵니다.
    criterion = create_criterion(CFG.criterion)

    # optimizer.py에 정의된 optimizer를 가져옵니다.
    optimizer_encoder = create_optimizer(
        CFG.optimizer, params=model.seg_model.encoder.parameters(), lr=1e-8)

    optimizer_decoder = create_optimizer(
        CFG.optimizer,
        params=[{
            "params": model.seg_model.decoder.parameters()
        }, {
            "params": model.seg_model.segmentation_head.parameters()
        }],
        lr=1e-8)

    # scheduler.py에 정의된 scheduler를 가져옵니다.
    scheduler_encoder = create_scheduler(CFG.scheduler,
                                         optimizer=optimizer_encoder,
                                         T_0=30,
                                         T_mult=2,
                                         eta_max=CFG.learning_rate * 0.1,
                                         T_up=5,
                                         gamma=0.3)

    scheduler_decoder = create_scheduler(CFG.scheduler,
                                         optimizer=optimizer_decoder,
                                         T_0=30,
                                         T_mult=2,
                                         eta_max=CFG.learning_rate,
                                         T_up=5,
                                         gamma=0.3)

    return model, criterion, optimizer_encoder, optimizer_decoder, scheduler_encoder, scheduler_decoder
Ejemplo n.º 2
0
    def __init__(self,
                 model,
                 model_params,
                 load_data=True,
                 debug=False,
                 batch_size=64):
        super().__init__(model,
                         model_params,
                         load_data=load_data,
                         debug=debug,
                         batch_size=batch_size,
                         name="TaskLanguageModeling")

        prior_dist_params = get_param_val(
            self.model_params,
            "prior_distribution",
            allow_default=False,
            error_location="TaskLanguageModeling - init")
        self.prior_distribution = create_prior_distribution(prior_dist_params)

        self.beta_scheduler = create_scheduler(self.model_params["beta"],
                                               "beta")

        self.summary_dict = {
            "log_prob": list(),
            "ldj": list(),
            "z": list(),
            "beta": 0
        }
Ejemplo n.º 3
0
Archivo: main.py Proyecto: SKholkin/EGA
def main_worker(config: EGAConfig):
    ega_state = EGA_state()
    criterio = Criterio(config.weight_matrix)
    scheduler = create_scheduler(ega_state, config)
    start_pop = launch_init(config)
    population = [encode(x) for x in start_pop]
    ega_state.add(('gen_overlap', config.get('gen_overlap', 0.5)))

    for i in range(config.get('max_iter', 10000)):
        ega_state.add(('pop_amount', len(population)))

        population = evolution_cycle(population, config, criterio, ega_state)

        print(f'iter: {i}')
        ega_state.add(
            ('max_criterio', 1 / criterio(max(population, key=criterio))))
        ega_state.add(('mean_criterio',
                       sum([1 / criterio(vector)
                            for vector in population]) / len(population)))
        mean_criterio_averagemetr.update(ega_state.mean_criterio)
        tb_logger.add_scalar('max_criterio', ega_state.max_criterio, i)
        tb_logger.add_scalar('mean_criterio', mean_criterio_averagemetr.value,
                             i)
        tb_logger.add_scalar('gen_overlap', ega_state.gen_overlap, i)
        print(f'Max criterio: {ega_state.max_criterio}')
        print(f'Mean criterio: {ega_state.mean_criterio}')
        print(f'Gen overlap: {ega_state.gen_overlap}')

        if config.get('scheduler', {}).get('mean_or_max', 'max') == 'mean':
            scheduler.step(mean_criterio_averagemetr.value)
        else:
            scheduler.step(ega_state.max_criterio)

        if ega_state.gen_overlap < 0.01:
            return
def get_model():
    '''
        get defined model from recycle_model.py
        
        Returns:
            model: pytorch model that would be trained
            optimizer: pytorch optimizer for gradient descent
            scheduler: pytorch lr scheduler
    '''
    model_module = getattr(import_module("recycle_model"), CFG.model)
    model = model_module(num_classes=11)

    # move model to cuda memory
    model.cuda()

    # watch model in wandb
    # wandb.watch(model)

    # check the number of model parameters
    print('parameters: ',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    # if using multi-gpu, train model in parallel
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # setting weight_decay different
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)]
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    # get optimizer from optimizer.py
    optimizer = create_optimizer(CFG.optimizer,
                                 params=optimizer_grouped_parameters,
                                 lr=CFG.learning_rate,
                                 **CFG.optimizer_params)

    # get scheduler from scheduler.py
    scheduler = create_scheduler(CFG.scheduler,
                                 optimizer=optimizer,
                                 **CFG.scheduler_params)

    return model, optimizer, scheduler
Ejemplo n.º 5
0
import signal
from setproctitle import setproctitle
import logging
logging.basicConfig()

import scheduler
import server
import models


def receive_signal(signum, stack):
    store.close()
    my_scheduler.close()
    server.close()


def setup_process():
    setproctitle('aws-sns-scheduler')
    signal.signal(signal.SIGUSR1, receive_signal)
    signal.signal(signal.SIGUSR2, receive_signal)


if __name__ == '__main__':
    setup_process()
    store = models.create_store()
    my_scheduler = scheduler.create_scheduler()
    my_scheduler.start()
    server.run(my_scheduler, store)
Ejemplo n.º 6
0
    # Define optimizer and loss function
    person_id_criterion = CrossEntropyLabelSmooth(
        train_dataset.number_classes(), use_gpu=opt.cuda)
    attribute_criterion = AttributeCriterion(attribute_choices,
                                             CrossEntropyLabelSmooth)
    triplet_criterion = TripletLoss(opt.margin)
    optimizer = optim.Adam(model.parameters(), lr=opt.lr,
                           weight_decay=5e-4)  # Default lr = 3e-4

    print("Using triplet loss = ", triplet_criterion)
    print("Using person_id = ", person_id_criterion)
    print("Using Attribute loss = ", attribute_criterion)
    print("Optimizer = ", optimizer)

    # scheduler creation
    lr_scheduler, num_epochs = create_scheduler(opt, optimizer)

    if epoch > 0:
        lr_scheduler.step(epoch)
    print("Scheduled epochs: ", num_epochs)
    print("learning rates ",
          [lr_scheduler._get_lr(epoch) for epoch in range(num_epochs)])

    # Training routine
    while epoch < num_epochs:

        # Training procedure
        train_epoch(
            model,
            dataloader,
            optimizer,
def get_model(train_iter):
    # get model from mask_model.py and define with parameters
    model_module = getattr(import_module("mask_model"), CFG.model)
    model = model_module()

    # Upload data to gpu memory
    model.cuda()    
    
    # print number of parameters(weights) of defined model
    print('parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad))
    
    # if exists more than 2 GPUs, use DataParallel training
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # get criterion from loss.py and define with parameters 
    criterion_mask = create_criterion(CFG.criterion, classes=3, smoothing=0.05)
    criterion_gender = create_criterion('cross_entropy')
    criterion_age = create_criterion(CFG.criterion, classes=3, smoothing=0.05)

    # get optimizer from optimizer.py and define with parameters 
    optimizer_backbone = create_optimizer(
        CFG.optimizer,
        params=model.backbone.parameters(),
        lr = CFG.learning_rate * 0.1,
        momentum=0.9,
        weight_decay=1e-2
    )
    optimizer_classifier = create_optimizer(
        CFG.optimizer,
        params=[
            {"params": model.mask_layer.parameters()},
            {"params": model.gender_layer.parameters()},
            {"params": model.age_layer.parameters()},
        ],
        lr = CFG.learning_rate,
        momentum=0.9,
        weight_decay=1e-2
    )

    # get scheduler from scheduler.py and define with parameters 
    scheduler_backbone = create_scheduler(
        CFG.scheduler,
        optimizer=optimizer_backbone,
        max_lr=CFG.learning_rate * 0.1,
        epochs=CFG.nepochs,
        steps_per_epoch=len(train_iter),
        pct_start=5/CFG.nepochs,
        anneal_strategy='cos'
    )
    scheduler_classifier = create_scheduler(
        CFG.scheduler,
        optimizer=optimizer_classifier,
        max_lr=CFG.learning_rate,
        epochs=CFG.nepochs,
        steps_per_epoch=len(train_iter),
        pct_start=5/CFG.nepochs,
        anneal_strategy='cos'
    )

    return model, criterion_mask, criterion_gender, criterion_age, optimizer_backbone, optimizer_classifier, scheduler_backbone, scheduler_classifier
def run(args):

    setup_default_logging()
    #args = parser.parse_args()
    args.prefetcher = not args.no_prefetcher
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1
        if args.distributed and args.num_gpu > 1:
            logging.warning(
                'Using more than one GPU per process in distributed mode is not allowed. Setting num_gpu to 1.'
            )
            args.num_gpu = 1

    args.world_size = 1
    args.rank = 0  # global rank
    if args.distributed:
        args.num_gpu = 1  #1
        args.device = 'cuda:%d' % args.local_rank
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()
        args.rank = torch.distributed.get_rank()
    assert args.rank >= 0

    if args.distributed:
        logging.info(
            'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
            % (args.rank, args.world_size))
    else:
        logging.info('Training with a single process on %d GPUs.' %
                     args.num_gpu)

    torch.manual_seed(args.seed + args.rank)

    model = create_model(args.model,
                         pretrained=args.pretrained,
                         num_classes=args.num_classes,
                         drop_rate=args.drop,
                         global_pool=args.gp,
                         bn_tf=args.bn_tf,
                         bn_momentum=args.bn_momentum,
                         bn_eps=args.bn_eps,
                         checkpoint_path=args.initial_checkpoint)

    if args.local_rank == 0:
        logging.info('Model %s created, param count: %d' %
                     (args.model, sum([m.numel()
                                       for m in model.parameters()])))

    data_config = resolve_data_config(vars(args),
                                      model=model,
                                      verbose=args.local_rank == 0)

    # optionally resume from a checkpoint
    optimizer_state = None
    resume_epoch = None
    if args.resume:
        optimizer_state, resume_epoch = resume_checkpoint(model, args.resume)

    if args.num_gpu > 1:
        if args.amp:
            logging.warning(
                'AMP does not work well with nn.DataParallel, disabling. Use distributed mode for multi-GPU AMP.'
            )
            args.amp = False
        model = nn.DataParallel(model,
                                device_ids=list(range(args.num_gpu))).cuda()
    else:
        model.cuda()

    optimizer = create_optimizer(args, model)
    if optimizer_state is not None:
        optimizer.load_state_dict(optimizer_state)

    use_amp = False
    if has_apex and args.amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
        use_amp = True
    if args.local_rank == 0:
        logging.info('NVIDIA APEX {}. AMP {}.'.format(
            'installed' if has_apex else 'not installed',
            'on' if use_amp else 'off'))

    model_ema = None
    if args.model_ema:
        # create EMA model after cuda()
        model_ema = ModelEma(model,
                             decay=args.model_ema_decay,
                             device='cpu' if args.model_ema_force_cpu else '',
                             resume=args.resume)

    if args.distributed:
        if args.sync_bn:
            try:
                if has_apex:
                    model = convert_syncbn_model(model)
                else:
                    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
                        model)
                if args.local_rank == 0:
                    logging.info(
                        'Converted model to use Synchronized BatchNorm.')
            except Exception as e:
                logging.error(
                    'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1'
                )
        if has_apex:
            model = DDP(model, delay_allreduce=True)
        else:
            if args.local_rank == 0:
                logging.info(
                    "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP."
                )
            model = DDP(model,
                        device_ids=[args.local_rank
                                    ])  # can use device str in Torch >= 1.1
        # NOTE: EMA model does not need to be wrapped by DDP

    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
    start_epoch = 0
    if args.start_epoch is not None:
        # a specified start_epoch will always override the resume epoch
        start_epoch = args.start_epoch
    elif resume_epoch is not None:
        start_epoch = resume_epoch
    if start_epoch > 0:
        lr_scheduler.step(start_epoch)

    if args.local_rank == 0:
        logging.info('Scheduled epochs: {}'.format(num_epochs))

    collate_fn = None
    if args.prefetcher and args.mixup > 0:
        collate_fn = FastCollateMixup(args.mixup, args.smoothing,
                                      args.num_classes)

    # Load dataset
    data_dir = os.path.join(args.data, 'img')
    if not os.path.exists(data_dir):
        logging.error('Training folder does not exist at: {}'.format(data_dir))
        exit(1)
    dataset_train = MultiViewDataSet(train_file,
                                     class_file,
                                     data_dir,
                                     transform=transform_train)
    dataset_eval = MultiViewDataSet(test_file,
                                    class_file,
                                    data_dir,
                                    transform=transform_eval)

    loader_train = torch.utils.data.DataLoader(dataset_train,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=1)
    if 0:
        loader_train = create_loader(
            dataset_train,
            input_size=data_config['input_size'],
            batch_size=args.batch_size,
            is_training=True,
            use_prefetcher=args.prefetcher,
            rand_erase_prob=args.reprob,
            rand_erase_mode=args.remode,
            color_jitter=args.color_jitter,
            interpolation='random',
            mean=data_config['mean'],
            std=data_config['std'],
            num_workers=args.workers,
            distributed=args.distributed,
            collate_fn=collate_fn,
        )

    loader_eval = create_loader(
        dataset_eval,
        input_size=data_config['input_size'],
        batch_size=4 * args.batch_size,
        is_training=False,
        use_prefetcher=args.prefetcher,
        interpolation=data_config['interpolation'],
        mean=data_config['mean'],
        std=data_config['std'],
        num_workers=args.workers,
        distributed=args.distributed,
    )

    if args.mixup > 0.:
        # smoothing is handled with mixup label transform
        train_loss_fn = SoftTargetCrossEntropy().cuda()
        validate_loss_fn = nn.CrossEntropyLoss().cuda()
    elif args.smoothing:
        train_loss_fn = LabelSmoothingCrossEntropy(
            smoothing=args.smoothing).cuda()
        validate_loss_fn = nn.CrossEntropyLoss().cuda()
    else:
        train_loss_fn = nn.CrossEntropyLoss().cuda()
        validate_loss_fn = train_loss_fn

    eval_metric = args.eval_metric
    best_metric = None
    best_epoch = None
    saver = None
    output_dir = ''
    metrics_history = OrderedDict()
    if args.local_rank == 0:
        output_base = args.output if args.output else './output'
        exp_name = '-'.join([
            datetime.now().strftime("%Y%m%d-%H%M%S"), args.model,
            str(data_config['input_size'][-1])
        ])
        output_dir = get_outdir(output_base, 'train', exp_name)
        decreasing = True if eval_metric == 'loss' else False
        saver = CheckpointSaver(checkpoint_dir=output_dir,
                                decreasing=decreasing)

    try:
        for epoch in range(start_epoch, num_epochs):

            train_metrics = train_epoch(epoch,
                                        model,
                                        loader_train,
                                        optimizer,
                                        train_loss_fn,
                                        args,
                                        lr_scheduler=lr_scheduler,
                                        saver=saver,
                                        output_dir=output_dir,
                                        use_amp=use_amp,
                                        model_ema=model_ema)

            eval_metrics = validate(model, loader_eval, validate_loss_fn, args)

            if model_ema is not None and not args.model_ema_force_cpu:
                ema_eval_metrics = validate(model_ema.ema,
                                            loader_eval,
                                            validate_loss_fn,
                                            args,
                                            log_suffix=' (EMA)')
                eval_metrics = ema_eval_metrics

            if lr_scheduler is not None:
                # step LR for next epoch
                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])

            update_summary(epoch,
                           train_metrics,
                           eval_metrics,
                           os.path.join(output_dir, 'summary.csv'),
                           write_header=best_metric is None)

            if saver is not None:
                # save proper checkpoint with eval metric
                save_metric = eval_metrics[eval_metric]
                metrics_history[epoch] = eval_metrics
                make_plots(metrics_history, output_dir)

                best_metric, best_epoch = saver.save_checkpoint(
                    model,
                    optimizer,
                    args,
                    epoch=epoch,
                    model_ema=model_ema,
                    metric=save_metric)

    except KeyboardInterrupt:
        pass
    if best_metric is not None:
        logging.info('*** Best metric: {0} (epoch {1})'.format(
            best_metric, best_epoch))
Ejemplo n.º 9
0
def main():
    args = parser.parse_args()

    args.prefetcher = not args.no_prefetcher
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1
        if args.distributed and args.num_gpu > 1:
            print(
                'Using more than one GPU per process in distributed mode is not allowed. Setting num_gpu to 1.'
            )
            args.num_gpu = 1

    args.device = 'cuda:0'
    args.world_size = 1
    r = -1
    if args.distributed:
        args.num_gpu = 1
        args.device = 'cuda:%d' % args.local_rank
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()
        r = torch.distributed.get_rank()

    if args.distributed:
        print(
            'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
            % (r, args.world_size))
    else:
        print('Training with a single process on %d GPUs.' % args.num_gpu)

    # FIXME seed handling for multi-process distributed?
    torch.manual_seed(args.seed)

    output_dir = ''
    if args.local_rank == 0:
        if args.output:
            output_base = args.output
        else:
            output_base = './output'
        exp_name = '-'.join([
            datetime.now().strftime("%Y%m%d-%H%M%S"), args.model,
            str(args.img_size)
        ])
        output_dir = get_outdir(output_base, 'train', exp_name)

    model = create_model(args.model,
                         pretrained=args.pretrained,
                         num_classes=args.num_classes,
                         drop_rate=args.drop,
                         global_pool=args.gp,
                         bn_tf=args.bn_tf,
                         bn_momentum=args.bn_momentum,
                         bn_eps=args.bn_eps,
                         checkpoint_path=args.initial_checkpoint)

    print('Model %s created, param count: %d' %
          (args.model, sum([m.numel() for m in model.parameters()])))

    data_config = resolve_data_config(model,
                                      args,
                                      verbose=args.local_rank == 0)

    # optionally resume from a checkpoint
    start_epoch = 0
    optimizer_state = None
    if args.resume:
        optimizer_state, start_epoch = resume_checkpoint(
            model, args.resume, args.start_epoch)

    if args.num_gpu > 1:
        if args.amp:
            print(
                'Warning: AMP does not work well with nn.DataParallel, disabling. '
                'Use distributed mode for multi-GPU AMP.')
            args.amp = False
        model = nn.DataParallel(model,
                                device_ids=list(range(args.num_gpu))).cuda()
    else:
        model.cuda()

    optimizer = create_optimizer(args, model)
    if optimizer_state is not None:
        optimizer.load_state_dict(optimizer_state)

    if has_apex and args.amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
        use_amp = True
        print('AMP enabled')
    else:
        use_amp = False
        print('AMP disabled')

    if args.distributed:
        model = DDP(model, delay_allreduce=True)

    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
    if start_epoch > 0:
        lr_scheduler.step(start_epoch)
    if args.local_rank == 0:
        print('Scheduled epochs: ', num_epochs)

    train_dir = os.path.join(args.data, 'train')
    if not os.path.exists(train_dir):
        print('Error: training folder does not exist at: %s' % train_dir)
        exit(1)
    dataset_train = Dataset(train_dir)

    collate_fn = None
    if args.prefetcher and args.mixup > 0:
        collate_fn = FastCollateMixup(args.mixup, args.smoothing,
                                      args.num_classes)

    loader_train = create_loader(
        dataset_train,
        input_size=data_config['input_size'],
        batch_size=args.batch_size,
        is_training=True,
        use_prefetcher=args.prefetcher,
        rand_erase_prob=args.reprob,
        rand_erase_mode=args.remode,
        interpolation=
        'random',  # FIXME cleanly resolve this? data_config['interpolation'],
        mean=data_config['mean'],
        std=data_config['std'],
        num_workers=args.workers,
        distributed=args.distributed,
        collate_fn=collate_fn,
    )

    eval_dir = os.path.join(args.data, 'validation')
    if not os.path.isdir(eval_dir):
        print('Error: validation folder does not exist at: %s' % eval_dir)
        exit(1)
    dataset_eval = Dataset(eval_dir)

    loader_eval = create_loader(
        dataset_eval,
        input_size=data_config['input_size'],
        batch_size=4 * args.batch_size,
        is_training=False,
        use_prefetcher=args.prefetcher,
        interpolation=data_config['interpolation'],
        mean=data_config['mean'],
        std=data_config['std'],
        num_workers=args.workers,
        distributed=args.distributed,
    )

    if args.mixup > 0.:
        # smoothing is handled with mixup label transform
        train_loss_fn = SoftTargetCrossEntropy().cuda()
        validate_loss_fn = nn.CrossEntropyLoss().cuda()
    elif args.smoothing:
        train_loss_fn = LabelSmoothingCrossEntropy(
            smoothing=args.smoothing).cuda()
        validate_loss_fn = nn.CrossEntropyLoss().cuda()
    else:
        train_loss_fn = nn.CrossEntropyLoss().cuda()
        validate_loss_fn = train_loss_fn

    eval_metric = args.eval_metric
    saver = None
    if output_dir:
        decreasing = True if eval_metric == 'loss' else False
        saver = CheckpointSaver(checkpoint_dir=output_dir,
                                decreasing=decreasing)
    best_metric = None
    best_epoch = None
    try:
        for epoch in range(start_epoch, num_epochs):
            if args.distributed:
                loader_train.sampler.set_epoch(epoch)

            train_metrics = train_epoch(epoch,
                                        model,
                                        loader_train,
                                        optimizer,
                                        train_loss_fn,
                                        args,
                                        lr_scheduler=lr_scheduler,
                                        saver=saver,
                                        output_dir=output_dir,
                                        use_amp=use_amp)

            eval_metrics = validate(model, loader_eval, validate_loss_fn, args)

            if lr_scheduler is not None:
                lr_scheduler.step(epoch, eval_metrics[eval_metric])

            update_summary(epoch,
                           train_metrics,
                           eval_metrics,
                           os.path.join(output_dir, 'summary.csv'),
                           write_header=best_metric is None)

            if saver is not None:
                # save proper checkpoint with eval metric
                best_metric, best_epoch = saver.save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': args.model,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'args': args,
                    },
                    epoch=epoch + 1,
                    metric=eval_metrics[eval_metric])

    except KeyboardInterrupt:
        pass
    if best_metric is not None:
        print('*** Best metric: {0} (epoch {1})'.format(
            best_metric, best_epoch))
Ejemplo n.º 10
0
def main():
    cfg, args = _parse_args()
    torch.manual_seed(args.seed)

    output_base = cfg.OUTPUT_DIR if len(cfg.OUTPUT_DIR) > 0 else './output'
    exp_name = '-'.join([
        datetime.now().strftime("%Y%m%d-%H%M%S"), cfg.MODEL.ARCHITECTURE,
        str(cfg.INPUT.IMG_SIZE)
    ])
    output_dir = get_outdir(output_base, exp_name)
    with open(os.path.join(output_dir, 'config.yaml'), 'w',
              encoding='utf-8') as file_writer:
        # cfg.dump(stream=file_writer, default_flow_style=False, indent=2, allow_unicode=True)
        file_writer.write(pyaml.dump(cfg))
    logger = setup_logger(file_name=os.path.join(output_dir, 'train.log'),
                          control_log=False,
                          log_level='INFO')

    # create model
    model = create_model(cfg.MODEL.ARCHITECTURE,
                         num_classes=cfg.MODEL.NUM_CLASSES,
                         pretrained=True,
                         in_chans=cfg.INPUT.IN_CHANNELS,
                         drop_rate=cfg.MODEL.DROP_RATE,
                         drop_connect_rate=cfg.MODEL.DROP_CONNECT,
                         global_pool=cfg.MODEL.GLOBAL_POOL)

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    gpu_list = list(map(int, args.gpu.split(',')))
    device = 'cuda'
    if len(gpu_list) == 1:
        model.cuda()
        torch.backends.cudnn.benchmark = True
    elif len(gpu_list) > 1:
        model = nn.DataParallel(model, device_ids=gpu_list)
        model = convert_model(model).cuda()
        torch.backends.cudnn.benchmark = True
    else:
        device = 'cpu'
    logger.info('device: {}, gpu_list: {}'.format(device, gpu_list))

    optimizer = create_optimizer(cfg, model)

    # optionally initialize from a checkpoint
    if args.initial_checkpoint and os.path.isfile(args.initial_checkpoint):
        load_checkpoint(model, args.initial_checkpoint)

    # optionally resume from a checkpoint
    resume_state = None
    resume_epoch = None
    if args.resume and os.path.isfile(args.resume):
        resume_state, resume_epoch = resume_checkpoint(model, args.resume)
    if resume_state and not args.no_resume_opt:
        if 'optimizer' in resume_state:
            optimizer.load_state_dict(resume_state['optimizer'])
            logger.info('Restoring optimizer state from [{}]'.format(
                args.resume))

    start_epoch = 0
    if args.start_epoch is not None:
        start_epoch = args.start_epoch
    elif resume_epoch is not None:
        start_epoch = resume_epoch

    model_ema = None
    if cfg.SOLVER.EMA:
        # Important to create EMA model after cuda()
        model_ema = ModelEma(model,
                             decay=cfg.SOLVER.EMA_DECAY,
                             device=device,
                             resume=args.resume)

    lr_scheduler, num_epochs = create_scheduler(cfg, optimizer)
    if lr_scheduler is not None and start_epoch > 0:
        lr_scheduler.step(start_epoch)

    # summary
    print('=' * 60)
    print(cfg)
    print('=' * 60)
    print(model)
    print('=' * 60)
    summary(model, (3, cfg.INPUT.IMG_SIZE, cfg.INPUT.IMG_SIZE))

    # dataset
    dataset_train = Dataset(cfg.DATASETS.TRAIN)
    dataset_valid = Dataset(cfg.DATASETS.TEST)
    train_loader = create_loader(dataset_train, cfg, is_training=True)
    valid_loader = create_loader(dataset_valid, cfg, is_training=False)

    # loss function
    if cfg.SOLVER.LABEL_SMOOTHING > 0:
        train_loss_fn = LabelSmoothingCrossEntropy(
            smoothing=cfg.SOLVER.LABEL_SMOOTHING).to(device)
        validate_loss_fn = nn.CrossEntropyLoss().to(device)
    else:
        train_loss_fn = nn.CrossEntropyLoss().to(device)
        validate_loss_fn = train_loss_fn

    eval_metric = cfg.SOLVER.EVAL_METRIC
    best_metric = None
    best_epoch = None
    saver = CheckpointSaver(
        checkpoint_dir=output_dir,
        recovery_dir=output_dir,
        decreasing=True if eval_metric == 'loss' else False)
    try:
        for epoch in range(start_epoch, num_epochs):
            train_metrics = train_epoch(epoch,
                                        model,
                                        train_loader,
                                        optimizer,
                                        train_loss_fn,
                                        cfg,
                                        logger,
                                        lr_scheduler=lr_scheduler,
                                        saver=saver,
                                        device=device,
                                        model_ema=model_ema)

            eval_metrics = validate(epoch, model, valid_loader,
                                    validate_loss_fn, cfg, logger)

            if model_ema is not None:
                ema_eval_metrics = validate(epoch, model_ema.ema, valid_loader,
                                            validate_loss_fn, cfg, logger)
                eval_metrics = ema_eval_metrics

            if lr_scheduler is not None:
                # step LR for next epoch
                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])

            update_summary(epoch,
                           train_metrics,
                           eval_metrics,
                           os.path.join(output_dir, 'summary.csv'),
                           write_header=best_metric is None)

            if saver is not None:
                # save proper checkpoint with eval metric
                save_metric = eval_metrics[eval_metric]
                best_metric, best_epoch = saver.save_checkpoint(
                    model,
                    optimizer,
                    cfg,
                    epoch=epoch,
                    model_ema=model_ema,
                    metric=save_metric)

    except KeyboardInterrupt:
        pass
    if best_metric is not None:
        logger.info('*** Best metric: {0} (epoch {1})'.format(
            best_metric, best_epoch))
Ejemplo n.º 11
0
def get_model():
    # model.py에 정의된 특정 모델을 가져옵니다.
    model_module = getattr(import_module("recycle_model"), CFG.model)
    model = model_module(num_classes=12)

    # 모델의 파라미터를 GPU메모리로 옮깁니다.
    model.cuda()

    # wandb에서 model 감독
    wandb.watch(model)

    # 모델의 파라미터 수를 출력합니다.
    print('parameters: ',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    # GPU가 2개 이상이면 데이터패러럴로 학습 가능하게 만듭니다.
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # loss.py에 정의된 criterion을 가져옵니다.
    criterion = create_criterion(CFG.criterion)

    # optimizer.py에 정의된 optimizer를 가져옵니다.
    if CFG.optimizer == "Adam":
        optimizer = create_optimizer(
            CFG.optimizer,
            params=[
                {
                    "params": model.seg_model.encoder.parameters(),
                    "lr": CFG.learning_rate * 0.1
                },
                {
                    "params": model.seg_model.decoder.parameters()
                },
                {
                    "params": model.seg_model.segmentation_head.parameters()
                },
            ],
            lr=CFG.learning_rate,
            weight_decay=1e-6)
    elif CFG.optimizer == "RAdam":
        optimizer = create_optimizer(
            CFG.optimizer,
            params=[
                {
                    "params": model.seg_model.encoder.parameters(),
                    "lr": CFG.learning_rate * 0.1
                },
                {
                    "params": model.seg_model.decoder.parameters()
                },
                {
                    "params": model.seg_model.segmentation_head.parameters()
                },
            ],
            lr=CFG.learning_rate,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=0)
    elif CFG.optimizer == "AdamP":
        optimizer = create_optimizer(
            CFG.optimizer,
            params=[
                {
                    "params": model.seg_model.encoder.parameters(),
                    "lr": CFG.learning_rate * 0.1
                },
                {
                    "params": model.seg_model.decoder.parameters()
                },
                {
                    "params": model.seg_model.segmentation_head.parameters()
                },
            ],
            lr=CFG.learning_rate,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=0)
    elif CFG.optimizer == "AdamW":
        optimizer = create_optimizer(
            CFG.optimizer,
            params=[
                {
                    "params": model.seg_model.encoder.parameters(),
                    "lr": CFG.learning_rate * 0.1
                },
                {
                    "params": model.seg_model.decoder.parameters()
                },
                {
                    "params": model.seg_model.segmentation_head.parameters()
                },
            ],
            lr=CFG.learning_rate,
            amsgrad=True)
    elif CFG.optimizer == "RMSprop":
        optimizer = create_optimizer(
            CFG.optimizer,
            params=[
                {
                    "params": model.seg_model.encoder.parameters(),
                    "lr": CFG.learning_rate * 0.1
                },
                {
                    "params": model.seg_model.decoder.parameters()
                },
                {
                    "params": model.seg_model.segmentation_head.parameters()
                },
            ],
            lr=CFG.learning_rate)

    # scheduler.py에 정의된 scheduler를 가져옵니다.
    if CFG.scheduler == "StepLR":
        scheduler = create_scheduler(CFG.scheduler,
                                     optimizer=optimizer,
                                     step_size=5,
                                     gamma=0.95)
    elif CFG.scheduler == "CosineAnnealingWarmupRestarts":
        scheduler = create_scheduler(
            CFG.scheduler,
            optimizer=optimizer,
            first_cycle_steps=5,
            cycle_mult=1.,
            max_lr=1e-4,
            min_lr=1e-7,
        )

    return model, criterion, optimizer, scheduler
Ejemplo n.º 12
0
    async def test_05_monitor(self):
        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"help monitor TOTOTOTO"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue(
            (await
             resp.json())["text"].startswith("Unknown command: TOTOTOTO"))

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"help monitor"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue(
            (await resp.json())["text"].startswith("`/tromino monitor ["))

        resp = await self.client.request("POST",
                                         "/mattermost/",
                                         data={
                                             "command": "/tromino",
                                             "text": f"monitor"
                                         })
        self.assertEqual(resp.status, 200)
        self.assertTrue(
            (await resp.json())["text"].startswith("`/tromino monitor ["))

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"help monitor types_list"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue(
            (await
             resp.json())["text"].startswith("`/tromino monitor types_list"))

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"monitor types_list"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue((await
                         resp.json())["text"].startswith("Monitors types:"))

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"help monitor create_monitor"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue(
            (await resp.json()
             )["text"].startswith("`/tromino monitor create_monitor"))

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"monitor create_monitor dummytest"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue(
            (await resp.json()
             )["text"].startswith("`/tromino monitor create_monitor"))

        # Create scheduler
        scheduler.clean_scheduler()
        scheduler.scheduler = scheduler.create_scheduler()
        scheduler.scheduler.start()

        Notifications.read_notifications()
        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"monitor create_monitor dummytest dummytime 1",
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertEqual((await resp.json())["text"],
                         "Monitor `dummytest` created")

        await asyncio.sleep(5)
        notifications = Notifications.read_notifications()
        self.assertEqual(notifications[0]["text"], "First compare")
        self.assertIn(len(notifications), [4, 5, 6])
        for i in range(1, len(notifications)):
            self.assertTrue(
                notifications[i]["text"].startswith("Since last refresh"))

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"help monitor mon-dummytest"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue((await
                         resp.json())["text"].startswith("`/tromino monitor"))

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"monitor mon-XXX"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue((await
                         resp.json())["text"].startswith("Unknown monitor"))

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"monitor mon-dummytest TOTOTO"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue((await
                         resp.json())["text"].startswith("Unknown command: "))

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"monitor mon-dummytest set-channel dummychannel",
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue(
            (await resp.json()
             )["text"].startswith("Monitor `dummytest` changed channel"))
        Notifications.read_notifications()  # Flush
        await asyncio.sleep(5)
        notifications = Notifications.read_notifications()
        self.assertIn(len(notifications), [4, 5, 6])
        for notification in notifications:
            self.assertEqual(notification["channel"], "dummychannel")

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"monitor mon-dummytest set-channel"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertTrue(
            (await resp.json()
             )["text"].startswith("Monitor `dummytest` changed channel"))
        Notifications.read_notifications()  # Flush
        await asyncio.sleep(5)
        notifications = Notifications.read_notifications()
        self.assertIn(len(notifications), [4, 5, 6])
        for notification in notifications:
            self.assertNotIn("channel", notification)

        resp = await self.client.request(
            "POST",
            "/mattermost/",
            data={
                "command": "/tromino",
                "text": f"monitor mon-dummytest remove"
            },
        )
        self.assertEqual(resp.status, 200)
        self.assertEqual((await resp.json())["text"],
                         "Monitor `dummytest` removed")

        await asyncio.sleep(5)  # Wait for alive job to finish
        Notifications.read_notifications()
        await asyncio.sleep(5)
        notifications = Notifications.read_notifications()
        self.assertEqual(len(notifications), 0)

        # Clean scheduler
        scheduler.clean_scheduler()
Ejemplo n.º 13
0
def main(args):
    seed_everything(21)
    load_dotenv()

    if WANDB:
        if args.ENCODER:
            run_name = args.MODEL + "_" + args.ENCODER
        else:
            run_name = args.MODEL

    if args.KFOLD > 1:
        if args.KFOLD != 5:
            print("Only 5 KFOLD is available")
            return

        # pt 저장 폴더 생성
        path_pair = args.MODEL_PATH.split(".")
        os.makedirs(path_pair[0], exist_ok=True)
        # 재사용위해 args 복사
        args_origin = copy.deepcopy(args)

    for fold in range(args.KFOLD):
        # hold-out, kfold에 따라서 dataloader 다르게 설정
        if args.KFOLD > 1:
            args = copy.deepcopy(args_origin)
            path_pair = args_origin.MODEL_PATH.split(".")
            # MODEL_PATH 변경
            args.MODEL_PATH = (path_pair[0] + f"/kfold_{fold+1}." +
                               path_pair[1])
            # wandb
            if WANDB:
                wandb.init(
                    project=os.environ.get("WANDB_PROJECT_NAME"),
                    name=run_name + f"_k{fold+1}",
                    config=args,
                    reinit=True,
                )
                args = wandb.config
            # dataloader
            dataloader = get_dataloader(args.BATCH_SIZE, fold_index=fold)
            print(f"\nfold {fold+1} start")
        else:
            # wandb
            if WANDB:
                wandb.init(
                    project=os.environ.get("WANDB_PROJECT_NAME"),
                    name=run_name,
                    reinit=True,
                )
                wandb.config.update(args)
                args = wandb.config
            # dataloader
            dataloader = get_dataloader(args.BATCH_SIZE)
        print("Get loader")

        model = get_model(args.MODEL, args.ENCODER).to(args.device)
        print("Load model")

        if WANDB:
            wandb.watch(model)

        criterion = []
        if "+" in args.LOSS:
            criterion.append("+")
            criterion.append(create_criterion(args.LOSS.split("+")[0]))
            criterion.append(create_criterion(args.LOSS.split("+")[1]))
        elif "-" in args.LOSS:
            criterion.append("-")
            criterion.append(create_criterion(args.LOSS.split("-")[0]))
            criterion.append(create_criterion(args.LOSS.split("-")[1]))
        else:
            criterion.append("0")
            criterion.append(create_criterion(args.LOSS))
        optimizer = create_optimizer(args.OPTIMIZER, model, args.LEARNING_RATE)
        if args.SCHEDULER:
            scheduler = create_scheduler(args.SCHEDULER, optimizer)
        else:
            scheduler = None
        # optimizer = optim.Adam(params = model.parameters(), lr = args.LEARNING_RATE, weight_decay=1e-6)

        print("Run")
        run(args, model, criterion, optimizer, dataloader, fold, scheduler)