def get_model(): # mask_model.py에 정의된 특정 모델을 가져옵니다. model_module = getattr(import_module("recycle_model"), CFG.model) model = model_module(num_classes=12) # 모델의 파라미터를 GPU메모리로 옮깁니다. model.cuda() # wandb에서 model 감독 wandb.watch(model) # 모델의 파라미터 수를 출력합니다. print('parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad)) # GPU가 2개 이상이면 데이터패러럴로 학습 가능하게 만듭니다. n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # loss.py에 정의된 criterion을 가져옵니다. criterion = create_criterion(CFG.criterion) # optimizer.py에 정의된 optimizer를 가져옵니다. optimizer_encoder = create_optimizer( CFG.optimizer, params=model.seg_model.encoder.parameters(), lr=1e-8) optimizer_decoder = create_optimizer( CFG.optimizer, params=[{ "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }], lr=1e-8) # scheduler.py에 정의된 scheduler를 가져옵니다. scheduler_encoder = create_scheduler(CFG.scheduler, optimizer=optimizer_encoder, T_0=30, T_mult=2, eta_max=CFG.learning_rate * 0.1, T_up=5, gamma=0.3) scheduler_decoder = create_scheduler(CFG.scheduler, optimizer=optimizer_decoder, T_0=30, T_mult=2, eta_max=CFG.learning_rate, T_up=5, gamma=0.3) return model, criterion, optimizer_encoder, optimizer_decoder, scheduler_encoder, scheduler_decoder
def __init__(self, model, model_params, load_data=True, debug=False, batch_size=64): super().__init__(model, model_params, load_data=load_data, debug=debug, batch_size=batch_size, name="TaskLanguageModeling") prior_dist_params = get_param_val( self.model_params, "prior_distribution", allow_default=False, error_location="TaskLanguageModeling - init") self.prior_distribution = create_prior_distribution(prior_dist_params) self.beta_scheduler = create_scheduler(self.model_params["beta"], "beta") self.summary_dict = { "log_prob": list(), "ldj": list(), "z": list(), "beta": 0 }
def main_worker(config: EGAConfig): ega_state = EGA_state() criterio = Criterio(config.weight_matrix) scheduler = create_scheduler(ega_state, config) start_pop = launch_init(config) population = [encode(x) for x in start_pop] ega_state.add(('gen_overlap', config.get('gen_overlap', 0.5))) for i in range(config.get('max_iter', 10000)): ega_state.add(('pop_amount', len(population))) population = evolution_cycle(population, config, criterio, ega_state) print(f'iter: {i}') ega_state.add( ('max_criterio', 1 / criterio(max(population, key=criterio)))) ega_state.add(('mean_criterio', sum([1 / criterio(vector) for vector in population]) / len(population))) mean_criterio_averagemetr.update(ega_state.mean_criterio) tb_logger.add_scalar('max_criterio', ega_state.max_criterio, i) tb_logger.add_scalar('mean_criterio', mean_criterio_averagemetr.value, i) tb_logger.add_scalar('gen_overlap', ega_state.gen_overlap, i) print(f'Max criterio: {ega_state.max_criterio}') print(f'Mean criterio: {ega_state.mean_criterio}') print(f'Gen overlap: {ega_state.gen_overlap}') if config.get('scheduler', {}).get('mean_or_max', 'max') == 'mean': scheduler.step(mean_criterio_averagemetr.value) else: scheduler.step(ega_state.max_criterio) if ega_state.gen_overlap < 0.01: return
def get_model(): ''' get defined model from recycle_model.py Returns: model: pytorch model that would be trained optimizer: pytorch optimizer for gradient descent scheduler: pytorch lr scheduler ''' model_module = getattr(import_module("recycle_model"), CFG.model) model = model_module(num_classes=11) # move model to cuda memory model.cuda() # watch model in wandb # wandb.watch(model) # check the number of model parameters print('parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad)) # if using multi-gpu, train model in parallel n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # setting weight_decay different param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)] }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # get optimizer from optimizer.py optimizer = create_optimizer(CFG.optimizer, params=optimizer_grouped_parameters, lr=CFG.learning_rate, **CFG.optimizer_params) # get scheduler from scheduler.py scheduler = create_scheduler(CFG.scheduler, optimizer=optimizer, **CFG.scheduler_params) return model, optimizer, scheduler
import signal from setproctitle import setproctitle import logging logging.basicConfig() import scheduler import server import models def receive_signal(signum, stack): store.close() my_scheduler.close() server.close() def setup_process(): setproctitle('aws-sns-scheduler') signal.signal(signal.SIGUSR1, receive_signal) signal.signal(signal.SIGUSR2, receive_signal) if __name__ == '__main__': setup_process() store = models.create_store() my_scheduler = scheduler.create_scheduler() my_scheduler.start() server.run(my_scheduler, store)
# Define optimizer and loss function person_id_criterion = CrossEntropyLabelSmooth( train_dataset.number_classes(), use_gpu=opt.cuda) attribute_criterion = AttributeCriterion(attribute_choices, CrossEntropyLabelSmooth) triplet_criterion = TripletLoss(opt.margin) optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=5e-4) # Default lr = 3e-4 print("Using triplet loss = ", triplet_criterion) print("Using person_id = ", person_id_criterion) print("Using Attribute loss = ", attribute_criterion) print("Optimizer = ", optimizer) # scheduler creation lr_scheduler, num_epochs = create_scheduler(opt, optimizer) if epoch > 0: lr_scheduler.step(epoch) print("Scheduled epochs: ", num_epochs) print("learning rates ", [lr_scheduler._get_lr(epoch) for epoch in range(num_epochs)]) # Training routine while epoch < num_epochs: # Training procedure train_epoch( model, dataloader, optimizer,
def get_model(train_iter): # get model from mask_model.py and define with parameters model_module = getattr(import_module("mask_model"), CFG.model) model = model_module() # Upload data to gpu memory model.cuda() # print number of parameters(weights) of defined model print('parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad)) # if exists more than 2 GPUs, use DataParallel training n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # get criterion from loss.py and define with parameters criterion_mask = create_criterion(CFG.criterion, classes=3, smoothing=0.05) criterion_gender = create_criterion('cross_entropy') criterion_age = create_criterion(CFG.criterion, classes=3, smoothing=0.05) # get optimizer from optimizer.py and define with parameters optimizer_backbone = create_optimizer( CFG.optimizer, params=model.backbone.parameters(), lr = CFG.learning_rate * 0.1, momentum=0.9, weight_decay=1e-2 ) optimizer_classifier = create_optimizer( CFG.optimizer, params=[ {"params": model.mask_layer.parameters()}, {"params": model.gender_layer.parameters()}, {"params": model.age_layer.parameters()}, ], lr = CFG.learning_rate, momentum=0.9, weight_decay=1e-2 ) # get scheduler from scheduler.py and define with parameters scheduler_backbone = create_scheduler( CFG.scheduler, optimizer=optimizer_backbone, max_lr=CFG.learning_rate * 0.1, epochs=CFG.nepochs, steps_per_epoch=len(train_iter), pct_start=5/CFG.nepochs, anneal_strategy='cos' ) scheduler_classifier = create_scheduler( CFG.scheduler, optimizer=optimizer_classifier, max_lr=CFG.learning_rate, epochs=CFG.nepochs, steps_per_epoch=len(train_iter), pct_start=5/CFG.nepochs, anneal_strategy='cos' ) return model, criterion_mask, criterion_gender, criterion_age, optimizer_backbone, optimizer_classifier, scheduler_backbone, scheduler_classifier
def run(args): setup_default_logging() #args = parser.parse_args() args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed and args.num_gpu > 1: logging.warning( 'Using more than one GPU per process in distributed mode is not allowed. Setting num_gpu to 1.' ) args.num_gpu = 1 args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.num_gpu = 1 #1 args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() assert args.rank >= 0 if args.distributed: logging.info( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: logging.info('Training with a single process on %d GPUs.' % args.num_gpu) torch.manual_seed(args.seed + args.rank) model = create_model(args.model, pretrained=args.pretrained, num_classes=args.num_classes, drop_rate=args.drop, global_pool=args.gp, bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps, checkpoint_path=args.initial_checkpoint) if args.local_rank == 0: logging.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0) # optionally resume from a checkpoint optimizer_state = None resume_epoch = None if args.resume: optimizer_state, resume_epoch = resume_checkpoint(model, args.resume) if args.num_gpu > 1: if args.amp: logging.warning( 'AMP does not work well with nn.DataParallel, disabling. Use distributed mode for multi-GPU AMP.' ) args.amp = False model = nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda() else: model.cuda() optimizer = create_optimizer(args, model) if optimizer_state is not None: optimizer.load_state_dict(optimizer_state) use_amp = False if has_apex and args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') use_amp = True if args.local_rank == 0: logging.info('NVIDIA APEX {}. AMP {}.'.format( 'installed' if has_apex else 'not installed', 'on' if use_amp else 'off')) model_ema = None if args.model_ema: # create EMA model after cuda() model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume=args.resume) if args.distributed: if args.sync_bn: try: if has_apex: model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( model) if args.local_rank == 0: logging.info( 'Converted model to use Synchronized BatchNorm.') except Exception as e: logging.error( 'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1' ) if has_apex: model = DDP(model, delay_allreduce=True) else: if args.local_rank == 0: logging.info( "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP." ) model = DDP(model, device_ids=[args.local_rank ]) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: logging.info('Scheduled epochs: {}'.format(num_epochs)) collate_fn = None if args.prefetcher and args.mixup > 0: collate_fn = FastCollateMixup(args.mixup, args.smoothing, args.num_classes) # Load dataset data_dir = os.path.join(args.data, 'img') if not os.path.exists(data_dir): logging.error('Training folder does not exist at: {}'.format(data_dir)) exit(1) dataset_train = MultiViewDataSet(train_file, class_file, data_dir, transform=transform_train) dataset_eval = MultiViewDataSet(test_file, class_file, data_dir, transform=transform_eval) loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=1) if 0: loader_train = create_loader( dataset_train, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, rand_erase_prob=args.reprob, rand_erase_mode=args.remode, color_jitter=args.color_jitter, interpolation='random', mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, collate_fn=collate_fn, ) loader_eval = create_loader( dataset_eval, input_size=data_config['input_size'], batch_size=4 * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, ) if args.mixup > 0.: # smoothing is handled with mixup label transform train_loss_fn = SoftTargetCrossEntropy().cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() elif args.smoothing: train_loss_fn = LabelSmoothingCrossEntropy( smoothing=args.smoothing).cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() else: train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = train_loss_fn eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' metrics_history = OrderedDict() if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, str(data_config['input_size'][-1]) ]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing) try: for epoch in range(start_epoch, num_epochs): train_metrics = train_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, use_amp=use_amp, model_ema=model_ema) eval_metrics = validate(model, loader_eval, validate_loss_fn, args) if model_ema is not None and not args.model_ema_force_cpu: ema_eval_metrics = validate(model_ema.ema, loader_eval, validate_loss_fn, args, log_suffix=' (EMA)') eval_metrics = ema_eval_metrics if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] metrics_history[epoch] = eval_metrics make_plots(metrics_history, output_dir) best_metric, best_epoch = saver.save_checkpoint( model, optimizer, args, epoch=epoch, model_ema=model_ema, metric=save_metric) except KeyboardInterrupt: pass if best_metric is not None: logging.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
def main(): args = parser.parse_args() args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed and args.num_gpu > 1: print( 'Using more than one GPU per process in distributed mode is not allowed. Setting num_gpu to 1.' ) args.num_gpu = 1 args.device = 'cuda:0' args.world_size = 1 r = -1 if args.distributed: args.num_gpu = 1 args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() r = torch.distributed.get_rank() if args.distributed: print( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (r, args.world_size)) else: print('Training with a single process on %d GPUs.' % args.num_gpu) # FIXME seed handling for multi-process distributed? torch.manual_seed(args.seed) output_dir = '' if args.local_rank == 0: if args.output: output_base = args.output else: output_base = './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, str(args.img_size) ]) output_dir = get_outdir(output_base, 'train', exp_name) model = create_model(args.model, pretrained=args.pretrained, num_classes=args.num_classes, drop_rate=args.drop, global_pool=args.gp, bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps, checkpoint_path=args.initial_checkpoint) print('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(model, args, verbose=args.local_rank == 0) # optionally resume from a checkpoint start_epoch = 0 optimizer_state = None if args.resume: optimizer_state, start_epoch = resume_checkpoint( model, args.resume, args.start_epoch) if args.num_gpu > 1: if args.amp: print( 'Warning: AMP does not work well with nn.DataParallel, disabling. ' 'Use distributed mode for multi-GPU AMP.') args.amp = False model = nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda() else: model.cuda() optimizer = create_optimizer(args, model) if optimizer_state is not None: optimizer.load_state_dict(optimizer_state) if has_apex and args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') use_amp = True print('AMP enabled') else: use_amp = False print('AMP disabled') if args.distributed: model = DDP(model, delay_allreduce=True) lr_scheduler, num_epochs = create_scheduler(args, optimizer) if start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: print('Scheduled epochs: ', num_epochs) train_dir = os.path.join(args.data, 'train') if not os.path.exists(train_dir): print('Error: training folder does not exist at: %s' % train_dir) exit(1) dataset_train = Dataset(train_dir) collate_fn = None if args.prefetcher and args.mixup > 0: collate_fn = FastCollateMixup(args.mixup, args.smoothing, args.num_classes) loader_train = create_loader( dataset_train, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, rand_erase_prob=args.reprob, rand_erase_mode=args.remode, interpolation= 'random', # FIXME cleanly resolve this? data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, collate_fn=collate_fn, ) eval_dir = os.path.join(args.data, 'validation') if not os.path.isdir(eval_dir): print('Error: validation folder does not exist at: %s' % eval_dir) exit(1) dataset_eval = Dataset(eval_dir) loader_eval = create_loader( dataset_eval, input_size=data_config['input_size'], batch_size=4 * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, ) if args.mixup > 0.: # smoothing is handled with mixup label transform train_loss_fn = SoftTargetCrossEntropy().cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() elif args.smoothing: train_loss_fn = LabelSmoothingCrossEntropy( smoothing=args.smoothing).cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() else: train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = train_loss_fn eval_metric = args.eval_metric saver = None if output_dir: decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing) best_metric = None best_epoch = None try: for epoch in range(start_epoch, num_epochs): if args.distributed: loader_train.sampler.set_epoch(epoch) train_metrics = train_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, use_amp=use_amp) eval_metrics = validate(model, loader_eval, validate_loss_fn, args) if lr_scheduler is not None: lr_scheduler.step(epoch, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric best_metric, best_epoch = saver.save_checkpoint( { 'epoch': epoch + 1, 'arch': args.model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'args': args, }, epoch=epoch + 1, metric=eval_metrics[eval_metric]) except KeyboardInterrupt: pass if best_metric is not None: print('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
def main(): cfg, args = _parse_args() torch.manual_seed(args.seed) output_base = cfg.OUTPUT_DIR if len(cfg.OUTPUT_DIR) > 0 else './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), cfg.MODEL.ARCHITECTURE, str(cfg.INPUT.IMG_SIZE) ]) output_dir = get_outdir(output_base, exp_name) with open(os.path.join(output_dir, 'config.yaml'), 'w', encoding='utf-8') as file_writer: # cfg.dump(stream=file_writer, default_flow_style=False, indent=2, allow_unicode=True) file_writer.write(pyaml.dump(cfg)) logger = setup_logger(file_name=os.path.join(output_dir, 'train.log'), control_log=False, log_level='INFO') # create model model = create_model(cfg.MODEL.ARCHITECTURE, num_classes=cfg.MODEL.NUM_CLASSES, pretrained=True, in_chans=cfg.INPUT.IN_CHANNELS, drop_rate=cfg.MODEL.DROP_RATE, drop_connect_rate=cfg.MODEL.DROP_CONNECT, global_pool=cfg.MODEL.GLOBAL_POOL) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu gpu_list = list(map(int, args.gpu.split(','))) device = 'cuda' if len(gpu_list) == 1: model.cuda() torch.backends.cudnn.benchmark = True elif len(gpu_list) > 1: model = nn.DataParallel(model, device_ids=gpu_list) model = convert_model(model).cuda() torch.backends.cudnn.benchmark = True else: device = 'cpu' logger.info('device: {}, gpu_list: {}'.format(device, gpu_list)) optimizer = create_optimizer(cfg, model) # optionally initialize from a checkpoint if args.initial_checkpoint and os.path.isfile(args.initial_checkpoint): load_checkpoint(model, args.initial_checkpoint) # optionally resume from a checkpoint resume_state = None resume_epoch = None if args.resume and os.path.isfile(args.resume): resume_state, resume_epoch = resume_checkpoint(model, args.resume) if resume_state and not args.no_resume_opt: if 'optimizer' in resume_state: optimizer.load_state_dict(resume_state['optimizer']) logger.info('Restoring optimizer state from [{}]'.format( args.resume)) start_epoch = 0 if args.start_epoch is not None: start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch model_ema = None if cfg.SOLVER.EMA: # Important to create EMA model after cuda() model_ema = ModelEma(model, decay=cfg.SOLVER.EMA_DECAY, device=device, resume=args.resume) lr_scheduler, num_epochs = create_scheduler(cfg, optimizer) if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) # summary print('=' * 60) print(cfg) print('=' * 60) print(model) print('=' * 60) summary(model, (3, cfg.INPUT.IMG_SIZE, cfg.INPUT.IMG_SIZE)) # dataset dataset_train = Dataset(cfg.DATASETS.TRAIN) dataset_valid = Dataset(cfg.DATASETS.TEST) train_loader = create_loader(dataset_train, cfg, is_training=True) valid_loader = create_loader(dataset_valid, cfg, is_training=False) # loss function if cfg.SOLVER.LABEL_SMOOTHING > 0: train_loss_fn = LabelSmoothingCrossEntropy( smoothing=cfg.SOLVER.LABEL_SMOOTHING).to(device) validate_loss_fn = nn.CrossEntropyLoss().to(device) else: train_loss_fn = nn.CrossEntropyLoss().to(device) validate_loss_fn = train_loss_fn eval_metric = cfg.SOLVER.EVAL_METRIC best_metric = None best_epoch = None saver = CheckpointSaver( checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=True if eval_metric == 'loss' else False) try: for epoch in range(start_epoch, num_epochs): train_metrics = train_epoch(epoch, model, train_loader, optimizer, train_loss_fn, cfg, logger, lr_scheduler=lr_scheduler, saver=saver, device=device, model_ema=model_ema) eval_metrics = validate(epoch, model, valid_loader, validate_loss_fn, cfg, logger) if model_ema is not None: ema_eval_metrics = validate(epoch, model_ema.ema, valid_loader, validate_loss_fn, cfg, logger) eval_metrics = ema_eval_metrics if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] best_metric, best_epoch = saver.save_checkpoint( model, optimizer, cfg, epoch=epoch, model_ema=model_ema, metric=save_metric) except KeyboardInterrupt: pass if best_metric is not None: logger.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
def get_model(): # model.py에 정의된 특정 모델을 가져옵니다. model_module = getattr(import_module("recycle_model"), CFG.model) model = model_module(num_classes=12) # 모델의 파라미터를 GPU메모리로 옮깁니다. model.cuda() # wandb에서 model 감독 wandb.watch(model) # 모델의 파라미터 수를 출력합니다. print('parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad)) # GPU가 2개 이상이면 데이터패러럴로 학습 가능하게 만듭니다. n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) # loss.py에 정의된 criterion을 가져옵니다. criterion = create_criterion(CFG.criterion) # optimizer.py에 정의된 optimizer를 가져옵니다. if CFG.optimizer == "Adam": optimizer = create_optimizer( CFG.optimizer, params=[ { "params": model.seg_model.encoder.parameters(), "lr": CFG.learning_rate * 0.1 }, { "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }, ], lr=CFG.learning_rate, weight_decay=1e-6) elif CFG.optimizer == "RAdam": optimizer = create_optimizer( CFG.optimizer, params=[ { "params": model.seg_model.encoder.parameters(), "lr": CFG.learning_rate * 0.1 }, { "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }, ], lr=CFG.learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=0) elif CFG.optimizer == "AdamP": optimizer = create_optimizer( CFG.optimizer, params=[ { "params": model.seg_model.encoder.parameters(), "lr": CFG.learning_rate * 0.1 }, { "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }, ], lr=CFG.learning_rate, betas=(0.9, 0.999), eps=1e-8, weight_decay=0) elif CFG.optimizer == "AdamW": optimizer = create_optimizer( CFG.optimizer, params=[ { "params": model.seg_model.encoder.parameters(), "lr": CFG.learning_rate * 0.1 }, { "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }, ], lr=CFG.learning_rate, amsgrad=True) elif CFG.optimizer == "RMSprop": optimizer = create_optimizer( CFG.optimizer, params=[ { "params": model.seg_model.encoder.parameters(), "lr": CFG.learning_rate * 0.1 }, { "params": model.seg_model.decoder.parameters() }, { "params": model.seg_model.segmentation_head.parameters() }, ], lr=CFG.learning_rate) # scheduler.py에 정의된 scheduler를 가져옵니다. if CFG.scheduler == "StepLR": scheduler = create_scheduler(CFG.scheduler, optimizer=optimizer, step_size=5, gamma=0.95) elif CFG.scheduler == "CosineAnnealingWarmupRestarts": scheduler = create_scheduler( CFG.scheduler, optimizer=optimizer, first_cycle_steps=5, cycle_mult=1., max_lr=1e-4, min_lr=1e-7, ) return model, criterion, optimizer, scheduler
async def test_05_monitor(self): resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"help monitor TOTOTOTO" }, ) self.assertEqual(resp.status, 200) self.assertTrue( (await resp.json())["text"].startswith("Unknown command: TOTOTOTO")) resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"help monitor" }, ) self.assertEqual(resp.status, 200) self.assertTrue( (await resp.json())["text"].startswith("`/tromino monitor [")) resp = await self.client.request("POST", "/mattermost/", data={ "command": "/tromino", "text": f"monitor" }) self.assertEqual(resp.status, 200) self.assertTrue( (await resp.json())["text"].startswith("`/tromino monitor [")) resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"help monitor types_list" }, ) self.assertEqual(resp.status, 200) self.assertTrue( (await resp.json())["text"].startswith("`/tromino monitor types_list")) resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"monitor types_list" }, ) self.assertEqual(resp.status, 200) self.assertTrue((await resp.json())["text"].startswith("Monitors types:")) resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"help monitor create_monitor" }, ) self.assertEqual(resp.status, 200) self.assertTrue( (await resp.json() )["text"].startswith("`/tromino monitor create_monitor")) resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"monitor create_monitor dummytest" }, ) self.assertEqual(resp.status, 200) self.assertTrue( (await resp.json() )["text"].startswith("`/tromino monitor create_monitor")) # Create scheduler scheduler.clean_scheduler() scheduler.scheduler = scheduler.create_scheduler() scheduler.scheduler.start() Notifications.read_notifications() resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"monitor create_monitor dummytest dummytime 1", }, ) self.assertEqual(resp.status, 200) self.assertEqual((await resp.json())["text"], "Monitor `dummytest` created") await asyncio.sleep(5) notifications = Notifications.read_notifications() self.assertEqual(notifications[0]["text"], "First compare") self.assertIn(len(notifications), [4, 5, 6]) for i in range(1, len(notifications)): self.assertTrue( notifications[i]["text"].startswith("Since last refresh")) resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"help monitor mon-dummytest" }, ) self.assertEqual(resp.status, 200) self.assertTrue((await resp.json())["text"].startswith("`/tromino monitor")) resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"monitor mon-XXX" }, ) self.assertEqual(resp.status, 200) self.assertTrue((await resp.json())["text"].startswith("Unknown monitor")) resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"monitor mon-dummytest TOTOTO" }, ) self.assertEqual(resp.status, 200) self.assertTrue((await resp.json())["text"].startswith("Unknown command: ")) resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"monitor mon-dummytest set-channel dummychannel", }, ) self.assertEqual(resp.status, 200) self.assertTrue( (await resp.json() )["text"].startswith("Monitor `dummytest` changed channel")) Notifications.read_notifications() # Flush await asyncio.sleep(5) notifications = Notifications.read_notifications() self.assertIn(len(notifications), [4, 5, 6]) for notification in notifications: self.assertEqual(notification["channel"], "dummychannel") resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"monitor mon-dummytest set-channel" }, ) self.assertEqual(resp.status, 200) self.assertTrue( (await resp.json() )["text"].startswith("Monitor `dummytest` changed channel")) Notifications.read_notifications() # Flush await asyncio.sleep(5) notifications = Notifications.read_notifications() self.assertIn(len(notifications), [4, 5, 6]) for notification in notifications: self.assertNotIn("channel", notification) resp = await self.client.request( "POST", "/mattermost/", data={ "command": "/tromino", "text": f"monitor mon-dummytest remove" }, ) self.assertEqual(resp.status, 200) self.assertEqual((await resp.json())["text"], "Monitor `dummytest` removed") await asyncio.sleep(5) # Wait for alive job to finish Notifications.read_notifications() await asyncio.sleep(5) notifications = Notifications.read_notifications() self.assertEqual(len(notifications), 0) # Clean scheduler scheduler.clean_scheduler()
def main(args): seed_everything(21) load_dotenv() if WANDB: if args.ENCODER: run_name = args.MODEL + "_" + args.ENCODER else: run_name = args.MODEL if args.KFOLD > 1: if args.KFOLD != 5: print("Only 5 KFOLD is available") return # pt 저장 폴더 생성 path_pair = args.MODEL_PATH.split(".") os.makedirs(path_pair[0], exist_ok=True) # 재사용위해 args 복사 args_origin = copy.deepcopy(args) for fold in range(args.KFOLD): # hold-out, kfold에 따라서 dataloader 다르게 설정 if args.KFOLD > 1: args = copy.deepcopy(args_origin) path_pair = args_origin.MODEL_PATH.split(".") # MODEL_PATH 변경 args.MODEL_PATH = (path_pair[0] + f"/kfold_{fold+1}." + path_pair[1]) # wandb if WANDB: wandb.init( project=os.environ.get("WANDB_PROJECT_NAME"), name=run_name + f"_k{fold+1}", config=args, reinit=True, ) args = wandb.config # dataloader dataloader = get_dataloader(args.BATCH_SIZE, fold_index=fold) print(f"\nfold {fold+1} start") else: # wandb if WANDB: wandb.init( project=os.environ.get("WANDB_PROJECT_NAME"), name=run_name, reinit=True, ) wandb.config.update(args) args = wandb.config # dataloader dataloader = get_dataloader(args.BATCH_SIZE) print("Get loader") model = get_model(args.MODEL, args.ENCODER).to(args.device) print("Load model") if WANDB: wandb.watch(model) criterion = [] if "+" in args.LOSS: criterion.append("+") criterion.append(create_criterion(args.LOSS.split("+")[0])) criterion.append(create_criterion(args.LOSS.split("+")[1])) elif "-" in args.LOSS: criterion.append("-") criterion.append(create_criterion(args.LOSS.split("-")[0])) criterion.append(create_criterion(args.LOSS.split("-")[1])) else: criterion.append("0") criterion.append(create_criterion(args.LOSS)) optimizer = create_optimizer(args.OPTIMIZER, model, args.LEARNING_RATE) if args.SCHEDULER: scheduler = create_scheduler(args.SCHEDULER, optimizer) else: scheduler = None # optimizer = optim.Adam(params = model.parameters(), lr = args.LEARNING_RATE, weight_decay=1e-6) print("Run") run(args, model, criterion, optimizer, dataloader, fold, scheduler)