def init_opt(args, model, logger): if args.optimizer == 'adam': if args.transformer_lr: opt = torch.optim.Adam(model.params, lr=args.transformer_lr_multiply, betas=(0.9, 0.98), eps=1e-9, weight_decay=args.weight_decay) lr_lambda = partial(get_transformer_learning_rate, dimension=args.dimension, warmup=args.warmup) scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda) else: opt = torch.optim.Adam(model.params, lr=args.lr_rate, betas=(args.beta0, 0.999), weight_decay=args.weight_decay) scheduler = None elif args.optimizer == 'radam': import radam if args.transformer_lr: logger.warning('--transformer_lr has no effect with RAdam optimizer, warmup is never applied') opt = radam.RAdam(model.params, lr=args.lr_rate, betas=(args.beta0, 0.999), weight_decay=args.weight_decay) scheduler = None else: assert args.optimizer == 'sgd' if args.transformer_lr: opt = torch.optim.SGD(model.params, lr=args.transformer_lr_multiply, weight_decay=args.weight_decay, ) lr_lambda = partial(get_sgd_learning_rate, warmup=args.warmup) scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda) else: opt = torch.optim.SGD(model.params, lr=args.lr_rate, weight_decay=args.weight_decay, ) scheduler = None return opt, scheduler
def init_opt(args, model, logger): if args.optimizer == 'adam': # Adam with transformer schedule has a different set of default hyperparameters: if args.lr_schedule == 'transformer': opt = torch.optim.Adam(model.params, lr=args.lr_multiply, betas=(0.9, 0.98), eps=1e-9, weight_decay=args.weight_decay) else: opt = torch.optim.Adam(model.params, lr=args.lr_multiply, betas=(args.beta0, 0.999), weight_decay=args.weight_decay) elif args.optimizer == 'adamw': opt = AdamW(model.params, lr=args.lr_multiply, weight_decay=args.weight_decay) elif args.optimizer == 'radam': import radam if args.warmup > 1: logger.warning('With RAdam optimizer, warmup is never applied') opt = radam.RAdam(model.params, lr=args.lr_multiply, betas=(args.beta0, 0.999), weight_decay=args.weight_decay) else: assert args.optimizer == 'sgd' opt = torch.optim.SGD(model.params, lr=args.lr_multiply, weight_decay=args.weight_decay) if args.lr_schedule == 'transformer': lr_lambda = partial(get_transformer_learning_rate, dimension=args.dimension, warmup=args.warmup) scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda) elif args.lr_schedule == 'constant': scheduler = get_constant_schedule_with_warmup( opt, num_warmup_steps=args.warmup) elif args.lr_schedule == 'linear': scheduler = get_linear_schedule_with_warmup( opt, num_training_steps=sum(args.train_iterations) // args.gradient_accumulation_steps, num_warmup_steps=args.warmup) elif args.lr_schedule == 'cosine': scheduler = get_cosine_schedule_with_warmup( opt, num_training_steps=sum(args.train_iterations) // args.gradient_accumulation_steps, num_warmup_steps=args.warmup, num_cycles=0.5) elif args.lr_schedule == 'sgd': lr_lambda = partial(get_sgd_learning_rate, warmup=args.warmup) scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda) else: raise ValueError('Invalid learning rate scheduler.') return opt, scheduler
def optimizer(net, args): assert args.optimizer.lower() in ["sgd", "adam", "radam"], "Invalid Optimizer" if args.optimizer.lower() == "sgd": return optim.SGD(net.parameters(), lr=args.lr, momentum=args.beta1, nesterov=args.nesterov) elif args.optimizer.lower() == "adam": return optim.Adam(net.parameters(), lr=args.lr, betas=(args.beta1, args.beta2)) elif args.optimizer.lower() == "radam": return radam.RAdam(net.parameters(), lr=args.lr, betas=(args.beta1, args.beta2))
def init_fn(self): if self.options.model == 'flow': num_input_channels = self.options.n_time_bins * 2 num_output_channels = 2 elif self.options.model == 'recons': # For the reconstruction model, we sum the event volume across the time dimension, so # that the network only sees a single channel event input, plus the prev image. num_input_channels = 1 + self.options.n_image_channels num_output_channels = self.options.n_image_channels else: raise ValueError( "Class was initialized with an invalid model {}" ", only {EventGAN, flow, recons} are supported.".format( self.options.model)) self.cycle_unet = UNet(num_input_channels=num_input_channels, num_output_channels=num_output_channels, skip_type='concat', activation='tanh', num_encoders=4, base_num_channels=32, num_residual_blocks=2, norm='BN', use_upsample_conv=True, multi=True) self.models_dict = {"model": self.cycle_unet} model_params = self.cycle_unet.parameters() optimizer = radam.RAdam(list(model_params), lr=self.options.lrc, weight_decay=self.options.wd, betas=(self.options.lr_decay, 0.999)) self.ssim = pytorch_ssim.SSIM() self.l1 = nn.L1Loss(reduction="mean") self.image_loss = lambda x, y: self.l1(x, y) - self.ssim(x, y) self.optimizers_dict = {"optimizer": optimizer} self.train_ds, self.train_sampler = event_loader.get_and_concat_datasets( self.options.train_file, self.options, train=True) self.validation_ds, self.validation_sampler = event_loader.get_and_concat_datasets( self.options.validation_file, self.options, train=False) self.cdl_kwargs["collate_fn"] = event_utils.none_safe_collate self.cdl_kwargs["sampler"] = self.train_sampler
def configure_optimizers(self): params = self.parameters() if isinstance(self._optimizer, dict): optimizer = radam.RAdam(params, **self._optimizer) else: optimizer = self._optimizer(params) if isinstance(self._scheduler, dict): scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, **self._scheduler) else: scheduler = self._scheduler(optimizer) return { 'optimizer': optimizer, 'lr_scheduler': scheduler }
def main(cfg): net = SPRINSeg(6, cfg.fps_n).cuda() if len(cfg.resume_path) > 0: net.load_state_dict( torch.load(hydra.utils.to_absolute_path(cfg.resume_path))) opt = radam.RAdam(net.parameters(), cfg.lr, weight_decay=cfg.weight_decay) pcs_train, segs_centered_train, segs_train = read_data( hydra.utils.to_absolute_path('shapenet_part_seg_hdf5_data'), r'ply_data_(train|val).*\.h5') pcs_test, segs_centered_test, segs_test = read_data( hydra.utils.to_absolute_path('shapenet_part_seg_hdf5_data'), r'ply_data_test.*\.h5') print(len(pcs_train)) print(len(pcs_test)) for e in range(1, cfg.max_epoch): run_epoch(net, pcs_train, segs_centered_train, segs_train, opt, e, ds=cfg.npoints, batchsize=cfg.batch_size) if e % 10 == 0: run_epoch(net, pcs_test, segs_centered_test, segs_test, opt, e, train=False, ds=cfg.npoints, batchsize=cfg.batch_size, rand_rot=True) torch.save(net.state_dict(), 'epoch{}.pt'.format(e))
def main(): discount = 0.995 unroll_steps = 5 replay_buffer_size = 1000 batch_size = 128 env = TicTacToeEnv() agent = MuZeroAgent(discount=discount) replay = ReplayBuffer(replay_buffer_size, batch_size, unroll_steps) # optimizer = torch.optim.SGD(agent.network.parameters(), lr=1e-4, momentum=0.9, weight_decay=1e-6, nesterov=True) optimizer = radam.RAdam(agent.network.parameters(), lr=1e-2, weight_decay=1e-6) agent.load_model("muzero_model.pth") try: writer = SummaryWriter("./logs/MuZero") muzero(env, agent, replay, optimizer, writer) except KeyboardInterrupt: print("Keyboard interrupt") print("Train complete") validate(env, agent, True) agent.save_model("muzero_model.pth")
seed = 20170705 np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) train_file = "train_large.txt" feature_sizes_file = "feature_sizes_large.txt" debug = False #train_file = "train.txt" #feature_sizes_file = "feature_sizes.txt" #debug = True # load data train_data = CriteoDataset('./data', train=True, train_file=train_file) # split trani and valid set train_idx, valid_idx = split_train_and_valid(train_data, debug) # loader loader_train = DataLoader(train_data, batch_size=256, sampler=sampler.SubsetRandomSampler(train_idx), num_workers=0) loader_val = DataLoader(train_data, batch_size=1000, sampler=sampler.SubsetRandomSampler(valid_idx), num_workers=0) feature_sizes = np.loadtxt('./data/{}'.format(feature_sizes_file), delimiter=',') feature_sizes = [int(x) for x in feature_sizes] print(feature_sizes) model = DeepFM(feature_sizes, use_cuda=True, overfitting=debug) #optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0) optimizer = radam.RAdam(model.parameters(), lr=1e-3, weight_decay=0.0) model.fit(loader_train, loader_val, optimizer, epochs=1000, verbose=True, print_every=1000, checkpoint_dir="./chkp")
def train_fold(fold_idx, work_dir, train_filenames, test_filenames, batch_sampler, epoch, epochs_to_train): os.makedirs(work_dir, exist_ok=True) fold_logger = kfold.FoldLogger(work_dir) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # device = 'cpu' batch_size = 4 # model = models.UNet(6, 1) # model = models.MyResNetModel() model = models.ResNetUNet(n_classes=1, upsample=True) # model = models.ResNetUNetPlusPlus(n_classes=1) # model = models.EfficientUNet(n_classes=1) # model = models.HRNetWithClassifier() model.to(device) model = torch.nn.DataParallel(model) # model.to(device) data_patallel_multiplier = max(1, torch.cuda.device_count()) # data_patallel_multiplier = 1 print('data_parallel_multiplier =', data_patallel_multiplier) img_size = 1024 train_dataset = datareader.SIIMDataset('data/dicom-images-train', 'data/train-rle.csv', ([img_size], [img_size]), augment=True, filenames_whitelist=train_filenames) # if batch_sampler is None: # batch_sampler = samplers.OnlineHardBatchSampler(train_dataset, batch_size * data_patallel_multiplier, # drop_last=False) # train_dataloader = torch.utils.data.DataLoader(train_dataset, num_workers=os.cpu_count(), # batch_sampler=batch_sampler) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size * data_patallel_multiplier, shuffle=True, num_workers=os.cpu_count()) val_dataset = datareader.SIIMDataset('data/dicom-images-train', 'data/train-rle.csv', ([img_size], [img_size]), filenames_whitelist=test_filenames) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size * data_patallel_multiplier, shuffle=False, num_workers=os.cpu_count()) trainable_params = [ param for param in model.parameters() if param.requires_grad ] lr_scaling_coefficient = (1 / 16) * data_patallel_multiplier * batch_size / 10 # max_lr = 2e-3 * lr_scaling_coefficient # base_lr = 5e-5 * lr_scaling_coefficient # OHEM Limited loss works with that divided by 10 max_lr = 2.5e-4 * lr_scaling_coefficient base_lr = 3.5e-5 * lr_scaling_coefficient # optim = torch.optim.Adam(params=trainable_params, lr=base_lr, betas=(0.0, 0.9)) # optim = torch.optim.Adam(params=[ # {"params": backbone_parameters, "lr": base_lr}, # {"params": head_and_classifier_params, "lr": max_lr}], lr=base_lr) # optim = torch.optim.Adam(params=trainable_params, lr=max_lr) # optim = torch.optim.AdamW(params=trainable_params, lr=base_lr, weight_decay=0.00001) optim = radam.RAdam(params=trainable_params, lr=base_lr, weight_decay=0.0001) optim = torchcontrib.optim.SWA(optim) # optim = torch.optim.SGD(params=trainable_params, # momentum=0.98, # nesterov=True, # lr=base_lr) # optim = torch.optim.SGD(params=trainable_params, # momentum=0.9, # nesterov=True, # lr=base_lr) best_metric = 0.0 _, loaded_best_metric = utils.try_load_checkpoint(work_dir, model, device, optimizer=optim, load_optimizer=True) if loaded_best_metric is not None: best_metric = loaded_best_metric # Experiments show that it often is good to set stepsize equal to 2 − 10 times the number of iterations in an epoch. # For example, setting stepsize = 8 ∗ epoch with the CIFAR-10 training run(as shown in Figure 1) only gives slightly # better results than setting stepsize = 2 ∗ epoch. (https://arxiv.org/pdf/1506.01186.pdf) # cycle_len = 4 == stepsize = 2 # in my implementation epochs_per_cycle = 20 lr_scheduler = lr_utils.CyclicalLR(max_lr=max_lr, base_lr=base_lr, steps_per_epoch=len(train_dataloader), epochs_per_cycle=epochs_per_cycle, mode='cosine') lr_scheduler.step_value = epoch * len(train_dataloader) steps_per_epoch = len(train_dataloader) # torch.optim.lr_scheduler.CyclicLR(optimizer=optim, base_lr=base_lr, max_lr=max_lr, step_size_up=steps_per_epoch * 1, # step_size_down=steps_per_epoch * 4, mode='triangular', gamma=1.0, scale_fn=None, # scale_mode='cycle', # cycle_momentum=False, base_momentum=0.8, max_momentum=0.9, # last_epoch=-1) # model, optimizer = amp.initialize(model, optim, opt_level='O0') writer = SummaryWriter(work_dir) for i in range(epochs_to_train): train_result_dict = train_one_epoch(model=model, optimizer=optim, data_loader=train_dataloader, device=device, epoch=epoch, lr_scheduler=lr_scheduler, summary_writer=writer, print_freq=100) val_result_dict = validate.validate(model, val_dataloader, device) mask_thresh, mask_score = val_result_dict['best_mask_score'] class_thresh, class_score = val_result_dict['best_class_score'] global_step = epoch * len(train_dataloader) writer.add_scalar('dice', mask_score, global_step=global_step) writer.add_scalar('classification_accuracy', class_score, global_step=global_step) writer.add_scalar('mean_epoch_loss', train_result_dict['loss'], global_step=global_step) writer.add_scalar('epoch', epoch, global_step=global_step) # {'best_mask_score': best_mask_score, 'mean_mask_scores': mean_mask_scores, # 'best_class_score': best_class_score, 'mean_class_scores': mean_class_scores} log_data = { 'score': val_result_dict['best_mask_score'][1], 'mask_threshold': val_result_dict['best_mask_score'][0], 'class_accuracy': val_result_dict['best_class_score'][1], 'class_thresold': val_result_dict['best_class_score'][0] } if (epoch + 1) % epochs_per_cycle == 0 and epoch != 0: print('Updating SWA running average') optim.update_swa() epoch += 1 break # if mask_score > best_metric: # best_metric = mask_score # if epoch % epochs_per_cycle == 0: fold_logger.log_epoch(epoch - 1, log_data) utils.save_checkpoint(output_dir=work_dir, epoch=epoch - 1, model=model, optimizer=optim, best_metric=best_metric) if (epoch) % epochs_per_cycle == 0 and epoch != 0: optim.swap_swa_sgd() print('Swapped SWA buffers') print('Updating BatchNorm statistics...') optim.bn_update( utils.dataloader_image_extract_wrapper(train_dataloader), model, device) print('Updated BatchNorm statistics') print('Validating SWA model...') val_result_dict = validate.validate(model, val_dataloader, device) log_data = { 'score': val_result_dict['best_mask_score'][1], 'mask_threshold': val_result_dict['best_mask_score'][0], 'class_accuracy': val_result_dict['best_class_score'][1], 'class_thresold': val_result_dict['best_class_score'][0] } fold_logger.log_epoch('swa', log_data) print('Saved SWA model') utils.save_checkpoint(output_dir=work_dir, epoch=None, name='swa', model=model, optimizer=optim, best_metric=best_metric) return { 'mask_score': mask_score, 'class_score': class_score, 'global_step': global_step, 'batch_sampler': batch_sampler }
def init_fn(self): # build model self.generator, self.discriminator = build_gan(self.options) self.models_dict = {"gen": self.generator, "dis": self.discriminator} if not self.is_training: self.optimizers_dict = {} return if self.options.cycle_recons: model_folder = "EventGAN/pretrained_models/{}".format( self.options.cycle_recons_model) checkpoint = os.path.join(model_folder, os.listdir(model_folder)[-1]) self.cycle_unet_recons = torch.load(checkpoint) self.cycle_unet_recons.eval() self.models_dict["e2i"] = self.cycle_unet_recons if self.options.cycle_flow: model_folder = "EventGAN/pretrained_models/{}".format( self.options.cycle_flow_model) checkpoint = os.path.join(model_folder, os.listdir(model_folder)[-1]) self.cycle_unet_flow = torch.load(checkpoint) self.cycle_unet_flow.eval() self.models_dict["e2f"] = self.cycle_unet_flow # params for each part of the network dis_params = filter(lambda p: p.requires_grad, self.discriminator.parameters()) gen_params = filter(lambda p: p.requires_grad, self.generator.parameters()) gen_params = self.generator.parameters() optimizer_dis = radam.RAdam(dis_params, lr=self.options.lrd, weight_decay=0., betas=(0., 0.999)) optimizer_gen = radam.RAdam(list(gen_params), lr=self.options.lrg, weight_decay=0., betas=(0., 0.999)) self.ssim = pytorch_ssim.SSIM() self.secondary_l1 = nn.L1Loss(reduction="mean") self.image_loss = lambda x, y: self.secondary_l1(x, y) - self.ssim( x, y) self.optimizers_dict = { "optimizer_gen": optimizer_gen, "optimizer_dis": optimizer_dis } self.train_ds, self.train_sampler = event_loader.get_and_concat_datasets( self.options.train_file, self.options, train=True) self.validation_ds, self.validation_sampler = event_loader.get_and_concat_datasets( self.options.validation_file, self.options, train=False) self.cdl_kwargs["collate_fn"] = event_utils.none_safe_collate self.cdl_kwargs["sampler"] = self.train_sampler self.prev_gen_losses = {} self.prev_dis_losses = {} self.prev_gen_outputs = {} self.prev_dis_outputs = {}
import torch import torch.nn.functional as F from utils import prepare_cifar import tqdm import radam from vgg import vgg13_bn from models import PreActResNet18 from aegleseeker import AegleSeeker from eval_model import eval_model_pgd device = 'cuda:0' model = vgg13_bn() model = AegleSeeker(model).to(device) train_loader, test_loader = prepare_cifar(100, 100) optim = radam.RAdam(model.parameters()) epsilon = 8 / 255 for epoch in range(100): with tqdm.tqdm(train_loader) as train: running_loss = 0.0 running_grad = 0.0 running_acc = 0.0 model.train() for i, (x, y) in enumerate(train): x, y = x.to(device), y.to(device) # x_bu = x.detach().clone() for _ in range(1): x_rg = x.detach().clone().requires_grad_(True) + \ torch.randn_like(x) * epsilon / 2 optim.zero_grad() pred = model(x_rg)
testdataloader = torch.utils.data.DataLoader(testdataset, batch_size=4, shuffle=True) nb_classes = len(traindataset.classes) viz = visdom.Visdom() m = model.Model(nb_classes, 64) m = m.cuda() m.load_state_dict(torch.load('classifier_model.pt')) print(m) initial_learning_rate = 100 / sum(p.numel() for p in m.parameters() if p.requires_grad) print("Initail Learning rate", initial_learning_rate) optim = radam.RAdam(m.parameters(), lr=initial_learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, 'min', factor=0.5, verbose=True) #optim.load_state_dict(torch.load('classifier_optim.pt')) trainlm = lossManager.LossManager(displayEvery=100, win="Train Losses") testlm = lossManager.LossManager(displayEvery=1, win="Test Losses") def train(m, optim, dataset): dataloader = torch.utils.data.DataLoader(nonechucks.SafeDataset(dataset), batch_size=16, shuffle=True, num_workers=16) m.train()
print("ADVERSARIAL") adv_optimizer = torch.optim.SGD(adv_hidden.parameters(), lr=args.adv_lr, weight_decay=args.adv_wdecay) #optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) # Ensure the optimizer is optimizing params, which includes both the model's weights as well as the criterion's weight (i.e. Adaptive Softmax) if args.optimizer == 'sgd': optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay) elif args.optimizer == 'radam': optimizer = radam.RAdam(params, lr=args.lr, weight_decay=args.wdecay) else: raise Exception("Bad value %s for optimizer type" % args.optimizer) epoch_start_time = time.time() epoch = 0 val_loss2 = evaluate(val_data) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) print('-' * 89) print("MAX EPOCH = ", args.epochs + 1) for epoch in range(args.start, args.epochs + 1):
def train_cv(input_directory, output_directory): # model # 模型保存文件夹 model_save_dir = '%s/%s_%s' % ( config.ckpt, config.model_name + "_cv", time.strftime("%Y%m%d%H%M") ) #'%s/%s_%s' % (config.ckpt, args.model_name+"_cv", time.strftime("%Y%m%d%H%M")) for fold in range(config.kfold): print("***************************fold : {}***********************". format(fold)) model = getattr(models, config.model_name)(fold=fold) # if args.ckpt and not args.resume: # state = torch.load(args.ckpt, map_location='cpu') # model.load_state_dict(state['state_dict']) # print('train with pretrained weight val_f1', state['f1']) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, config.num_classes) #2019/11/11 #save dense/fc weight for pretrain 55 classes # model = MyModel() # num_ftrs = model.classifier.out_features # model.fc = nn.Linear(55, config.num_classes) model = model.to(device) # data train_dataset = ECGDataset(data_path=config.train_data_cv.format(fold), data_dir=input_directory, train=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, drop_last=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data_cv.format(fold), data_dir=input_directory, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, drop_last=True, num_workers=4) print("fold_{}_train_datasize".format(fold), len(train_dataset), "fold_{}_val_datasize".format(fold), len(val_dataset)) # optimizer and loss optimizer = radam.RAdam( model.parameters(), lr=config.lr) #optim.Adam(model.parameters(), lr=config.lr) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) ## utils.FocalLoss() # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', verbose=True, factor=0.1, patience=5, min_lr=1e-06, eps=1e-08) # if args.ex: model_save_dir += args.ex # best_f1 = -1 # lr = config.lr # start_epoch = 1 # stage = 1 best_f1 = -1 best_cm = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 # if args.resume: # if os.path.exists(args.ckpt): # 这里是存放权重的目录 # model_save_dir = args.ckpt # current_w = torch.load(os.path.join(args.ckpt, config.current_w)) # best_w = torch.load(os.path.join(model_save_dir, config.best_w)) # best_f1 = best_w['loss'] # start_epoch = current_w['epoch'] + 1 # lr = current_w['lr'] # stage = current_w['stage'] # model.load_state_dict(current_w['state_dict']) # # 如果中断点恰好为转换stage的点 # if start_epoch - 1 in config.stage_epoch: # stage += 1 # lr /= config.lr_decay # utils.adjust_learning_rate(optimizer, lr) # model.load_state_dict(best_w['state_dict']) # print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_acc, train_f1, train_f2, train_g2, train_cm = train_epoch( model, optimizer, criterion, train_dataloader, show_interval=100) val_loss, val_acc, val_f1, val_f2, val_g2, val_cm = val_epoch( model, criterion, val_dataloader) # train_loss, train_f1 = train_beat_epoch(model, optimizer, criterion, train_dataloader, show_interval=100) # val_loss, val_f1 = val_beat_epoch(model, criterion, val_dataloader) print('#epoch:%02d, stage:%d, train_loss:%.3e, train_acc:%.3f, train_f1:%.3f, train_f2:%.3f, train_g2:%.3f,train_cm:%.3f,\n \ val_loss:%0.3e, val_acc:%.3f, val_f1:%.3f, val_f2:%.3f, val_g2:%.3f, val_cm:%.3f,time:%s\n' % (epoch, stage, train_loss, train_acc,train_f1,train_f2,train_g2,train_cm, \ val_loss, val_acc, val_f1, val_f2, val_g2, val_cm,utils.print_time_cost(since))) logger.log_value('fold{}_train_loss'.format(fold), train_loss, step=epoch) logger.log_value('fold{}_train_f1'.format(fold), train_f1, step=epoch) logger.log_value('fold{}_val_loss'.format(fold), val_loss, step=epoch) logger.log_value('fold{}_val_f1'.format(fold), val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt_cv(state, best_cm < val_cm, model_save_dir, fold, output_directory) best_cm = max(best_cm, val_cm) scheduler.step(val_cm) # scheduler.step() if val_cm < best_cm: epoch_cum += 1 else: epoch_cum = 0 # save_ckpt_cv(state, best_f1 < val_f1, model_save_dir,fold) # best_f1 = max(best_f1, val_f1) # if val_f1 < best_f1: # epoch_cum += 1 # else: # epoch_cum = 0 # if epoch in config.stage_epoch: # if epoch_cum == 5: # stage += 1 # lr /= config.lr_decay # if lr < 1e-6: # lr = 1e-6 # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # best_w = os.path.join(model_save_dir, config.best_w_cv.format(fold)) # model.load_state_dict(torch.load(best_w)['state_dict']) # print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) # utils.adjust_learning_rate(optimizer, lr) # elif epoch_cum >= 12: # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # break if epoch_cum >= 12: print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) break
def train(input_directory, output_directory): # model model = getattr(models, config.model_name)() # if args.ckpt and not args.resume: # state = torch.load(args.ckpt, map_location='cpu') # model.load_state_dict(state['state_dict']) # print('train with pretrained weight val_f1', state['f1']) num_ftrs = model.fc.in_features model.fc = nn.Linear(num_ftrs, config.num_classes) model = model.to(device) # data train_dataset = ECGDataset(data_path=config.train_data, data_dir=input_directory, train=True) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=6) val_dataset = ECGDataset(data_path=config.train_data, data_dir=input_directory, train=False) val_dataloader = DataLoader(val_dataset, batch_size=config.batch_size, num_workers=4) print("train_datasize", len(train_dataset), "val_datasize", len(val_dataset)) # optimizer and loss #optimizer = optim.Adam(model.parameters(), lr=config.lr) optimizer = radam.RAdam(model.parameters(), lr=config.lr, weight_decay=1e-4) #config.lr #optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, dampening=0, weight_decay=1e-4, nesterov=False) w = torch.tensor(train_dataset.wc, dtype=torch.float).to(device) criterion = utils.WeightedMultilabel(w) ## # utils.FocalLoss() # scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'max', verbose=True, factor=0.1, patience=5, min_lr=1e-06, eps=1e-08) #CosineAnnealingLR CosineAnnealingWithRestartsLR #scheduler = pytorchtools.CosineAnnealingWithRestartsLR(optimizer,T_max=30, T_mult = 1.2, eta_min=1e-6) # optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True) # scheduler = pytorchtools.CosineAnnealingLR_with_Restart(optimizer, T_max=12, T_mult=1, model=model, out_dir='./snapshot',take_snapshot=True, eta_min=1e-9) # 模型保存文件夹 model_save_dir = '%s/%s_%s' % (config.ckpt, config.model_name, time.strftime("%Y%m%d%H%M")) # if args.ex: model_save_dir += args.ex best_f1 = -1 best_cm = -1 lr = config.lr start_epoch = 1 stage = 1 # 从上一个断点,继续训练 # if args.resume: # if os.path.exists(args.ckpt): # 这里是存放权重的目录 # model_save_dir = args.ckpt # current_w = torch.load(os.path.join(args.ckpt, config.current_w)) # best_w = torch.load(os.path.join(model_save_dir, config.best_w)) # best_f1 = best_w['loss'] # start_epoch = current_w['epoch'] + 1 # lr = current_w['lr'] # stage = current_w['stage'] # model.load_state_dict(current_w['state_dict']) # # 如果中断点恰好为转换stage的点 # if start_epoch - 1 in config.stage_epoch: # stage += 1 # lr /= config.lr_decay # utils.adjust_learning_rate(optimizer, lr) # model.load_state_dict(best_w['state_dict']) # print("=> loaded checkpoint (epoch {})".format(start_epoch - 1)) logger = Logger(logdir=model_save_dir, flush_secs=2) # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_acc, train_f1, train_f2, train_g2, train_cm = train_epoch( model, optimizer, criterion, train_dataloader, show_interval=100) val_loss, val_acc, val_f1, val_f2, val_g2, val_cm = val_epoch( model, criterion, val_dataloader) # train_loss, train_f1 = train_beat_epoch(model, optimizer, criterion, train_dataloader, show_interval=100) # val_loss, val_f1 = val_beat_epoch(model, criterion, val_dataloader) print('#epoch:%02d, stage:%d, train_loss:%.3e, train_acc:%.3f, train_f1:%.3f, train_f2:%.3f, train_g2:%.3f,train_cm:%.3f,\n \ val_loss:%0.3e, val_acc:%.3f, val_f1:%.3f, val_f2:%.3f, val_g2:%.3f, val_cm:%.3f,time:%s\n' % (epoch, stage, train_loss, train_acc,train_f1,train_f2,train_g2,train_cm, \ val_loss, val_acc, val_f1, val_f2, val_g2, val_cm,utils.print_time_cost(since))) logger.log_value('train_loss', train_loss, step=epoch) logger.log_value('train_f1', train_f1, step=epoch) logger.log_value('val_loss', val_loss, step=epoch) logger.log_value('val_f1', val_f1, step=epoch) state = { "state_dict": model.state_dict(), "epoch": epoch, "loss": val_loss, 'f1': val_f1, 'lr': lr, 'stage': stage } save_ckpt(state, best_cm < val_cm, model_save_dir, output_directory) best_cm = max(best_cm, val_cm) scheduler.step(val_cm) # scheduler.step() if val_cm < best_cm: epoch_cum += 1 else: epoch_cum = 0 # # if epoch in config.stage_epoch: # if epoch_cum == 5: # stage += 1 # lr /= config.lr_decay # if lr < 1e-6: # lr = 1e-6 # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # best_w = os.path.join(model_save_dir, config.best_w) # model.load_state_dict(torch.load(best_w)['state_dict']) # print("*" * 10, "step into stage%02d lr %.3ef" % (stage, lr)) # utils.adjust_learning_rate(optimizer, lr) # elif epoch_cum >= 12: # print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) # break if epoch_cum >= 12: print("*" * 20, "step into stage%02d lr %.3ef" % (stage, lr)) break
ngpu = torch.cuda.device_count() device_ids = list(range(ngpu)) model = torch.nn.DataParallel(model, device_ids) model.cuda() else: model.to(DEVICE) model.apply(init_weight) model.train() max_lr = 1e-3 warmup_step = hp.warmup_step warmup_factor = hp.warmup_factor if hp.optimizer.lower() == 'radam': import radam optimizer = radam.RAdam(model.parameters(), lr=max_lr, betas=(0.9, 0.98), eps=1e-9) else: optimizer = torch.optim.Adam(model.parameters(), lr=max_lr, betas=(0.9, 0.98), eps=1e-9) save_dir = hp.save_dir # save dir name os.makedirs(save_dir, exist_ok=True) if hp_file != f'{save_dir}/hparams.py' and not filecmp.cmp(hp_file, f'{save_dir}/hparams.py'): shutil.copyfile(hp_file, f'{save_dir}/hparams.py') writer = SummaryWriter(f'{hp.log_dir}/logs/{hp.comment}') if hp.output_type == 'softmax': dataset_train = datasets.VQWav2vecTrainDatasets(hp.train_script) collate_fn_transformer = datasets.collate_fn_vqwav2vec else: dataset_train = datasets.get_dataset(hp.train_script) collate_fn_transformer = datasets.collate_fn