def build_optimizer(self): self.logger.write("Building optimizer") optimizer_name = self.config.training.optimizer if optimizer_name == "adam": self.optimizer = Adam(self.model.parameters(), lr=self.lr) elif optimizer_name == "adadelta": self.optimizer = Adadelta(self.model.parameters(), lr=self.lr) else: raise ValueError( "{} optimizer is not supported.".format(optimizer_name))
def compile(self, optimizer='adam', initial_lr=0.002): if optimizer.lower() == 'adam': self.optimizer = Adam(self.parameters(), lr=initial_lr) elif optimizer.lower() == 'adadelta': self.optimizer = Adadelta(self.parameters(), lr=initial_lr, rho=0.95, eps=1e-08) else: raise NotImplementedError("the optimizer hasn't been implemented")
def _set_optimizer(self, lr, opt_conf): """optimizerとしてself._optimizerの指示の元、インスタンスを立てるメソッド """ if self._optimizer in adam: return Adam([{ 'params': self.model.parameters() }], lr=lr, **opt_conf) elif self._optimizer in sgd: return SGD([{ 'params': self.model.parameters() }], lr=lr, **opt_conf) elif self._optimizer in rmsprop: return RMSprop([{ 'params': self.model.parameters() }], lr=lr, **opt_conf) elif self._optimizer in adadelta: return Adadelta([{ 'params': self.model.parameters() }], lr=lr, **opt_conf) elif self._optimizer in adagrad: return Adagrad([{ 'params': self.model.parameters() }], lr=lr, **opt_conf) else: raise ValueError(f'optimizer={self._optimizer}は用意されていません')
def build_optimizer(cfg, model): # params = [p for p in model.parameters() if p.requires_grad] _params = [] # filter(lambda p: p.requires_grad, model.parameters()) for n, p in dict(model.named_parameters()).items(): if p.requires_grad: _args = deepcopy(cfg.OPTIMIZER.BIAS_PARAMS if "bias" in n else cfg.OPTIMIZER.WEIGHT_PARAMS) _args.pop("data") _params += [{"params": [p], "lr": cfg.INIT_LR, **_args}] if "bias" in n: _params[-1]["lr"] *= cfg.OPTIMIZER.BIAS_LR_MULTIPLIER or 1.0 opt_type = cfg.OPTIMIZER.TYPE.lower() if opt_type == "sgd": '''torch.optim.SGD(params, lr=0.001, momentum=0, dampening=0, weight_decay=0, nesterov=False)''' optimizer = SGD(_params) elif opt_type == "adam": '''torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)''' optimizer = Adam(_params) elif opt_type == "adamw": '''torch.optim.AdamW(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False)''' optimizer = AdamW(_params) elif opt_type == "adadelta": '''torch.optim.Adadelta(params, lr=1.0, rho=0.9, eps=1e-06, weight_decay=0)''' optimizer = Adadelta(_params) elif opt_type == 'rmsprop': '''torch.optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)''' optimizer = RMSprop(_params) else: raise ValueError("Unsupported optimizer type: {}, Expected optimizer method in [SGD, Adam, Adadelta, RMSprop]".format(cfg.OPTIMIZER.TYPE)) return optimizer
def build(self, params): from torch.optim import Adadelta return Adadelta( params, lr=self.lr, rho=self.rho, eps=self.eps, weight_decay=self.weight_decay )
def init_optimizer(self): if self.args.optimizer.lower() == 'adam': self.optimizer = Adam(self.parameters(), lr=self.args.lr, weight_decay=1e-3) elif self.args.optimizer.lower() == 'sgd': self.optimizer = SGD(self.parameters(), lr=self.args.lr, weight_decay=0.99999) elif self.args.optimizer.lower() == 'adad': self.optimizer = Adadelta(self.parameters(), lr=self.args.lr) else: raise ValueError('No such optimizer implement.')
def adadelta(parameters): # pick defaults if not ("rho" in parameters["optimizer"]): parameters["optimizer"]["rho"] = 0.9 if not ("eps" in parameters["optimizer"]): parameters["optimizer"]["eps"] = 1e-6 if not ("weight_decay" in parameters["optimizer"]): parameters["optimizer"]["weight_decay"] = 0 return Adadelta( parameters["model_parameters"], lr=parameters["learning_rate"], rho=parameters["optimizer"]["rho"], eps=parameters["optimizer"]["eps"], weight_decay=parameters["optimizer"]["weight_decay"], )
def get_optimizer(net): if args.optimizer == 'sgd': optimizer = SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == 'nesterov': optimizer = SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay,nesterov=True) elif args.optimizer == 'adagrad': optimizer = Adagrad(net.parameters(), weight_decay=args.weight_decay) elif args.optimizer == 'adadelta': optimizer = Adadelta(net.parameters(), weight_decay=args.weight_decay) elif args.optimizer == 'adam': optimizer = Adam(net.parameters(), weight_decay=args.weight_decay) else: raise Exception('Invalid optimizer specified.') return optimizer
def main(cfg): # DIFF print(cfg.pretty()) use_cuda = not cfg.no_cuda and torch.cuda.is_available() # DIFF torch.manual_seed(cfg.seed) # DIFF device = torch.device("cuda" if use_cuda else "cpu") train_kwargs = {"batch_size": cfg.batch_size} # DIFF test_kwargs = {"batch_size": cfg.test_batch_size} # DIFF if use_cuda: cuda_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True} train_kwargs.update(cuda_kwargs) test_kwargs.update(cuda_kwargs) transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform) dataset2 = datasets.MNIST("../data", train=False, transform=transform) train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) model = Net().to(device) optimizer = Adadelta( lr=cfg.adadelta.lr, rho=cfg.adadelta.rho, eps=cfg.adadelta.eps, weight_decay=cfg.adadelta.weight_decay, params=model.parameters(), ) # DIFF scheduler = StepLR( step_size=cfg.steplr.step_size, gamma=cfg.steplr.gamma, last_epoch=cfg.steplr.last_epoch, optimizer=optimizer, ) # DIFF for epoch in range(1, cfg.epochs + 1): # DIFF train(cfg, model, device, train_loader, optimizer, epoch) # DIFF test(model, device, test_loader) scheduler.step() if cfg.save_model: # DIFF torch.save(model.state_dict(), cfg.checkpoint_name) # DIFF
def get_optimizer(params, cfg): if cfg.optimizer == 'SGD': return SGD(params, lr=cfg.lr, momentum=cfg.momentum, weight_decay=cfg.weight_decay) elif cfg.optimizer == 'Adadelta': return Adadelta(params, lr=cfg.lr, weight_decay=cfg.weight_decay) elif cfg.optimizer == 'Adagrad': return Adagrad(params, lr=cfg.lr, weight_decay=cfg.weight_decay) elif cfg.optimizer == 'Adam': return Adam(params, lr=cfg.lr, weight_decay=cfg.weight_decay) elif cfg.optimizer == 'RMSprop': return RMSprop(params, lr=cfg.lr, momentum=cfg.momentum, weight_decay=cfg.weight_decay) else: raise Exception('Unknown optimizer : {}'.format(cfg.optimizer))
def generate_optimizer(opt, lr, params): if opt == 'adam': return Adam(params, lr=lr, betas=betas, weight_decay=gamma, eps=eps) elif opt == 'sgd': return SGD(params, lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) elif opt == 'adadelta': return Adadelta(params, lr=lr, rho=rho, weight_decay=gamma, eps=eps) else: raise ValueError('Unknown optimization algorithm: %s' % opt)
def DICTAVAILOPTIMIZERS(option, model_params, lr): list_optimizer_avail = [ 'SGD', 'SGDmom', 'Adagrad', 'RMSprop', 'Adadelta', 'Adam' ] if (option == 'SGD'): return SGD(model_params, lr=lr) elif (option == 'SGDmom'): return SGD(model_params, lr=lr, momentum=0.9) elif (option == 'Adagrad'): return Adagrad(model_params, lr=lr) elif (option == 'RMSprop'): return RMSprop(model_params, lr=lr) elif (option == 'Adadelta'): return Adadelta(model_params, lr=lr) elif (option == 'Adam'): return Adam(model_params, lr=lr) else: message = 'Optimizer chosen not found. Optimizers available: (%s)' % ( ', '.join(list_optimizer_avail)) CatchErrorException(message) return NotImplemented
def initialize_optimizer(params, cfg): """ Create an optimizer for the given params based on the given cfg. :param params: The parameters of the model we optimize. :params cfg: The config from which we configure the optimizer. :returns: An optimizer for given `params` based on the `cfg`. """ optimizer = cfg.optimizer.lower() assert optimizer in ["adam", "adadelta", "adamax", "rmsprop", "adagrad"] if optimizer == "adam": return Adam(params, lr=cfg.learning_rate) if optimizer == "adadelta": return Adadelta(params, lr=cfg.learning_rate) if optimizer == "adamax": return Adamax(params, lr=cfg.learning_rate) if optimizer == "rmsprop": return RMSprop(params, lr=cfg.learning_rate) if optimizer == "adagrad": return Adagrad(params, lr=cfg.learning_rate, initial_accumulator_value=cfg.adagrad_init_acc)
def train(train_loader, validation_loader, net, embeddings): """ Trains the network with a given training data loader and validation data loader. """ optimizer = Adadelta(net.parameters()) evaluate(validation_loader, net, 'validation', log=False) prev_best_acc = 0 for i in range(10): print('Epoch:', i) net.train() avg_loss = 0 avg_acc = 0 for i, (vectors, targets) in enumerate(train_loader): optimizer.zero_grad() logits = net(vectors) loss = F.cross_entropy(logits, targets) loss.backward() optimizer.step() corrects = float((torch.max(logits, 1)[1].view( targets.size()).data == targets.data).sum()) accuracy = 100.0 * corrects / batch_size avg_loss += float(loss) avg_acc += accuracy avg_loss /= i + 1 avg_acc /= i + 1 logger('training', 'loss', avg_loss) logger('training', 'accuracy', avg_acc) acc = evaluate(validation_loader, net, 'validation') if acc > prev_best_acc: torch.save(net.state_dict(), params_file) prev_best_acc = acc
def main(args): # first, we define some pre-processing data_transforms = transforms.Compose([ # extra augmentations # transforms.ColorJitter(brightness=0.3), # necessary transformations transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) # build our data loaders train_dataset = MNIST('../data', train=True, download=True, transform=data_transforms) test_dataset = MNIST('../data', train=False, transform=data_transforms) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size) test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size) # create the model model = Net() # build an optimizer for optimizing the parameters of our model optimizer = Adadelta(model.parameters(), lr=args.lr) # if we want to use cuda, we have to copy all parameters to the GPU model.to(args.device) # build object that handles updating routine updater = MNISTUpdater( iterators={'images': train_dataloader}, networks={'net': model}, optimizers={'main': optimizer}, device=args.device, copy_to_device=True, ) # build the trainer trainer = Trainer( updater, stop_trigger=get_trigger((args.epochs, 'epoch')) ) # prepare logging logger = TensorboardLogger( args.log_dir, args, {}, Path(__file__).resolve().parent, trigger=get_trigger((100, 'iteration')) ) # make sure we are evaluating trainer.extend(Evaluator( test_dataloader, logger, MNISTEvaluator(model), args.device, )) # make sure we are saving the trained models to disk, including the optimizer. This allows us to resume training. snapshotter = Snapshotter( { 'network': model, 'optimizer': optimizer }, args.log_dir ) trainer.extend(snapshotter) # add learning rate scheduling, in this case Cosine Annealing schedulers = { "encoder": CosineAnnealingLR(optimizer, trainer.num_epochs * trainer.iterations_per_epoch, eta_min=1e-8) } lr_scheduler = LRScheduler(schedulers, trigger=get_trigger((1, 'iteration'))) trainer.extend(lr_scheduler) trainer.extend(logger) trainer.train()
def main(args): # ================================================ # Preparation # ================================================ if not torch.cuda.is_available(): raise Exception('At least one gpu must be available.') gpu = torch.device('cuda:0') # create result directory (if necessary) if not os.path.exists(args.result_dir): os.makedirs(args.result_dir) for phase in ['phase_1', 'phase_2', 'phase_3']: if not os.path.exists(os.path.join(args.result_dir, phase)): os.makedirs(os.path.join(args.result_dir, phase)) # load dataset trnsfm = transforms.Compose([ transforms.Resize(args.cn_input_size), transforms.RandomCrop((args.cn_input_size, args.cn_input_size)), transforms.ToTensor(), ]) print('loading dataset... (it may take a few minutes)') train_dset = ImageDataset(os.path.join(args.data_dir, 'train'), trnsfm, recursive_search=args.recursive_search) test_dset = ImageDataset(os.path.join(args.data_dir, 'test'), trnsfm, recursive_search=args.recursive_search) train_loader = DataLoader(train_dset, batch_size=(args.bsize // args.bdivs), shuffle=True) # compute mpv (mean pixel value) of training dataset if args.mpv is None: mpv = np.zeros(shape=(1, )) pbar = tqdm(total=len(train_dset.imgpaths), desc='computing mean pixel value of training dataset...') for imgpath in train_dset.imgpaths: img = Image.open(imgpath) x = np.array(img) / 255. mpv += x.mean(axis=(0, 1)) pbar.update() mpv /= len(train_dset.imgpaths) pbar.close() else: mpv = np.array(args.mpv) # save training config mpv_json = [] for i in range(1): mpv_json.append(float(mpv[i])) args_dict = vars(args) # args_dict['mpv'] = mpv_json with open(os.path.join(args.result_dir, 'config.json'), mode='w') as f: json.dump(args_dict, f) # make mpv & alpha tensors mpv = torch.tensor(mpv.reshape(1, 1, 1, 1), dtype=torch.float32).to(gpu) alpha = torch.tensor(args.alpha, dtype=torch.float32).to(gpu) # ================================================ # Training Phase 1 # ================================================ # load completion network model_cn = CompletionNetwork() if args.init_model_cn is not None: model_cn.load_state_dict( torch.load(args.init_model_cn, map_location='cpu')) if args.data_parallel: model_cn = DataParallel(model_cn) model_cn = model_cn.to(gpu) opt_cn = Adadelta(model_cn.parameters()) # training cnt_bdivs = 0 pbar = tqdm(total=args.steps_1) while pbar.n < args.steps_1: for x in train_loader: # forward x = x.to(gpu) mask = gen_input_mask(shape=(x.shape[0], 1, x.shape[2], x.shape[3]), ).to(gpu) x_mask = x - x * mask + mpv * mask input = torch.cat((x_mask, mask), dim=1) output = model_cn(input) loss = completion_network_loss(x, output, mask) # backward loss.backward() cnt_bdivs += 1 if cnt_bdivs >= args.bdivs: cnt_bdivs = 0 # optimize opt_cn.step() opt_cn.zero_grad() pbar.set_description('phase 1 | train loss: %.5f' % loss.cpu()) pbar.update() # test if pbar.n % args.snaperiod_1 == 0: model_cn.eval() with torch.no_grad(): x = sample_random_batch( test_dset, batch_size=args.num_test_completions).to(gpu) mask = gen_input_mask(shape=(x.shape[0], 1, x.shape[2], x.shape[3]), ).to(gpu) x_mask = x - x * mask + mpv * mask input = torch.cat((x_mask, mask), dim=1) output = model_cn(input) completed = rejoiner(x_mask, output, mask) imgs = torch.cat( (x.cpu(), x_mask.cpu(), completed.cpu()), dim=0) imgpath = os.path.join(args.result_dir, 'phase_1', 'step%d.png' % pbar.n) model_cn_path = os.path.join( args.result_dir, 'phase_1', 'model_cn_step%d' % pbar.n) save_image(imgs, imgpath, nrow=len(x)) if args.data_parallel: torch.save(model_cn.module.state_dict(), model_cn_path) else: torch.save(model_cn.state_dict(), model_cn_path) model_cn.train() if pbar.n >= args.steps_1: break pbar.close() # ================================================ # Training Phase 2 # ================================================ # load context discriminator model_cd = ContextDiscriminator( local_input_shape=(1, args.ld_input_size, args.ld_input_size), global_input_shape=(1, args.cn_input_size, args.cn_input_size), ) if args.init_model_cd is not None: model_cd.load_state_dict( torch.load(args.init_model_cd, map_location='cpu')) if args.data_parallel: model_cd = DataParallel(model_cd) model_cd = model_cd.to(gpu) opt_cd = Adadelta(model_cd.parameters(), lr=0.1) bceloss = BCELoss() # training cnt_bdivs = 0 pbar = tqdm(total=args.steps_2) while pbar.n < args.steps_2: for x in train_loader: # fake forward x = x.to(gpu) hole_area_fake = gen_hole_area( (args.ld_input_size, args.ld_input_size), (x.shape[3], x.shape[2])) mask = gen_input_mask(shape=(x.shape[0], 1, x.shape[2], x.shape[3]), ).to(gpu) fake = torch.zeros((len(x), 1)).to(gpu) x_mask = x - x * mask + mpv * mask input_cn = torch.cat((x_mask, mask), dim=1) output_cn = model_cn(input_cn) input_gd_fake = output_cn.detach() input_ld_fake = crop(input_gd_fake, hole_area_fake) output_fake = model_cd( (input_ld_fake.to(gpu), input_gd_fake.to(gpu))) loss_fake = bceloss(output_fake, fake) # real forward hole_area_real = gen_hole_area( (args.ld_input_size, args.ld_input_size), (x.shape[3], x.shape[2])) real = torch.ones((len(x), 1)).to(gpu) input_gd_real = x input_ld_real = crop(input_gd_real, hole_area_real) output_real = model_cd((input_ld_real, input_gd_real)) loss_real = bceloss(output_real, real) # reduce loss = (loss_fake + loss_real) / 2. # backward loss.backward() cnt_bdivs += 1 if cnt_bdivs >= args.bdivs: cnt_bdivs = 0 # optimize opt_cd.step() opt_cd.zero_grad() pbar.set_description('phase 2 | train loss: %.5f' % loss.cpu()) pbar.update() # test if pbar.n % args.snaperiod_2 == 0: model_cn.eval() with torch.no_grad(): x = sample_random_batch( test_dset, batch_size=args.num_test_completions).to(gpu) mask = gen_input_mask(shape=(x.shape[0], 1, x.shape[2], x.shape[3]), ).to(gpu) x_mask = x - x * mask + mpv * mask input = torch.cat((x_mask, mask), dim=1) output = model_cn(input) completed = rejoiner(x_mask, output, mask) imgs = torch.cat( (x.cpu(), x_mask.cpu(), completed.cpu()), dim=0) imgpath = os.path.join(args.result_dir, 'phase_2', 'step%d.png' % pbar.n) model_cd_path = os.path.join( args.result_dir, 'phase_2', 'model_cd_step%d' % pbar.n) save_image(imgs, imgpath, nrow=len(x)) if args.data_parallel: torch.save(model_cd.module.state_dict(), model_cd_path) else: torch.save(model_cd.state_dict(), model_cd_path) model_cn.train() if pbar.n >= args.steps_2: break pbar.close() # ================================================ # Training Phase 3 # ================================================ cnt_bdivs = 0 pbar = tqdm(total=args.steps_3) while pbar.n < args.steps_3: for x in train_loader: # forward model_cd x = x.to(gpu) hole_area_fake = gen_hole_area( (args.ld_input_size, args.ld_input_size), (x.shape[3], x.shape[2])) mask = gen_input_mask(shape=(x.shape[0], 1, x.shape[2], x.shape[3]), ).to(gpu) # fake forward fake = torch.zeros((len(x), 1)).to(gpu) x_mask = x - x * mask + mpv * mask input_cn = torch.cat((x_mask, mask), dim=1) output_cn = model_cn(input_cn) input_gd_fake = output_cn.detach() input_ld_fake = crop(input_gd_fake, hole_area_fake) output_fake = model_cd((input_ld_fake, input_gd_fake)) loss_cd_fake = bceloss(output_fake, fake) # real forward hole_area_real = gen_hole_area( (args.ld_input_size, args.ld_input_size), (x.shape[3], x.shape[2])) real = torch.ones((len(x), 1)).to(gpu) input_gd_real = x input_ld_real = crop(input_gd_real, hole_area_real) output_real = model_cd((input_ld_real, input_gd_real)) loss_cd_real = bceloss(output_real, real) # reduce loss_cd = (loss_cd_fake + loss_cd_real) * alpha / 2. # backward model_cd loss_cd.backward() cnt_bdivs += 1 if cnt_bdivs >= args.bdivs: # optimize opt_cd.step() opt_cd.zero_grad() # forward model_cn loss_cn_1 = completion_network_loss(x, output_cn, mask) input_gd_fake = output_cn input_ld_fake = crop(input_gd_fake, hole_area_fake) output_fake = model_cd((input_ld_fake, (input_gd_fake))) loss_cn_2 = bceloss(output_fake, real) # reduce loss_cn = (loss_cn_1 + alpha * loss_cn_2) / 2. # backward model_cn loss_cn.backward() if cnt_bdivs >= args.bdivs: cnt_bdivs = 0 # optimize opt_cn.step() opt_cn.zero_grad() pbar.set_description( 'phase 3 | train loss (cd): %.5f (cn): %.5f' % (loss_cd.cpu(), loss_cn.cpu())) pbar.update() # test if pbar.n % args.snaperiod_3 == 0: model_cn.eval() with torch.no_grad(): x = sample_random_batch( test_dset, batch_size=args.num_test_completions).to(gpu) mask = gen_input_mask(shape=(x.shape[0], 1, x.shape[2], x.shape[3]), ).to(gpu) x_mask = x - x * mask + mpv * mask input = torch.cat((x_mask, mask), dim=1) output = model_cn(input) completed = rejoiner(x_mask, output, mask) imgs = torch.cat( (x.cpu(), x_mask.cpu(), completed.cpu()), dim=0) imgpath = os.path.join(args.result_dir, 'phase_3', 'step%d.png' % pbar.n) model_cn_path = os.path.join( args.result_dir, 'phase_3', 'model_cn_step%d' % pbar.n) model_cd_path = os.path.join( args.result_dir, 'phase_3', 'model_cd_step%d' % pbar.n) save_image(imgs, imgpath, nrow=len(x)) if args.data_parallel: torch.save(model_cn.module.state_dict(), model_cn_path) torch.save(model_cd.module.state_dict(), model_cd_path) else: torch.save(model_cn.state_dict(), model_cn_path) torch.save(model_cd.state_dict(), model_cd_path) model_cn.train() if pbar.n >= args.steps_3: break pbar.close()
def main(rank, args): # Distributed setup if args.distributed: setup_distributed(rank, args.world_size) not_main_rank = args.distributed and rank != 0 logging.info("Start time: %s", datetime.now()) # Explicitly set seed to make sure models created in separate processes # start from same random weights and biases torch.manual_seed(args.seed) # Empty CUDA cache torch.cuda.empty_cache() # Change backend for flac files torchaudio.set_audio_backend("soundfile") # Transforms melkwargs = { "n_fft": args.win_length, "n_mels": args.n_bins, "hop_length": args.hop_length, } sample_rate_original = 16000 if args.type == "mfcc": transforms = torch.nn.Sequential( torchaudio.transforms.MFCC( sample_rate=sample_rate_original, n_mfcc=args.n_bins, melkwargs=melkwargs, ), ) num_features = args.n_bins elif args.type == "waveform": transforms = torch.nn.Sequential(UnsqueezeFirst()) num_features = 1 else: raise ValueError("Model type not supported") if args.normalize: transforms = torch.nn.Sequential(transforms, Normalize()) augmentations = torch.nn.Sequential() if args.freq_mask: augmentations = torch.nn.Sequential( augmentations, torchaudio.transforms.FrequencyMasking( freq_mask_param=args.freq_mask), ) if args.time_mask: augmentations = torch.nn.Sequential( augmentations, torchaudio.transforms.TimeMasking(time_mask_param=args.time_mask), ) # Text preprocessing char_blank = "*" char_space = " " char_apostrophe = "'" labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase language_model = LanguageModel(labels, char_blank, char_space) # Dataset training, validation = split_process_librispeech( [args.dataset_train, args.dataset_valid], [transforms, transforms], language_model, root=args.dataset_root, folder_in_archive=args.dataset_folder_in_archive, ) # Decoder if args.decoder == "greedy": decoder = GreedyDecoder() else: raise ValueError("Selected decoder not supported") # Model model = Wav2Letter( num_classes=language_model.length, input_type=args.type, num_features=num_features, ) if args.jit: model = torch.jit.script(model) if args.distributed: n = torch.cuda.device_count() // args.world_size devices = list(range(rank * n, (rank + 1) * n)) model = model.to(devices[0]) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=devices) else: devices = ["cuda" if torch.cuda.is_available() else "cpu"] model = model.to(devices[0], non_blocking=True) model = torch.nn.DataParallel(model) n = count_parameters(model) logging.info("Number of parameters: %s", n) # Optimizer if args.optimizer == "adadelta": optimizer = Adadelta( model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay, eps=args.eps, rho=args.rho, ) elif args.optimizer == "sgd": optimizer = SGD( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optimizer == "adam": optimizer = Adam( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optimizer == "adamw": optimizer = AdamW( model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) else: raise ValueError("Selected optimizer not supported") if args.scheduler == "exponential": scheduler = ExponentialLR(optimizer, gamma=args.gamma) elif args.scheduler == "reduceonplateau": scheduler = ReduceLROnPlateau(optimizer, patience=10, threshold=1e-3) else: raise ValueError("Selected scheduler not supported") criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False) # Data Loader collate_fn_train = collate_factory(model_length_function, augmentations) collate_fn_valid = collate_factory(model_length_function) loader_training_params = { "num_workers": args.workers, "pin_memory": True, "shuffle": True, "drop_last": True, } loader_validation_params = loader_training_params.copy() loader_validation_params["shuffle"] = False loader_training = DataLoader( training, batch_size=args.batch_size, collate_fn=collate_fn_train, **loader_training_params, ) loader_validation = DataLoader( validation, batch_size=args.batch_size, collate_fn=collate_fn_valid, **loader_validation_params, ) # Setup checkpoint best_loss = 1.0 load_checkpoint = args.checkpoint and os.path.isfile(args.checkpoint) if args.distributed: torch.distributed.barrier() if load_checkpoint: logging.info("Checkpoint: loading %s", args.checkpoint) checkpoint = torch.load(args.checkpoint) args.start_epoch = checkpoint["epoch"] best_loss = checkpoint["best_loss"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) scheduler.load_state_dict(checkpoint["scheduler"]) logging.info("Checkpoint: loaded '%s' at epoch %s", args.checkpoint, checkpoint["epoch"]) else: logging.info("Checkpoint: not found") save_checkpoint( { "epoch": args.start_epoch, "state_dict": model.state_dict(), "best_loss": best_loss, "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), }, False, args.checkpoint, not_main_rank, ) if args.distributed: torch.distributed.barrier() torch.autograd.set_detect_anomaly(False) for epoch in range(args.start_epoch, args.epochs): logging.info("Epoch: %s", epoch) train_one_epoch( model, criterion, optimizer, scheduler, loader_training, decoder, language_model, devices[0], epoch, args.clip_grad, not_main_rank, not args.reduce_lr_valid, ) loss = evaluate( model, criterion, loader_validation, decoder, language_model, devices[0], epoch, not_main_rank, ) if args.reduce_lr_valid and isinstance(scheduler, ReduceLROnPlateau): scheduler.step(loss) is_best = loss < best_loss best_loss = min(loss, best_loss) save_checkpoint( { "epoch": epoch + 1, "state_dict": model.state_dict(), "best_loss": best_loss, "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), }, is_best, args.checkpoint, not_main_rank, ) logging.info("End time: %s", datetime.now()) if args.distributed: torch.distributed.destroy_process_group()
def train_model(args): # Read and process data train, dev, test, batch_size, test_batch_size, train_ques_to_para,\ dev_ques_to_para, test_ques_to_para, train_tokenized_paras,\ dev_tokenized_paras, test_tokenized_paras, train_order, dev_order, test_order,\ train_data, dev_data, test_data, train_tokenized_paras_chars,\ dev_tokenized_paras_chars, test_tokenized_paras_chars = read_and_process_data(args) # Build model model, config = build_model(args, train_data.dictionary.size(), train_data.dictionary.index_to_word, train_data.dictionary.word_to_index, train_data.dictionary.char_to_index, train_data.dictionary.index_to_char) if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) #------------------------------ Train System ----------------------------------# # Should we resume running from an existing checkpoint? last_done_epoch = config['ckpt'] if last_done_epoch > 0: model = model.load(args.model_dir, last_done_epoch) print "Loaded model." if not args.disable_glove: print "Embedding shape:", model.embedding.shape start_time = time.time() print "Starting training." # Decide which optimizer to use. if args.optimizer == "SGD": print "Using SGD optimizer." optimizer = SGD(model.parameters(), lr = args.learning_rate) elif args.optimizer == "Adamax": print "Using Adamax optimizer." optimizer = Adamax(model.parameters(), lr= args.learning_rate) elif args.optimizer == "Adadelta": print "Using Adadelta optimizer." optimizer = Adadelta(model.parameters(), lr=args.learning_rate, rho=0.95) else: assert False, "Unrecognized optimizer." if last_done_epoch > 0: if os.path.exists(args.model_dir + "/optim_%d.pt" % last_done_epoch): optimizer = torch.load(args.model_dir + "/optim_%d.pt" % last_done_epoch) else: print "Optimizer saved state not found. Not loading optimizer." # Model summary. print(model) for EPOCH in range(last_done_epoch+1, args.epochs): start_t = time.time() train_loss_sum = 0.0 model.train() for i, num in enumerate(train_order): print "\rTrain epoch %d, %.2f s - (Done %d of %d)" %\ (EPOCH, (time.time()-start_t)*(len(train_order)-i-1)/(i+1), i+1, len(train_order)), # Create next batch by getting lengths and padding train_batch = train[num:num+batch_size] passage_input_f, passage_input_b, question_input_f, question_input_b,\ passage_input_lens, question_input_lens, passage_input_chars_f,\ passage_input_chars_b, question_input_chars_f, question_input_chars_b,\ passage_input_chars_lens, question_input_chars_lens, answer_input =\ get_minibatch_input(train_batch, train_tokenized_paras, train_tokenized_paras_chars, train_ques_to_para) # Zero previous gradient. model.zero_grad() model((passage_input_chars_f, passage_input_chars_lens),\ (passage_input_chars_b, passage_input_chars_lens),\ (question_input_chars_f, question_input_chars_lens),\ (question_input_chars_b, question_input_chars_lens),\ (passage_input_f, passage_input_lens),\ (passage_input_b, passage_input_lens),\ (question_input_f, question_input_lens),\ (question_input_b, question_input_lens),\ answer_input) model.loss.backward() optimizer.step() train_loss_sum += model.loss.data[0] model.free_memory() print "Loss: %.5f (in time %.2fs)" % \ (train_loss_sum/(i+1), time.time() - start_t), sys.stdout.flush() print "\nLoss: %.5f (in time %.2fs)" % \ (train_loss_sum/len(train_order), time.time() - start_t) # End of epoch. random.shuffle(train_order) model.zero_grad() model.save(args.model_dir, EPOCH) # Updating LR for optimizer for param in optimizer.param_groups: param['lr'] *= config['decay'] torch.save(optimizer, args.model_dir + "/optim_%d.pt" % EPOCH) # Run pass over dev data. dev_start_t = time.time() dev_loss_sum = 0.0 all_predictions = {} print "\nRunning on Dev." model.eval() for i, num in enumerate(dev_order): print "\rDev: %.2f s (Done %d of %d)" %\ ((time.time()-dev_start_t)*(len(dev_order)-i-1)/(i+1), i+1, len(dev_order)), dev_batch = dev[num:num+test_batch_size] passage_input_f, passage_input_b, question_input_f, question_input_b,\ passage_input_lens, question_input_lens, passage_input_chars_f,\ passage_input_chars_b, question_input_chars_f, question_input_chars_b,\ passage_input_chars_lens, question_input_chars_lens, answer_input =\ get_minibatch_input(dev_batch, dev_tokenized_paras, dev_tokenized_paras_chars, dev_ques_to_para) # distributions[{0,1}].shape = (batch, max_passage_len) distributions = \ model((passage_input_chars_f, passage_input_chars_lens),\ (passage_input_chars_b, passage_input_chars_lens),\ (question_input_chars_f, question_input_chars_lens),\ (question_input_chars_b, question_input_chars_lens),\ (passage_input_f, passage_input_lens),\ (passage_input_b, passage_input_lens),\ (question_input_f, question_input_lens),\ (question_input_b, question_input_lens),\ answer_input) distributions[0] = distributions[0].data.cpu().numpy() distributions[1] = distributions[1].data.cpu().numpy() # Add all batch qids to predictions dict, if they don't already exist. qids = [ example[2] for example in dev_batch ] for qid in qids: if not qid in all_predictions: all_predictions[qid] = [] best_idxs = [] for idx in range(len(dev_batch)): best_prob = -1 best = [0, 0] max_end = passage_input_lens[idx] for j, start_prob in enumerate(distributions[0][idx][:max_end]): cur_end_idx = min(j + args.max_answer_span, max_end) end_idx = np.argmax(distributions[1][idx][j:cur_end_idx]) prob = distributions[1][idx][j+end_idx] * start_prob if prob > best_prob: best_prob = prob best = [j, j+end_idx] best_idxs.append(best) tokenized_paras = dev_data.tokenized_paras answers = [ tokenized_paras[dev_ques_to_para[qids[idx]]][start:end+1] \ for idx, (start, end) in enumerate(best_idxs) ] answers = [ " ".join([ dev_data.dictionary.get_word(idx) for idx in ans ]) \ for ans in answers ] for qid, answer in zip(qids, answers): all_predictions[qid] = answer dev_loss_sum += model.loss.data[0] model.free_memory() print "[Average loss : %.5f]" % (dev_loss_sum/(i+1)), sys.stdout.flush() # Print dev stats for epoch print "\nDev Loss: %.4f (in time: %.2f s)" %\ (dev_loss_sum/len(dev_order), (time.time() - dev_start_t)) # Dump the results json in the required format print "Dumping prediction results." json.dump( all_predictions, open(args.model_dir + "/dev_predictions_" + str(EPOCH) + ".json", "w")) print "Done."
# Create a loder for the training set train_loader = DataLoader(train_set,batch_size=256,shuffle=True,num_workers=4) # Load the test set, note that train is set to False test_set = CIFAR10(root="./data", train=False, transform=transformations, download=True) # Create a loder for the test set, note that both shuffle is set to false for the test loader test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=4) # Create model, optimizer and loss function # model = AlexNet(num_classes=10) model = SqueezeNet(version=1.1, num_classes=10) # model = SqueezeNet(version=1.2, num_classes=10) # model = SqueezeNetPReLu(version=1.1, num_classes=10) # model = SqueezeNetPReLu(version=1.2, num_classes=10) if torch.cuda.is_available(): model.cuda() optimizer = Adadelta(model.parameters()) loss_fn = nn.CrossEntropyLoss() # Print the number of model parameters print(count_parameters(model)) # Train the model for 100 epochs train(100)
def train_predict(batch_size=100, epochs=10, topk=30, L2=1e-8): patients = getTrainData(4000000) # patients × visits × medical_code patients_num = len(patients) train_patient_num = int(patients_num * 0.8) patients_train = patients[0:train_patient_num] test_patient_num = patients_num - train_patient_num patients_test = patients[train_patient_num:] train_batch_num = int(np.ceil(float(train_patient_num) / batch_size)) test_batch_num = int(np.ceil(float(test_patient_num) / batch_size)) model = Dipole(input_dim=3393, day_dim=200, rnn_hiddendim=300, output_dim=283) params = list(model.parameters()) k = 0 for i in params: l = 1 print("该层的结构:" + str(list(i.size()))) for j in i.size(): l *= j print("该层参数和:" + str(l)) k = k + l print("总参数数量和:" + str(k)) optimizer = Adadelta(model.parameters(), lr=1, weight_decay=L2) loss_mce = nn.BCELoss(reduction='sum') model = model.cuda(device=1) for epoch in range(epochs): starttime = time.time() # 训练 model.train() all_loss = 0.0 for batch_index in range(train_batch_num): patients_batch = patients_train[batch_index * batch_size:(batch_index + 1) * batch_size] patients_batch_reshape, patients_lengths = model.padTrainMatrix( patients_batch) # maxlen × n_samples × inputDimSize batch_x = patients_batch_reshape[0:-1] # 获取前n-1个作为x,来预测后n-1天的值 # batch_y = patients_batch_reshape[1:] batch_y = patients_batch_reshape[1:, :, :283] # 取出药物作为y optimizer.zero_grad() # h0 = model.initHidden(batch_x.shape[1]) batch_x = torch.tensor(batch_x, device=torch.device('cuda:1')) batch_y = torch.tensor(batch_y, device=torch.device('cuda:1')) y_hat = model(batch_x) mask = out_mask2(y_hat, patients_lengths) # 生成mask,用于将padding的部分输出置0 # 通过mask,将对应序列长度外的网络输出置0 y_hat = y_hat.mul(mask) batch_y = batch_y.mul(mask) # (seq_len, batch_size, out_dim)->(seq_len*batch_size*out_dim, 1)->(seq_len*batch_size*out_dim, ) y_hat = y_hat.view(-1, 1).squeeze() batch_y = batch_y.view(-1, 1).squeeze() loss = loss_mce(y_hat, batch_y) loss.backward() optimizer.step() all_loss += loss.item() print("Train:Epoch-" + str(epoch) + ":" + str(all_loss) + " Train Time:" + str(time.time() - starttime)) # 测试 model.eval() NDCG = 0.0 RECALL = 0.0 DAYNUM = 0.0 all_loss = 0.0 gbert_pred = [] gbert_true = [] gbert_len = [] for batch_index in range(test_batch_num): patients_batch = patients_test[batch_index * batch_size:(batch_index + 1) * batch_size] patients_batch_reshape, patients_lengths = model.padTrainMatrix( patients_batch) batch_x = patients_batch_reshape[0:-1] batch_y = patients_batch_reshape[1:, :, :283] batch_x = torch.tensor(batch_x, device=torch.device('cuda:1')) batch_y = torch.tensor(batch_y, device=torch.device('cuda:1')) y_hat = model(batch_x) mask = out_mask2(y_hat, patients_lengths) loss = loss_mce(y_hat.mul(mask), batch_y.mul(mask)) all_loss += loss.item() y_hat = y_hat.detach().cpu().numpy() ndcg, recall, daynum = validation(y_hat, patients_batch, patients_lengths, topk) NDCG += ndcg RECALL += recall DAYNUM += daynum gbert_pred.append(y_hat) gbert_true.append(batch_y.cpu()) gbert_len.append(patients_lengths) avg_NDCG = NDCG / DAYNUM avg_RECALL = RECALL / DAYNUM y_pred_all, y_true_all = batch_squeeze(gbert_pred, gbert_true, gbert_len) acc_container = metric_report(y_pred_all, y_true_all, 0.2) print("Test:Epoch-" + str(epoch) + " Loss:" + str(all_loss) + " Test Time:" + str(time.time() - starttime)) print("Test:Epoch-" + str(epoch) + " NDCG:" + str(avg_NDCG) + " RECALL:" + str(avg_RECALL)) print("Test:Epoch-" + str(epoch) + " Jaccard:" + str(acc_container['jaccard']) + " f1:" + str(acc_container['f1']) + " prauc:" + str(acc_container['prauc']) + " roauc:" + str(acc_container['auc'])) print("")
def train(training_data_file, valid_data_file, super_batch_size, tokenizer, mode, kw, p_key, model1, device, model2, model3, \ batch_size, num_epoch, gradient_accumulation_steps, lr1, lr2, lambda_, valid_critic, early_stop): '''Train three models Train models through bundles Args: training_data_file (list) : training data json file, raw json file used to load data super_batch_size (int) : how many samples will be loaded into memory at once tokenizer : SentencePiece tokenizer used to obtain the token ids mode (str): mode of the passage format, coule be a list (processed) or a long string (unprocessed). kw (str) : the key word map to the passage in each data dictionary. Defaults to 'abstract' p_key (str) : the key word to search for specific passage. Default to 'title' model1 (nn.DataParallel) : local dependency encoder device (torch.device): The device which models and data are on. model2 (nn.Module): global coherence encoder model3 (nn.Module): attention decoder batch_size (int): Defaults to 4. num_epoch (int): Defaults to 1. gradient_accumulation_steps (int): Defaults to 1. lr (float): Defaults to 1e-4. The Start learning rate. lambda_ (float): Defaults to 0.01. Balance factor for param nomalization. valid_critic (bool) : what critic to use when early stop evaluation. Default to 5 early_stop (int) : set the early stop boundary. Default to 5 ''' # Prepare optimizer for Sys1 param_optimizer_bert = list(model1.named_parameters()) param_optimizer_others = list(model2.named_parameters()) + list( model3.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # We tend to fix the embedding. Temeporarily we doesn't find the embedding layer optimizer_grouped_parameters_bert = [{ 'params': [ p for n, p in param_optimizer_bert if not any(nd in n for nd in no_decay) ], 'weight_decay': lambda_ }, { 'params': [ p for n, p in param_optimizer_bert if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer_grouped_parameters_others = [{ 'params': [ p for n, p in param_optimizer_others if not any(nd in n for nd in no_decay) ], 'weight_decay': lambda_ }, { 'params': [ p for n, p in param_optimizer_others if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # We shall adda module to count the num of parameters here critic = nn.NLLLoss(reduction='none') line_num = int(os.popen("wc -l " + training_data_file).read().split()[0]) global_step = 0 # global step opt1 = BertAdam(optimizer_grouped_parameters_bert, lr=lr1, warmup=0.1, t_total=line_num / batch_size * num_epoch) # optimizer 1 # opt = Adam(optimizer_grouped_parameter, lr=lr) opt2 = Adadelta(optimizer_grouped_parameters_others, lr=lr2, rho=0.95) model1.to(device) # model1.train() # model2.to(device) # model2.train() # model3.to(device) # model3.train() # warmed = True for epoch in trange(num_epoch, desc='Epoch'): smooth_mean = WindowMean() opt1.zero_grad() opt2.zero_grad() for superbatch, line_num in load_superbatch(training_data_file, super_batch_size): bundles = [] for data in superbatch: try: bundles.append( convert_passage_to_samples_bundle( tokenizer, data, mode, kw, p_key)) except: print_exc() num_batch, dataloader = homebrew_data_loader(bundles, batch_size=batch_size) tqdm_obj = tqdm(dataloader, total=num_batch) num_steps = line_num # for step, batch in enumerate(tqdm_obj): try: #batch[0] = batch[0].to(device) #batch[1] = batch[1].to(device) #batch[2] = batch[2].to(device) batch = tuple(t for t in batch) log_prob_loss, pointers_output, ground_truth = calculate_loss( batch, model1, model2, model3, device, critic) # here we need to add code to cal rouge-w and acc rouge_ws = [] accs = [] ken_taus = [] pmrs = [] for pred, true in zip(pointers_output, ground_truth): rouge_ws.append(rouge_w(pred, true)) accs.append(acc(pred, true)) ken_taus.append(kendall_tau(pred, true)) pmrs.append(pmr(pred, true)) log_prob_loss.backward() # ******** In the following code we gonna edit it and made early stop ************ if (step + 1) % gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses. From BERT pytorch examples lr_this_step = lr1 * warmup_linear( global_step / num_steps, warmup=0.1) for param_group in opt1.param_groups: param_group['lr'] = lr_this_step global_step += 1 opt2.step() opt2.zero_grad() smooth_mean_loss = smooth_mean.update( log_prob_loss.item()) tqdm_obj.set_description( '{}: {:.4f}, {}: {:.4f}, smooth_mean_loss: {:.4f}'. format('accuracy', np.mean(accs), 'rough-w', np.mean(rouge_ws), smooth_mean_loss)) # During warming period, model1 is frozen and model2 is trained to normal weights if smooth_mean_loss < 1.0 and step > 100: # ugly manual hyperparam warmed = True if warmed: opt1.step() opt1.zero_grad() if step % 1000 == 0: output_model_file = './models/bert-base-cased.bin.tmp' saved_dict = { 'params1': model1.module.state_dict() } saved_dict['params2'] = model2.state_dict() saved_dict['params3'] = model3.state_dict() torch.save(saved_dict, output_model_file) except Exception as err: traceback.print_exc() exit() # if mode == 'list': # print(batch._id) if epoch < 5: best_score = 0 continue with torch.no_grad(): print('valid..............') valid_critic_dict = { 'rouge-w': rouge_w, 'acc': acc, 'ken-tau': kendall_tau, 'pmr': pmr } for superbatch, _ in load_superbatch(valid_data_file, super_batch_size): bundles = [] for data in superbatch: try: bundles.append( convert_passage_to_samples_bundle( tokenizer, data, mode, kw, p_key)) except: print_exc() num_batch, valid_dataloader = homebrew_data_loader( bundles, batch_size=1) valid_value = [] for step, batch in enumerate(valid_dataloader): try: batch = tuple(t for idx, t in enumerate(batch)) pointers_output, ground_truth \ = dev_test(batch, model1, model2, model3, device) valid_value.append(valid_critic_dict[valid_critic]( pointers_output, ground_truth)) except Exception as err: traceback.print_exc() # if mode == 'list': # print(batch._id) score = np.mean(valid_value) print('epc:{}, {} : {:.2f} best : {:.2f}\n'.format( epoch, valid_critic, score, best_score)) if score > best_score: best_score = score best_iter = epoch print('Saving model to {}'.format( output_model_file)) # save model structure saved_dict = { 'params1': model1.module.state_dict() } # save parameters saved_dict['params2'] = model2.state_dict() # save parameters saved_dict['params3'] = model3.state_dict() torch.save(saved_dict, output_model_file) # # print('save best model at epc={}'.format(epc)) # checkpoint = {'model': model.state_dict(), # 'args': args, # 'loss': best_score} # torch.save(checkpoint, '{}/{}.best.pt'.format(args.model_path, args.model)) if early_stop and (epoch - best_iter) >= early_stop: print('early stop at epc {}'.format(epoch)) break
self.map3 = Linear(hidden_size, output_size) def forward(self, x): x = leaky_relu(self.map1(x), 0.1) x = leaky_relu(self.map2(x), 0.1) return sigmoid(self.map3(x)) generator = SimpleMLP(input_size=z_dim, hidden_size=50, output_size=DIMENSION) discriminator = SimpleMLP(input_size=DIMENSION, hidden_size=100, output_size=1) if GPU_NUMS > 0: generator.cuda() discriminator.cuda() criterion = BCELoss() d_optimizer = Adadelta(discriminator.parameters(), lr=1) g_optimizer = Adadelta(generator.parameters(), lr=1) progBar = ProgressBar(1, iterations, "D Loss:(real/fake) %.3f/%.3f,G Loss:%.3f") for train_iter in range(1, iterations + 1): for d_index in range(3): # 1. Train D on real+fake discriminator.zero_grad() # 1A: Train D on real real_samples = sample_2d(lut_2d, bs) d_real_data = Variable(torch.Tensor(real_samples)) if GPU_NUMS > 0: d_real_data = d_real_data.cuda() d_real_decision = discriminator(d_real_data) labels = Variable(torch.ones(bs))
def main(): args = read_args(default_config="confs/kim_cnn_sst2.json") set_seed(args.seed) try: os.makedirs(args.workspace) except: pass torch.cuda.deterministic = True dataset_cls = find_dataset(args.dataset_name) training_iter, dev_iter, test_iter = dataset_cls.iters(args.dataset_path, args.vectors_file, args.vectors_dir, batch_size=args.batch_size, device=args.device, train=args.train_file, dev=args.dev_file, test=args.test_file) args.dataset = training_iter.dataset args.words_num = len(training_iter.dataset.TEXT_FIELD.vocab) model = mod.SiameseRNNModel(args).to(args.device) ckpt_attrs = mod.load_checkpoint(model, args.workspace, best=args.load_best_checkpoint) if args.load_last_checkpoint or args.load_best_checkpoint else {} offset = ckpt_attrs.get("epoch_idx", -1) + 1 args.epochs -= offset training_pbar = tqdm(total=len(training_iter), position=2) training_pbar.set_description("Training") dev_pbar = tqdm(total=args.epochs, position=1) dev_pbar.set_description("Dev") criterion = nn.CrossEntropyLoss() kd_criterion = nn.KLDivLoss(reduction="batchmean") params = list(filter(lambda x: x.requires_grad, model.parameters())) optimizer = Adadelta(params, lr=args.lr, rho=0.95) increment_fn = mod.make_checkpoint_incrementer(model, args.workspace, save_last=True, best_loss=ckpt_attrs.get("best_dev_loss", 10000)) non_embedding_params = model.non_embedding_params() if args.use_data_parallel: model = nn.DataParallel(model) if args.eval_test_only: test_acc, _ = evaluate(model, test_iter, criterion, export_eval_labels=args.export_eval_labels) print(test_acc) return if args.epochs == 0: print("No epochs left from loaded model.", file=sys.stderr) return for epoch_idx in tqdm(range(args.epochs), position=0): training_iter.init_epoch() model.train() training_pbar.n = 0 training_pbar.refresh() for batch in training_iter: training_pbar.update(1) optimizer.zero_grad() logits = model(batch.sentence1, batch.sentence2) kd_logits = torch.stack((batch.logits_0, batch.logits_1, batch.logits_2), 1) kd = args.distill_lambda * kd_criterion(F.log_softmax(logits / args.distill_temperature, 1), F.softmax(kd_logits / args.distill_temperature, 1)) loss = args.ce_lambda * criterion(logits, batch.gold_label) + kd loss.backward() clip_grad_norm_(non_embedding_params, args.clip_grad) optimizer.step() acc = ((logits.max(1)[1] == batch.gold_label).float().sum() / batch.gold_label.size(0)).item() training_pbar.set_postfix(accuracy=f"{acc:.2}") model.eval() dev_acc, dev_loss = evaluate(model, dev_iter, criterion) dev_pbar.update(1) dev_pbar.set_postfix(accuracy=f"{dev_acc:.4}") is_best_dev = increment_fn(dev_loss, dev_acc=dev_acc, epoch_idx=epoch_idx + offset) if is_best_dev: dev_pbar.set_postfix(accuracy=f"{dev_acc:.4} (best loss)") test_acc, _ = evaluate(model, test_iter, criterion, export_eval_labels=args.export_eval_labels) training_pbar.close() dev_pbar.close() print(f"Test accuracy of the best model: {test_acc:.4f}", file=sys.stderr) print(test_acc)
model = MwanModel( num_class=len(data_info.vocabs[Const.TARGET]), EmbLayer=StaticEmbedding(data_info.vocabs[Const.INPUT], requires_grad=False, normalize=False), ElmoLayer=None, args_of_imm={ "input_size": 300, "hidden_size": arg.hidden_size, "dropout": arg.dropout, "use_allennlp": False, }, ) optimizer = Adadelta(lr=arg.lr, params=model.parameters()) scheduler = StepLR(optimizer, step_size=10, gamma=0.5) callbacks = [ LRScheduler(scheduler), ] if arg.task in ['snli']: callbacks.append( FitlogCallback(data_info.datasets[arg.testset_name], verbose=1)) elif arg.task == 'mnli': callbacks.append( FitlogCallback( { 'dev_matched': data_info.datasets['dev_matched'], 'dev_mismatched': data_info.datasets['dev_mismatched']
#lesion_model = UNet3D_2(input_size=len(options['input_data']), output_size=2) lesion_model.cuda() input_tensor = torch.rand(5, 3, 2, 160, 200).cuda() pred = lesion_model(input_tensor) options['model_name'] = lesion_model.__class__.__name__ model_name = 'ms_lesion_segmentation' # define the torch.device device = torch.device('cuda') if options['gpu_use'] else torch.device('cpu') # define the optimizer if options['optimizer'] == "adam": optimizer = Adam(lesion_model.parameters()) elif options['optimizer'] == "adadelta": optimizer = Adadelta(lesion_model.parameters()) # send the model to the device lesion_model = lesion_model.to(device) early_stopping = EarlyStopping(patience=options['patience'], verbose=True) train_losses = [] val_losses = [] train_jaccs = [] val_jaccs = [] # training loop training = True train_complete = False epoch = 1
args.get('min_length'), args.get('max_length')) val_dataloader = TextDataLoader(dataset=val_dataset, dictionary=dictionary, batch_size=args.get('batch_size'), shuffle=not args.get('sort_dataset'), num_workers=args.get('num_workers')) # test_dataset = TextDataset(test_data, dictionary, args.get('sort_dataset'), args.get('min_length'), args.get('max_length')) # test_dataloader = TextDataLoader(dataset=test_dataset, dictionary=dictionary, batch_size=args.get('batch_size'), shuffle = not args.get('sort_dataset')) logger.info("Training...") # trainable_params = [p for p in model.parameters() if p.requires_grad] if args.get('optimizer') == 'Adam': optimizer = Adam(model.parameters(), lr=args.get('initial_lr')) elif args.get('optimizer') == 'Adadelta': optimizer = Adadelta(params=trainable_params, lr=args.get('initial_lr'), weight_decay=0.95) else: raise NotImplementedError() lr_plateau = lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5) criterion = nn.CrossEntropyLoss trainer = Trainer(model, train_dataloader, val_dataloader, criterion=criterion, optimizer=optimizer, lr_schedule=args.get('lr_schedule'), lr_scheduler=lr_plateau, use_gpu=args.get('use_gpu'), logger=logger)
def main(): args = read_args(default_config="confs/kim_cnn_sst2.json") set_seed(args.seed) try: os.makedirs(args.workspace) except: pass torch.cuda.deterministic = True dataset_cls = find_dataset(args.dataset_name) training_iter, dev_iter, test_iter = dataset_cls.iters( args.dataset_path, args.vectors_file, args.vectors_dir, batch_size=args.batch_size, device=args.device, train=args.train_file, dev=args.dev_file, test=args.test_file) args.dataset = training_iter.dataset args.words_num = len(training_iter.dataset.TEXT_FIELD.vocab) model = mod.SiameseRNNModel(args).to(args.device) sd = torch.load('sst.pt')['state_dict'] del sd['static_embed.weight'] del sd['non_static_embed.weight'] del sd['fc1.weight'] del sd['fc1.bias'] del sd['fc2.weight'] del sd['fc2.bias'] model.load_state_dict(sd, strict=False) mod.init_embedding(model, args) # embs, field_src = torch.load('embs_tmp.pt') # field_mappings = list_field_mappings(dataset_cls.TEXT_FIELD, field_src) # replace_embeds(model.non_static_embed, embs, field_mappings) model.to(args.device) ckpt_attrs = mod.load_checkpoint( model, args.workspace, best=args.load_best_checkpoint ) if args.load_last_checkpoint or args.load_best_checkpoint else {} torch.save((model.non_static_embed, dataset_cls.TEXT_FIELD.vocab), 'qqp-embs.pt') return offset = ckpt_attrs.get("epoch_idx", -1) + 1 args.epochs -= offset training_pbar = tqdm(total=len(training_iter), position=2) training_pbar.set_description("Training") dev_pbar = tqdm(total=args.epochs, position=1) dev_pbar.set_description("Dev") criterion = nn.CrossEntropyLoss() kd_criterion = nn.MSELoss() # KLDivLoss(reduction="batchmean") filter_params = [(n, p) for n, p in model.named_parameters() if p.requires_grad and 'fc' in n] params = list(map(lambda x: x[1], filter_params)) # print([x[0] for x in filter_params]) optimizer = Adadelta(params, lr=args.lr, rho=0.95) #optimizer = Adam(params, lr=args.lr) increment_fn = mod.make_checkpoint_incrementer(model, args.workspace, save_last=True, best_loss=ckpt_attrs.get( "best_dev_loss", 10000)) non_embedding_params = model.non_embedding_params() if args.use_data_parallel: model = nn.DataParallel(model) if args.eval_test_only: test_acc, _ = evaluate(model, test_iter, criterion, export_eval_labels=args.export_eval_labels) print(test_acc) return if args.epochs == 0: print("No epochs left from loaded model.", file=sys.stderr) return for epoch_idx in tqdm(range(args.epochs), position=0): training_iter.init_epoch() model.train() training_pbar.n = 0 training_pbar.refresh() for batch in training_iter: training_pbar.update(1) optimizer.zero_grad() logits = model(batch.question1, batch.question2) # kd_logits = torch.stack((batch.logits_0, batch.logits_1), 1) #kd = args.distill_lambda * kd_criterion(F.log_softmax(logits / args.distill_temperature, 1), # F.softmax(kd_logits / args.distill_temperature, 1)) # kd = args.distill_lambda * kd_criterion(logits, kd_logits) loss = criterion(logits, batch.is_duplicate) loss.backward() clip_grad_norm_(non_embedding_params, args.clip_grad) optimizer.step() acc = ((logits.max(1)[1] == batch.is_duplicate).float().sum() / batch.is_duplicate.size(0)).item() training_pbar.set_postfix(accuracy=f"{acc:.2}") model.eval() dev_acc, dev_loss = evaluate(model, dev_iter, criterion) dev_pbar.update(1) dev_pbar.set_postfix(accuracy=f"{dev_acc:.4}") is_best_dev = increment_fn(dev_loss, dev_acc=dev_acc, epoch_idx=epoch_idx + offset) if is_best_dev: dev_pbar.set_postfix(accuracy=f"{dev_acc:.4} (best loss)") # test_acc, _ = evaluate(model, test_iter, criterion, export_eval_labels=args.export_eval_labels) training_pbar.close() dev_pbar.close() print(f"Test accuracy of the best model: {test_acc:.4f}", file=sys.stderr) print(test_acc)
def train(pretrain=PRETRAIN): logging.debug('pretrain:{}'.format(pretrain)) if DEVICE == 'cuda': if torch.cuda.is_available() == False: logging.error("can't find a GPU device") pdb.set_trace() #model=DenseLSTM(NUM_CLASS) #model=VGGLSTM(NUM_CLASS) #model=DenseCNN(NUM_CLASS) #model=VGGFC(NUM_CLASS) model = ResNetLSTM(NUM_CLASS) if os.path.exists(MODEL_PATH) == False: os.makedirs(MODEL_PATH) if os.path.exists(PATH + DICTIONARY_NAME) == False: logging.error("can't find the dictionary") pdb.set_trace() with open(PATH + DICTIONARY_NAME, 'r') as f: dictionary = json.load(f) if pretrain == True: model.load_state_dict( torch.load(MODEL_PATH + MODEL_NAME, map_location=DEVICE)) model.to(DEVICE).train() model.register_backward_hook(backward_hook) #transforms.Resize((32,400)) dataset = ICDARRecTs_2DataSet(IMAGE_PATH, dictionary, BATCH_SIZE, img_transform=transforms.Compose([ transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.3), transforms.ToTensor(), transforms.Normalize( (0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ])) dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=False) #collate_fn=dataset.collate #optimizer=Adam(model.parameters(),lr=LR,betas=(0.9,0.999),weight_decay=0) optimizer = Adadelta(model.parameters(), lr=0.01, rho=0.9, weight_decay=0) criterion = CTCLoss(blank=0) length = len(dataloader) max_accuracy = 0 if os.path.exists('max_accuracy.txt') == True: with open('max_accuracy.txt', 'r') as f: max_accuracy = float(f.read()) for epoch in range(EPOCH): epoch_time = datetime.now() epoch_correct = 0 epoch_loss = 0 min_loss = 100 for step, data in enumerate(dataloader): step_time = datetime.now() imgs, names, label_size, img_name = data #print(names,label_size) logging.debug("imgs' size:{}".format(imgs.size())) imgs = Variable(imgs, requires_grad=True).to(DEVICE) label, batch_label = dataset.transform_label(batch_name=names) label = Variable(label).to(DEVICE) label_size = Variable(label_size).to(DEVICE) preds = model(imgs) logging.debug("preds size:{}".format(preds.size())) preds_size = Variable( torch.LongTensor([preds.size(0)] * BATCH_SIZE)).to(DEVICE) loss = criterion(preds, label, preds_size, label_size) epoch_loss += loss.item() optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) optimizer.step() if min_loss > loss.item(): min_loss = loss.item() torch.save(model.state_dict(), MODEL_PATH + MODEL_NAME) num_same = if_same(preds.cpu().data, batch_label) epoch_correct += num_same logging.debug( "Epoch:{}|length:{}|step:{}|num_same:{}|loss:{:.4f}|min loss:{:.4f}" .format(epoch, length, step, num_same, loss.item(), min_loss)) logging.debug("the time of one step:{}".format(datetime.now() - step_time)) if step % 100 == 0: clear_output(wait=True) accuracy = epoch_correct / (length) * BATCH_SIZE if accuracy > max_accuracy: max_accuracy = accuracy with open('max_accuracy.txt', 'w') as f: f.write(str(max_accuracy)) torch.save(model.state_dict(), MODEL_PATH + MODEL_NAME) torch.save(model.state_dict(), MODEL_PATH + 'optimal' + str(max_accuracy) + MODEL_NAME) mean_loss = epoch_loss / length logging.info( 'Epoch:{}|accuracy:{}|mean loss:{}|the time of one epoch:{}|max accuracy:{}' .format(epoch, accuracy, mean_loss, datetime.now() - epoch_time, max_accuracy)) with open('accuracy.txt', 'a+') as f: f.write( 'Epoch:{}|accuracy:{}|mean loss:{}|the time of one epoch:{}|max accuracy:{}\n' .format(epoch, accuracy, mean_loss, datetime.now() - epoch_time, max_accuracy))
# static : pre-trained vectors are kept pass else: # MODE == 'nonstatic' if isinstance(U, torch.Tensor): U = nn.Parameter(U) else: LOGGER.info("Type error, U should be torch.Tensor. ") model = TextCNN(cfg.hidden_units[0] * 3, cfg.hidden_units[0], cfg.filter_hs) model.train() LR = 1e-04 #optimizer = Adam(model.parameters(), lr=LR) # 0.001 optimizer = Adadelta(model.parameters(), lr=1.) ################################### ### Training ################################### best_loss = float('inf') best_acc = float('-inf') for epoch in range(cfg.n_epochs): epoch_loss = 0.0 steps = 0 num = 0 pbar = tqdm(train_loader) pbar_eval = tqdm(test_loader) for field in pbar: # check shape if DEBUG:
def train(self): os.environ["CUDA_VISIBLE_DEVICES"] = '0' device_ids = [0] self.classifier = classifier() get_paprams(self.classifier) get_paprams(self.classifier.base) # data_set_eval = my_dataset(eval=True) # data_set = my_dataset_10s() # data_set_test = my_dataset_10s() data_set = my_dataset_10s_smote() data_set_test = my_dataset_10s_smote(test=True, all_data=data_set.all_data, all_label=data_set.all_label, index_=data_set.index) # data_set_eval = my_dataset_10s(eval=True) # data_set_combine = my_dataset(combine=True) batch = 300 totoal_epoch = 2000 print('batch:{}'.format(batch)) # self.evaluation = evaluation data_loader = DataLoader(data_set, batch, shuffle=True, collate_fn=detection_collate) data_loader_test = DataLoader(data_set_test, batch, False, collate_fn=detection_collate) # data_loader_eval = DataLoader(data_set_eval, batch, False, collate_fn=detection_collate) self.classifier = self.classifier.cuda() self.classifier = DataParallel(self.classifier, device_ids=device_ids) optim = Adadelta(self.classifier.parameters(), 0.1, 0.9, weight_decay=1e-5) self.cretion = smooth_focal_weight() self.classifier.apply(weights_init) start_time = time.time() count = 0 epoch = -1 while 1: epoch += 1 runing_losss = [0] * 5 for data in data_loader: loss = [0] * 5 y = data[1].cuda() x = data[0].cuda() optim.zero_grad() weight = torch.Tensor([0.5, 2, 0.5, 2]).cuda() inputs, targets_a, targets_b, lam = mixup_data(x, y) predict = self.classifier(x) ############################3 loss_func = mixup_criterion(targets_a, targets_b, lam, weight) loss5 = loss_func(self.cretion, predict[0]) loss4 = loss_func(self.cretion, predict[1]) * 0.4 loss3 = loss_func(self.cretion, predict[2]) * 0.3 loss2 = loss_func(self.cretion, predict[3]) * 0.2 loss1 = loss_func(self.cretion, predict[4]) * 0.1 tmp = loss5 + loss4 + loss3 + loss2 + loss1 # tmp = sum(loss) tmp.backward() optim.step() for i in range(5): # runing_losss[i] += (loss[i].item()) runing_losss[i] += (tmp.item()) count += 1 # torch.cuda.empty_cache() end_time = time.time() print( "epoch:{a}: loss:{b} spend_time:{c} time:{d}".format(a=epoch, b=sum(runing_losss), c=int(end_time - start_time), d=time.asctime())) start_time = end_time # vis.line(np.asarray([optim.param_groups[0]['lr']]), np.asarray([epoch]), win="lr", update='append', # opts=dict(title='lr')) # if (epoch > 20): # runing_losss = np.asarray(runing_losss).reshape(1, 5) # vis.line(runing_losss, # np.asarray([epoch] * 5).reshape(1, 5), win="loss-epoch", update='append', # opts=dict(title='loss', legend=['loss1', 'loss2', 'loss3', 'loss4', 'loss5', 'loss6'])) save(self.classifier.module.base.state_dict(), str(epoch) + 'base_c2.p') save(self.classifier.module.state_dict(), str(epoch) + 'base_all_c2.p') # print('eval:{}'.format(time.asctime(time.localtime(time.time())))) self.classifier.eval() # self.evaluation(self.classifier, data_loader_eval) # print('test:{}'.format(time.asctime(time.localtime(time.time())))) # self.evaluation(self.classifier, data_loader_eval, epoch) self.evaluation(self.classifier, data_loader_test, epoch) # self.evaluation(self.classifier, data_loader, epoch) # print('combine:{}'.format(time.asctime(time.localtime(time.time())))) # evaluation(self.classifier, data_loader_combine) self.classifier.train() if epoch % 10 == 0: adjust_learning_rate(optim, 0.9, epoch, totoal_epoch, 0.1)