def train(model, model_name, train_loader, valid_loader, epochs=1000): # Create callbacks and checkpoints lrscheduler = ReduceLROnPlateau(patience=3, verbose=True) early_stopping = EarlyStopping(patience=10, min_delta=1e-4, verbose=True) model_path = './models/' os.makedirs(model_path, exist_ok=True) ckpt_best = ModelCheckpoint(model_path + 'best_' + model_name + '.torch', save_best_only=True, restore_best=True, temporary_filename=model_path + 'temp_best_' + model_name + '.torch', verbose=True) ckpt_last = ModelCheckpoint(model_path + 'last_' + model_name + '.torch', temporary_filename=model_path + 'temp_last_' + model_name + '.torch') logger_path = './train_logs/' os.makedirs(logger_path, exist_ok=True) csv_logger = CSVLogger(logger_path + model_name + '.csv') callbacks = [lrscheduler, ckpt_best, ckpt_last, early_stopping, csv_logger] # Fit the model model.fit_generator(train_loader, valid_loader, epochs=epochs, callbacks=callbacks)
def fit(self, x_train, y_train, x_valid, y_valid, n_epochs=100, batch_size=32, log_filename=None, checkpoint_filename=None, with_early_stopping=True): """ :param x_train: training set examples :param y_train: training set labels :param x_valid: testing set examples :param y_valid: testing set labels :param n_epochs: int, number of epoch default value 100 :param batch_size: int, size of the batch default value 32, must be multiple of 2 :param log_filename: optional, to output the training informations :param checkpoint_filename: optional, to save the model :param with_early_stopping: to activate the early stopping or not :return: self, the model """ callbacks = [] if with_early_stopping: early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0) callbacks += [early_stopping] reduce_lr = ReduceLROnPlateau(monitor='loss', patience=2, factor=1 / 10, min_lr=1e-6) best_model_restore = BestModelRestore() callbacks += [reduce_lr, best_model_restore] if log_filename: logger = CSVLogger(log_filename, batch_granularity=False, separator='\t') callbacks += [logger] if checkpoint_filename: checkpointer = ModelCheckpoint(checkpoint_filename, monitor='val_loss', save_best_only=True) callbacks += [checkpointer] # self.model.fit(x_train, y_train, x_valid, y_valid, # batch_size=batch_size, epochs=n_epochs, # callbacks=callbacks) nb_steps_train, nb_step_valid = int( len(x_train) / batch_size), int(len(x_valid) / batch_size) self.model.fit_generator( generator(x_train, y_train, batch_size), steps_per_epoch=nb_steps_train, valid_generator=generator(x_valid, y_valid, batch_size), validation_steps=nb_step_valid, epochs=n_epochs, callbacks=callbacks, ) return self
def test_reduce_lr_on_plateau_integration(self): train_gen = some_data_generator(OptimizerCheckpointTest.batch_size) valid_gen = some_data_generator(OptimizerCheckpointTest.batch_size) reduce_lr = ReduceLROnPlateau(monitor='loss', patience=3) checkpointer = LRSchedulerCheckpoint(reduce_lr, self.checkpoint_filename, period=1) self.model.fit_generator(train_gen, valid_gen, epochs=OptimizerCheckpointTest.epochs, steps_per_epoch=5, callbacks=[checkpointer])
def fit(self, meta_train, meta_valid, n_epochs=100, steps_per_epoch=100, log_filename=None, checkpoint_filename=None, tboard_folder=None): if hasattr(self.model, 'is_eval'): self.model.is_eval = False self.is_eval = False self.steps_per_epoch = steps_per_epoch callbacks = [ EarlyStopping(patience=10, verbose=False), ReduceLROnPlateau(patience=2, factor=1 / 2, min_lr=1e-6, verbose=True), BestModelRestore() ] if log_filename: callbacks += [ CSVLogger(log_filename, batch_granularity=False, separator='\t') ] if checkpoint_filename: callbacks += [ ModelCheckpoint(checkpoint_filename, monitor='val_loss', save_best_only=True, temporary_filename=checkpoint_filename + 'temp') ] if tboard_folder is not None: self.writer = SummaryWriter(tboard_folder) self.fit_generator(meta_train, meta_valid, epochs=n_epochs, steps_per_epoch=steps_per_epoch, validation_steps=steps_per_epoch, callbacks=callbacks, verbose=True) self.is_fitted = True return self
def test_reduce_lr_on_plateau_integration(self): reduce_lr = ReduceLROnPlateau(monitor='loss', patience=3) self._fit_with_callback_integration(reduce_lr)
def test_reduce_lr_checkpoints(self): reduce_lr = ReduceLROnPlateau(monitor='loss', patience=3) checkpointer = LRSchedulerCheckpoint(reduce_lr, self.checkpoint_filename, period=1) self._test_checkpointer(checkpointer, reduce_lr)
def main(): randomhash = ''.join(str(time.time()).split('.')) parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='data/penn/', help='location of the data corpus') parser.add_argument('--model', type=str, default='LSTM', help='type of recurrent net (LSTM, QRNN, GRU)') parser.add_argument('--emsize', type=int, default=400, help='size of word embeddings') parser.add_argument('--nhid', type=int, default=1150, help='number of hidden units per layer') parser.add_argument('--nlayers', type=int, default=3, help='number of layers') parser.add_argument('--lr', type=float, default=30, help='initial learning rate') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--epochs', type=int, default=8000, help='upper epoch limit') parser.add_argument('--batch_size', type=int, default=80, metavar='N', help='batch size') parser.add_argument('--bptt', type=int, default=70, help='sequence length') parser.add_argument('--dropout', type=float, default=0.4, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--dropouth', type=float, default=0.3, help='dropout for rnn layers (0 = no dropout)') parser.add_argument('--dropouti', type=float, default=0.65, help='dropout for input embedding layers (0 = no dropout)') parser.add_argument('--dropoute', type=float, default=0.1, help='dropout to remove words from embedding layer (0 = no dropout)') parser.add_argument('--wdrop', type=float, default=0.5, help='amount of weight dropout to apply to the RNN hidden to hidden matrix') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--nonmono', type=int, default=5, help='random seed') parser.add_argument('--cuda', action='store_false', help='use CUDA') parser.add_argument('--log-interval', type=int, default=200, metavar='N', help='report interval') parser.add_argument('--save', type=str, default=randomhash+'.pt', help='path to save the final model') parser.add_argument('--alpha', type=float, default=2, help='alpha L2 regularization on RNN activation (alpha = 0 means no regularization)') parser.add_argument('--beta', type=float, default=1, help='beta slowness regularization applied on RNN activiation (beta = 0 means no regularization)') parser.add_argument('--wdecay', type=float, default=1.2e-6, help='weight decay applied to all weights') parser.add_argument('--resume', type=str, default='', help='path of model to resume') parser.add_argument('--optimizer', type=str, default='sgd', help='optimizer to use (sgd, adam)') parser.add_argument('--when', nargs="+", type=int, default=[-1], help='When (which epochs) to divide the learning rate by 10 - accepts multiple') args = parser.parse_args() args.tied = True # Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print("WARNING: You have a CUDA device, so you should probably run with --cuda") else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) eval_batch_size = 20 test_batch_size = 1 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) train_loader = SentenceLoader(train_data, args.bptt) valid_loader = SentenceLoader(val_data, args.bptt, False) test_loader = SentenceLoader(test_data, args.bptt, False) ntokens = len(corpus.dictionary) model = m.RNNModel( args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied, args.alpha, args.beta, args.batch_size ) if args.model == 'QRNN': model.reset() ### params = list(model.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params) optimizer = None if args.optimizer == 'sgd': optimizer = torch.optim.SGD(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay) device = None device_id = 0 if torch.cuda.is_available(): torch.cuda.set_device(device_id) # Fix bug where memory is allocated on GPU0 when ask to take GPU1. device = torch.device('cuda:%d' % device_id) logging.info("Training on GPU %d" % device_id) else: logging.info("Training on CPU") dataset = args.data.split('/')[-1] model_name = "AWD_{}_{}".format(args.model, dataset) expt_name = './expt_{}'.format(model_name) expt_dir = get_experiment_directory(expt_name) expt = PytouneExperiment( expt_dir, model, device=device, optimizer=optimizer, monitor_metric='val_loss', monitor_mode='min' ) callbacks = [ HiddenInitCallback(args.batch_size, eval_batch_size), HiddenRepackagingCallback(), ClipNorm(params, args.clip), # EvaluationCallback(), ASGDOptimizerSwitchCallback(args), ReduceLROnPlateau(monitor='val_loss', mode='min', patience=20, factor=0.5, threshold_mode='abs', threshold=1e-3, verbose=True), AdaptativeLRSchedulerCallback(train_loader), ] try: expt.train(train_loader, valid_loader, callbacks=callbacks, seed=args.seed) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') print("Testing on test set...") expt.test(test_loader)