def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']] # TODO: make this optional if 'test_file' in self.dataset: vocab_sources.append(self.dataset['test_file']) vocab1, vocab2 = self.reader.build_vocabs(vocab_sources, min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file')) # To keep the config file simple, share a list between source and destination (tgt) features_src = [] features_tgt = None for feature in self.config_params['features']: if feature['name'] == 'tgt': features_tgt = feature else: features_src += [feature] self.src_embeddings, self.feat2src = self._create_embeddings(embeddings_set, vocab1, features_src) # For now, dont allow multiple vocabs of output baseline.save_vocabs(self.get_basedir(), self.feat2src) self.tgt_embeddings, self.feat2tgt = self._create_embeddings(embeddings_set, {'tgt': vocab2}, [features_tgt]) baseline.save_vocabs(self.get_basedir(), self.feat2tgt) self.tgt_embeddings = self.tgt_embeddings['tgt'] self.feat2tgt = self.feat2tgt['tgt']
def initialize(self, embeddings): self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) vocabs = self.reader.build_vocab( [self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file']], min_f=Task._get_min_f(self.config_params), ) self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocabs, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index)
def initialize(self, embeddings): embeddings_set = mead.utils.index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache, True).download() print("[train file]: {}\n[valid file]: {}\n[test file]: {}\n[vocab file]: {}".format(self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file'], self.dataset.get('vocab_file',"None"))) vocab_file = self.dataset.get('vocab_file',None) if vocab_file is not None: vocab1, vocab2 = self.reader.build_vocabs([vocab_file]) else: vocab1, vocab2 = self.reader.build_vocabs([self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file']]) self.embeddings1, self.feat2index1 = self._create_embeddings(embeddings_set, {'word': vocab1}) self.embeddings2, self.feat2index2 = self._create_embeddings(embeddings_set, {'word': vocab2})
def initialize(self, embeddings): embeddings_set = mead.utils.index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print("[train file]: {}\n[valid file]: {}\n[test file]: {}".format( self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file'])) vocab, self.num_words = self.reader.build_vocab([ self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file'] ]) self.embeddings, self.feat2index = self._create_embeddings( embeddings_set, vocab)
def initialize(self, embeddings): embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print_dataset_info(self.dataset) vocab_sources = [self.dataset['train_file'], self.dataset['valid_file']] # TODO: make this optional if 'test_file' in self.dataset: vocab_sources.append(self.dataset['test_file']) vocabs = self.reader.build_vocab(vocab_sources, min_f=Task._get_min_f(self.config_params), vocab_file=self.dataset.get('vocab_file')) self.embeddings, self.feat2index = self._create_embeddings(embeddings_set, vocabs, self.config_params['features']) baseline.save_vocabs(self.get_basedir(), self.feat2index)
def initialize(self, embeddings): self.dataset = DataDownloader(self.dataset, self.data_download_cache).download() print("[train file]: {}\n[valid file]: {}\n[test file]: {}".format( self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file'])) embeddings = read_config_file_or_json(embeddings, 'embeddings') embeddings_set = index_by_label(embeddings) vocabs = self.reader.build_vocab([ self.dataset['train_file'], self.dataset['valid_file'], self.dataset['test_file'] ]) self.embeddings, self.feat2index = self._create_embeddings( embeddings_set, vocabs)
def train(): parser = ArgumentParser() parser.add_argument("--basedir", type=str) parser.add_argument("--dataset_key", type=str, default='wikitext-2', help="key from DATASETS global") parser.add_argument("--train_file", type=str, help='Optional file path to use for train file') parser.add_argument("--valid_file", type=str, help='Optional file path to use for valid file') parser.add_argument("--dataset_cache", type=str, default=os.path.expanduser('~/.bl-data'), help="Path or url of the dataset cache") parser.add_argument("--cache_features", type=str2bool, default=True) parser.add_argument("--d_model", type=int, default=410, help="Model dimension (and embedding dsz)") parser.add_argument("--d_ff", type=int, default=2100, help="FFN dimension") parser.add_argument("--num_heads", type=int, default=10, help="Number of heads") parser.add_argument("--num_layers", type=int, default=16, help="Number of layers") parser.add_argument("--nctx", type=int, default=256, help="Max input length") parser.add_argument("--batch_size", type=int, default=8, help="Batch Size") parser.add_argument("--tokens", choices=["words", "chars", "bpe", "wordpiece"], default="wordpiece", help="What tokens to use") parser.add_argument("--subword_model_file", type=str, help="If using subwords, pass this", default='bert-base-cased') parser.add_argument( "--subword_vocab_file", type=str, help="If using subwords with separate vocab file, pass here") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout") parser.add_argument("--lr", type=float, default=4.0e-4, help="Learning rate") parser.add_argument("--clip", type=float, default=0.25, help="Clipping gradient norm") parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay") parser.add_argument("--epochs", type=int, default=20, help="Num training epochs") parser.add_argument( "--restart_from", type=str, help="Option allows you to restart from a previous checkpoint") parser.add_argument("--warmup_steps", type=int, default=1000, help="Num warmup steps") parser.add_argument("--mlm", type=str2bool, default=False, help="Use Masked Language Model (MLM) objective") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--distributed", type=str2bool, default=False, help="Are we doing distributed training?") parser.add_argument( "--local_rank", type=int, default=-1, help= "Local rank for distributed training (-1 means use the environment variables to find)" ) parser.add_argument("--chars_per_word", type=int, default=40, help="How many max characters per word") args = parser.parse_args() if args.train_file and not args.valid_file: logger.error( "If you provide a train_file, you must provide a valid_file") return if not args.train_file and args.valid_file: logger.error( "If you provide a valid_file, you must also provide a train_file") return if args.tokens == "chars" and args.mlm: logger.error( "Character composition cannot currently be used with the MLM objective" ) if args.basedir is None: args.basedir = 'transformer-{}-{}-{}'.format(args.dataset_key, args.tokens, os.getpid()) logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("Cache directory [%s]", args.dataset_cache) args.distributed = args.distributed or int(os.environ.get("WORLD_SIZE", 1)) > 1 if args.distributed: if args.local_rank == -1: # https://github.com/kubeflow/pytorch-operator/issues/128 # https://github.com/pytorch/examples/blob/master/imagenet/main.py logger.info("Setting local rank to RANK env variable") args.local_rank = int(os.environ['RANK']) logger.warning("Local rank (%d)", args.local_rank) torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') if args.train_file: dataset = { 'train_file': args.train_file, 'valid_file': args.valid_file } else: dataset = DataDownloader(DATASETS[args.dataset_key], args.dataset_cache).download() reader = create_reader(args.tokens, args.nctx, args.chars_per_word, args.subword_model_file, args.subword_vocab_file) preproc_data = load_embed_and_vocab(args.tokens, reader, dataset, args.dataset_key, args.d_model, args.cache_features) vocabs = preproc_data['vocabs'] if args.mlm: mask_from = vocabs['x'] vocab_size = len(mask_from) mask_value = mask_from.get("[MASK]", mask_from.get("<MASK>", -1)) if mask_value == -1: logger.error( "We could not find a suitable masking token in the vocab") return os.makedirs(args.basedir, exist_ok=True) # We want to make sure to save our input vocab into the basedir for reuse later write_json(vocabs['x'], os.path.join(args.basedir, 'vocabs.json')) embeddings = preproc_data['embeddings'] valid_num_words = preproc_data['valid_num_words'] tgt_key = preproc_data['tgt_key'] logger.info("Loaded embeddings") train_set = load_data(args.tokens, reader, dataset, 'train_file', vocabs, args.cache_features) valid_set = load_data(args.tokens, reader, dataset, 'valid_file', vocabs, args.cache_features) logger.info("valid. tokens [%s], valid. words [%s]", valid_set.tensors[-1].numel(), valid_num_words) train_sampler = torch.utils.data.distributed.DistributedSampler( train_set) if args.distributed else None train_loader = DataLoader(train_set, sampler=train_sampler, batch_size=args.batch_size, shuffle=(not args.distributed)) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=False) logger.info("Loaded datasets") model = TransformerLanguageModel.create( embeddings, hsz=args.d_model, d_ff=args.d_ff, tie_weights=(args.tokens != 'chars'), dropout=args.dropout, gpu=False, num_heads=args.num_heads, layers=args.num_layers, src_keys=['x'], tgt_key=tgt_key) model.to(args.device) loss_function = model.create_loss() loss_function.to(args.device) logger.info("Loaded model and loss") steps_per_epoch = len(train_loader) update_on = steps_per_epoch // 10 cosine_decay = CosineDecaySchedulerPyTorch(len(train_loader) * args.epochs, lr=args.lr) linear_warmup = WarmupLinearSchedulerPyTorch(args.warmup_steps, lr=args.lr) lr_sched = CompositeLRScheduler(linear_warmup, cosine_decay, lr=args.lr) global_step = 0 start_epoch = 0 if args.restart_from: model.load_state_dict(torch.load(args.restart_from)) start_epoch = int(args.restart_from.split("-")[-1].split(".")[0]) - 1 global_step = (start_epoch + 1) * steps_per_epoch logger.info( "Restarting from a previous checkpoint %s.\n\tStarting at global_step=%d, epoch=%d", args.restart_from, global_step, start_epoch + 1) optimizer = OptimizerManager(model, global_step, optim='adam', lr=args.lr, lr_function=lr_sched, weight_decay=args.weight_decay) logger.info("Model has {:,} parameters".format( sum(p.numel() for p in model.parameters() if p.requires_grad))) # Prepare model for distributed training if needed if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Model located on %d", args.local_rank) # This is the training loop for epoch in range(start_epoch, args.epochs): avg_loss = Average('average_train_loss') metrics = {} optimizer.zero_grad() if args.distributed: train_sampler.set_epoch(epoch) start = time.time() model.train() for i, batch in enumerate(train_loader): x, y = batch inputs = {'x': x.to(args.device)} labels = y.to(args.device) if args.mlm: # Replace 15% of tokens masked_indices = torch.bernoulli(torch.full( labels.shape, 0.15)).byte() # Anything not masked is 0 so no loss labels[~masked_indices] = 0 # Of the masked items, mask 80% of them with [MASK] indices_replaced = torch.bernoulli( torch.full(labels.shape, 0.8)).byte() & masked_indices inputs[indices_replaced] = mask_value # Replace 10% of them with random words, rest preserved for auto-encoding indices_random = torch.bernoulli(torch.full( labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced random_words = torch.randint(vocab_size, labels.shape, dtype=torch.long, device=args.device) inputs[indices_random] = random_words[indices_random] labels = labels.transpose(0, 1).contiguous() logits = model(inputs, None)[0].transpose(0, 1).contiguous() if args.mlm: loss = loss_function(logits, labels) else: shift_logits = logits[:-1] shift_labels = labels[1:] loss = loss_function(shift_logits, shift_labels) loss.backward() avg_loss.update(loss.item()) torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() optimizer.zero_grad() if (i + 1) % update_on == 0: logging.info(avg_loss) # How much time elapsed in minutes elapsed = (time.time() - start) / 60 train_token_loss = avg_loss.avg # This is the average training token-level loss across all machines # This is the token-level training perplexity train_token_ppl = math.exp(train_token_loss) metrics['train_elapsed_min'] = elapsed metrics['average_train_loss'] = train_token_loss metrics['train_ppl'] = train_token_ppl model_base = os.path.join(args.basedir, 'checkpoint') avg_valid_loss = Average('average_valid_loss') start = time.time() model.eval() for batch in valid_loader: with torch.no_grad(): x, y = batch inputs = {'x': x.to(args.device)} labels = y.to(args.device) if args.mlm: # Replace 15% of tokens masked_indices = torch.bernoulli( torch.full(labels.shape, 0.15)).byte() # Anything not masked is 0 so no loss labels[~masked_indices] = 0 # Of the masked items, mask 80% of them with [MASK] indices_replaced = torch.bernoulli( torch.full(labels.shape, 0.8)).byte() & masked_indices inputs[indices_replaced] = mask_value # Replace 10% of them with random work indices_random = torch.bernoulli( torch.full( labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced random_words = torch.randint(vocab_size, labels.shape, dtype=torch.long, device=args.device) inputs[indices_random] = random_words[indices_random] labels = labels.transpose(0, 1).contiguous() logits = model(inputs, None)[0].transpose(0, 1).contiguous() if args.mlm: loss = loss_function(logits, labels) else: shift_logits = logits[:-1] shift_labels = labels[1:] loss = loss_function(shift_logits, shift_labels) avg_valid_loss.update(loss.item()) valid_token_loss = avg_valid_loss.avg valid_token_ppl = math.exp(valid_token_loss) elapsed = (time.time() - start) / 60 metrics['valid_elapsed_min'] = elapsed metrics['average_valid_loss'] = valid_token_loss if args.tokens in ['bpe', 'wordpiece']: metrics['valid_token_ppl'] = valid_token_ppl metrics['average_valid_word_ppl'] = math.exp( valid_token_loss * valid_set.tensors[-1].numel() / valid_num_words) else: metrics['average_valid_word_ppl'] = valid_token_ppl logger.info(metrics) if args.local_rank < 1: # Should probably do this more often checkpoint_name = checkpoint_for(model_base, epoch + 1) logger.info("Creating checkpoint: %s", checkpoint_name) if args.distributed: torch.save(model.module.state_dict(), checkpoint_name) else: torch.save(model.state_dict(), checkpoint_name) rm_old_checkpoints(model_base, epoch + 1)
parser.add_argument('--datasets', help='json library of dataset labels', default='config/datasets.json', type=convert_path) parser.add_argument('--embeddings', help='json library of embeddings', default='config/embeddings.json', type=convert_path) args = parser.parse_args() datasets = read_json(args.datasets) datasets = index_by_label(datasets) for name, d in datasets.items(): print(name) try: DataDownloader(d, args.cache).download() except Exception as e: print(e) emb = read_json(args.embeddings) emb = index_by_label(emb) for name, e in emb.items(): print(name) try: EmbeddingDownloader(e['file'], e['dsz'], e.get('sha1'), args.cache).download() except Exception as e: print(e)
def train(): parser = ArgumentParser() parser.add_argument("--basedir", type=str) parser.add_argument("--dataset_key", type=str, default='wikitext-2', help="key from DATASETS global") parser.add_argument("--train_file", type=str, help='Optional file path to use for train file') parser.add_argument("--valid_file", type=str, help='Optional file path to use for valid file') parser.add_argument("--dataset_cache", type=str, default=os.path.expanduser('~/.bl-data'), help="Path or url of the dataset cache") parser.add_argument("--cache_features", type=str2bool, default=True) parser.add_argument("--d_model", type=int, default=410, help="Model dimension (and embedding dsz)") parser.add_argument("--d_ff", type=int, default=2100, help="FFN dimension") parser.add_argument("--num_heads", type=int, default=10, help="Number of heads") parser.add_argument("--num_layers", type=int, default=8, help="Number of layers") parser.add_argument("--nctx", type=int, default=256, help="Max input length") parser.add_argument("--batch_size", type=int, default=8, help="Batch Size") parser.add_argument("--tokens", choices=["words", "chars", "subwords"], default="subwords", help="What tokens to use") parser.add_argument("--dropout", type=float, default=0.1, help="Dropout") parser.add_argument("--lr", type=float, default=4.0e-4, help="Learning rate") parser.add_argument("--clip", type=float, default=0.25, help="Clipping gradient norm") parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay") parser.add_argument("--epochs", type=int, default=20, help="Num training epochs") parser.add_argument("--warmup_steps", type=int, default=1000, help="Num warmup steps") parser.add_argument("--eval_every", type=int, default=-1, help="Evaluate every X steps (-1 => end of epoch)") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--distributed", type=str2bool, default=False, help="Are we doing distributed training?") parser.add_argument( "--local_rank", type=int, default=-1, help= "Local rank for distributed training (-1 means use the environment variables to find)" ) parser.add_argument("--chars_per_word", type=int, default=40, help="How many max characters per word") parser.add_argument( "--accum_grad_steps", type=int, default=1, help="Create effective batch size by accumulating grads without updates" ) args = parser.parse_args() if args.train_file and not args.valid_file: logger.error( "If you provide a train_file, you must provide a valid_file") return if not args.train_file and args.valid_file: logger.error( "If you provide a valid_file, you must also provide a train_file") return if args.basedir is None: args.basedir = 'transformer-{}-{}-{}'.format(args.dataset_key, args.tokens, os.getpid()) logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("Cache directory [%s]", args.dataset_cache) args.distributed = args.distributed or int(os.environ.get("WORLD_SIZE", 1)) > 1 if args.distributed: if args.local_rank == -1: # https://github.com/kubeflow/pytorch-operator/issues/128 # https://github.com/pytorch/examples/blob/master/imagenet/main.py logger.info("Setting local rank to RANK env variable") args.local_rank = int(os.environ['RANK']) logger.warning("Local rank (%d)", args.local_rank) torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') if args.train_file: dataset = { 'train_file': args.train_file, 'valid_file': args.valid_file } else: dataset = DataDownloader(DATASETS[args.dataset_key], args.dataset_cache).download() reader = create_reader(args.tokens, args.nctx, args.chars_per_word) preproc_data = load_embed_and_vocab(args.tokens, reader, dataset, args.dataset_key, args.d_model, args.cache_features) vocabs = preproc_data['vocabs'] os.makedirs(args.basedir, exist_ok=True) # We want to make sure to save our input vocab into the basedir for reuse later write_json(vocabs['x'], os.path.join(args.basedir, 'vocabs.json')) embeddings = preproc_data['embeddings'] valid_num_words = preproc_data['valid_num_words'] tgt_key = preproc_data['tgt_key'] logger.info("Loaded embeddings") train_set = load_data(args.tokens, reader, dataset, 'train_file', vocabs, args.cache_features) valid_set = load_data(args.tokens, reader, dataset, 'valid_file', vocabs, args.cache_features) logger.info("valid. tokens [%s], valid. words [%s]", valid_set.tensors[-1].numel(), valid_num_words) train_sampler = torch.utils.data.distributed.DistributedSampler( train_set) if args.distributed else None train_loader = DataLoader(train_set, sampler=train_sampler, batch_size=args.batch_size, shuffle=(not args.distributed)) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_set) if args.distributed else None valid_loader = DataLoader(valid_set, sampler=valid_sampler, batch_size=args.batch_size, shuffle=False) logger.info("Loaded datasets") model = TransformerLanguageModel.create( embeddings, hsz=args.d_model, d_ff=args.d_ff, tie_weights=(args.tokens != 'chars'), dropout=args.dropout, gpu=False, num_heads=args.num_heads, layers=args.num_layers, src_keys=['x'], tgt_key=tgt_key) model.to(args.device) train_loss = model.create_loss() train_loss.to(args.device) logger.info("Loaded model and loss") optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) # Prepare model for distributed training if needed if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Model located on %d", args.local_rank) def update(engine, batch): model.train() x, y = batch inputs = {'x': x.to(args.device)} labels = y.to(args.device).transpose(0, 1).contiguous() logits = model(inputs, None)[0].transpose(0, 1).contiguous() shift_logits = logits[:-1] shift_labels = labels[1:] loss = train_loss(shift_logits, shift_labels) loss = loss / args.accum_grad_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) if engine.state.iteration % args.accum_grad_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) def inference(_, batch): model.eval() with torch.no_grad(): x, y = batch inputs = {'x': x.to(args.device)} labels = y.to(args.device).transpose(0, 1).contiguous() logits = model(inputs, None)[0].transpose(0, 1).contiguous() shift_logits = logits[:-1] shift_labels = labels[1:] return shift_logits.view(-1, logits.size(-1)), shift_labels.view(-1) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(valid_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(valid_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) cos_scheduler = CosineAnnealingScheduler(optimizer, 'lr', args.lr, 0.0, len(train_loader) * args.epochs) scheduler = create_lr_scheduler_with_warmup(cos_scheduler, 0.0, args.lr, args.warmup_steps) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1))} metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) if args.tokens == 'subwords': # If we compute subwords, need to renormalize for num words metrics["average_subword_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) metrics["average_word_ppl"] = MetricsLambda( lambda x: math.exp(x * valid_set.tensors[-1].numel() / valid_num_words), metrics["average_nll"]) else: metrics["average_word_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) if args.local_rank < 1: RunningAverage(output_transform=lambda x: x).attach( trainer, "valid_loss") trainer.add_event_handler( Events.EPOCH_COMPLETED, lambda _: print( "Epoch[{}] Training Loss: {:.2f}, Perplexity {:.2f}".format( trainer.state.epoch, trainer.state.output, np.exp(trainer.state.output)))) evaluator.add_event_handler( Events.COMPLETED, lambda _: print("Validation: %s" % pformat( evaluator.state.metrics))) checkpoint_handler = ModelCheckpoint(args.basedir, 'checkpoint', save_interval=1, n_saved=3, create_dir=False) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) trainer.run(train_loader, max_epochs=args.epochs)