len(train_batched) // args.target_seq_len) optim = torch.optim.SGD(lm.parameters(), lr, weight_decay=args.beta) train_( lm, train_data, optim, logger, args.clip, use_ivecs=False, custom_batches=False, ) val_loss = evaluate_( lm, valid_data, use_ivecs=False, custom_batches=False, ) print( epoch_summary(epoch, logger.nb_updates(), logger.time_since_creation(), val_loss)) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: torch.save(lm, args.save) best_val_loss = val_loss else: lr /= 2.0 pass
def main(args): print(args) logging.basicConfig(level=logging.INFO, format='[%(levelname)s::%(name)s] %(message)s') init_seeds(args.seed, args.cuda) print("loading model...") lm = torch.load(args.load) if args.cuda: lm.cuda() lm.decoder.core_loss.amount = args.label_smoothing print(lm.model) print('Label smoothing power', lm.decoder.core_loss.amount) tokenize_regime = 'words' print("preparing training data...") train_ids = tokens_from_fn(args.train, lm.vocab, randomize=False, regime=tokenize_regime) train_streams = form_input_targets(train_ids) corrupted_provider = InputTargetCorruptor(train_streams, args.subs_rate, args.target_subs_rate, len(lm.vocab), args.del_rate, args.ins_rate, protected=[lm.vocab['</s>']]) batch_former = LazyBatcher(args.batch_size, corrupted_provider) train_data = TemplSplitterClean(args.target_seq_len, batch_former) train_data_stream = OndemandDataProvider(TransposeWrapper(train_data), args.cuda) print("preparing validation data...") evaluator = EnblockEvaluator(lm, args.valid, 10, args.target_seq_len) # Evaluation (de facto LR scheduling) with input corruption did not # help during the CHiMe-6 evaluation # evaluator = SubstitutionalEnblockEvaluator( # lm, args.valid, # batch_size=10, target_seq_len=args.target_seq_len, # corruptor=lambda data: Corruptor(data, args.corruption_rate, len(lm.vocab)), # nb_rounds=args.eval_rounds, # ) def val_loss_fn(): return evaluator.evaluate().loss_per_token print("computing initial PPL...") initial_val_loss = val_loss_fn() print('Initial perplexity {:.2f}'.format(math.exp(initial_val_loss))) print("training...") lr = args.lr best_val_loss = None val_watcher = ValidationWatcher(val_loss_fn, initial_val_loss, args.val_interval, args.workdir, lm) optim = torch.optim.SGD(lm.parameters(), lr, weight_decay=args.beta) for epoch in range(1, args.epochs + 1): logger = ProgressLogger(epoch, args.log_interval, lr, len(list(train_data)) // args.target_seq_len) hidden = None for X, targets in train_data_stream: if hidden is None: hidden = lm.model.init_hidden(args.batch_size) hidden = repackage_hidden(hidden) lm.train() output, hidden = lm.model(X, hidden) loss, nb_words = lm.decoder.neg_log_prob(output, targets) loss /= nb_words val_watcher.log_training_update(loss.data, nb_words) optim.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(lm.parameters(), args.clip) optim.step() logger.log(loss.data) val_loss = val_loss_fn() print(epoch_summary(epoch, logger.nb_updates(), logger.time_since_creation(), val_loss)) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: torch.save(lm, args.save) best_val_loss = val_loss patience_ticks = 0 else: patience_ticks += 1 if patience_ticks > args.patience: lr /= 2.0 patience_ticks = 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train', type=str, required=True, help='location of the train corpus') parser.add_argument('--valid', type=str, required=True, help='location of the valid corpus') parser.add_argument('--shuffle-lines', action='store_true', help='shuffle lines before every epoch') parser.add_argument('--max-batch-size', type=int, default=20, help='maxiamal batch size') parser.add_argument('--max-softmaxes', type=int, default=1000, help='maximal number of softmaxes in a single batch') parser.add_argument('--lr', type=float, default=20, help='initial learning rate') parser.add_argument('--beta', type=float, default=0, help='L2 regularization penalty') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--log-interval', type=int, default=200, metavar='N', help='report interval') parser.add_argument('--val-interval', type=int, default=1000000, metavar='N', help='validation interval in number of tokens') parser.add_argument('--workdir', help='where to put models, logs etc.') parser.add_argument('--load', type=str, required=True, help='where to load a model from') parser.add_argument('--save', type=str, required=True, help='path to save the final model') args = parser.parse_args() print(args) init_seeds(args.seed, args.cuda) print("loading model...") lm = torch.load(args.load) if args.cuda: lm.cuda() print(lm.model) print("preparing training data...") with open(args.train) as f: train_lines = get_independent_lines(f, lm.vocab) nb_train_tokens = sum(len(ids) for ids in train_lines) nb_oovs = sum( sum(ids == lm.vocab.unk_ind).detach().item() for ids in train_lines) print('Nb oovs: {} / {} ({:.2f} %)\n'.format( nb_oovs, nb_train_tokens, 100.0 * nb_oovs / nb_train_tokens)) evaluator = IndependentLinesEvaluator(lm, args.valid, args.max_batch_size, args.max_softmaxes) print("computing initial PPL...") initial_evaluation = evaluator.evaluate('') print('Initial perplexity {:.2f}'.format( math.exp(initial_evaluation.loss_per_token))) print("training...") lr = args.lr best_val_loss = None val_watcher = ValidationWatcher( lambda: evaluator.evaluate('').loss_per_token, initial_evaluation.loss_per_token, args.val_interval, args.workdir, lm) optim = torch.optim.SGD(lm.parameters(), lr, weight_decay=args.beta) for epoch in range(1, args.epochs + 1): logger = InfinityLogger(epoch, args.log_interval, lr) nb_batches = 0 nb_tokens = 0 running_loss = 0.0 t0 = time.time() random.shuffle(train_lines) train_data_stream = OndemandDataProvider(Batcher( train_lines, args.max_batch_size, args.max_softmaxes), cuda=False) for batch in train_data_stream: nb_batches += 1 lm.train() loss = lm.batch_nll_idxs(batch).sum() running_loss += loss.detach().item() nb_words = sum(len(s) for s in batch) nb_tokens += nb_words loss /= nb_words val_watcher.log_training_update(loss.data, nb_words) optim.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(lm.parameters(), args.clip) optim.step() logger.log(loss.data) val_loss = evaluator.evaluate('').loss_per_token print( f'epoch {epoch}: {nb_batches} batches, train loss {running_loss:.1f}, running PPL {math.exp(running_loss/nb_tokens):.2f}, val PPL {math.exp(val_loss):.2f}, {time.time() - t0:.1f} sec' ) print( epoch_summary(epoch, logger.nb_updates(), logger.time_since_creation(), val_loss)) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: torch.save(lm, args.save) best_val_loss = val_loss else: lr /= 2.0 pass
def main(args): print(args) init_seeds(args.seed, args.cuda) print("loading model...") device = torch.device('cuda') if args.cuda else torch.device('cpu') lm = torch.load(args.load).to(device) print(lm.model) print("preparing training data...") if args.train_yaml: train_data_stream, single_stream_len = yaml_factory( args.train_yaml, lm, device) else: train_data_stream, single_stream_len = plain_factory( data_fn=args.train, lm=lm, tokenize_regime=args.tokenize_regime, batch_size=args.batch_size, device=device, target_seq_len=args.target_seq_len, ) print("preparing validation data...") evaluator = EnblockEvaluator(lm, args.valid, 10, args.target_seq_len, tokenize_regime=args.tokenize_regime) def val_loss_fn(): return evaluator.evaluate().loss_per_token print("computing initial PPL...") initial_val_loss = val_loss_fn() print('Initial perplexity {:.2f}'.format(math.exp(initial_val_loss))) print("training...") lr = args.lr best_val_loss = None val_watcher = ValidationWatcher(val_loss_fn, initial_val_loss, args.val_interval, args.workdir, lm) best_val_loss = initial_val_loss optim = torch.optim.SGD(lm.parameters(), lr, weight_decay=args.beta) patience_ticks = 0 for epoch in range(1, args.epochs + 1): logger = ProgressLogger(epoch, args.log_interval, lr, single_stream_len // args.target_seq_len) hidden = None for X, targets in train_data_stream: if hidden is None: hidden = lm.model.init_hidden(args.batch_size) hidden = repackage_hidden(hidden) lm.train() output, hidden = lm.model(X, hidden) loss, nb_words = lm.decoder.neg_log_prob(output, targets) loss /= nb_words val_watcher.log_training_update(loss.data, nb_words) optim.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(lm.parameters(), args.clip) optim.step() logger.log(loss.data) val_loss = val_loss_fn() print( epoch_summary(epoch, logger.nb_updates(), logger.time_since_creation(), val_loss)) # Save the model if the validation loss is the best we've seen so far. if val_loss < best_val_loss: torch.save(lm, args.save) best_val_loss = val_loss patience_ticks = 0 else: patience_ticks += 1 if patience_ticks > args.patience: lr /= 2.0 if lr < args.min_lr: print( f"Learning has reached {lr}, training was supposed to stop at {args.min_lr}, stopping." ) break for p in optim.param_groups: p['lr'] = lr patience_ticks = 0