help='batch size') parser.add_argument('--target-seq-len', type=int, default=35, help='sequence length') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--load', type=str, required=True, help='where to load a model from') args = parser.parse_args() print(args) init_seeds(args.seed, args.cuda) print("loading model...") lm = torch.load(args.load, map_location='cpu') if args.cuda: lm.cuda() print(lm) print("preparing data...") tokenize_regime = 'words' if args.characters: tokenize_regime = 'chars' ids = tokens_from_fn(args.data, lm.vocab, randomize=False,
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train', type=str, required=True, help='location of the train corpus') parser.add_argument('--valid', type=str, required=True, help='location of the valid corpus') parser.add_argument( '--characters', action='store_true', help='work on character level, whitespace is significant') parser.add_argument('--shuffle-lines', action='store_true', help='shuffle lines before every epoch') parser.add_argument('--batch-size', type=int, default=20, metavar='N', help='batch size') parser.add_argument('--target-seq-len', type=int, default=35, help='sequence length') parser.add_argument('--lr', type=float, default=20, help='initial learning rate') parser.add_argument('--beta', type=float, default=0, help='L2 regularization penalty') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--log-interval', type=int, default=200, metavar='N', help='report interval') parser.add_argument('--val-interval', type=int, default=1000000, metavar='N', help='validation interval in number of tokens') parser.add_argument('--val-report', help='where to put validation report') parser.add_argument('--load', type=str, required=True, help='where to load a model from') parser.add_argument('--save', type=str, required=True, help='path to save the final model') args = parser.parse_args() print(args) init_seeds(args.seed, args.cuda) print("loading model...") lm = torch.load(args.load) if args.cuda: lm.cuda() print(lm.model) print("preparing data...") tokenize_regime = 'words' if args.characters: tokenize_regime = 'chars' train_ids = tokens_from_fn(args.train, lm.vocab, randomize=False, regime=tokenize_regime) train_batched = batchify(train_ids, args.batch_size, args.cuda) train_data_tb = TemporalSplits(train_batched, nb_inputs_necessary=lm.model.in_len, nb_targets_parallel=args.target_seq_len) train_data = TransposeWrapper(train_data_tb) valid_ids = tokens_from_fn(args.valid, lm.vocab, randomize=False, regime=tokenize_regime) valid_batched = batchify(valid_ids, 10, args.cuda) valid_data_tb = TemporalSplits(valid_batched, nb_inputs_necessary=lm.model.in_len, nb_targets_parallel=args.target_seq_len) valid_data = TransposeWrapper(valid_data_tb) def val_loss_fn(lm): return evaluate_(lm, valid_data, use_ivecs=False, custom_batches=False) initial_val_loss = val_loss_fn(lm) print('Initial perplexity {:.2f}'.format(math.exp(initial_val_loss))) print("training...") lr = args.lr best_val_loss = None if args.val_report is not None: val_report_f = open(args.val_report, 'w', buffering=1) else: val_report_f = sys.stdout val_watcher = ValidationWatcher(lambda: val_loss_fn(lm), initial_val_loss, args.val_interval, val_report_f) optim = torch.optim.SGD(lm.parameters(), lr, weight_decay=args.beta) for epoch in range(1, args.epochs + 1): logger = ProgressLogger(epoch, args.log_interval, lr, len(train_batched) // args.target_seq_len) hidden = None for X, targets in train_data: X = X.t() targets = targets.t().contiguous() if hidden is None: hidden = lm.model.init_hidden(args.batch_size) hidden = repackage_hidden(hidden) lm.train() output, hidden = lm.model(X, hidden) loss, nb_words = lm.decoder.neg_log_prob(output, targets) loss /= nb_words val_watcher.log_training_update(loss.data, nb_words) optim.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(lm.parameters(), args.clip) optim.step() logger.log(loss.data) val_loss = val_loss_fn(lm) print( epoch_summary(epoch, logger.nb_updates(), logger.time_since_creation(), val_loss)) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: torch.save(lm, args.save) best_val_loss = val_loss else: lr /= 2.0 pass