def __init__(self, params): self.trainer, self.early_stopping = None, None m = LM(params['emb_dim'], params['hid_dim'], d, num_layers=params['num_layers'], cell=params['cell'], dropout=params['dropout'], train_init=params['train_init'], deepout_layers=params['deepout_layers'], maxouts=params['maxouts'], word_dropout=params['word_dropout']) u.initialize_model(m) optimizer = getattr(optim, args.optim)(m.parameters(), lr=args.lr) self.early_stopping = EarlyStopping(5, patience=3) def early_stop_hook(trainer, epoch, batch_num, num_checkpoints): valid_loss = trainer.validate_model() self.early_stopping.add_checkpoint(sum(valid_loss.pack())) trainer = Trainer(m, { "train": train, "test": test, "valid": valid }, optimizer, max_norm=args.max_norm) trainer.add_hook(early_stop_hook, hooks_per_epoch=5) trainer.add_loggers(StdLogger()) self.trainer = trainer
def from_lm(cls, lm, embeddings=None, **kwargs): if embeddings is not None: if embeddings.weight.size(1) != lm.embeddings.weight.size(1): raise ValueError("Uncompatible embedding matrices") # Initialize embeddings to random values to account for OOVs # or use the unknown embedding from the LM instead if available vocab, unk = len(embeddings.d), lm.embeddings.d.get_unk() if unk is not None: embeddings.weight.data.copy_( lm.embeddings.weight.data[unk].unsqueeze(0).expand( vocab, embeddings.embedding_dim)) else: import seqmod.utils as u u.initialize_model(embeddings) found, target = 0, {w: idx for idx, w in enumerate(lm.embeddings.d.vocab)} for idx, w in enumerate(embeddings.d.vocab): if w not in target: continue found += 1 embeddings.weight.data[idx].copy_(lm.embeddings.weight.data[target[w]]) logging.warn("Initialized [%d/%d] embs from LM" % (found, vocab)) else: logging.warn("Reusing LM embedding vocabulary. This vocabulary might not " "correspond to the input data if it wasn't processed with " "the same Dict") embeddings = lm.embeddings hid_dim, layers = lm.rnn.hidden_size, lm.rnn.num_layers cell, bidi = type(lm.rnn).__name__, kwargs.pop('bidi', False) if bidi: logging.warn('Cannot initialize bidirectional layers from sequential LM. ' 'The bidirectional option will be ignored') inst = cls(embeddings, hid_dim, layers, cell, bidi=False, **kwargs) for param, weight in inst.rnn.named_parameters(): weight.data.copy_(getattr(lm.rnn, param).data) return inst
reuse_hidden=args.att_type.lower() != 'none', dropout=args.dropout, input_feed=args.input_feed, word_dropout=args.word_dropout, deepout_layers=args.deepout_layers, tie_weights=args.tie_weights, reverse=args.reverse) # model.freeze_submodule('encoder') # model.encoder.register_backward_hook(u.log_grad) # model.decoder.register_backward_hook(u.log_grad) u.initialize_model(model, rnn={ 'type': 'orthogonal', 'args': { 'gain': 1.0 } }) optimizer = getattr(optim, args.optim)(model.parameters(), lr=args.lr) print(model) print() print('* number of parameters: {}'.format(model.n_params())) model.to(device=args.device) early_stopping = EarlyStopping(args.patience) trainer = Trainer(model, { 'train': train,
print("Building model...") model = SequenceVAE(args.emb_dim, args.hid_dim, args.z_dim, train.d['src'], num_layers=args.num_layers, cell=args.cell, dropout=args.dropout, add_z=args.add_z, word_dropout=args.word_dropout, tie_weights=args.tie_weights, project_init=args.project_init, inflection_point=args.inflection_point) print(model) u.initialize_model(model) if args.load_embeddings: weight = load_embeddings(train.d['src'].vocab, args.flavor, args.suffix, '~/data/word_embeddings') model.init_embeddings(weight) if args.gpu: model.cuda() def on_lr_update(old_lr, new_lr): trainer.log("info", "Resetting lr [%g -> %g]" % (old_lr, new_lr)) optimizer = Optimizer( model.parameters(), args.optim,
data, d, args.batch_size, args.bptt, device=args.device, ).splits(test=args.test_split, dev=args.dev_split) print(' * vocabulary size. {}'.format(len(d))) print(' * number of train batches. {}'.format(len(train))) print('Building model...') m = LM(args.emb_dim, args.hid_dim, d, exposure_rate=args.schedule_init, num_layers=args.num_layers, cell=args.cell, dropout=args.dropout, att_dim=args.att_dim, tie_weights=args.tie_weights, mixtures=args.mixtures, deepout_layers=args.deepout_layers, train_init=args.train_init, deepout_act=args.deepout_act, maxouts=args.maxouts, sampled_softmax=args.sampled_softmax, word_dropout=args.word_dropout) u.initialize_model( m, rnn={'type': 'orthogonal_', 'args': {'gain': 1.0}}, emb={'type': 'uniform_', 'args': {'a': -0.05, 'b': 0.05}}) m.to(device=args.device) print(m) print('* number of parameters: {}'.format(m.n_params())) if args.optim == 'Adam': optimizer = getattr(optim, args.optim)( m.parameters(), lr=args.lr, betas=(0., 0.99), eps=1e-5) else: optimizer = getattr(optim, args.optim)(m.parameters(), lr=args.lr) # create trainer loss_type = 'bpc' if args.level == 'char' else 'ppl'
word_dropout=args.word_dropout, reuse_hidden=False, input_feed=False, att_type=None, cond_dims=cond_dims, cond_vocabs=cond_vocabs, add_init_jitter=True, train_init=args.train_init, reverse=args.reverse) losses, weights = ('ppl', ), None print(m) print('* number of params: ', sum(p.nelement() for p in m.parameters())) u.initialize_model(m) if args.init_embeddings: m.encoder.embeddings.init_embeddings_from_file(args.embeddings_path, verbose=True) m.to(args.device) optimizer = getattr(optim, args.optimizer)(m.parameters(), lr=args.lr) # Decrease lr by a factor after each epoch scheduler = optim.lr_scheduler.StepLR(optimizer, args.lr_schedule_epochs, args.lr_schedule_factor) model_name = 'AE.GRL{}.C{}'.format(str(args.grl), str(conditional)) trainer = Trainer(m, { 'train': train, 'valid': valid
args.mode, cell=args.cell, hid_dim=args.hid_dim, num_layers=args.num_layers, summary=args.summary, softmax=args.softmax, dropout=args.dropout) print("Initializing parameters ...") utils.initialize_model(m, rnn={ 'type': 'rnn_orthogonal', 'args': { 'forget_bias': True } }, emb={ 'type': 'uniform_', 'args': { 'a': -0.1, 'b': 0.1 } }) if args.init_embeddings: embeddings.init_embeddings_from_file(args.embeddings_path, verbose=True) m.to(device=args.device) optimizer = getattr(optim, args.optim)(m.parameters(), lr=args.lr) losses = [{
def main(): parser = argparse.ArgumentParser() # dataset parser.add_argument('--input', type=str, default='data') parser.add_argument('--min_item_freq', type=int, default=50) parser.add_argument('--max_vocab_size', type=int, default=20000) parser.add_argument('--min_len', default=1, type=int) parser.add_argument('--max_len', default=15, type=int) parser.add_argument('--dev', default=0.1, type=float) parser.add_argument('--rnd_seed', default=12345, type=int) parser.add_argument('--max_items', default=None, type=int) parser.add_argument('--task', default='sentences', type=str) parser.add_argument('--level', default='word', type=str) parser.add_argument('--focus_size', default=15, type=int) parser.add_argument('--left_size', default=15, type=int) parser.add_argument('--right_size', default=15, type=int) parser.add_argument('--shingle_stride', default=None, type=int) parser.add_argument('--allow_overlap', action='store_true', default=False) parser.add_argument('--shuffle', action='store_true') parser.add_argument('--tokenize', action='store_true', default=False) parser.add_argument('--grow', action='store_true') parser.add_argument('--grow_n_epochs', default=1, type=int) # training parser.add_argument('--epochs', default=5, type=int) parser.add_argument('--gpu', action='store_true') parser.add_argument('--batch_size', default=30, type=int) parser.add_argument('--optim', default='Adam', type=str) parser.add_argument('--lr', default=0.0003, type=float) parser.add_argument('--max_norm', default=10., type=float) parser.add_argument('--dropout', default=0.25, type=float) parser.add_argument('--word_dropout', default=0.0, type=float) parser.add_argument('--use_schedule', action='store_true') parser.add_argument('--patience', default=10, type=int) parser.add_argument('--reverse', action='store_true') parser.add_argument('--batches_for_checkpoint', default=50, type=int) parser.add_argument('--checkpoints_for_hooks', default=1, type=int) parser.add_argument('--target', default='Ze was', type=str) parser.add_argument('--bidi', action='store_true') parser.add_argument('--beam', action='store_true') parser.add_argument('--plot', action='store_true') parser.add_argument('--json', type=str, default='history.json') # model parser.add_argument('--model_path', default='./model_storage', type=str) parser.add_argument('--num_layers', default=1, type=int) parser.add_argument('--emb_dim', default=64, type=int) parser.add_argument('--hid_dim', default=150, type=int) parser.add_argument('--cell', default='GRU') parser.add_argument('--train_init', action='store_true') parser.add_argument('--add_init_jitter', action='store_true') parser.add_argument('--encoder-summary', default='inner-attention') parser.add_argument('--deepout_layers', type=int, default=0) parser.add_argument('--att_type', type=str, default='none') args = parser.parse_args() if args.task == 'sentences' and args.level == 'word': args.target = [t.lower() for t in word_tokenize(args.target)] elif args.task == 'sentences' and args.level == 'char': args.target = tuple(args.target.lower()) train, valid, vocab_dict = uz.shingle_dataset(args, vocab_dict=None) print(f' * vocabulary size {len(vocab_dict)}') print(f' * number of train batches {len(train)}') print(f' * number of dev batches {len(valid)}') print(f' * maximum batch size {args.batch_size}') model = make_skipthoughts_model(num_layers=args.num_layers, emb_dim=args.emb_dim, hid_dim=args.hid_dim, src_dict=vocab_dict, cell=args.cell, bidi=args.bidi, encoder_summary=args.encoder_summary, att_type=args.att_type, task=args.task, tie_weights=False) u.initialize_model(model, rnn={ 'type': 'orthogonal', 'args': { 'gain': 1.0 } }) optimizer = getattr(optim, args.optim)(model.parameters(), lr=args.lr) #, amsgrad=True) print(model) print('* number of parameters: {}'.format(model.n_params())) if args.gpu: model.cuda() early_stopping = EarlyStopping(patience=args.patience, maxsize=1) trainer = SkipthoughtsTrainer(model, { 'train': train, 'valid': valid }, optimizer, early_stopping=early_stopping, max_norm=args.max_norm) if args.json: logger = JsonLogger(json_file=args.json) else: logger = StdLogger() trainer.add_loggers(logger) trainer.set_additional_params(args, vocab_dict) hook = make_translation_hook(args.target, args.gpu, beam=args.beam, max_len=args.right_size) trainer.add_hook(hook, num_checkpoints=args.checkpoints_for_hooks) #hook = u.make_schedule_hook( # inflection_sigmoid(len(train) * 2, 1.75, inverse=True)) #trainer.add_hook(hook, num_checkpoints=args.checkpoints_for_hooks) (best_model, valid_loss), test_loss = trainer.train(args.epochs, args.batches_for_checkpoint, shuffle=True, use_schedule=args.use_schedule) u.save_checkpoint(args.model_path, best_model, vars(args), d=vocab_dict, ppl=valid_loss, suffix='final')