def get_best_model(model_type: str) -> nn.Module: model: nn.Module = None if model_type == 'RNN': model = RNN(emb_size=200, hidden_size=1500, seq_len=35, batch_size=20, vocab_size=vocab_size, num_layers=2, dp_keep_prob=0.35) model.load_state_dict( torch.load('./4_1_a/best_params.pt', map_location=device)) elif model_type == 'GRU': model = GRU(emb_size=200, hidden_size=1500, seq_len=35, batch_size=20, vocab_size=vocab_size, num_layers=2, dp_keep_prob=0.35) model.load_state_dict( torch.load('./4_1_b/best_params.pt', map_location=device)) elif model_type == 'TRANSFORMER': model = TRANSFORMER(vocab_size=vocab_size, n_units=512, n_blocks=6, dropout=1. - 0.9) model.batch_size = 128 model.seq_len = 35 model.vocab_size = vocab_size model.load_state_dict(torch.load('./4_1_c/best_params.pt')) return model
def make_my_model(model_name, device, seq_len=35, batch_size=20, pt=None): # --model=RNN --optimizer=ADAM --initial_lr=0.0001 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35 --save_best # --model=GRU --optimizer=SGD_LR_SCHEDULE --initial_lr=10 --batch_size=20 --seq_len=35 --hidden_size=1500 --num_layers=2 --dp_keep_prob=0.35 --save_best # --model=TRANSFORMER --optimizer=SGD_LR_SCHEDULE --initial_lr=20 --batch_size=128 --seq_len=35 --hidden_size=512 --num_layers=6 --dp_keep_prob=0.9 --save_best if model_name == 'RNN': model = RNN(emb_size=200, hidden_size=1500, seq_len=seq_len, batch_size=batch_size, vocab_size=vocab_size, num_layers=2, dp_keep_prob=0.35) elif model_name == 'GRU': model = GRU(emb_size=200, hidden_size=1500, seq_len=seq_len, batch_size=batch_size, vocab_size=vocab_size, num_layers=2, dp_keep_prob=0.35) elif model_name == 'TRANSFORMER': model = TRANSFORMER(vocab_size=vocab_size, n_units=512, n_blocks=6, dropout=1. - 0.9) # these 3 attributes don't affect the Transformer's computations; # they are only used in run_epoch model.batch_size = 128 model.seq_len = 35 model.vocab_size = vocab_size else: print("ERROR: Model type not recognized.") return # Model to device model = model.to(device) # Load pt if pt is not None: model.load_state_dict(torch.load(pt, map_location=device)) return model
def load_model(model_info, device, vocab_size, emb_size=200, load_on_device=True): params_path = model_info.get_params_path() if model_info.model == 'RNN': model = RNN(emb_size=emb_size, hidden_size=model_info.hidden_size, seq_len=model_info.seq_len, batch_size=model_info.batch_size, vocab_size=vocab_size, num_layers=model_info.num_layers, dp_keep_prob=model_info.dp_keep_prob) elif model_info.model == 'GRU': model = GRU(emb_size=emb_size, hidden_size=model_info.hidden_size, seq_len=model_info.seq_len, batch_size=model_info.batch_size, vocab_size=vocab_size, num_layers=model_info.num_layers, dp_keep_prob=model_info.dp_keep_prob) else: model = TRANSFORMER(vocab_size=vocab_size, n_units=model_info.hidden_size, n_blocks=model_info.num_layers, dropout=1. - model_info.dp_keep_prob) model.batch_size = model_info.batch_size model.seq_len = model_info.seq_len model.vocab_size = vocab_size if load_on_device: model = model.to(device) model.load_state_dict(torch.load(params_path, map_location=device)) return model
if args.debug: # use a very small model model = TRANSFORMER(vocab_size=vocab_size, n_units=16, n_blocks=2) else: # Note that we're using num_layers and hidden_size to mean slightly # different things here than in the RNNs. # Also, the Transformer also has other hyperparameters # (such as the number of attention heads) which can change it's behavior. model = TRANSFORMER(vocab_size=vocab_size, n_units=args.hidden_size, n_blocks=args.num_layers, dropout=1. - args.dp_keep_prob) # these 3 attributes don't affect the Transformer's computations; # they are only used in run_epoch model.batch_size = args.batch_size model.seq_len = args.seq_len model.vocab_size = vocab_size else: print("Model type not recognized.") model = model.to(device) # LOSS FUNCTION loss_fn = nn.CrossEntropyLoss() if args.optimizer == 'ADAM': optimizer = torch.optim.Adam(model.parameters(), lr=args.initial_lr) # LEARNING RATE SCHEDULE lr = args.initial_lr lr_decay_base = 1 / 1.15 m_flat_lr = 14.0 # we will not touch lr for the first m_flat_lr epochs