def train_model(device, model, train_sampler, val_sampler, test_sampler, num_epochs, lr, exp_name, partial_run_epoch, partial_decode_single, lr_annealing_rate=1.0, n_teacher_forcing=1, save_folder=''): """ Training pipeline for model Arguments: device {torch.device} -- Device on which data is stored/operated model {nn.Module} -- Model (with `forward` function) train_sampler {BatchSampler} -- Sampler to produce tensor batches of training data val_sampler {BatchSampler} -- Sampler to produce tensor batches of validation data test_sampler {BatchSampler} -- Sampler to produce tensor batches of testing data num_epochs {int} -- Maximum # epochs lr {float} -- Starting learning rate exp_name {str} -- Experiment name partial_run_epoch {func} -- `run_epoch` function, with train/epoch-invariant arguments already filled in via `functools.partial` partial_decode_single {func} -- `decode_single` function, w/ train/epoch-invariant arguments already filled in via `functools.partial` Keyword Arguments: lr_annealing_rate {float} -- Scale learning rate upon validation ppx increasing (default: {1.0}) n_teacher_forcing {int} -- Number of epochs to conduct teacher forcing (default: {1}) save_folder {str} -- Location to which to save models (default: {''}) Returns: list -- validation perplexities across epochs float -- final test perplexity """ start_train = datetime.now() current_lr = lr # Making sure the folder exists if not os.path.exists(save_folder): os.mkdir(save_folder) # Model serialization model_loc = os.path.join(save_folder, 'model_{}'.format(exp_name)) print( '{} - ******************* Training model with {} epochs of teacher forcing, starting LR {}, saving to {}' .format(datetime.now() - start_train, n_teacher_forcing, current_lr, model_loc)) # optionally add label smoothing; see the Annotated Transformer criterion = nn.NLLLoss(reduction="sum", ignore_index=PAD_INDEX) optim = torch.optim.Adam(model.parameters(), lr=current_lr, weight_decay=1e-5) dev_perplexities = [] best_perplexity_thus_far = 1e56 best_model_save_loc = '' for epoch in range(num_epochs): model.train() # curriculum learning with teacher forcing force_teaching = epoch <= n_teacher_forcing print('[{} - Epoch {}] START EPOCH{} | {} model has {:,} parameters'. format( datetime.now() - start_train, epoch, ' (Teacher Forcing for {} epochs)'.format(n_teacher_forcing) if force_teaching else '', exp_name, count_parameters(model))) train_perplexity = partial_run_epoch(device=device, model=model, sampler=train_sampler, loss_compute=SimpleLossCompute( criterion, optim), teacher_forcing=force_teaching) print('***************** Train perplexity: %f' % train_perplexity) # Save improved model candidate_loc = model_loc + 'CANDIDATE.pt' torch.save(model.state_dict(), candidate_loc) print('[{} -^^^^^^^^^^^^^^^^^ Epoch {}] Saved candidate model to {}'. format(datetime.now() - start_train, epoch, candidate_loc)) # VALIDATION model.eval() with torch.no_grad(): dev_perplexity = partial_run_epoch(device=device, model=model, sampler=val_sampler, loss_compute=SimpleLossCompute( criterion, None), teacher_forcing=True) # Early stopping - compare with prior perplexity prior_perplexity = dev_perplexities[ -1] if dev_perplexities else 9.9e12 print("Validation perplexity: %f" % dev_perplexity) dev_perplexities.append(dev_perplexity) # Pick the first recipe and decode it as an example decode_output = partial_decode_single(device=device, model=model, sampler=val_sampler) recipe_str = decode_output[-1] print('[{} - Epoch {}] Decoded recipe from validation:'.format( datetime.now() - start_train, epoch)) print(recipe_str) # If validation perplexity doesn't go down, we either anneal or stop if dev_perplexity > prior_perplexity: if lr_annealing_rate == 0.0 or current_lr < 1e-12: # Early stoppage print('[{} - Epoch {}] EARLY STOPPAGE'.format( datetime.now() - start_train, epoch)) break elif lr_annealing_rate != 1.0: # No annealing if 1.0 new_lr = current_lr * lr_annealing_rate print( '[{} - Epoch {}] Annealing: changed LR from {:.5f} to {:.5f}' .format(datetime.now() - start_train, epoch, current_lr, new_lr)) current_lr = new_lr for param_group in optim.param_groups: param_group['lr'] = current_lr continue # Save improved model if dev_perplexity < best_perplexity_thus_far: best_perplexity_thus_far = min(best_perplexity_thus_far, dev_perplexity) best_model_save_loc = model_loc + '_e{}.pt'.format(epoch) torch.save(model.state_dict(), best_model_save_loc) print( '[{} -*********** Epoch {}] Saved model to {} ****************' .format(datetime.now() - start_train, epoch, best_model_save_loc)) # TESTING model.load_state_dict(torch.load(best_model_save_loc)) model = model.to(device) print('{} - Loaded best model from {}'.format(datetime.now() - start_train, best_model_save_loc)) model.eval() with torch.no_grad(): test_perplexity = partial_run_epoch(device=device, model=model, sampler=test_sampler, loss_compute=SimpleLossCompute( criterion, None), teacher_forcing=True) print("Test perplexity: {:.4f}".format(test_perplexity)) # Pick the first recipe and decode it as an example decode_output = partial_decode_single(device=device, model=model, sampler=test_sampler) recipe_str = decode_output[-1] print( '[{} - *********** Testing ******************** Epoch {}] Decoded recipe from test set:' .format(datetime.now() - start_train, epoch)) print(recipe_str) return dev_perplexities, test_perplexity
def create_model( vocab_emb_dim, calorie_emb_dim, n_items_w_pad, hidden_size, n_layers, dropout=0.0, max_ingr=20, max_ingr_tok=20, use_cuda=True, state_dict_path=None, decode_name=False, ingr_gru=False, ingr_emb=False, num_ingr=None, ingr_emb_dim=None, shared_projection=False, item_emb=False, item_emb_dim=None, ): """ Instantiates a model Arguments: vocab_emb_dim {int} -- Embedding dimension for vocabulary (ingredients, steps) calorie_emb_dim {int} -- Embedding dimension for calorie levels n_items_w_pad {int} -- Number of unique items, including padding items hidden_size {int} -- Size of hidden layers n_layers {int} -- Number of decoder RNN layers Keyword Arguments: dropout {float} -- Dropout rate (default: {0.0}) max_ingr {int} -- Maximum # ingredients/recipe (default: {20}) max_ingr_tok {int} -- Maximum # tokens/ingredient (default: {20}) use_cuda {bool} -- Whether to use CUDA (default: {True}) state_dict_path {str} -- If provided, loads pretrained model weights from here (default: {None}) shared_projection {bool} -- Use the same projection layer for name & steps Returns: nn.Module -- Loaded Encoder-Decoder model """ start = datetime.now() # Create the model proj_layer = nn.Linear(hidden_size, VOCAB_SIZE, bias=False) vocab_emb_layer = nn.Embedding(VOCAB_SIZE, embedding_dim=vocab_emb_dim) calorie_emb_layer = nn.Embedding(5, embedding_dim=calorie_emb_dim) if ingr_emb: ingr_emb_layer = nn.Embedding(num_ingr, embedding_dim=ingr_emb_dim) else: ingr_emb_layer = None if item_emb: item_emb_layer = nn.Embedding(n_items_w_pad, embedding_dim=item_emb_dim) else: item_emb_layer = None # Encoder encoder = Encoder( vocab_embedding_layer=vocab_emb_layer, calorie_embedding_layer=calorie_emb_layer, ingr_embedding_layer=ingr_emb_layer, hidden_size=hidden_size, max_ingrs=max_ingr, max_ingr_tokens=max_ingr_tok, dropout=dropout, ingr_gru=ingr_gru, gru_layers=n_layers, ) # Decoder decoder = PersonalItemDecoder( vocab_embedding_layer=vocab_emb_layer, item_embedding_layer=item_emb_layer, hidden_size=hidden_size, gru_layers=n_layers, dropout=dropout, ingr_encoded_size=encoder.ingr_encoded_size, calorie_encoded_size=encoder.calorie_encoded_size, name_encoded_size=encoder.name_encoded_size, proj_layer=proj_layer, ) # Total model if decode_name: name_decoder = NameDecoder( vocab_embedding_layer=vocab_emb_layer, hidden_size=hidden_size, gru_layers=n_layers, dropout=dropout, proj_layer=proj_layer if shared_projection else nn.Linear( hidden_size, VOCAB_SIZE, bias=False), ) model = PersonalItemEncoderDecoder(encoder, decoder, name_decoder=name_decoder) else: model = PersonalItemEncoderDecoder(encoder, decoder) if use_cuda: model = model.cuda() print('{} - Constructed model skeleton'.format(datetime.now() - start)) if state_dict_path is not None: # Load model state dictionary state_dict = torch.load(state_dict_path) # Load state dict model.load_state_dict(state_dict, strict=True) print('{} - Created {} model with {:,} parameters'.format( datetime.now() - start, model.__class__.__name__, count_parameters(model))) print(model) print('\n\n') return model
hidden_size=hidden_size, n_layers=n_layers, dropout=dropout, max_ingr=MAX_INGR, max_ingr_tok=MAX_INGR_TOK, use_cuda=USE_CUDA, state_dict_path=checkpoint_loc, ingr_gru=ingr_gru, decode_name=decode_name, ingr_emb=ingr_emb, num_ingr=N_INGREDIENTS, ingr_emb_dim=ingr_emb_dim, shared_projection=shared_proj, ) print('{} - {} Model defined with {:,} parameters'.format( datetime.now() - start, exp_name, count_parameters(model))) ''' TRAIN MODEL ''' partial_run_epoch = partial(run_epoch, user_items_df=user_items_df, print_every=print_every, max_len=MAX_STEP_TOK, max_name_len=MAX_NAME, clip=clip, **memory_tensor_map) partial_decode_single = partial(decode_single, user_items_df=user_items_df, max_len=MAX_STEP_TOK, max_name_len=MAX_NAME, ingr_map=ingr_map,