Exemple #1
0
def train_model(device,
                model,
                train_sampler,
                val_sampler,
                test_sampler,
                num_epochs,
                lr,
                exp_name,
                partial_run_epoch,
                partial_decode_single,
                lr_annealing_rate=1.0,
                n_teacher_forcing=1,
                save_folder=''):
    """
    Training pipeline for model

    Arguments:
        device {torch.device} -- Device on which data is stored/operated
        model {nn.Module} -- Model (with `forward` function)
        train_sampler {BatchSampler} -- Sampler to produce tensor batches of training data
        val_sampler {BatchSampler} -- Sampler to produce tensor batches of validation data
        test_sampler {BatchSampler} -- Sampler to produce tensor batches of testing data
        num_epochs {int} -- Maximum # epochs
        lr {float} -- Starting learning rate
        exp_name {str} -- Experiment name
        partial_run_epoch {func} -- `run_epoch` function, with train/epoch-invariant arguments already 
            filled in via `functools.partial`
        partial_decode_single {func} -- `decode_single` function, w/ train/epoch-invariant
            arguments already filled in via `functools.partial`

    Keyword Arguments:
        lr_annealing_rate {float} -- Scale learning rate upon validation ppx increasing (default: {1.0})
        n_teacher_forcing {int} -- Number of epochs to conduct teacher forcing (default: {1})
        save_folder {str} -- Location to which to save models (default: {''})

    Returns:
        list -- validation perplexities across epochs
        float -- final test perplexity
    """
    start_train = datetime.now()
    current_lr = lr

    # Making sure the folder exists
    if not os.path.exists(save_folder):
        os.mkdir(save_folder)

    # Model serialization
    model_loc = os.path.join(save_folder, 'model_{}'.format(exp_name))
    print(
        '{} - *******************  Training model with {} epochs of teacher forcing, starting LR {}, saving to {}'
        .format(datetime.now() - start_train, n_teacher_forcing, current_lr,
                model_loc))

    # optionally add label smoothing; see the Annotated Transformer
    criterion = nn.NLLLoss(reduction="sum", ignore_index=PAD_INDEX)
    optim = torch.optim.Adam(model.parameters(),
                             lr=current_lr,
                             weight_decay=1e-5)

    dev_perplexities = []
    best_perplexity_thus_far = 1e56
    best_model_save_loc = ''
    for epoch in range(num_epochs):
        model.train()
        # curriculum learning with teacher forcing
        force_teaching = epoch <= n_teacher_forcing
        print('[{} - Epoch {}] START EPOCH{} | {} model has {:,} parameters'.
              format(
                  datetime.now() - start_train, epoch,
                  ' (Teacher Forcing for {} epochs)'.format(n_teacher_forcing)
                  if force_teaching else '', exp_name,
                  count_parameters(model)))
        train_perplexity = partial_run_epoch(device=device,
                                             model=model,
                                             sampler=train_sampler,
                                             loss_compute=SimpleLossCompute(
                                                 criterion, optim),
                                             teacher_forcing=force_teaching)

        print('***************** Train perplexity: %f' % train_perplexity)

        # Save improved model
        candidate_loc = model_loc + 'CANDIDATE.pt'
        torch.save(model.state_dict(), candidate_loc)
        print('[{} -^^^^^^^^^^^^^^^^^  Epoch {}] Saved candidate model to {}'.
              format(datetime.now() - start_train, epoch, candidate_loc))

        # VALIDATION
        model.eval()
        with torch.no_grad():
            dev_perplexity = partial_run_epoch(device=device,
                                               model=model,
                                               sampler=val_sampler,
                                               loss_compute=SimpleLossCompute(
                                                   criterion, None),
                                               teacher_forcing=True)

            # Early stopping - compare with prior perplexity
            prior_perplexity = dev_perplexities[
                -1] if dev_perplexities else 9.9e12
            print("Validation perplexity: %f" % dev_perplexity)
            dev_perplexities.append(dev_perplexity)

            # Pick the first recipe and decode it as an example
            decode_output = partial_decode_single(device=device,
                                                  model=model,
                                                  sampler=val_sampler)
            recipe_str = decode_output[-1]
            print('[{} - Epoch {}] Decoded recipe from validation:'.format(
                datetime.now() - start_train, epoch))
            print(recipe_str)

            # If validation perplexity doesn't go down, we either anneal or stop
            if dev_perplexity > prior_perplexity:
                if lr_annealing_rate == 0.0 or current_lr < 1e-12:  # Early stoppage
                    print('[{} - Epoch {}] EARLY STOPPAGE'.format(
                        datetime.now() - start_train, epoch))
                    break
                elif lr_annealing_rate != 1.0:  # No annealing if 1.0
                    new_lr = current_lr * lr_annealing_rate
                    print(
                        '[{} - Epoch {}] Annealing: changed LR from {:.5f} to {:.5f}'
                        .format(datetime.now() - start_train, epoch,
                                current_lr, new_lr))
                    current_lr = new_lr
                    for param_group in optim.param_groups:
                        param_group['lr'] = current_lr
                    continue

            # Save improved model
            if dev_perplexity < best_perplexity_thus_far:
                best_perplexity_thus_far = min(best_perplexity_thus_far,
                                               dev_perplexity)
                best_model_save_loc = model_loc + '_e{}.pt'.format(epoch)
                torch.save(model.state_dict(), best_model_save_loc)
                print(
                    '[{} -***********  Epoch {}] Saved model to {} ****************'
                    .format(datetime.now() - start_train, epoch,
                            best_model_save_loc))

    # TESTING
    model.load_state_dict(torch.load(best_model_save_loc))
    model = model.to(device)
    print('{} - Loaded best model from {}'.format(datetime.now() - start_train,
                                                  best_model_save_loc))
    model.eval()
    with torch.no_grad():
        test_perplexity = partial_run_epoch(device=device,
                                            model=model,
                                            sampler=test_sampler,
                                            loss_compute=SimpleLossCompute(
                                                criterion, None),
                                            teacher_forcing=True)
        print("Test perplexity: {:.4f}".format(test_perplexity))

    # Pick the first recipe and decode it as an example
    decode_output = partial_decode_single(device=device,
                                          model=model,
                                          sampler=test_sampler)
    recipe_str = decode_output[-1]
    print(
        '[{} - *********** Testing ******************** Epoch {}] Decoded recipe from test set:'
        .format(datetime.now() - start_train, epoch))
    print(recipe_str)

    return dev_perplexities, test_perplexity
Exemple #2
0
def create_model(
    vocab_emb_dim,
    calorie_emb_dim,
    n_items_w_pad,
    hidden_size,
    n_layers,
    dropout=0.0,
    max_ingr=20,
    max_ingr_tok=20,
    use_cuda=True,
    state_dict_path=None,
    decode_name=False,
    ingr_gru=False,
    ingr_emb=False,
    num_ingr=None,
    ingr_emb_dim=None,
    shared_projection=False,
    item_emb=False,
    item_emb_dim=None,
):
    """
    Instantiates a model

    Arguments:
        vocab_emb_dim {int} -- Embedding dimension for vocabulary (ingredients, steps)
        calorie_emb_dim {int} -- Embedding dimension for calorie levels
        n_items_w_pad {int} -- Number of unique items, including padding items
        hidden_size {int} -- Size of hidden layers
        n_layers {int} -- Number of decoder RNN layers

    Keyword Arguments:
        dropout {float} -- Dropout rate (default: {0.0})
        max_ingr {int} -- Maximum # ingredients/recipe (default: {20})
        max_ingr_tok {int} -- Maximum # tokens/ingredient (default: {20})
        use_cuda {bool} -- Whether to use CUDA (default: {True})
        state_dict_path {str} -- If provided, loads pretrained model weights from here (default: {None})
        shared_projection {bool} -- Use the same projection layer for name & steps

    Returns:
        nn.Module -- Loaded Encoder-Decoder model
    """
    start = datetime.now()

    # Create the model
    proj_layer = nn.Linear(hidden_size, VOCAB_SIZE, bias=False)
    vocab_emb_layer = nn.Embedding(VOCAB_SIZE, embedding_dim=vocab_emb_dim)
    calorie_emb_layer = nn.Embedding(5, embedding_dim=calorie_emb_dim)
    if ingr_emb:
        ingr_emb_layer = nn.Embedding(num_ingr, embedding_dim=ingr_emb_dim)
    else:
        ingr_emb_layer = None
    if item_emb:
        item_emb_layer = nn.Embedding(n_items_w_pad,
                                      embedding_dim=item_emb_dim)
    else:
        item_emb_layer = None

    # Encoder
    encoder = Encoder(
        vocab_embedding_layer=vocab_emb_layer,
        calorie_embedding_layer=calorie_emb_layer,
        ingr_embedding_layer=ingr_emb_layer,
        hidden_size=hidden_size,
        max_ingrs=max_ingr,
        max_ingr_tokens=max_ingr_tok,
        dropout=dropout,
        ingr_gru=ingr_gru,
        gru_layers=n_layers,
    )

    # Decoder
    decoder = PersonalItemDecoder(
        vocab_embedding_layer=vocab_emb_layer,
        item_embedding_layer=item_emb_layer,
        hidden_size=hidden_size,
        gru_layers=n_layers,
        dropout=dropout,
        ingr_encoded_size=encoder.ingr_encoded_size,
        calorie_encoded_size=encoder.calorie_encoded_size,
        name_encoded_size=encoder.name_encoded_size,
        proj_layer=proj_layer,
    )

    # Total model
    if decode_name:
        name_decoder = NameDecoder(
            vocab_embedding_layer=vocab_emb_layer,
            hidden_size=hidden_size,
            gru_layers=n_layers,
            dropout=dropout,
            proj_layer=proj_layer if shared_projection else nn.Linear(
                hidden_size, VOCAB_SIZE, bias=False),
        )
        model = PersonalItemEncoderDecoder(encoder,
                                           decoder,
                                           name_decoder=name_decoder)
    else:
        model = PersonalItemEncoderDecoder(encoder, decoder)
    if use_cuda:
        model = model.cuda()

    print('{} - Constructed model skeleton'.format(datetime.now() - start))

    if state_dict_path is not None:
        # Load model state dictionary
        state_dict = torch.load(state_dict_path)
        # Load state dict
        model.load_state_dict(state_dict, strict=True)

    print('{} - Created {} model with {:,} parameters'.format(
        datetime.now() - start, model.__class__.__name__,
        count_parameters(model)))

    print(model)
    print('\n\n')

    return model
Exemple #3
0
     hidden_size=hidden_size,
     n_layers=n_layers,
     dropout=dropout,
     max_ingr=MAX_INGR,
     max_ingr_tok=MAX_INGR_TOK,
     use_cuda=USE_CUDA,
     state_dict_path=checkpoint_loc,
     ingr_gru=ingr_gru,
     decode_name=decode_name,
     ingr_emb=ingr_emb,
     num_ingr=N_INGREDIENTS,
     ingr_emb_dim=ingr_emb_dim,
     shared_projection=shared_proj,
 )
 print('{} - {} Model defined with {:,} parameters'.format(
     datetime.now() - start, exp_name, count_parameters(model)))
 '''
 TRAIN MODEL
 '''
 partial_run_epoch = partial(run_epoch,
                             user_items_df=user_items_df,
                             print_every=print_every,
                             max_len=MAX_STEP_TOK,
                             max_name_len=MAX_NAME,
                             clip=clip,
                             **memory_tensor_map)
 partial_decode_single = partial(decode_single,
                                 user_items_df=user_items_df,
                                 max_len=MAX_STEP_TOK,
                                 max_name_len=MAX_NAME,
                                 ingr_map=ingr_map,