Exemple #1
0
def example():
    start = time.time()
    train_dir = './datasets/Flickr8k_text/Flickr_8k.trainImages.txt'
    token_dir = './datasets/Flickr8k_text/Flickr8k.token.txt'
    # the current best trained model
    model_dir = './model-params/current_best.h5'

    tokenizer = create_tokenizer(train_dir,
                                 token_dir,
                                 start_end=True,
                                 use_all=True)

    # set relevent parameters
    vocab_size = tokenizer.num_words or (len(tokenizer.word_index) + 1)
    max_len = 24

    NIC_inference = greedy_inference_model(vocab_size, max_len)
    NIC_inference.load_weights(model_dir, by_name=True, skip_mismatch=True)
    # Encoder
    img_feature = extract_feature_from_image('./image.jpg')
    # Decoder
    caption = decoder(NIC_inference, tokenizer, img_feature, True)
    print(caption[0])
    print('It took', time.time() - start, 'seconds.')
    return {'caption': caption[0]}
Exemple #2
0
def generate_caption_from_image_beam_search(img_arr, beam_width=5, alpha=0.7):
    # Encoder
    img_feature = extract_feature_from_image_arr(img_arr)

    # load vocabulary
    tokenizer = create_tokenizer(train_dir,
                                 token_dir,
                                 start_end=True,
                                 use_all=True)

    # set relevent parameters
    vocab_size = tokenizer.num_words or (len(tokenizer.word_index) + 1)
    max_len = 24  # use 24 as maximum sentence's length when training the model

    # prepare inference model
    NIC_text_emb_lstm = text_emb_lstm(vocab_size)
    NIC_text_emb_lstm.load_weights(model_dir, by_name=True, skip_mismatch=True)
    NIC_image_dense_lstm = image_dense_lstm()
    NIC_image_dense_lstm.load_weights(model_dir,
                                      by_name=True,
                                      skip_mismatch=True)

    # Decoder
    a0, c0 = NIC_image_dense_lstm.predict(
        [img_feature, np.zeros([1, 512]),
         np.zeros([1, 512])])

    res = beam_search(NIC_text_emb_lstm, a0, c0, tokenizer, beam_width,
                      max_len, alpha)
    best_idx = np.argmax(res['scores'])
    caption = tokenizer.sequences_to_texts([res['routes'][best_idx]])[0]

    return caption
Exemple #3
0
def generate_caption_from_image_greedy(img_arr):
    # Encoder
    img_feature = extract_feature_from_image_arr(img_arr)

    # load vocabulary
    tokenizer = create_tokenizer(train_dir,
                                 token_dir,
                                 start_end=True,
                                 use_all=True)

    # set relevent parameters
    vocab_size = tokenizer.num_words or (len(tokenizer.word_index) + 1)
    max_len = 24  # use 24 as maximum sentence's length when training the model

    # prepare inference model
    NIC_inference = greedy_inference_model(vocab_size, max_len)
    NIC_inference.load_weights(model_dir, by_name=True, skip_mismatch=True)

    # Decoder
    caption = decoder(NIC_inference, tokenizer, img_feature, True)

    return caption
Exemple #4
0
def training(dirs_dict,
             lr,
             decay,
             reg,
             batch_size,
             epochs,
             max_len,
             initial_epoch,
             previous_model=None):

    dict_dir = dirs_dict['dict_dir']
    token_dir = dirs_dict['token_dir']
    train_dir = dirs_dict['train_dir']
    dev_dir = dirs_dict['dev_dir']
    params_dir = dirs_dict['params_dir']

    # Use Tokenizer to create vocabulary
    tokenizer = create_tokenizer(train_dir, token_dir, start_end=True)

    # Progressive loading
    # if batch size of training set is 30 and total 30000 sentences, then 1000 steps.
    # if batch size of dev set is 50 and total 5000 sentences, then 100 steps.
    generator_train = batch_generator(batch_size, max_len, tokenizer, dict_dir,
                                      train_dir, token_dir)
    generator_dev = batch_generator(50, max_len, tokenizer, dict_dir, dev_dir,
                                    token_dir)

    vocab_size = tokenizer.num_words or (len(tokenizer.word_index) + 1)

    # Define NIC model structure
    NIC_model = model(vocab_size, max_len, reg)

    if not previous_model:
        NIC_model.summary()
        plot_model(NIC_model, to_file='./model.png', show_shapes=True)
    else:
        NIC_model.load_weights(previous_model,
                               by_name=True,
                               skip_mismatch=True)

    # Define checkpoint callback
    file_path = params_dir + '/model-ep{epoch:03d}-loss{loss:.4f}-val_loss{val_loss:.4f}.h5'
    checkpoint = ModelCheckpoint(file_path,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_weights_only=True,
                                 period=1)
    tbc = TensorBoardCaption(tokenizer,
                             vocab_size,
                             max_len,
                             log_dir='./logs',
                             feed_pics_dir='./put-your-image-here',
                             model_params_dir=params_dir)

    # Compile the model
    NIC_model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(lr=lr, decay=decay),
                      metrics=['accuracy'])

    # train
    NIC_model.fit_generator(generator_train,
                            steps_per_epoch=30000 // batch_size,
                            epochs=epochs,
                            verbose=2,
                            callbacks=[checkpoint, tbc],
                            validation_data=generator_dev,
                            validation_steps=100,
                            initial_epoch=initial_epoch)
from keras.models import load_model


from preprocessing.image import extract_features, extract_feature_from_image
from preprocessing.text import create_tokenizer
from NIC import greedy_inference_model, image_dense_lstm, text_emb_lstm
from evaluate import decoder, beam_search



train_dir = './datasets/Flickr8k_text/Flickr_8k.trainImages.txt'
token_dir = './datasets/Flickr8k_text/Flickr8k.token.txt'
# the current best trained model
model_dir = './model-params/current_best.h5'

tokenizer = create_tokenizer(train_dir, token_dir, start_end = True, use_all=True)

# set relevent parameters
vocab_size  = tokenizer.num_words or (len(tokenizer.word_index)+1)
max_len = 24

NIC_inference = greedy_inference_model(vocab_size, max_len)
NIC_inference.load_weights(model_dir, by_name = True, skip_mismatch=True)


def generate_caption_from_file(file_dir):
    # Encoder
    img_feature = extract_feature_from_image(file_dir)
    # Decoder
    caption = decoder(NIC_inference, tokenizer, img_feature, True)