def example(): start = time.time() train_dir = './datasets/Flickr8k_text/Flickr_8k.trainImages.txt' token_dir = './datasets/Flickr8k_text/Flickr8k.token.txt' # the current best trained model model_dir = './model-params/current_best.h5' tokenizer = create_tokenizer(train_dir, token_dir, start_end=True, use_all=True) # set relevent parameters vocab_size = tokenizer.num_words or (len(tokenizer.word_index) + 1) max_len = 24 NIC_inference = greedy_inference_model(vocab_size, max_len) NIC_inference.load_weights(model_dir, by_name=True, skip_mismatch=True) # Encoder img_feature = extract_feature_from_image('./image.jpg') # Decoder caption = decoder(NIC_inference, tokenizer, img_feature, True) print(caption[0]) print('It took', time.time() - start, 'seconds.') return {'caption': caption[0]}
def generate_caption_from_image_beam_search(img_arr, beam_width=5, alpha=0.7): # Encoder img_feature = extract_feature_from_image_arr(img_arr) # load vocabulary tokenizer = create_tokenizer(train_dir, token_dir, start_end=True, use_all=True) # set relevent parameters vocab_size = tokenizer.num_words or (len(tokenizer.word_index) + 1) max_len = 24 # use 24 as maximum sentence's length when training the model # prepare inference model NIC_text_emb_lstm = text_emb_lstm(vocab_size) NIC_text_emb_lstm.load_weights(model_dir, by_name=True, skip_mismatch=True) NIC_image_dense_lstm = image_dense_lstm() NIC_image_dense_lstm.load_weights(model_dir, by_name=True, skip_mismatch=True) # Decoder a0, c0 = NIC_image_dense_lstm.predict( [img_feature, np.zeros([1, 512]), np.zeros([1, 512])]) res = beam_search(NIC_text_emb_lstm, a0, c0, tokenizer, beam_width, max_len, alpha) best_idx = np.argmax(res['scores']) caption = tokenizer.sequences_to_texts([res['routes'][best_idx]])[0] return caption
def generate_caption_from_image_greedy(img_arr): # Encoder img_feature = extract_feature_from_image_arr(img_arr) # load vocabulary tokenizer = create_tokenizer(train_dir, token_dir, start_end=True, use_all=True) # set relevent parameters vocab_size = tokenizer.num_words or (len(tokenizer.word_index) + 1) max_len = 24 # use 24 as maximum sentence's length when training the model # prepare inference model NIC_inference = greedy_inference_model(vocab_size, max_len) NIC_inference.load_weights(model_dir, by_name=True, skip_mismatch=True) # Decoder caption = decoder(NIC_inference, tokenizer, img_feature, True) return caption
def training(dirs_dict, lr, decay, reg, batch_size, epochs, max_len, initial_epoch, previous_model=None): dict_dir = dirs_dict['dict_dir'] token_dir = dirs_dict['token_dir'] train_dir = dirs_dict['train_dir'] dev_dir = dirs_dict['dev_dir'] params_dir = dirs_dict['params_dir'] # Use Tokenizer to create vocabulary tokenizer = create_tokenizer(train_dir, token_dir, start_end=True) # Progressive loading # if batch size of training set is 30 and total 30000 sentences, then 1000 steps. # if batch size of dev set is 50 and total 5000 sentences, then 100 steps. generator_train = batch_generator(batch_size, max_len, tokenizer, dict_dir, train_dir, token_dir) generator_dev = batch_generator(50, max_len, tokenizer, dict_dir, dev_dir, token_dir) vocab_size = tokenizer.num_words or (len(tokenizer.word_index) + 1) # Define NIC model structure NIC_model = model(vocab_size, max_len, reg) if not previous_model: NIC_model.summary() plot_model(NIC_model, to_file='./model.png', show_shapes=True) else: NIC_model.load_weights(previous_model, by_name=True, skip_mismatch=True) # Define checkpoint callback file_path = params_dir + '/model-ep{epoch:03d}-loss{loss:.4f}-val_loss{val_loss:.4f}.h5' checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_weights_only=True, period=1) tbc = TensorBoardCaption(tokenizer, vocab_size, max_len, log_dir='./logs', feed_pics_dir='./put-your-image-here', model_params_dir=params_dir) # Compile the model NIC_model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=lr, decay=decay), metrics=['accuracy']) # train NIC_model.fit_generator(generator_train, steps_per_epoch=30000 // batch_size, epochs=epochs, verbose=2, callbacks=[checkpoint, tbc], validation_data=generator_dev, validation_steps=100, initial_epoch=initial_epoch)
from keras.models import load_model from preprocessing.image import extract_features, extract_feature_from_image from preprocessing.text import create_tokenizer from NIC import greedy_inference_model, image_dense_lstm, text_emb_lstm from evaluate import decoder, beam_search train_dir = './datasets/Flickr8k_text/Flickr_8k.trainImages.txt' token_dir = './datasets/Flickr8k_text/Flickr8k.token.txt' # the current best trained model model_dir = './model-params/current_best.h5' tokenizer = create_tokenizer(train_dir, token_dir, start_end = True, use_all=True) # set relevent parameters vocab_size = tokenizer.num_words or (len(tokenizer.word_index)+1) max_len = 24 NIC_inference = greedy_inference_model(vocab_size, max_len) NIC_inference.load_weights(model_dir, by_name = True, skip_mismatch=True) def generate_caption_from_file(file_dir): # Encoder img_feature = extract_feature_from_image(file_dir) # Decoder caption = decoder(NIC_inference, tokenizer, img_feature, True)