Ejemplo n.º 1
0
def main():
    # Logging stuff
    now = datetime.now()
    logdir = "/tmp/image_captioning/" + now.strftime("%Y%m%d-%H%M%S") + "/"

    # Get Coco dataset
    coco.set_data_dir("../Datasets/coco/")
    coco.maybe_download_and_extract()

    # Get file names and captions
    _, filenames_train, captions_train = coco.load_records(train=True)

    num_images_train = len(filenames_train)
    print("Number of training images = {}".format(num_images_train))

    image_model = VGG16(include_top=True, weights='imagenet')
    image_model.summary()
Ejemplo n.º 2
0
 def __init__(self,model_path):
     print('='*10+' Initiating the caption model '+10*'=')
     self.img_size=(224, 224)
     num_words = 10000
     image_model = VGG16(include_top=True, weights='imagenet')
     transfer_layer = image_model.get_layer('fc2')
     self.image_model_transfer = Model(inputs=image_model.input, outputs=transfer_layer.output)
     self.decoder_model= tf.keras.models.load_model(model_path)
     self.mark_start = 'ssss '
     self.mark_end = ' eeee'
     self.token_start = 2
     self.token_end = 3
     _, filenames_train, captions_train = coco.load_records(train=True)
     captions_train_marked = self.mark_captions(captions_train)
     captions_train_flat = self.flatten(captions_train_marked)
     self.tokenizer = TokenizerWrap(texts=captions_train_flat, num_words=num_words)
     print(self.decoder_model.summary())
Ejemplo n.º 3
0
from PIL import Image
from cache import cache
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.applications import VGG16
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import coco  #导入coco包

coco.set_data_dir("/home/ljk/ic/data/coco/")  #设置coco所在的文件夹(路径可以改)

_, filenames_train, captions_train = coco.load_records(
    train=True)  #用coco包里的加载工具记录训练集图片并将记录保存在一个新的文件中,可以用于快速加载
num_images_train = len(filenames_train)  #训练集图像数量大小
_, filenames_val, captions_val = coco.load_records(train=False)  #对验证集做记录


def load_image(path, size=None):  #定义加载图像函数
    # 使用PIL加载图像
    img = Image.open(path)
    # 重新定义图像大小
    if not size is None:
        img = img.resize(size=size, resample=Image.LANCZOS)
    # 将图像转换到np数组里面
    img = np.array(img)
    # 归一化像素
    img = img / 255.0
    # 将2-dim灰度数组转换成3-dimRGB数组
Ejemplo n.º 4
0
    # Path for the image-file.
    path = os.path.join(data_dir, filename)

    # Use the model to generate a caption of the image.
    generate_caption(image_path=path)

    # Print the true captions from the data-set.
    print("True captions:")
    for caption in captions:
        print(caption)


coco.set_data_dir('data/coco/')
coco.maybe_download_and_extract()

_, filenames_train, captions_train = coco.load_records(train=True)

num_images_train = len(filenames_train)

_, filenames_val, captions_val = coco.load_records(train=False)

show_image(idx=1, train=True)

image_model = VGG16(include_top=True, weights='imagenet')
image_model.summary()

transfer_layer = image_model.get_layer('fc2')
image_model_transfer = Model(inputs=image_model.input,
                             outputs=transfer_layer.output)

img_size = K.int_shape(image_model.input)[1:3]
Ejemplo n.º 5
0
import os
from PIL import Image
from cache import cache

from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.applications import VGG16
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

import coco  ### A manually written py file to process coco data

ids, filenames_val, captions_val = coco.load_records(train=False)

num_images_val = len(filenames_val)
print(num_images_val)
## output - 5000

### Helper Functions


def load_image(path, size=None):
    """
    Load the image from the given file-path and resize it
    to the given size if not None.
    """

    # Load the image using PIL.
def generate_caption(image_path, max_tokens=30):
    """
    Generate a caption for the image in the given path.
    The caption is limited to the given number of tokens (words).
    """

    # Load and resize the image.
    image = load_image(image_path, size=img_size)

    # Expand the 3-dim numpy array to 4-dim
    # because the image-model expects a whole batch as input,
    # so we give it a batch with just one image.
    image_batch = np.expand_dims(image, axis=0)

    # Process the image with the pre-trained image-model
    # to get the transfer-values.
    with graph.as_default():
        transfer_values = image_model_transfer.predict(image_batch)

    # Pre-allocate the 2-dim array used as input to the decoder.
    # This holds just a single sequence of integer-tokens,
    # but the decoder-model expects a batch of sequences.
    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)

    # prepare Tokenizer
    _, filenames_train, captions_train = coco.load_records(train=True)
    captions_train_marked = mark_captions(captions_train)
    captions_train_flat = flatten(captions_train_marked)
    tokenizer = TokenizerWrap(texts=captions_train_flat, num_words=num_words)

    token_start = tokenizer.word_index[mark_start.strip()]
    token_end = tokenizer.word_index[mark_end.strip()]

    # The first input-token is the special start-token for 'ssss '.
    token_int = token_start

    # Initialize an empty output-text.
    output_text = ''

    # Initialize the number of tokens we have processed.
    count_tokens = 0

    # While we haven't sampled the special end-token for ' eeee'
    # and we haven't processed the max number of tokens.
    while token_int != token_end and count_tokens < max_tokens:
        # Update the input-sequence to the decoder
        # with the last token that was sampled.
        # In the first iteration this will set the
        # first element to the start-token.
        decoder_input_data[0, count_tokens] = token_int

        # Wrap the input-data in a dict for clarity and safety,
        # so we are sure we input the data in the right order.
        x_data = \
        {
            'transfer_values_input': transfer_values,
            'decoder_input': decoder_input_data
        }

        # Note that we input the entire sequence of tokens
        # to the decoder. This wastes a lot of computation
        # because we are only interested in the last input
        # and output. We could modify the code to return
        # the GRU-states when calling predict() and then
        # feeding these GRU-states as well the next time
        # we call predict(), but it would make the code
        # much more complicated.

        # Input this data to the decoder and get the predicted output.
        with graph2.as_default():
            decoder_output = decoder_model.predict(x_data)

        # Get the last predicted token as a one-hot encoded array.
        # Note that this is not limited by softmax, but we just
        # need the index of the largest element so it doesn't matter.
        token_onehot = decoder_output[0, count_tokens, :]

        # Convert to an integer-token.
        token_int = np.argmax(token_onehot)

        # Lookup the word corresponding to this integer-token.
        sampled_word = tokenizer.token_to_word(token_int)

        # Append the word to the output-text.
        output_text += " " + sampled_word

        # Increment the token-counter.
        count_tokens += 1

    # This is the sequence of tokens output by the decoder.
    output_tokens = decoder_input_data[0]

    # return the predicted caption.
    return output_text
Ejemplo n.º 7
0
import sys
import os
import coco
from PIL import Image
from cache import cache
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.applications import VGG16
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

coco.set_data_dir("/home/ljk/ic/data/coco/")  ####记得改路径
_, filenames_train, captions_train = coco.load_records(train=True)
image_model = VGG16(include_top=True, weights='imagenet')


def load_image(path, size=None):  #定义加载图像函数
    # 使用PIL加载图像
    img = Image.open(path)
    # 重新定义图像大小
    if not size is None:
        img = img.resize(size=size, resample=Image.LANCZOS)
    # 将图像转换到np数组里面
    img = np.array(img)
    # 归一化像素
    img = img / 255.0
    # 将2-dim灰度数组转换成3-dimRGB数组
    if (len(img.shape) == 2):