Example #1
0
 def __init__(self,dataset,image_feature_root,image_root="",preload_all_features=False,image_mean="imagenet",holding_raw_captions=False):
     self.holding_raw_captions=holding_raw_captions
     self.image_loader=Image_loader(mean=image_mean)
     self.captions=dataset["captions"]
     self.num_captions=len(self.captions)
     self.images=dataset["images"]
     self.caption2image={caption["idx"]:caption["image_idx"] for caption in dataset["captions"]}
     self.image_feature_root=image_feature_root+"/"#path to preprocessed image features. It assume the feature are stored with the same name but only extension is changed to .npz
     self.image_root=image_root+"/"#path to image directory
     self.random_indicies = np.random.permutation(len(self.captions))
     self.index_count=0
     self.epoch=1
     self.preload_all_features=preload_all_features
     if  self.preload_all_features:
         self.image_features=np.array([np.load("%s/%s.npz"%(self.image_feature_root, os.path.splitext(image["file_path"])[0] ))['arr_0'] for image in self.images])
    def __init__(self,
                 rnn_model_place,
                 cnn_model_place,
                 dictonary_place,
                 beamsize=3,
                 depth_limit=50,
                 gpu_id=-1,
                 first_word="<sos>",
                 hidden_dim=512,
                 mean="imagenet"):
        self.gpu_id = gpu_id
        self.beamsize = beamsize
        self.depth_limit = depth_limit
        self.image_loader = Image_loader(mean)
        self.index2token = self.parse_dic(dictonary_place)

        self.cnn_model = ResNet()
        serializers.load_hdf5(cnn_model_place, self.cnn_model)
        self.cnn_model.train = False

        self.rnn_model = Image2CaptionDecoder(len(self.token2index),
                                              hidden_dim=hidden_dim)
        if len(rnn_model_place) > 0:
            serializers.load_hdf5(rnn_model_place, self.rnn_model)
        self.rnn_model.train = False

        self.first_word = first_word

        #Gpu Setting
        global xp
        if self.gpu_id >= 0:
            xp = cuda.cupy
            cuda.get_device(gpu_id).use()
            self.cnn_model.to_gpu()
            self.rnn_model.to_gpu()
        else:
            xp = np
Example #3
0
class CaptionDataLoader(object):
    def __init__(self,dataset,image_feature_root,image_root="",preload_all_features=False,image_mean="imagenet",holding_raw_captions=False):
        self.holding_raw_captions=holding_raw_captions
        self.image_loader=Image_loader(mean=image_mean)
        self.captions=dataset["captions"]
        self.num_captions=len(self.captions)
        self.images=dataset["images"]
        self.caption2image={caption["idx"]:caption["image_idx"] for caption in dataset["captions"]}
        self.image_feature_root=image_feature_root+"/"#path to preprocessed image features. It assume the feature are stored with the same name but only extension is changed to .npz
        self.image_root=image_root+"/"#path to image directory
        self.random_indicies = np.random.permutation(len(self.captions))
        self.index_count=0
        self.epoch=1
        self.preload_all_features=preload_all_features
        if  self.preload_all_features:
            self.image_features=np.array([np.load("%s/%s.npz"%(self.image_feature_root, os.path.splitext(image["file_path"])[0] ))['arr_0'] for image in self.images])

    def get_batch(self,batch_size,raw_image=False):
        #if raw_image is true, it will give you Batchx3x224x224 otherwise it will be just features
        batch_caption_indicies=self.random_indicies[self.index_count:self.index_count+batch_size]
        self.index_count+=batch_size
        if self.index_count > len(self.captions):
            self.epoch+=1
            self.suffle_data()
            self.index_count=0

        #sorry the following lines are so complicated...
        #this is just loading preprocessed images or image features and captions for this batch
        if raw_image:
            batch_images= np.array( [self.image_loader.load(self.image_root+self.images[self.caption2image[i]]["file_path"],expand_batch_dim=False) for i in batch_caption_indicies] )
        else:
            if self.preload_all_features:
                batch_images=self.image_features[[self.caption2image[i] for i in batch_caption_indicies]]
            else:
                batch_images=np.array([np.load("%s/%s.npz"%(self.image_feature_root, os.path.splitext(self.images[self.caption2image[i]]["file_path"])[0] ))['arr_0'] for i in batch_caption_indicies])
        if self.holding_raw_captions:
            batch_word_indices=[self.captions[i]["caption"] for i in batch_caption_indicies]
        else:
            batch_word_indices=[np.array(self.captions[i]["caption"],dtype=np.int32) for i in batch_caption_indicies]

        return batch_images,batch_word_indices

    def suffle_data(self):
        self.random_indicies = np.random.permutation(len(self.captions))
class CaptionGenerator(object):
    def __init__(self,
                 rnn_model_place,
                 cnn_model_place,
                 dictonary_place,
                 beamsize=3,
                 depth_limit=50,
                 gpu_id=-1,
                 first_word="<sos>",
                 hidden_dim=512,
                 mean="imagenet"):
        self.gpu_id = gpu_id
        self.beamsize = beamsize
        self.depth_limit = depth_limit
        self.image_loader = Image_loader(mean)
        self.index2token = self.parse_dic(dictonary_place)

        self.cnn_model = ResNet()
        serializers.load_hdf5(cnn_model_place, self.cnn_model)
        self.cnn_model.train = False

        self.rnn_model = Image2CaptionDecoder(len(self.token2index),
                                              hidden_dim=hidden_dim)
        if len(rnn_model_place) > 0:
            serializers.load_hdf5(rnn_model_place, self.rnn_model)
        self.rnn_model.train = False

        self.first_word = first_word

        #Gpu Setting
        global xp
        if self.gpu_id >= 0:
            xp = cuda.cupy
            cuda.get_device(gpu_id).use()
            self.cnn_model.to_gpu()
            self.rnn_model.to_gpu()
        else:
            xp = np

    def parse_dic(self, dictonary_place):
        with open(dictonary_place, 'r') as f:
            json_file = json.load(f)
        if len(
                json_file
        ) < 10:  #this is ad-hock. I need to distinguish new format and old format...
            self.token2index = {
                word['word']: word['idx']
                for word in json_file["words"]
            }
        else:
            self.token2index = json_file

        return {v: k for k, v in self.token2index.items()}

    def successor(self, current_state):
        '''
        Args:
            current_state: a stete, python tuple (hx,cx,path,cost)
                hidden: hidden states of LSTM
                cell: cell states LSTM
                path: word indicies so far as a python list  e.g. initial is self.token2index["<sos>"]
                cost: negative log likelihood

        Returns:
            k_best_next_states: a python list whose length is the beam size. possible_sentences[i] = {"indicies": list of word indices,"cost":negative log likelihood so far}

        '''

        word = [xp.array([current_state["path"][-1]], dtype=xp.int32)]
        hx = current_state["hidden"]
        cx = current_state["cell"]
        hy, cy, next_words = self.rnn_model(hx, cx, word)

        word_dist = F.softmax(
            next_words[0]).data[0]  #possible next word distributions
        k_best_next_sentences = []
        for i in range(self.beamsize):
            next_word_idx = int(xp.argmax(word_dist))
            k_best_next_sentences.append(\
                {\
                "hidden":hy,\
                "cell":cy,\
                "path":deepcopy(current_state["path"])+[next_word_idx],\
                "cost":current_state["cost"]-xp.log(word_dist[next_word_idx])
                }\
                )
            word_dist[next_word_idx] = 0

        return hy, cy, k_best_next_sentences

    def beam_search(self, initial_state):
        '''
        Beam search is a graph search algorithm! So I use graph search abstraction

        Args:
            initial state: an initial stete, python tuple (hx,cx,path,cost)
            each state has 
                hx: hidden states
                cx: cell states
                path: word indicies so far as a python list  e.g. initial is self.token2index["<sos>"]
                cost: negative log likelihood

        Returns:
            captions sorted by the cost (i.e. negative log llikelihood)
        '''
        found_paths = []
        top_k_states = [initial_state]
        while (len(found_paths) < self.beamsize):
            #forward one step for all top k states, then only select top k after that
            new_top_k_states = []
            for state in top_k_states:
                #examine to next five possible states
                hy, cy, k_best_next_states = self.successor(state)
                for next_state in k_best_next_states:
                    new_top_k_states.append(next_state)
            selected_top_k_states = heapq.nsmallest(self.beamsize,
                                                    new_top_k_states,
                                                    key=lambda x: x["cost"])

            #within the selected states, let's check if it is terminal or not.
            top_k_states = []
            for state in selected_top_k_states:
                #is goal state? -> yes, then end the search
                if state["path"][-1] == self.token2index["<eos>"] or len(
                        state["path"]) == self.depth_limit:
                    found_paths.append(state)
                else:
                    top_k_states.append(state)

        return sorted(found_paths, key=lambda x: x["cost"])

    def beam_search0(self, initial_state):
        #original one. This takes much memory
        '''
        Beam search is a graph search algorithm! So I use graph search abstraction
        Args:
            initial state: an initial stete, python tuple (hx,cx,path,cost)
            each state has 
                hx: hidden states
                cx: cell states
                path: word indicies so far as a python list  e.g. initial is self.token2index["<sos>"]
                cost: negative log likelihood
        Returns:
            captions sorted by the cost (i.e. negative log llikelihood)
        '''
        found_paths = []
        q = Q.PriorityQueue()
        q.put((0, initial_state))
        while (len(found_paths) < self.beamsize):
            i = 0
            # this is just a one step ahead?
            while not q.empty():
                if i == self.beamsize:
                    break
                state = q.get()[1]
                #is goal state? -> yes, then end the search
                if state["path"][-1] == self.token2index["<eos>"] or len(
                        state["path"]) == self.depth_limit:
                    found_paths.append(state)
                    continue
                #examine to next five possible states and add to priority queue
                hy, cy, k_best_next_states = self.successor(state)
                for next_state in k_best_next_states:
                    q.put((state["cost"], next_state))
                i += 1

        return sorted(found_paths, key=lambda x: x["cost"])

    def generate(self, image_file_path):
        '''
        Args:
            image_file_path: image_file_path
        '''
        img = self.image_loader.load(image_file_path)
        return self.generate_from_img(img)

    def generate_from_img_feature(self, image_feature):
        if self.gpu_id >= 0:
            image_feature = cuda.to_gpu(image_feature)

        batch_size = 1
        hx = xp.zeros(
            (self.rnn_model.n_layers, batch_size, self.rnn_model.hidden_dim),
            dtype=xp.float32)
        cx = xp.zeros(
            (self.rnn_model.n_layers, batch_size, self.rnn_model.hidden_dim),
            dtype=xp.float32)

        hy, cy = self.rnn_model.input_cnn_feature(hx, cx, image_feature)


        initial_state={\
                    "hidden":hy,\
                    "cell":cy,\
                    "path":[self.token2index[self.first_word]],\
                    "cost":0,\
                }\

        captions = self.beam_search(initial_state)

        caption_candidates = []
        for caption in captions:
            sentence = [
                self.index2token[word_idx] for word_idx in caption["path"]
            ]
            log_likelihood = -float(
                caption["cost"])  #cost is the negative log likelihood
            caption_candidates.append({
                "sentence": sentence,
                "log_likelihood": log_likelihood
            })

        return caption_candidates

    def generate_from_img(self, image_array):
        '''Generate Caption for an Numpy Image array
        
        Args:
            image_array: numpy array of image

        Returns:
            list of generated captions, sorted by the cost (i.e. negative log llikelihood)

            The structure is [caption,caption,caption,...]
            Where caption = {"sentence": a generated sentence as a python list of word, "log_likelihood": The log llikelihood of the generated sentence} 

        '''
        if self.gpu_id >= 0:
            image_array = cuda.to_gpu(image_array)
        image_feature = self.cnn_model(image_array, "feature").data.reshape(
            1, 1, 2048
        )  #次元が一つ多いのは、NstepLSTMはsequaenceとみなすから。(sequence size, batch size, feature dim)ということ

        return self.generate_from_img_feature(image_feature)
parser.add_argument('--out-dir',
                    type=str,
                    help='The directory that the features will be saved')
parser.add_argument('--model',
                    type=str,
                    default='../data/ResNet50.model',
                    help='place of the ResNet model')
parser.add_argument('--gpu',
                    '-g',
                    type=int,
                    default=-1,
                    help='GPU ID (negative value indicates CPU)')
args = parser.parse_args()

#setup image loader
image_loader = Image_loader(mean="imagenet")

#set up and load the model
model = ResNet()
serializers.load_hdf5(args.model, model)
model.train = False

#GPU preparation
if args.gpu >= 0:
    cuda.get_device(args.gpu).use()
    model.to_gpu()

image_files = os.listdir(args.img_dir)
i = 0
for path in image_files:
    name, ext = os.path.splitext(path)
Example #6
0
#print(chainer.functions.Linear(1,1).type_check_enable)

import cv2
import argparse
import numpy as np
import math
from chainer import cuda
import chainer.functions as F
from chainer import cuda, Function, FunctionSet, gradient_check, Variable, optimizers
from chainer import serializers

sys.path.append('../code/')
from CaptionGenerator import CaptionGenerator
from image_loader import Image_loader

image_loader = Image_loader(mean="imaganet")

app = Flask(__name__)

# app.config['UPLOAD_FOLDER'] = '/home/stsutsui/chainer-caption/webapi/uploads'

# JSON 中の日本語を ASCII コードに変換しないようにする (curl コマンドで見やすくするため。ASCII に変換しても特に問題ない)
app.config['JSON_AS_ASCII'] = False

# This is the path to the upload directory
app.config['UPLOAD_FOLDER'] = 'uploads/'
# These are the extension that we are accepting to be uploaded
app.config['ALLOWED_EXTENSIONS'] = set(['png', 'jpg', 'jpeg', 'gif'])


# For a given file, return whether it's an allowed type or not
Example #7
0
    help='path to the vocaburary json')
parser.add_argument('--img',
                    default='./sample_imgs/dog.jpg',
                    type=str,
                    help='path to the image')
parser.add_argument('--cnn-model',
                    type=str,
                    default='./data/ResNet50.model',
                    help='place of the ResNet model')
parser.add_argument('--rnn-model',
                    type=str,
                    default='./data/caption_model.model',
                    help='place of the caption model')
args = parser.parse_args()

image_loader = Image_loader(mean='imagenet')
with open(args.vocab, 'r') as f:
    token2index = json.load(f)
index2token = {v: k for k, v in token2index.items()}

cnn_model = ResNet()
serializers.load_hdf5(args.cnn_model, cnn_model)
cnn_model.train = False
rnn_model = Image2CaptionDecoder(len(token2index))
serializers.load_hdf5(args.rnn_model, rnn_model)
rnn_model.train = False

if args.gpu >= 0:
    xp = cuda.cupy
    cuda.get_device(args.gpu).use()
    cnn_model.to_gpu()
Example #8
0
                    type=str,
                    default="../sample_imgs/dog.jpg",
                    help='place of a image that you want to predict')
parser.add_argument('--model',
                    type=str,
                    default='../data/ResNet50.model',
                    help='place of the ResNet model')
parser.add_argument('--gpu',
                    '-g',
                    type=int,
                    default=-1,
                    help='GPU ID (negative value indicates CPU)')
args = parser.parse_args()

#setup image loader
image_loader = Image_loader("imagenet")

#set up and load the model
model = ResNet()
serializers.load_hdf5(args.model, model)
model.train = False

#load image
img = image_loader.load(args.img)

#GPU preparation
if args.gpu >= 0:
    cuda.get_device(args.gpu).use()
    model.to_gpu()
    img = cuda.to_gpu(img, device=args.gpu)
Example #9
0
    type=str,
    help='path to the predicted json file')
parser.add_argument('--output',
                    default="../output1",
                    type=str,
                    help='output directory name')
args = parser.parse_args()

if not os.path.isdir(args.output):
    os.makedirs(args.output)
    print("made the save directory", args.output)
image_dir = args.output + "/images/"
if not os.path.isdir(image_dir):
    os.makedirs(image_dir)

image_loader = Image_loader()

with open(args.predicted, 'r') as f:
    predictions = json.load(f)

html = HTMLPrinter(file_path=args.output + "/captions.html")

i = 0
html.write('<html><body><table border="1">')

for image_filename, caption in predictions.items():
    # if image_filename[-4:]=='.npz':
    #     image_filename=image_filename[0:-4].split("/")[-1]
    sys.stdout.write("\r%d" % i)
    sys.stdout.flush()
    i += 1