def __init__(self,dataset,image_feature_root,image_root="",preload_all_features=False,image_mean="imagenet",holding_raw_captions=False): self.holding_raw_captions=holding_raw_captions self.image_loader=Image_loader(mean=image_mean) self.captions=dataset["captions"] self.num_captions=len(self.captions) self.images=dataset["images"] self.caption2image={caption["idx"]:caption["image_idx"] for caption in dataset["captions"]} self.image_feature_root=image_feature_root+"/"#path to preprocessed image features. It assume the feature are stored with the same name but only extension is changed to .npz self.image_root=image_root+"/"#path to image directory self.random_indicies = np.random.permutation(len(self.captions)) self.index_count=0 self.epoch=1 self.preload_all_features=preload_all_features if self.preload_all_features: self.image_features=np.array([np.load("%s/%s.npz"%(self.image_feature_root, os.path.splitext(image["file_path"])[0] ))['arr_0'] for image in self.images])
def __init__(self, rnn_model_place, cnn_model_place, dictonary_place, beamsize=3, depth_limit=50, gpu_id=-1, first_word="<sos>", hidden_dim=512, mean="imagenet"): self.gpu_id = gpu_id self.beamsize = beamsize self.depth_limit = depth_limit self.image_loader = Image_loader(mean) self.index2token = self.parse_dic(dictonary_place) self.cnn_model = ResNet() serializers.load_hdf5(cnn_model_place, self.cnn_model) self.cnn_model.train = False self.rnn_model = Image2CaptionDecoder(len(self.token2index), hidden_dim=hidden_dim) if len(rnn_model_place) > 0: serializers.load_hdf5(rnn_model_place, self.rnn_model) self.rnn_model.train = False self.first_word = first_word #Gpu Setting global xp if self.gpu_id >= 0: xp = cuda.cupy cuda.get_device(gpu_id).use() self.cnn_model.to_gpu() self.rnn_model.to_gpu() else: xp = np
class CaptionDataLoader(object): def __init__(self,dataset,image_feature_root,image_root="",preload_all_features=False,image_mean="imagenet",holding_raw_captions=False): self.holding_raw_captions=holding_raw_captions self.image_loader=Image_loader(mean=image_mean) self.captions=dataset["captions"] self.num_captions=len(self.captions) self.images=dataset["images"] self.caption2image={caption["idx"]:caption["image_idx"] for caption in dataset["captions"]} self.image_feature_root=image_feature_root+"/"#path to preprocessed image features. It assume the feature are stored with the same name but only extension is changed to .npz self.image_root=image_root+"/"#path to image directory self.random_indicies = np.random.permutation(len(self.captions)) self.index_count=0 self.epoch=1 self.preload_all_features=preload_all_features if self.preload_all_features: self.image_features=np.array([np.load("%s/%s.npz"%(self.image_feature_root, os.path.splitext(image["file_path"])[0] ))['arr_0'] for image in self.images]) def get_batch(self,batch_size,raw_image=False): #if raw_image is true, it will give you Batchx3x224x224 otherwise it will be just features batch_caption_indicies=self.random_indicies[self.index_count:self.index_count+batch_size] self.index_count+=batch_size if self.index_count > len(self.captions): self.epoch+=1 self.suffle_data() self.index_count=0 #sorry the following lines are so complicated... #this is just loading preprocessed images or image features and captions for this batch if raw_image: batch_images= np.array( [self.image_loader.load(self.image_root+self.images[self.caption2image[i]]["file_path"],expand_batch_dim=False) for i in batch_caption_indicies] ) else: if self.preload_all_features: batch_images=self.image_features[[self.caption2image[i] for i in batch_caption_indicies]] else: batch_images=np.array([np.load("%s/%s.npz"%(self.image_feature_root, os.path.splitext(self.images[self.caption2image[i]]["file_path"])[0] ))['arr_0'] for i in batch_caption_indicies]) if self.holding_raw_captions: batch_word_indices=[self.captions[i]["caption"] for i in batch_caption_indicies] else: batch_word_indices=[np.array(self.captions[i]["caption"],dtype=np.int32) for i in batch_caption_indicies] return batch_images,batch_word_indices def suffle_data(self): self.random_indicies = np.random.permutation(len(self.captions))
class CaptionGenerator(object): def __init__(self, rnn_model_place, cnn_model_place, dictonary_place, beamsize=3, depth_limit=50, gpu_id=-1, first_word="<sos>", hidden_dim=512, mean="imagenet"): self.gpu_id = gpu_id self.beamsize = beamsize self.depth_limit = depth_limit self.image_loader = Image_loader(mean) self.index2token = self.parse_dic(dictonary_place) self.cnn_model = ResNet() serializers.load_hdf5(cnn_model_place, self.cnn_model) self.cnn_model.train = False self.rnn_model = Image2CaptionDecoder(len(self.token2index), hidden_dim=hidden_dim) if len(rnn_model_place) > 0: serializers.load_hdf5(rnn_model_place, self.rnn_model) self.rnn_model.train = False self.first_word = first_word #Gpu Setting global xp if self.gpu_id >= 0: xp = cuda.cupy cuda.get_device(gpu_id).use() self.cnn_model.to_gpu() self.rnn_model.to_gpu() else: xp = np def parse_dic(self, dictonary_place): with open(dictonary_place, 'r') as f: json_file = json.load(f) if len( json_file ) < 10: #this is ad-hock. I need to distinguish new format and old format... self.token2index = { word['word']: word['idx'] for word in json_file["words"] } else: self.token2index = json_file return {v: k for k, v in self.token2index.items()} def successor(self, current_state): ''' Args: current_state: a stete, python tuple (hx,cx,path,cost) hidden: hidden states of LSTM cell: cell states LSTM path: word indicies so far as a python list e.g. initial is self.token2index["<sos>"] cost: negative log likelihood Returns: k_best_next_states: a python list whose length is the beam size. possible_sentences[i] = {"indicies": list of word indices,"cost":negative log likelihood so far} ''' word = [xp.array([current_state["path"][-1]], dtype=xp.int32)] hx = current_state["hidden"] cx = current_state["cell"] hy, cy, next_words = self.rnn_model(hx, cx, word) word_dist = F.softmax( next_words[0]).data[0] #possible next word distributions k_best_next_sentences = [] for i in range(self.beamsize): next_word_idx = int(xp.argmax(word_dist)) k_best_next_sentences.append(\ {\ "hidden":hy,\ "cell":cy,\ "path":deepcopy(current_state["path"])+[next_word_idx],\ "cost":current_state["cost"]-xp.log(word_dist[next_word_idx]) }\ ) word_dist[next_word_idx] = 0 return hy, cy, k_best_next_sentences def beam_search(self, initial_state): ''' Beam search is a graph search algorithm! So I use graph search abstraction Args: initial state: an initial stete, python tuple (hx,cx,path,cost) each state has hx: hidden states cx: cell states path: word indicies so far as a python list e.g. initial is self.token2index["<sos>"] cost: negative log likelihood Returns: captions sorted by the cost (i.e. negative log llikelihood) ''' found_paths = [] top_k_states = [initial_state] while (len(found_paths) < self.beamsize): #forward one step for all top k states, then only select top k after that new_top_k_states = [] for state in top_k_states: #examine to next five possible states hy, cy, k_best_next_states = self.successor(state) for next_state in k_best_next_states: new_top_k_states.append(next_state) selected_top_k_states = heapq.nsmallest(self.beamsize, new_top_k_states, key=lambda x: x["cost"]) #within the selected states, let's check if it is terminal or not. top_k_states = [] for state in selected_top_k_states: #is goal state? -> yes, then end the search if state["path"][-1] == self.token2index["<eos>"] or len( state["path"]) == self.depth_limit: found_paths.append(state) else: top_k_states.append(state) return sorted(found_paths, key=lambda x: x["cost"]) def beam_search0(self, initial_state): #original one. This takes much memory ''' Beam search is a graph search algorithm! So I use graph search abstraction Args: initial state: an initial stete, python tuple (hx,cx,path,cost) each state has hx: hidden states cx: cell states path: word indicies so far as a python list e.g. initial is self.token2index["<sos>"] cost: negative log likelihood Returns: captions sorted by the cost (i.e. negative log llikelihood) ''' found_paths = [] q = Q.PriorityQueue() q.put((0, initial_state)) while (len(found_paths) < self.beamsize): i = 0 # this is just a one step ahead? while not q.empty(): if i == self.beamsize: break state = q.get()[1] #is goal state? -> yes, then end the search if state["path"][-1] == self.token2index["<eos>"] or len( state["path"]) == self.depth_limit: found_paths.append(state) continue #examine to next five possible states and add to priority queue hy, cy, k_best_next_states = self.successor(state) for next_state in k_best_next_states: q.put((state["cost"], next_state)) i += 1 return sorted(found_paths, key=lambda x: x["cost"]) def generate(self, image_file_path): ''' Args: image_file_path: image_file_path ''' img = self.image_loader.load(image_file_path) return self.generate_from_img(img) def generate_from_img_feature(self, image_feature): if self.gpu_id >= 0: image_feature = cuda.to_gpu(image_feature) batch_size = 1 hx = xp.zeros( (self.rnn_model.n_layers, batch_size, self.rnn_model.hidden_dim), dtype=xp.float32) cx = xp.zeros( (self.rnn_model.n_layers, batch_size, self.rnn_model.hidden_dim), dtype=xp.float32) hy, cy = self.rnn_model.input_cnn_feature(hx, cx, image_feature) initial_state={\ "hidden":hy,\ "cell":cy,\ "path":[self.token2index[self.first_word]],\ "cost":0,\ }\ captions = self.beam_search(initial_state) caption_candidates = [] for caption in captions: sentence = [ self.index2token[word_idx] for word_idx in caption["path"] ] log_likelihood = -float( caption["cost"]) #cost is the negative log likelihood caption_candidates.append({ "sentence": sentence, "log_likelihood": log_likelihood }) return caption_candidates def generate_from_img(self, image_array): '''Generate Caption for an Numpy Image array Args: image_array: numpy array of image Returns: list of generated captions, sorted by the cost (i.e. negative log llikelihood) The structure is [caption,caption,caption,...] Where caption = {"sentence": a generated sentence as a python list of word, "log_likelihood": The log llikelihood of the generated sentence} ''' if self.gpu_id >= 0: image_array = cuda.to_gpu(image_array) image_feature = self.cnn_model(image_array, "feature").data.reshape( 1, 1, 2048 ) #次元が一つ多いのは、NstepLSTMはsequaenceとみなすから。(sequence size, batch size, feature dim)ということ return self.generate_from_img_feature(image_feature)
parser.add_argument('--out-dir', type=str, help='The directory that the features will be saved') parser.add_argument('--model', type=str, default='../data/ResNet50.model', help='place of the ResNet model') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() #setup image loader image_loader = Image_loader(mean="imagenet") #set up and load the model model = ResNet() serializers.load_hdf5(args.model, model) model.train = False #GPU preparation if args.gpu >= 0: cuda.get_device(args.gpu).use() model.to_gpu() image_files = os.listdir(args.img_dir) i = 0 for path in image_files: name, ext = os.path.splitext(path)
#print(chainer.functions.Linear(1,1).type_check_enable) import cv2 import argparse import numpy as np import math from chainer import cuda import chainer.functions as F from chainer import cuda, Function, FunctionSet, gradient_check, Variable, optimizers from chainer import serializers sys.path.append('../code/') from CaptionGenerator import CaptionGenerator from image_loader import Image_loader image_loader = Image_loader(mean="imaganet") app = Flask(__name__) # app.config['UPLOAD_FOLDER'] = '/home/stsutsui/chainer-caption/webapi/uploads' # JSON 中の日本語を ASCII コードに変換しないようにする (curl コマンドで見やすくするため。ASCII に変換しても特に問題ない) app.config['JSON_AS_ASCII'] = False # This is the path to the upload directory app.config['UPLOAD_FOLDER'] = 'uploads/' # These are the extension that we are accepting to be uploaded app.config['ALLOWED_EXTENSIONS'] = set(['png', 'jpg', 'jpeg', 'gif']) # For a given file, return whether it's an allowed type or not
help='path to the vocaburary json') parser.add_argument('--img', default='./sample_imgs/dog.jpg', type=str, help='path to the image') parser.add_argument('--cnn-model', type=str, default='./data/ResNet50.model', help='place of the ResNet model') parser.add_argument('--rnn-model', type=str, default='./data/caption_model.model', help='place of the caption model') args = parser.parse_args() image_loader = Image_loader(mean='imagenet') with open(args.vocab, 'r') as f: token2index = json.load(f) index2token = {v: k for k, v in token2index.items()} cnn_model = ResNet() serializers.load_hdf5(args.cnn_model, cnn_model) cnn_model.train = False rnn_model = Image2CaptionDecoder(len(token2index)) serializers.load_hdf5(args.rnn_model, rnn_model) rnn_model.train = False if args.gpu >= 0: xp = cuda.cupy cuda.get_device(args.gpu).use() cnn_model.to_gpu()
type=str, default="../sample_imgs/dog.jpg", help='place of a image that you want to predict') parser.add_argument('--model', type=str, default='../data/ResNet50.model', help='place of the ResNet model') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() #setup image loader image_loader = Image_loader("imagenet") #set up and load the model model = ResNet() serializers.load_hdf5(args.model, model) model.train = False #load image img = image_loader.load(args.img) #GPU preparation if args.gpu >= 0: cuda.get_device(args.gpu).use() model.to_gpu() img = cuda.to_gpu(img, device=args.gpu)
type=str, help='path to the predicted json file') parser.add_argument('--output', default="../output1", type=str, help='output directory name') args = parser.parse_args() if not os.path.isdir(args.output): os.makedirs(args.output) print("made the save directory", args.output) image_dir = args.output + "/images/" if not os.path.isdir(image_dir): os.makedirs(image_dir) image_loader = Image_loader() with open(args.predicted, 'r') as f: predictions = json.load(f) html = HTMLPrinter(file_path=args.output + "/captions.html") i = 0 html.write('<html><body><table border="1">') for image_filename, caption in predictions.items(): # if image_filename[-4:]=='.npz': # image_filename=image_filename[0:-4].split("/")[-1] sys.stdout.write("\r%d" % i) sys.stdout.flush() i += 1