def __init__(self,rnn_model_place,cnn_model_place,dictionary_place,beamsize=3,depth_limit=50,gpu_id=-1,first_word="<sos>",hidden_dim=512,mean="imagenet"): self.gpu_id=gpu_id self.beamsize=beamsize self.depth_limit=depth_limit #self.image_loader=Image_loader(mean) self.index2token=self.parse_dic(dictionary_place) self.cnn_model=ResNet() serializers.load_hdf5(cnn_model_place, self.cnn_model) self.cnn_model.train = False self.rnn_model=Image2CaptionDecoder(len(self.token2index),hidden_dim=hidden_dim) if len(rnn_model_place) > 0: serializers.load_hdf5(rnn_model_place, self.rnn_model) self.rnn_model.train = False self.first_word=first_word mean_image = np.ndarray((3, 224, 224), dtype=np.float32) mean_image[0] = 103.939 mean_image[1] = 116.779 mean_image[2] = 123.68 self.mean_image = mean_image #Gpu Setting global xp if self.gpu_id >= 0: xp = cuda.cupy cuda.get_device(gpu_id).use() self.cnn_model.to_gpu() self.rnn_model.to_gpu() else: xp=np
def __init__(self, rnn_model_place, cnn_model_place, dictonary_place, beamsize=3, depth_limit=50, gpu_id=-1, first_word="<sos>", hidden_dim=512, mean="imagenet"): self.gpu_id = gpu_id self.beamsize = beamsize self.depth_limit = depth_limit self.image_loader = Image_loader(mean) self.index2token = self.parse_dic(dictonary_place) self.cnn_model = ResNet() serializers.load_hdf5(cnn_model_place, self.cnn_model) self.cnn_model.train = False self.rnn_model = Image2CaptionDecoder(len(self.token2index), hidden_dim=hidden_dim) if len(rnn_model_place) > 0: serializers.load_hdf5(rnn_model_place, self.rnn_model) self.rnn_model.train = False self.first_word = first_word #Gpu Setting global xp if self.gpu_id >= 0: xp = cuda.cupy cuda.get_device(gpu_id).use() self.cnn_model.to_gpu() self.rnn_model.to_gpu() else: xp = np
class CaptionGenerator(object): def __init__(self, rnn_model_place, cnn_model_place, dictonary_place, beamsize=3, depth_limit=50, gpu_id=-1, first_word="<sos>", hidden_dim=512, mean="imagenet"): self.gpu_id = gpu_id self.beamsize = beamsize self.depth_limit = depth_limit self.image_loader = Image_loader(mean) self.index2token = self.parse_dic(dictonary_place) self.cnn_model = ResNet() serializers.load_hdf5(cnn_model_place, self.cnn_model) self.cnn_model.train = False self.rnn_model = Image2CaptionDecoder(len(self.token2index), hidden_dim=hidden_dim) if len(rnn_model_place) > 0: serializers.load_hdf5(rnn_model_place, self.rnn_model) self.rnn_model.train = False self.first_word = first_word #Gpu Setting global xp if self.gpu_id >= 0: xp = cuda.cupy cuda.get_device(gpu_id).use() self.cnn_model.to_gpu() self.rnn_model.to_gpu() else: xp = np def parse_dic(self, dictonary_place): with open(dictonary_place, 'r') as f: json_file = json.load(f) if len( json_file ) < 10: #this is ad-hock. I need to distinguish new format and old format... self.token2index = { word['word']: word['idx'] for word in json_file["words"] } else: self.token2index = json_file return {v: k for k, v in self.token2index.items()} def successor(self, current_state): ''' Args: current_state: a stete, python tuple (hx,cx,path,cost) hidden: hidden states of LSTM cell: cell states LSTM path: word indicies so far as a python list e.g. initial is self.token2index["<sos>"] cost: negative log likelihood Returns: k_best_next_states: a python list whose length is the beam size. possible_sentences[i] = {"indicies": list of word indices,"cost":negative log likelihood so far} ''' word = [xp.array([current_state["path"][-1]], dtype=xp.int32)] hx = current_state["hidden"] cx = current_state["cell"] hy, cy, next_words = self.rnn_model(hx, cx, word) word_dist = F.softmax( next_words[0]).data[0] #possible next word distributions k_best_next_sentences = [] for i in range(self.beamsize): next_word_idx = int(xp.argmax(word_dist)) k_best_next_sentences.append(\ {\ "hidden":hy,\ "cell":cy,\ "path":deepcopy(current_state["path"])+[next_word_idx],\ "cost":current_state["cost"]-xp.log(word_dist[next_word_idx]) }\ ) word_dist[next_word_idx] = 0 return hy, cy, k_best_next_sentences def beam_search(self, initial_state): ''' Beam search is a graph search algorithm! So I use graph search abstraction Args: initial state: an initial stete, python tuple (hx,cx,path,cost) each state has hx: hidden states cx: cell states path: word indicies so far as a python list e.g. initial is self.token2index["<sos>"] cost: negative log likelihood Returns: captions sorted by the cost (i.e. negative log llikelihood) ''' found_paths = [] top_k_states = [initial_state] while (len(found_paths) < self.beamsize): #forward one step for all top k states, then only select top k after that new_top_k_states = [] for state in top_k_states: #examine to next five possible states hy, cy, k_best_next_states = self.successor(state) for next_state in k_best_next_states: new_top_k_states.append(next_state) selected_top_k_states = heapq.nsmallest(self.beamsize, new_top_k_states, key=lambda x: x["cost"]) #within the selected states, let's check if it is terminal or not. top_k_states = [] for state in selected_top_k_states: #is goal state? -> yes, then end the search if state["path"][-1] == self.token2index["<eos>"] or len( state["path"]) == self.depth_limit: found_paths.append(state) else: top_k_states.append(state) return sorted(found_paths, key=lambda x: x["cost"]) def beam_search0(self, initial_state): #original one. This takes much memory ''' Beam search is a graph search algorithm! So I use graph search abstraction Args: initial state: an initial stete, python tuple (hx,cx,path,cost) each state has hx: hidden states cx: cell states path: word indicies so far as a python list e.g. initial is self.token2index["<sos>"] cost: negative log likelihood Returns: captions sorted by the cost (i.e. negative log llikelihood) ''' found_paths = [] q = Q.PriorityQueue() q.put((0, initial_state)) while (len(found_paths) < self.beamsize): i = 0 # this is just a one step ahead? while not q.empty(): if i == self.beamsize: break state = q.get()[1] #is goal state? -> yes, then end the search if state["path"][-1] == self.token2index["<eos>"] or len( state["path"]) == self.depth_limit: found_paths.append(state) continue #examine to next five possible states and add to priority queue hy, cy, k_best_next_states = self.successor(state) for next_state in k_best_next_states: q.put((state["cost"], next_state)) i += 1 return sorted(found_paths, key=lambda x: x["cost"]) def generate(self, image_file_path): ''' Args: image_file_path: image_file_path ''' img = self.image_loader.load(image_file_path) return self.generate_from_img(img) def generate_from_img_feature(self, image_feature): if self.gpu_id >= 0: image_feature = cuda.to_gpu(image_feature) batch_size = 1 hx = xp.zeros( (self.rnn_model.n_layers, batch_size, self.rnn_model.hidden_dim), dtype=xp.float32) cx = xp.zeros( (self.rnn_model.n_layers, batch_size, self.rnn_model.hidden_dim), dtype=xp.float32) hy, cy = self.rnn_model.input_cnn_feature(hx, cx, image_feature) initial_state={\ "hidden":hy,\ "cell":cy,\ "path":[self.token2index[self.first_word]],\ "cost":0,\ }\ captions = self.beam_search(initial_state) caption_candidates = [] for caption in captions: sentence = [ self.index2token[word_idx] for word_idx in caption["path"] ] log_likelihood = -float( caption["cost"]) #cost is the negative log likelihood caption_candidates.append({ "sentence": sentence, "log_likelihood": log_likelihood }) return caption_candidates def generate_from_img(self, image_array): '''Generate Caption for an Numpy Image array Args: image_array: numpy array of image Returns: list of generated captions, sorted by the cost (i.e. negative log llikelihood) The structure is [caption,caption,caption,...] Where caption = {"sentence": a generated sentence as a python list of word, "log_likelihood": The log llikelihood of the generated sentence} ''' if self.gpu_id >= 0: image_array = cuda.to_gpu(image_array) image_feature = self.cnn_model(image_array, "feature").data.reshape( 1, 1, 2048 ) #次元が一つ多いのは、NstepLSTMはsequaenceとみなすから。(sequence size, batch size, feature dim)ということ return self.generate_from_img_feature(image_feature)
parser.add_argument('--model', type=str, default='../data/ResNet50.model', help='place of the ResNet model') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() #setup image loader image_loader = Image_loader(mean="imagenet") #set up and load the model model = ResNet() serializers.load_hdf5(args.model, model) model.train = False #GPU preparation if args.gpu >= 0: cuda.get_device(args.gpu).use() model.to_gpu() image_files = os.listdir(args.img_dir) i = 0 for path in image_files: name, ext = os.path.splitext(path) print(i, path) img = image_loader.load(args.img_dir + '/' + path) if args.gpu >= 0:
parser.add_argument('--cnn-model', type=str, default='./data/ResNet50.model', help='place of the ResNet model') parser.add_argument('--rnn-model', type=str, default='./data/caption_model.model', help='place of the caption model') args = parser.parse_args() image_loader = Image_loader(mean='imagenet') with open(args.vocab, 'r') as f: token2index = json.load(f) index2token = {v: k for k, v in token2index.items()} cnn_model = ResNet() serializers.load_hdf5(args.cnn_model, cnn_model) cnn_model.train = False rnn_model = Image2CaptionDecoder(len(token2index)) serializers.load_hdf5(args.rnn_model, rnn_model) rnn_model.train = False if args.gpu >= 0: xp = cuda.cupy cuda.get_device(args.gpu).use() cnn_model.to_gpu() rnn_model.to_gpu() else: xp = np batch_size = 1
caption_generator=CaptionGenerator( rnn_model_place=args.rnn_model, cnn_model_place=args.cnn_model, dictonary_place=dictonary_place, beamsize=1, depth_limit=args.depth, gpu_id=args.gpu, first_word= "<sos>", ) #Model Preparation print("preparing caption generation models and training process") model=chainer.Chain() model.rnn=Image2CaptionDecoder(vocaburary_size=len(caption_generator.index2token),hidden_dim=args.hidden,n_layers=args.layers) model.cnn=ResNet() model.rnn.train=True model.cnn.train=True serializers.load_hdf5(args.cnn_model, model.cnn) if not len(args.rnn_model) == 0: serializers.load_hdf5(args.rnn_model, model.rnn) #To GPU if args.gpu >= 0: model.cnn.to_gpu() model.rnn.to_gpu() #set up optimizers optimizer = optimizers.Adam() optimizer.setup(model.rnn) optimizer.alpha=args.rnn_lr