rawim = np.copy(im).astype('uint8') # Shuffle axes to c01 im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1) # Convert to BGR im = im[::-1, :, :] im = im - MEAN_VALUES return rawim.transpose(2, 0, 1).astype(np.float32) #Model Preparation model = FunctionSet() model.img_feature2vec=F.Linear(image_feature_dim, n_units) model.embed=F.EmbedID(len(vocab), n_units) model.l1_x=F.Linear(n_units, 4 * n_units)#parameter W,b model.l1_h=F.Linear(n_units, 4 * n_units)#parameter W,b model.out=F.Linear(n_units, len(vocab))#parameter W,b serializers.load_hdf5(model_place, model) #To GPU if gpu_id >= 0: model.to_gpu() #Define Newtowork (Forward) #forward_one_step is after the CNN layer, #h0 is n_units dimensional vector (embedding) def forward_one_step(cur_word, state, volatile='on'):
rawim = np.copy(im).astype('uint8') # Shuffle axes to c01 im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1) # Convert to BGR im = im[::-1, :, :] im = im - MEAN_VALUES return rawim.transpose(2, 0, 1).astype(np.float32) #Model Preparation print "preparing caption generation models" model = FunctionSet() model.img_feature2vec=F.Linear(image_feature_dim, n_units)#CNN(I)の最後のレイヤーに相当。#parameter W,b model.embed=F.EmbedID(len(vocab), n_units)#W_e*S_tに相当 #parameter W model.l1_x=F.Linear(n_units, 4 * n_units)#parameter W,b model.l1_h=F.Linear(n_units, 4 * n_units)#parameter W,b model.out=F.Linear(n_units, len(vocab))#parameter W,b serializers.load_hdf5(model_place, model) #To GPU if gpu_id >= 0: model.to_gpu() print "done" #Define Newtowork (Forward) #forward_one_step is after the CNN layer, #h0 is n_units dimensional vector (embedding)
class Caption_generator(object): def __init__(self,caption_model_place,cnn_model_place,index2word_place,gpu_id=-1,beamsize=3): #basic paramaters you need to modify self.gpu_id=gpu_id# GPU ID. if you want to use cpu, -1 self.beamsize=beamsize #Gpu Setting global xp if self.gpu_id >= 0: xp = cuda.cupy cuda.get_device(gpu_id).use() else: xp=np # Prepare dataset with open(index2word_place, 'r') as f: self.index2word = pickle.load(f) vocab=self.index2word #Load Caffe Model with open(cnn_model_place, 'r') as f: self.func = pickle.load(f) #Model Preparation image_feature_dim=1024#dimension of image feature self.n_units = 512 #number of units per layer n_units = 512 self.model = FunctionSet() self.model.img_feature2vec=F.Linear(image_feature_dim, n_units)#CNN(I)の最後のレイヤーに相当。#parameter W,b self.model.embed=F.EmbedID(len(vocab), n_units)#W_e*S_tに相当 #parameter W self.model.l1_x=F.Linear(n_units, 4 * n_units)#parameter W,b self.model.l1_h=F.Linear(n_units, 4 * n_units)#parameter W,b self.model.out=F.Linear(n_units, len(vocab))#parameter W,b serializers.load_hdf5(caption_model_place, self.model)#read pre-trained model #To GPU if gpu_id >= 0: model.to_gpu() func.to_gpu() #to avoid overflow. #I don't know why, but this model overflows at the first time only with CPU. #So I intentionally make overflow so that it never happns after that. if gpu_id < 0: numpy_image = np.ones((3, 224,224), dtype=np.float32) self.generate(numpy_image) def feature_exractor(self,x_chainer_variable): #to extract image feature by CNN. y, = self.func(inputs={'data': x_chainer_variable}, outputs=['pool5/7x7_s1'], disable=['loss1/ave_pool', 'loss2/ave_pool','loss3/classifier'], train=False) return y def forward_one_step_for_image(self,img_feature, state, volatile='on'): x = img_feature#img_feature is chainer.variable. h0 = self.model.img_feature2vec(x) h1_in = self.model.l1_x(F.dropout(h0,train=False)) + self.model.l1_h(state['h1']) c1, h1 = F.lstm(state['c1'], h1_in) y = self.model.out(F.dropout(h1,train=False))#don't forget to change drop out into non train mode. state = {'c1': c1, 'h1': h1} return state, F.softmax(y) #forward_one_step is after the CNN layer, #h0 is n_units dimensional vector (embedding) def forward_one_step(self,cur_word, state, volatile='on'): x = chainer.Variable(cur_word, volatile) h0 = self.model.embed(x) h1_in = self.model.l1_x(F.dropout(h0,train=False)) + self.model.l1_h(state['h1']) c1, h1 = F.lstm(state['c1'], h1_in) y = self.model.out(F.dropout(h1,train=False)) state = {'c1': c1, 'h1': h1} return state, F.softmax(y) def beam_search(self,sentence_candidates,final_sentences,depth=1,beamsize=3): volatile=True next_sentence_candidates_temp=list() for sentence_tuple in sentence_candidates: cur_sentence=sentence_tuple[0] cur_index=sentence_tuple[0][-1] cur_index_xp=xp.array([cur_index],dtype=np.int32) cur_state=sentence_tuple[1] cur_log_likely=sentence_tuple[2] state, predicted_word = self.forward_one_step(cur_index_xp,cur_state, volatile=volatile) predicted_word_np=cuda.to_cpu(predicted_word.data) top_indexes=(-predicted_word_np).argsort()[0][:beamsize] for index in np.nditer(top_indexes): index=int(index) probability=predicted_word_np[0][index] next_sentence=copy.deepcopy(cur_sentence) next_sentence.append(index) log_likely=math.log(probability) next_log_likely=cur_log_likely+log_likely next_sentence_candidates_temp.append((next_sentence,state,next_log_likely))# make each sentence tuple prob_np_array=np.array([sentence_tuple[2] for sentence_tuple in next_sentence_candidates_temp]) top_candidates_indexes=(-prob_np_array).argsort()[:beamsize] next_sentence_candidates=list() for i in top_candidates_indexes: sentence_tuple=next_sentence_candidates_temp[i] index=sentence_tuple[0][-1] if self.index2word[index]=='<EOS>': final_sentence=sentence_tuple[0] final_likely=sentence_tuple[2] final_probability=math.exp(final_likely) final_sentences.append((final_sentence,final_probability,final_likely)) else: next_sentence_candidates.append(sentence_tuple) if len(final_sentences)>=beamsize: return final_sentences elif depth==50: return final_sentences else: depth+=1 return self.beam_search(next_sentence_candidates,final_sentences,depth,beamsize) def generate(self,numpy_image): '''Generate Caption for an Numpy Image array Args: numpy_image: numpy image Returns: list of generated captions. The structure is [caption,caption,caption,...] Where caption = {"sentence":This is a generated sentence, "probability": The probability of the generated sentence} ''' #initial step x_batch = np.ndarray((1, 3, 224,224), dtype=np.float32) x_batch[0]=numpy_image volatile=True if self.gpu_id >=0: x_batch_chainer = Variable(cuda.to_gpu(x_batch),volatile=volatile) else: x_batch_chainer = Variable(x_batch,volatile=volatile) batchsize=1 #image is chainer.variable. state = {name: chainer.Variable(xp.zeros((batchsize, self.n_units),dtype=np.float32),volatile) for name in ('c1', 'h1')} img_feature=self.feature_exractor(x_batch_chainer) state, predicted_word = self.forward_one_step_for_image(img_feature,state, volatile=volatile) if self.gpu_id >=0: index=cuda.to_cpu(predicted_word.data.argmax(1))[0] else: index=predicted_word.data.argmax(1)[0] probability=predicted_word.data[0][index] initial_sentence_candidates=[([index],state,probability)] final_sentences=list() generated_sentence_candidates=self.beam_search(initial_sentence_candidates,final_sentences,beamsize=self.beamsize) #convert to index to strings generated_string_sentence_candidates=[] for sentence_tuple in generated_sentence_candidates: sentence=[self.index2word[index] for index in sentence_tuple[0]][1:-1] probability=sentence_tuple[1] final_likely=sentence_tuple[2] a_candidate={'sentence':sentence,'probability':probability,'log_probability':final_likely} generated_string_sentence_candidates.append(a_candidate) return generated_string_sentence_candidates def generate_temp(self,numpy_image): '''Simple Generate Caption for an Numpy Image array Args: numpy_image: numpy image Returns: string of generated capiton ''' genrated_sentence_string='' x_batch = np.ndarray((1, 3, 224,224), dtype=np.float32) x_batch[0]=numpy_image volatile=True if self.gpu_id >=0: x_batch_chainer = Variable(cuda.to_gpu(x_batch),volatile=volatile) else: x_batch_chainer = Variable(x_batch,volatile=volatile) batchsize=1 #image is chainer.variable. state = {name: chainer.Variable(xp.zeros((batchsize, self.n_units),dtype=np.float32),volatile) for name in ('c1', 'h1')} img_feature=self.feature_exractor(x_batch_chainer) #img_feature_chainer is chainer.variable of extarcted feature. state = {name: chainer.Variable(xp.zeros((batchsize, self.n_units),dtype=np.float32),volatile) for name in ('c1', 'h1')} state, predicted_word = self.forward_one_step_for_image(img_feature,state, volatile=volatile) index=predicted_word.data.argmax(1) index=cuda.to_cpu(index)[0] #genrated_sentence_string+=index2word[index] #dont's add it because this is <SOS> for i in xrange(50): state, predicted_word = self.forward_one_step(predicted_word.data.argmax(1).astype(np.int32),state, volatile=volatile) index=predicted_word.data.argmax(1) index=cuda.to_cpu(index)[0] if self.index2word[index]=='<EOS>': genrated_sentence_string=genrated_sentence_string.strip() break; genrated_sentence_string+=self.index2word[index]+" " return genrated_sentence_string def get_top_sentence(self,numpy_image): ''' just get a top sentence as string Args: numpy_image: numpy image Returns: string of generated capiton ''' candidates=self.generate(numpy_image) scores=[caption['log_probability'] for caption in candidates] argmax=np.argmax(scores) top_caption=candidates[argmax]['sentence'] sentence = '' for word in top_caption: sentence+=word+' ' return sentence.strip()
# Shuffle axes to c01 im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1) # Convert to BGR im = im[::-1, :, :] im = im - MEAN_VALUES return rawim.transpose(2, 0, 1).astype(np.float32) #Model Preparation print "preparing caption generation models" model = FunctionSet() model.img_feature2vec = F.Linear(image_feature_dim, n_units) #CNN(I)の最後のレイヤーに相当。#parameter W,b model.embed = F.EmbedID(len(vocab), n_units) #W_e*S_tに相当 #parameter W model.l1_x = F.Linear(n_units, 4 * n_units) #parameter W,b model.l1_h = F.Linear(n_units, 4 * n_units) #parameter W,b model.out = F.Linear(n_units, len(vocab)) #parameter W,b serializers.load_hdf5(model_place, model) #To GPU if gpu_id >= 0: model.to_gpu() print "done" #Define Newtowork (Forward) #forward_one_step is after the CNN layer,
class Caption_generator(object): def __init__(self,caption_model_place,cnn_model_place,index2word_place,gpu_id=-1,beamsize=3): #basic paramaters you need to modify self.gpu_id=gpu_id# GPU ID. if you want to use cpu, -1 self.beamsize=beamsize #Gpu Setting global xp if self.gpu_id >= 0: xp = cuda.cupy cuda.get_device(gpu_id).use() else: xp=np # Prepare dataset with open(index2word_place, 'r') as f: self.index2word = pickle.load(f) vocab=self.index2word #Load Caffe Model with open(cnn_model_place, 'r') as f: self.func = pickle.load(f) #Model Preparation image_feature_dim=1024#dimension of image feature self.n_units = 512 #number of units per layer n_units = 512 self.model = FunctionSet() self.model.img_feature2vec=F.Linear(image_feature_dim, n_units)#CNN(I)の最後のレイヤーに相当。#parameter W,b self.model.embed=F.EmbedID(len(vocab), n_units)#W_e*S_tに相当 #parameter W self.model.l1_x=F.Linear(n_units, 4 * n_units)#parameter W,b self.model.l1_h=F.Linear(n_units, 4 * n_units)#parameter W,b self.model.out=F.Linear(n_units, len(vocab))#parameter W,b serializers.load_hdf5(caption_model_place, self.model)#read pre-trained model #To GPU if gpu_id >= 0: self.model.to_gpu() self.func.to_gpu() #to avoid overflow. #I don't know why, but this model overflows at the first time only with CPU. #So I intentionally make overflow so that it never happns after that. if gpu_id < 0: numpy_image = np.ones((3, 224,224), dtype=np.float32) self.generate(numpy_image) def feature_exractor(self,x_chainer_variable): #to extract image feature by CNN. y, = self.func(inputs={'data': x_chainer_variable}, outputs=['pool5/7x7_s1'], disable=['loss1/ave_pool', 'loss2/ave_pool','loss3/classifier'], train=False) return y def forward_one_step_for_image(self,img_feature, state, volatile='on'): x = img_feature#img_feature is chainer.variable. h0 = self.model.img_feature2vec(x) h1_in = self.model.l1_x(F.dropout(h0,train=False)) + self.model.l1_h(state['h1']) c1, h1 = F.lstm(state['c1'], h1_in) y = self.model.out(F.dropout(h1,train=False))#don't forget to change drop out into non train mode. state = {'c1': c1, 'h1': h1} return state, F.softmax(y) #forward_one_step is after the CNN layer, #h0 is n_units dimensional vector (embedding) def forward_one_step(self,cur_word, state, volatile='on'): x = chainer.Variable(cur_word, volatile) h0 = self.model.embed(x) h1_in = self.model.l1_x(F.dropout(h0,train=False)) + self.model.l1_h(state['h1']) c1, h1 = F.lstm(state['c1'], h1_in) y = self.model.out(F.dropout(h1,train=False)) state = {'c1': c1, 'h1': h1} return state, F.softmax(y) def beam_search(self,sentence_candidates,final_sentences,depth=1,beamsize=3): volatile=True next_sentence_candidates_temp=list() for sentence_tuple in sentence_candidates: cur_sentence=sentence_tuple[0] cur_index=sentence_tuple[0][-1] cur_index_xp=xp.array([cur_index],dtype=np.int32) cur_state=sentence_tuple[1] cur_log_likely=sentence_tuple[2] state, predicted_word = self.forward_one_step(cur_index_xp,cur_state, volatile=volatile) predicted_word_np=cuda.to_cpu(predicted_word.data) top_indexes=(-predicted_word_np).argsort()[0][:beamsize] for index in np.nditer(top_indexes): index=int(index) probability=predicted_word_np[0][index] next_sentence=copy.deepcopy(cur_sentence) next_sentence.append(index) log_likely=math.log(probability) next_log_likely=cur_log_likely+log_likely next_sentence_candidates_temp.append((next_sentence,state,next_log_likely))# make each sentence tuple prob_np_array=np.array([sentence_tuple[2] for sentence_tuple in next_sentence_candidates_temp]) top_candidates_indexes=(-prob_np_array).argsort()[:beamsize] next_sentence_candidates=list() for i in top_candidates_indexes: sentence_tuple=next_sentence_candidates_temp[i] index=sentence_tuple[0][-1] if self.index2word[index]=='<EOS>': final_sentence=sentence_tuple[0] final_likely=sentence_tuple[2] final_probability=math.exp(final_likely) final_sentences.append((final_sentence,final_probability,final_likely)) else: next_sentence_candidates.append(sentence_tuple) if len(final_sentences)>=beamsize: return final_sentences elif depth==50: return final_sentences else: depth+=1 return self.beam_search(next_sentence_candidates,final_sentences,depth,beamsize) def generate(self,numpy_image): '''Generate Caption for an Numpy Image array Args: numpy_image: numpy image Returns: list of generated captions. The structure is [caption,caption,caption,...] Where caption = {"sentence":This is a generated sentence, "probability": The probability of the generated sentence} ''' #initial step x_batch = np.ndarray((1, 3, 224,224), dtype=np.float32) x_batch[0]=numpy_image volatile=True if self.gpu_id >=0: x_batch_chainer = Variable(cuda.to_gpu(x_batch),volatile=volatile) else: x_batch_chainer = Variable(x_batch,volatile=volatile) batchsize=1 #image is chainer.variable. state = {name: chainer.Variable(xp.zeros((batchsize, self.n_units),dtype=np.float32),volatile) for name in ('c1', 'h1')} img_feature=self.feature_exractor(x_batch_chainer) state, predicted_word = self.forward_one_step_for_image(img_feature,state, volatile=volatile) if self.gpu_id >=0: index=cuda.to_cpu(predicted_word.data.argmax(1))[0] else: index=predicted_word.data.argmax(1)[0] probability=predicted_word.data[0][index] initial_sentence_candidates=[([index],state,probability)] final_sentences=list() generated_sentence_candidates=self.beam_search(initial_sentence_candidates,final_sentences,beamsize=self.beamsize) #convert to index to strings generated_string_sentence_candidates=[] for sentence_tuple in generated_sentence_candidates: sentence=[self.index2word[index] for index in sentence_tuple[0]][1:-1] probability=sentence_tuple[1] final_likely=sentence_tuple[2] a_candidate={'sentence':sentence,'probability':probability,'log_probability':final_likely} generated_string_sentence_candidates.append(a_candidate) return generated_string_sentence_candidates def generate_temp(self,numpy_image): '''Simple Generate Caption for an Numpy Image array Args: numpy_image: numpy image Returns: string of generated capiton ''' genrated_sentence_string='' x_batch = np.ndarray((1, 3, 224,224), dtype=np.float32) x_batch[0]=numpy_image volatile=True if self.gpu_id >=0: x_batch_chainer = Variable(cuda.to_gpu(x_batch),volatile=volatile) else: x_batch_chainer = Variable(x_batch,volatile=volatile) batchsize=1 #image is chainer.variable. state = {name: chainer.Variable(xp.zeros((batchsize, self.n_units),dtype=np.float32),volatile) for name in ('c1', 'h1')} img_feature=self.feature_exractor(x_batch_chainer) #img_feature_chainer is chainer.variable of extarcted feature. state = {name: chainer.Variable(xp.zeros((batchsize, self.n_units),dtype=np.float32),volatile) for name in ('c1', 'h1')} state, predicted_word = self.forward_one_step_for_image(img_feature,state, volatile=volatile) index=predicted_word.data.argmax(1) index=cuda.to_cpu(index)[0] #genrated_sentence_string+=index2word[index] #dont's add it because this is <SOS> for i in xrange(50): state, predicted_word = self.forward_one_step(predicted_word.data.argmax(1).astype(np.int32),state, volatile=volatile) index=predicted_word.data.argmax(1) index=cuda.to_cpu(index)[0] if self.index2word[index]=='<EOS>': genrated_sentence_string=genrated_sentence_string.strip() break; genrated_sentence_string+=self.index2word[index]+" " return genrated_sentence_string def get_top_sentence(self,numpy_image): ''' just get a top sentence as string Args: numpy_image: numpy image Returns: string of generated capiton ''' candidates=self.generate(numpy_image) scores=[caption['log_probability'] for caption in candidates] argmax=np.argmax(scores) top_caption=candidates[argmax]['sentence'] sentence = '' for word in top_caption: sentence+=word+' ' return sentence.strip()