id_to_word[v] = k image_model = VGG19() image_model.load(args.image_model) caption_net = ImageCaption(len(word_ids), feature_num, hidden_num) serializers.load_hdf5(args.model, caption_net) xp = np if args.gpu >= 0: cuda.check_cuda_available() gpu_device = args.gpu cuda.get_device(gpu_device).use() xp = cuda.cupy image_model.to_gpu(gpu_device) caption_net.to_gpu(gpu_device) bos = word_ids['<S>'] eos = word_ids['</S>'] with open(args.list) as f: paths = filter(bool, f.read().split('\n')) def generate(net, image_model, image_path): feature = image_model.feature(image_path) net.initialize(feature) candidates = [(net, [bos], 0)] for i in range(max_length): next_candidates = []
def main(args): model_path = args.MODEL_PATH all_data_path = args.ALL_DATA_PATH all_feature_path = args.ALL_FEATURE_PATH save_path = args.SAVE_PATH with open(args.WORD2ID_PATH, 'rb') as f: word_id_dic = pickle.load(f) bos = word_id_dic['<s>'] eos = word_id_dic['</s>'] unk = word_id_dic['<unk>'] id2word = {word_id_dic[x]: x for x in word_id_dic.keys()} print('data loading...') with open(all_data_path, 'rb') as f: all_data = pickle.load(f) with open(all_feature_path, 'rb') as f: all_features = pickle.load(f) print('data loaded!') test_features, test_qa_ids, test_target_vecs = create_data(all_data, all_features, 'test') feature_num = 2005 hidden_num = 1024 vocab_num = len(word_id_dic) attr_num = 5 CaptionNet = ImageCaption(vocab_num, attr_num, feature_num, hidden_num) serializers.load_hdf5(model_path, CaptionNet) CaptionNet.to_gpu(gpu_device) beam_width = 3 max_length = 100 question_list = [] for i in tqdm(range(len(test_qa_ids))): qa_id = test_qa_ids[i] target_vec = test_target_vecs[i] target_var = Variable(xp.array([target_vec], dtype=xp.float32)) concat_feature = test_features[i] feature_var = Variable(xp.array([concat_feature], dtype=xp.float32)) with chainer.using_config('train', False), chainer.no_backprop_mode(): CaptionNet.image_init(feature_var) candidates = [(CaptionNet, [bos], 0)] next_candidates = beam_search(candidates, target_var, norm=True) for j in range(max_length): next_candidates = beam_search(next_candidates, target_var, norm=True) if all([x[1][-1] == eos for x in next_candidates]): break result = [k[1] for k in next_candidates] tokens = [id2word[token_id] for token_id in result[0][1:-1]] question_list.append([qa_id, tokens]) all_list = [] for each_question in question_list: each_dic = {} qa_id = each_question[0] question = each_question[1] join_question = (' '.join([word + '' for word in question]) + '?').capitalize() each_dic['id'] = qa_id each_dic['image_id'] = qa_id each_dic['caption'] = join_question all_list.append(each_dic) with open(save_path, 'w') as f: json.dump(all_list, f)
image_dataset = scipy.io.loadmat(args.image) images = image_dataset['feats'].transpose((1, 0)) train_image_ids = sentence_dataset['images']['train'] train_sentences = sentence_dataset['sentences']['train'] test_image_ids = sentence_dataset['images']['test'] test_sentences = sentence_dataset['sentences']['test'] word_ids = sentence_dataset['word_ids'] feature_num = images.shape[1] hidden_num = 512 batch_size = 128 print 'word count: ', len(word_ids) caption_net = ImageCaption(len(word_ids), feature_num, hidden_num) if gpu_device is not None: caption_net.to_gpu(gpu_device) optimizer = optimizers.Adam() optimizer.setup(caption_net) if args.model is not None: serializers.load_hdf5(args.model + '.model', caption_net) serializers.load_hdf5(args.model + '.state', optimizer) bos = word_ids['<S>'] eos = word_ids['</S>'] unknown = word_ids['<UNK>'] def random_batches(image_groups, sentence_groups): batches = [] for image_ids, sentences in zip(image_groups, sentence_groups): length = len(sentences)
def main(args): model_save_path = args.MODEL_PATH all_data_path = args.ALL_DATA_PATH all_feature_path = args.ALL_FEATURE_PATH if not os.path.exists(model_save_path): print('make model save directory') os.mkdir(model_save_path) with open(args.WORD2ID_PATH, 'rb') as f: word_id_dic = pickle.load(f) eos = word_id_dic['</s>'] print('all data loading...') with open(all_data_path, 'rb') as f: all_data = pickle.load(f) print('all data loaded!') print('all feature loading...') with open(all_feature_path, 'rb') as f: all_features = pickle.load(f) print('all feature loaded!') train_features, train_questions, train_target_vecs = create_data( all_data, all_features, 'train') valid_features, valid_questions, valid_target_vecs = create_data( all_data, all_features, 'valid') feature_num = 2005 hidden_num = 1024 vocab_num = len(word_id_dic) attr_num = 5 epoch_num = 100 batch_size = 100 CaptionNet = ImageCaption(vocab_num, attr_num, feature_num, hidden_num) CaptionNet.to_gpu(gpu_device) optimizer = optimizers.Adam(alpha=4.0e-4) optimizer.setup(CaptionNet) N_train = len(train_target_vecs) N_valid = len(valid_target_vecs) for epoch in tqdm(range(epoch_num)): train_perm = np.random.permutation(N_train) sum_loss = 0 sum_acc = 0 sum_size = 0 for index in range(0, N_train, batch_size): CaptionNet.zerograds() index_array = train_perm[index:index + batch_size] batch_features = xp.array(train_features[index_array]) batch_questions = xp.array(train_questions[index_array]) batch_targets = xp.array(train_target_vecs[index_array]) feature_var = Variable(batch_features) question_var = Variable(batch_questions) target_var = Variable(batch_targets) loss, acc, size = forward(target_var, feature_var, question_var, CaptionNet, eos) loss.backward() optimizer.update() sum_loss += loss.data.tolist() sum_acc += acc.tolist() sum_size += size.tolist() print('loss:', sum_loss / sum_size, 'acc:', sum_acc / sum_size) valid_sum_loss = 0 valid_sum_acc = 0 valid_sum_size = 0 for valid_index in range(0, N_valid, batch_size): valid_batch_features = valid_features[valid_index:valid_index + batch_size] valid_batch_questions = valid_questions[valid_index:valid_index + batch_size] valid_batch_targets = valid_target_vecs[valid_index:valid_index + batch_size] valid_feature_var = Variable( xp.array(valid_batch_features, dtype=xp.float32)) valid_question_var = Variable( xp.array(valid_batch_questions, dtype=xp.float32)) valid_target_var = Variable( xp.array(valid_batch_targets, dtype=xp.float32)) with chainer.using_config('train', False), chainer.no_backprop_mode(): valid_loss, valid_acc, valid_size = forward( valid_target_var, valid_feature_var, valid_question_var, CaptionNet, eos) valid_sum_loss += valid_loss.data.tolist() valid_sum_acc += valid_acc.tolist() valid_sum_size += valid_size.tolist() print( "{:3d} epoch train loss : {:.5f}, acc : {:.5f} | valid loss : {:.5f}, acc : {:.5f}\n" .format(epoch, sum_loss / sum_size, sum_acc / sum_size, valid_sum_loss / valid_sum_size, valid_sum_acc / valid_sum_size)) if (epoch > 0) and (epoch % 10 == 0): serializers.save_hdf5( model_save_path + 'model_' + str(epoch) + '.h5', CaptionNet) serializers.save_hdf5( model_save_path + 'optimizer_' + str(epoch) + '.h5', optimizer)