feature_num = 4096 hidden_num = 512 beam_width = 20 max_length = 60 with open(args.sentence, 'rb') as f: sentence_dataset = pickle.load(f) word_ids = sentence_dataset['word_ids'] id_to_word = {} for k, v in word_ids.items(): id_to_word[v] = k image_model = VGG19() image_model.load(args.image_model) caption_net = ImageCaption(len(word_ids), feature_num, hidden_num) serializers.load_hdf5(args.model, caption_net) xp = np if args.gpu >= 0: cuda.check_cuda_available() gpu_device = args.gpu cuda.get_device(gpu_device).use() xp = cuda.cupy image_model.to_gpu(gpu_device) caption_net.to_gpu(gpu_device) bos = word_ids['<S>'] eos = word_ids['</S>'] with open(args.list) as f:
with open(args.sentence, 'rb') as f: sentence_dataset = pickle.load(f) image_dataset = scipy.io.loadmat(args.image) images = image_dataset['feats'].transpose((1, 0)) train_image_ids = sentence_dataset['images']['train'] train_sentences = sentence_dataset['sentences']['train'] test_image_ids = sentence_dataset['images']['test'] test_sentences = sentence_dataset['sentences']['test'] word_ids = sentence_dataset['word_ids'] feature_num = images.shape[1] hidden_num = 512 batch_size = 128 print 'word count: ', len(word_ids) caption_net = ImageCaption(len(word_ids), feature_num, hidden_num) if gpu_device is not None: caption_net.to_gpu(gpu_device) optimizer = optimizers.Adam() optimizer.setup(caption_net) if args.model is not None: serializers.load_hdf5(args.model + '.model', caption_net) serializers.load_hdf5(args.model + '.state', optimizer) bos = word_ids['<S>'] eos = word_ids['</S>'] unknown = word_ids['<UNK>'] def random_batches(image_groups, sentence_groups): batches = []
def main(args): model_path = args.MODEL_PATH all_data_path = args.ALL_DATA_PATH all_feature_path = args.ALL_FEATURE_PATH save_path = args.SAVE_PATH with open(args.WORD2ID_PATH, 'rb') as f: word_id_dic = pickle.load(f) bos = word_id_dic['<s>'] eos = word_id_dic['</s>'] unk = word_id_dic['<unk>'] id2word = {word_id_dic[x]: x for x in word_id_dic.keys()} print('data loading...') with open(all_data_path, 'rb') as f: all_data = pickle.load(f) with open(all_feature_path, 'rb') as f: all_features = pickle.load(f) print('data loaded!') test_features, test_qa_ids, test_target_vecs = create_data(all_data, all_features, 'test') feature_num = 2005 hidden_num = 1024 vocab_num = len(word_id_dic) attr_num = 5 CaptionNet = ImageCaption(vocab_num, attr_num, feature_num, hidden_num) serializers.load_hdf5(model_path, CaptionNet) CaptionNet.to_gpu(gpu_device) beam_width = 3 max_length = 100 question_list = [] for i in tqdm(range(len(test_qa_ids))): qa_id = test_qa_ids[i] target_vec = test_target_vecs[i] target_var = Variable(xp.array([target_vec], dtype=xp.float32)) concat_feature = test_features[i] feature_var = Variable(xp.array([concat_feature], dtype=xp.float32)) with chainer.using_config('train', False), chainer.no_backprop_mode(): CaptionNet.image_init(feature_var) candidates = [(CaptionNet, [bos], 0)] next_candidates = beam_search(candidates, target_var, norm=True) for j in range(max_length): next_candidates = beam_search(next_candidates, target_var, norm=True) if all([x[1][-1] == eos for x in next_candidates]): break result = [k[1] for k in next_candidates] tokens = [id2word[token_id] for token_id in result[0][1:-1]] question_list.append([qa_id, tokens]) all_list = [] for each_question in question_list: each_dic = {} qa_id = each_question[0] question = each_question[1] join_question = (' '.join([word + '' for word in question]) + '?').capitalize() each_dic['id'] = qa_id each_dic['image_id'] = qa_id each_dic['caption'] = join_question all_list.append(each_dic) with open(save_path, 'w') as f: json.dump(all_list, f)
def main(): sys.setrecursionlimit(10000) # workaround for deep copying large graph parser = argparse.ArgumentParser() parser.add_argument("--backend", default="webgpu,webassembly") parser.add_argument("--encoding", default="eightbit") parser.add_argument('--out', '-o', default='webdnn/image-caption-model', help='Directory to output the graph descriptor') parser.add_argument('--sentence', '-s', required=True, type=str, help='sentence dataset file path') parser.add_argument('--model', '-m', required=True, type=str, help='input model file path') parser.add_argument("--example_image", help="example image for comparing output") parser.add_argument("--visualize_ir", action="store_true") args = parser.parse_args() os.makedirs(args.out, exist_ok=True) out_dir_graph1 = os.path.join(args.out, "image-feature") out_dir_graph2 = os.path.join(args.out, "caption-generation") hidden_num = 512 with open(args.sentence, 'rb') as f: sentence_dataset = pickle.load(f) word_ids = sentence_dataset['word_ids'] word_num = len(word_ids) id_to_word = [""] * word_num for k, v in word_ids.items(): id_to_word[v] = k with open(os.path.join(args.out, "word_data.json"), "w") as f: json.dump( { "id_to_word": id_to_word, "bos_id": word_ids["<S>"], "eos_id": word_ids["</S>"], "word_num": word_num, "hidden_num": hidden_num }, f) caption_net = ImageCaption(word_num=word_num, feature_num=2048, hidden_num=hidden_num) chainer.serializers.load_hdf5(args.model, caption_net) graph1 = generate_graph_model1(caption_net) graph2 = generate_graph_model2(caption_net, hidden_num=hidden_num) if args.example_image: example_io = generate_example_io(caption_net, word_ids, args.example_image) with open(os.path.join(args.out, "example_io.json"), "w") as f: json.dump(example_io, f) if args.visualize_ir: ir_dot_path = os.path.join(args.out, "ir.dot") with open(ir_dot_path, "w") as f: f.write(dump_dot(graph2)) console.stderr( f"IR graph can be visualized with graphviz command: 'dot {ir_dot_path} -T png -o output.png'" ) any_backend_failed = False last_backend_exception = None for backend in args.backend.split(","): try: graph_exec_data = generate_descriptor( backend, graph1, constant_encoder_name=args.encoding) graph_exec_data.save(out_dir_graph1) graph_exec_data = generate_descriptor( backend, graph2, constant_encoder_name=args.encoding) graph_exec_data.save(out_dir_graph2) except Exception as ex: any_backend_failed = True last_backend_exception = ex console.error( f"Failed generating descriptor for backend {backend}: {str(ex)}\n" ) if any_backend_failed: raise last_backend_exception
def main(args): model_save_path = args.MODEL_PATH all_data_path = args.ALL_DATA_PATH all_feature_path = args.ALL_FEATURE_PATH if not os.path.exists(model_save_path): print('make model save directory') os.mkdir(model_save_path) with open(args.WORD2ID_PATH, 'rb') as f: word_id_dic = pickle.load(f) eos = word_id_dic['</s>'] print('all data loading...') with open(all_data_path, 'rb') as f: all_data = pickle.load(f) print('all data loaded!') print('all feature loading...') with open(all_feature_path, 'rb') as f: all_features = pickle.load(f) print('all feature loaded!') train_features, train_questions, train_target_vecs = create_data( all_data, all_features, 'train') valid_features, valid_questions, valid_target_vecs = create_data( all_data, all_features, 'valid') feature_num = 2005 hidden_num = 1024 vocab_num = len(word_id_dic) attr_num = 5 epoch_num = 100 batch_size = 100 CaptionNet = ImageCaption(vocab_num, attr_num, feature_num, hidden_num) CaptionNet.to_gpu(gpu_device) optimizer = optimizers.Adam(alpha=4.0e-4) optimizer.setup(CaptionNet) N_train = len(train_target_vecs) N_valid = len(valid_target_vecs) for epoch in tqdm(range(epoch_num)): train_perm = np.random.permutation(N_train) sum_loss = 0 sum_acc = 0 sum_size = 0 for index in range(0, N_train, batch_size): CaptionNet.zerograds() index_array = train_perm[index:index + batch_size] batch_features = xp.array(train_features[index_array]) batch_questions = xp.array(train_questions[index_array]) batch_targets = xp.array(train_target_vecs[index_array]) feature_var = Variable(batch_features) question_var = Variable(batch_questions) target_var = Variable(batch_targets) loss, acc, size = forward(target_var, feature_var, question_var, CaptionNet, eos) loss.backward() optimizer.update() sum_loss += loss.data.tolist() sum_acc += acc.tolist() sum_size += size.tolist() print('loss:', sum_loss / sum_size, 'acc:', sum_acc / sum_size) valid_sum_loss = 0 valid_sum_acc = 0 valid_sum_size = 0 for valid_index in range(0, N_valid, batch_size): valid_batch_features = valid_features[valid_index:valid_index + batch_size] valid_batch_questions = valid_questions[valid_index:valid_index + batch_size] valid_batch_targets = valid_target_vecs[valid_index:valid_index + batch_size] valid_feature_var = Variable( xp.array(valid_batch_features, dtype=xp.float32)) valid_question_var = Variable( xp.array(valid_batch_questions, dtype=xp.float32)) valid_target_var = Variable( xp.array(valid_batch_targets, dtype=xp.float32)) with chainer.using_config('train', False), chainer.no_backprop_mode(): valid_loss, valid_acc, valid_size = forward( valid_target_var, valid_feature_var, valid_question_var, CaptionNet, eos) valid_sum_loss += valid_loss.data.tolist() valid_sum_acc += valid_acc.tolist() valid_sum_size += valid_size.tolist() print( "{:3d} epoch train loss : {:.5f}, acc : {:.5f} | valid loss : {:.5f}, acc : {:.5f}\n" .format(epoch, sum_loss / sum_size, sum_acc / sum_size, valid_sum_loss / valid_sum_size, valid_sum_acc / valid_sum_size)) if (epoch > 0) and (epoch % 10 == 0): serializers.save_hdf5( model_save_path + 'model_' + str(epoch) + '.h5', CaptionNet) serializers.save_hdf5( model_save_path + 'optimizer_' + str(epoch) + '.h5', optimizer)