Beispiel #1
0
feature_num = 4096
hidden_num = 512
beam_width = 20
max_length = 60

with open(args.sentence, 'rb') as f:
    sentence_dataset = pickle.load(f)
word_ids = sentence_dataset['word_ids']
id_to_word = {}
for k, v in word_ids.items():
    id_to_word[v] = k

image_model = VGG19()
image_model.load(args.image_model)

caption_net = ImageCaption(len(word_ids), feature_num, hidden_num)
serializers.load_hdf5(args.model, caption_net)

xp = np
if args.gpu >= 0:
    cuda.check_cuda_available()
    gpu_device = args.gpu
    cuda.get_device(gpu_device).use()
    xp = cuda.cupy
    image_model.to_gpu(gpu_device)
    caption_net.to_gpu(gpu_device)

bos = word_ids['<S>']
eos = word_ids['</S>']

with open(args.list) as f:
Beispiel #2
0
with open(args.sentence, 'rb') as f:
    sentence_dataset = pickle.load(f)
image_dataset = scipy.io.loadmat(args.image)
images = image_dataset['feats'].transpose((1, 0))

train_image_ids = sentence_dataset['images']['train']
train_sentences = sentence_dataset['sentences']['train']
test_image_ids = sentence_dataset['images']['test']
test_sentences = sentence_dataset['sentences']['test']
word_ids = sentence_dataset['word_ids']
feature_num = images.shape[1]
hidden_num = 512
batch_size = 128

print 'word count: ', len(word_ids)
caption_net = ImageCaption(len(word_ids), feature_num, hidden_num)
if gpu_device is not None:
    caption_net.to_gpu(gpu_device)
optimizer = optimizers.Adam()
optimizer.setup(caption_net)

if args.model is not None:
    serializers.load_hdf5(args.model + '.model', caption_net)
    serializers.load_hdf5(args.model + '.state', optimizer)

bos = word_ids['<S>']
eos = word_ids['</S>']
unknown = word_ids['<UNK>']

def random_batches(image_groups, sentence_groups):
    batches = []
Beispiel #3
0
def main(args):
    model_path = args.MODEL_PATH
    all_data_path = args.ALL_DATA_PATH
    all_feature_path = args.ALL_FEATURE_PATH
    save_path = args.SAVE_PATH

    with open(args.WORD2ID_PATH, 'rb') as f:
        word_id_dic = pickle.load(f)

    bos = word_id_dic['<s>']
    eos = word_id_dic['</s>']
    unk = word_id_dic['<unk>']


    id2word = {word_id_dic[x]: x for x in word_id_dic.keys()}

    print('data loading...')
    with open(all_data_path, 'rb') as f:
        all_data = pickle.load(f)
    with open(all_feature_path, 'rb') as f:
        all_features = pickle.load(f)

    print('data loaded!')

    test_features, test_qa_ids, test_target_vecs = create_data(all_data, all_features, 'test')

    feature_num = 2005
    hidden_num = 1024
    vocab_num = len(word_id_dic)
    attr_num = 5

    CaptionNet = ImageCaption(vocab_num, attr_num, feature_num, hidden_num)
    serializers.load_hdf5(model_path, CaptionNet)
    CaptionNet.to_gpu(gpu_device)

    beam_width = 3
    max_length = 100

    question_list = []

    for i in tqdm(range(len(test_qa_ids))):
        qa_id = test_qa_ids[i]
        target_vec = test_target_vecs[i]

        target_var = Variable(xp.array([target_vec], dtype=xp.float32))
        concat_feature = test_features[i]
        feature_var = Variable(xp.array([concat_feature], dtype=xp.float32))

        with chainer.using_config('train', False), chainer.no_backprop_mode():
            CaptionNet.image_init(feature_var)

        candidates = [(CaptionNet, [bos], 0)]
        next_candidates = beam_search(candidates, target_var, norm=True)
        for j in range(max_length):
            next_candidates = beam_search(next_candidates, target_var, norm=True)
            if all([x[1][-1] == eos for x in next_candidates]):
                break
        result = [k[1] for k in next_candidates]
        tokens = [id2word[token_id] for token_id in result[0][1:-1]]
        question_list.append([qa_id, tokens])

    all_list = []
    for each_question in question_list:
        each_dic = {}
        qa_id = each_question[0]
        question = each_question[1]
        join_question = (' '.join([word + '' for word in question]) + '?').capitalize()
        each_dic['id'] = qa_id
        each_dic['image_id'] = qa_id
        each_dic['caption'] = join_question
        all_list.append(each_dic)

    with open(save_path, 'w') as f:
        json.dump(all_list, f)
def main():
    sys.setrecursionlimit(10000)  # workaround for deep copying large graph

    parser = argparse.ArgumentParser()
    parser.add_argument("--backend", default="webgpu,webassembly")
    parser.add_argument("--encoding", default="eightbit")
    parser.add_argument('--out',
                        '-o',
                        default='webdnn/image-caption-model',
                        help='Directory to output the graph descriptor')
    parser.add_argument('--sentence',
                        '-s',
                        required=True,
                        type=str,
                        help='sentence dataset file path')
    parser.add_argument('--model',
                        '-m',
                        required=True,
                        type=str,
                        help='input model file path')
    parser.add_argument("--example_image",
                        help="example image for comparing output")
    parser.add_argument("--visualize_ir", action="store_true")

    args = parser.parse_args()

    os.makedirs(args.out, exist_ok=True)
    out_dir_graph1 = os.path.join(args.out, "image-feature")
    out_dir_graph2 = os.path.join(args.out, "caption-generation")

    hidden_num = 512
    with open(args.sentence, 'rb') as f:
        sentence_dataset = pickle.load(f)
    word_ids = sentence_dataset['word_ids']
    word_num = len(word_ids)
    id_to_word = [""] * word_num
    for k, v in word_ids.items():
        id_to_word[v] = k

    with open(os.path.join(args.out, "word_data.json"), "w") as f:
        json.dump(
            {
                "id_to_word": id_to_word,
                "bos_id": word_ids["<S>"],
                "eos_id": word_ids["</S>"],
                "word_num": word_num,
                "hidden_num": hidden_num
            }, f)

    caption_net = ImageCaption(word_num=word_num,
                               feature_num=2048,
                               hidden_num=hidden_num)
    chainer.serializers.load_hdf5(args.model, caption_net)
    graph1 = generate_graph_model1(caption_net)
    graph2 = generate_graph_model2(caption_net, hidden_num=hidden_num)

    if args.example_image:
        example_io = generate_example_io(caption_net, word_ids,
                                         args.example_image)
        with open(os.path.join(args.out, "example_io.json"), "w") as f:
            json.dump(example_io, f)

    if args.visualize_ir:
        ir_dot_path = os.path.join(args.out, "ir.dot")
        with open(ir_dot_path, "w") as f:
            f.write(dump_dot(graph2))
        console.stderr(
            f"IR graph can be visualized with graphviz command: 'dot {ir_dot_path} -T png -o output.png'"
        )

    any_backend_failed = False
    last_backend_exception = None
    for backend in args.backend.split(","):
        try:
            graph_exec_data = generate_descriptor(
                backend, graph1, constant_encoder_name=args.encoding)
            graph_exec_data.save(out_dir_graph1)
            graph_exec_data = generate_descriptor(
                backend, graph2, constant_encoder_name=args.encoding)
            graph_exec_data.save(out_dir_graph2)
        except Exception as ex:
            any_backend_failed = True
            last_backend_exception = ex
            console.error(
                f"Failed generating descriptor for backend {backend}: {str(ex)}\n"
            )

    if any_backend_failed:
        raise last_backend_exception
Beispiel #5
0
def main(args):
    model_save_path = args.MODEL_PATH
    all_data_path = args.ALL_DATA_PATH
    all_feature_path = args.ALL_FEATURE_PATH

    if not os.path.exists(model_save_path):
        print('make model save directory')
        os.mkdir(model_save_path)

    with open(args.WORD2ID_PATH, 'rb') as f:
        word_id_dic = pickle.load(f)

    eos = word_id_dic['</s>']

    print('all data loading...')
    with open(all_data_path, 'rb') as f:
        all_data = pickle.load(f)
    print('all data loaded!')
    print('all feature loading...')
    with open(all_feature_path, 'rb') as f:
        all_features = pickle.load(f)
    print('all feature loaded!')

    train_features, train_questions, train_target_vecs = create_data(
        all_data, all_features, 'train')
    valid_features, valid_questions, valid_target_vecs = create_data(
        all_data, all_features, 'valid')

    feature_num = 2005
    hidden_num = 1024
    vocab_num = len(word_id_dic)
    attr_num = 5

    epoch_num = 100
    batch_size = 100

    CaptionNet = ImageCaption(vocab_num, attr_num, feature_num, hidden_num)
    CaptionNet.to_gpu(gpu_device)

    optimizer = optimizers.Adam(alpha=4.0e-4)

    optimizer.setup(CaptionNet)

    N_train = len(train_target_vecs)
    N_valid = len(valid_target_vecs)

    for epoch in tqdm(range(epoch_num)):
        train_perm = np.random.permutation(N_train)
        sum_loss = 0
        sum_acc = 0
        sum_size = 0

        for index in range(0, N_train, batch_size):
            CaptionNet.zerograds()
            index_array = train_perm[index:index + batch_size]
            batch_features = xp.array(train_features[index_array])
            batch_questions = xp.array(train_questions[index_array])
            batch_targets = xp.array(train_target_vecs[index_array])

            feature_var = Variable(batch_features)
            question_var = Variable(batch_questions)
            target_var = Variable(batch_targets)

            loss, acc, size = forward(target_var, feature_var, question_var,
                                      CaptionNet, eos)

            loss.backward()
            optimizer.update()
            sum_loss += loss.data.tolist()
            sum_acc += acc.tolist()
            sum_size += size.tolist()

        print('loss:', sum_loss / sum_size, 'acc:', sum_acc / sum_size)

        valid_sum_loss = 0
        valid_sum_acc = 0
        valid_sum_size = 0

        for valid_index in range(0, N_valid, batch_size):
            valid_batch_features = valid_features[valid_index:valid_index +
                                                  batch_size]
            valid_batch_questions = valid_questions[valid_index:valid_index +
                                                    batch_size]
            valid_batch_targets = valid_target_vecs[valid_index:valid_index +
                                                    batch_size]

            valid_feature_var = Variable(
                xp.array(valid_batch_features, dtype=xp.float32))
            valid_question_var = Variable(
                xp.array(valid_batch_questions, dtype=xp.float32))
            valid_target_var = Variable(
                xp.array(valid_batch_targets, dtype=xp.float32))

            with chainer.using_config('train',
                                      False), chainer.no_backprop_mode():
                valid_loss, valid_acc, valid_size = forward(
                    valid_target_var, valid_feature_var, valid_question_var,
                    CaptionNet, eos)

            valid_sum_loss += valid_loss.data.tolist()
            valid_sum_acc += valid_acc.tolist()
            valid_sum_size += valid_size.tolist()

        print(
            "{:3d} epoch train loss : {:.5f}, acc : {:.5f} | valid loss : {:.5f}, acc : {:.5f}\n"
            .format(epoch, sum_loss / sum_size, sum_acc / sum_size,
                    valid_sum_loss / valid_sum_size,
                    valid_sum_acc / valid_sum_size))

        if (epoch > 0) and (epoch % 10 == 0):
            serializers.save_hdf5(
                model_save_path + 'model_' + str(epoch) + '.h5', CaptionNet)
            serializers.save_hdf5(
                model_save_path + 'optimizer_' + str(epoch) + '.h5', optimizer)