コード例 #1
0
ファイル: infer.py プロジェクト: katha-shah/iksha
def main():
    infer_img_path = "C:/Users/Crypto/PycharmProjects/segmented_style_transfer/data"
    infer_img_name = 'two_boys.jpg'
    # Read and Process Raw data
    data = CaptioningData()
    # Finding image files as data
    data.set_all_images(cfg.images_path)
    captions_dict = data.get_captions(cfg.token_file)
    caption_maxlen = data.get_caption_maxlen()

    vocab = load_vocab(vocab_path=cfg.data_path, vocab_name=cfg.vocab_name)
    # print(vocab.word2idx)
    inception_encoding = Encoder()

    # Decoder model
    decoder = Decoder(vocab_size=len(vocab),
                      embedding_size=300,
                      input_shape=2048,
                      caption_max_len=caption_maxlen)
    decoder_model = decoder.get_model()
    decoder_model.load_weights('model/best_weights.97-0.95.hdf5')

    img_ids = data.get_val_images(cfg.val_image_files)
    img_name = img_ids[19]

    enc_img = inception_encoding.encode_single_img(file_path=cfg.images_path,
                                                   img_name=img_name)
    # enc_img = inception_encoding.encode_single_img(file_path=infer_img_path, img_name=infer_img_name)

    caption = ["<start>"]
    while True:
        par_caps = [vocab(i) for i in caption]
        par_caps = sequence.pad_sequences([par_caps],
                                          maxlen=40,
                                          padding='post')
        preds = decoder_model.predict(
            [np.array([enc_img]), np.array(par_caps)])
        word_pred = vocab.idx2word[np.argmax(preds[0])]
        caption.append(word_pred)

        if word_pred == "<end>" or len(caption) > 40:
            break

    full_img_path = os.path.join(cfg.images_path, img_name)
    print(captions_dict[img_name])
    print(full_img_path)
    print(' '.join(caption[1:-1]))

    for beam_size in [3, 5, 7]:
        caption = beam_search_predictions(vocab,
                                          enc_img,
                                          decoder_model,
                                          caption_maxlen,
                                          beam_index=beam_size)
        print(beam_size, caption)
コード例 #2
0
ファイル: main.py プロジェクト: katha-shah/iksha
def main():
    construct_vocab = False
    encode_images = False
    train = True

    # Read and Process Raw data
    data = CaptioningData()
    # Finding image files as data
    data.set_all_images(cfg.images_path)
    captions_dict = data.get_captions(cfg.token_file)
    caption_maxlen = data.get_caption_maxlen()

    # Construct vocabulary
    if construct_vocab:
        # get all caption to construct Vocab
        all_captions = data.get_all_captions()
        vocab = build_vocab(vocab_path=cfg.data_path,
                            vocab_name=cfg.vocab_name,
                            captions=all_captions,
                            threshold=2)
    else:
        vocab = load_vocab(vocab_path=cfg.data_path, vocab_name=cfg.vocab_name)
    # print(vocab.word2idx)
    inception_encoding = Encoder()

    # train data
    if train:
        train_images = data.get_train_images(cfg.train_image_files)
        train_pairs = [
            ImgCaptionPair(img_id, captions_dict[img_id])
            for img_id in train_images
        ]

        # Image Encoding

        if encode_images:
            train_img_encoding = inception_encoding.encode_images(
                file_path=cfg.images_path,
                image_list=train_images,
                encoding_file=cfg.train_img_encoding_file)
        else:
            train_img_encoding = inception_encoding.load_image_encoding(
                encoding_file=cfg.train_img_encoding_file)

        train_data_generator = data_generator(vocab,
                                              train_pairs,
                                              train_img_encoding,
                                              batch_size=1800,
                                              max_len=caption_maxlen)
        # next(g)

    # Decoder model
    decoder = Decoder(vocab_size=len(vocab),
                      embedding_size=300,
                      input_shape=2048,
                      caption_max_len=caption_maxlen)
    decoder_model = decoder.get_model()
    decoder_model.load_weights('best_weights.97-0.95.hdf5')

    if train:
        decoder_model.compile(loss='categorical_crossentropy',
                              optimizer=RMSprop(),
                              metrics=['accuracy'])
        ckpt = ModelCheckpoint('weights.{epoch:02d}-{loss:.2f}.hdf5',
                               monitor='loss',
                               verbose=0,
                               save_best_only=False,
                               save_weights_only=False,
                               mode='auto',
                               period=30)
        best_ckpt = ModelCheckpoint('best_weights.{epoch:02d}-{loss:.2f}.hdf5',
                                    monitor='loss',
                                    verbose=0,
                                    save_best_only=True,
                                    save_weights_only=False,
                                    mode='auto',
                                    period=1)
        decoder_model.fit_generator(train_data_generator,
                                    steps_per_epoch=30,
                                    epochs=100,
                                    callbacks=[ckpt, best_ckpt])

    decoder_model.save('decoder_model.h5')

    img_ids = data.get_val_images(cfg.val_image_files)
    img_name = img_ids[9]

    enc_img = inception_encoding.encode_single_img(file_path=cfg.images_path,
                                                   img_name=img_name)

    caption = ["<start>"]
    while True:
        par_caps = [vocab(i) for i in caption]
        par_caps = sequence.pad_sequences([par_caps],
                                          maxlen=40,
                                          padding='post')
        preds = decoder_model.predict(
            [np.array([enc_img]), np.array(par_caps)])
        word_pred = vocab.idx2word[np.argmax(preds[0])]
        caption.append(word_pred)

        if word_pred == "<end>" or len(caption) > 40:
            break

    full_img_path = os.path.join(cfg.images_path, img_name)
    print(captions_dict[img_name])
    print(full_img_path)
    print(' '.join(caption[1:-1]))