def _process_parts_of_speech(images, vocab, tagger):
    """Processes a list of images.
    Args:
        images: a list containing CategoryMetadata objects.
        vocab: a Vocabulary object.
        tagger: a Tagger object from nltk.
    Returns:
        a list of PartOfSpeechMetadata objects.
    """
    POS_map = get_parts_of_speech()
    POS_images = []
    for image in images:
        
        caption = image.captions[0]
        parts_of_speech_ids = POS_map.word_to_id(caption, tagger)
        POS_images.append(PartOfSpeechMetadata(
            image_id=image.image_id, 
            filename=image.filename, 
            captions=image.captions, 
            image_features=image.image_features, 
            object_features=image.object_features,
            running_ids=image.running_ids, 
            running_ids_splits=image.running_ids_splits,
            word_ids=image.word_ids, 
            pointer_ids=image.pointer_ids,
            attributes=image.attributes,
            coarse=image.coarse,
            fine=image.fine,
            plurality=image.plurality,
            parts_of_speech_ids=parts_of_speech_ids))
            
    return POS_images
Ejemplo n.º 2
0
def main(unused_argv):

    image = load_image_from_path("images/image.jpg")[np.newaxis, ...]

    vocab, pretrained_matrix = load_glove(vocab_size=100, embedding_size=50)
    pos, pos_embeddings = get_parts_of_speech(), np.random.normal(
        0, 0.1, [15, 50])
    with tf.Graph().as_default():

        inputs = tf.placeholder(tf.float32, shape=image.shape)
        box_extractor = BoxExtractor(get_faster_rcnn_config(), top_k_boxes=16)
        boxes, scores, cropped_inputs = box_extractor(inputs)
        feature_extractor = FeatureExtractor()
        mean_image_features = tf.reduce_mean(feature_extractor(inputs), [1, 2])
        mean_object_features = tf.reshape(
            tf.reduce_mean(feature_extractor(cropped_inputs), [1, 2]),
            [1, 16, 2048])
        image_captioner = PartOfSpeechImageCaptioner(UpDownCell(50), vocab,
                                                     pretrained_matrix,
                                                     UpDownCell(50),
                                                     UpDownCell(50), pos,
                                                     pos_embeddings)
        pos_logits, pos_logits_ids, word_logits, word_logits_ids = image_captioner(
            mean_image_features=mean_image_features,
            mean_object_features=mean_object_features)

        with tf.Session() as sess:

            box_saver = tf.train.Saver(var_list=box_extractor.variables)
            resnet_saver = tf.train.Saver(var_list=feature_extractor.variables)

            box_saver.restore(sess, get_faster_rcnn_checkpoint())
            resnet_saver.restore(sess, get_resnet_v2_101_checkpoint())
            sess.run(tf.variables_initializer(image_captioner.variables))

            results = sess.run(
                [pos_logits, pos_logits_ids, word_logits, word_logits_ids],
                feed_dict={inputs: image})

            assert (results[2].shape[0] == 1 and results[2].shape[1] == 3
                    and results[2].shape[3] == 100)
            tf.logging.info("Successfully passed test.")
def _to_sequence_example(image, vocab):
    """Builds a SequenceExample proto for an image-caption pair.
    Args:
        image: An PreextractedMetadata object.
        vocab: A Vocabulary object.
    Returns:
        A SequenceExample proto.
    """
    with tf.gfile.FastGFile(image.filename, "rb") as f:
        encoded_image = f.read()

    context = tf.train.Features(feature={
        "image/image_id": _int64_feature(image.image_id),
        "image/data": _bytes_feature(encoded_image),
    })
    assert len(image.captions) == 1
    POS_map = get_parts_of_speech()
    caption = image.captions[0]
    caption_ids = [vocab.start_id] + vocab.word_to_id(caption) + [vocab.end_id]
    parts_of_speech_ids = [POS_map.start_id] + image.parts_of_speech_ids + [POS_map.end_id]
    feature_lists = tf.train.FeatureLists(feature_list={
        "image/caption": _bytes_feature_list([bytes(c, "utf-8") for c in caption]),
        "image/caption_ids": _int64_feature_list(caption_ids),
        "image/image_features": _float_feature_list(image.image_features.flatten().tolist()),
        "image/image_features_shape": _int64_feature_list(image.image_features.shape),
        "image/object_features": _float_feature_list(image.object_features.flatten().tolist()),
        "image/object_features_shape": _int64_feature_list(image.object_features.shape),
        "image/running_ids": _int64_feature_list(image.running_ids),
        "image/running_ids_splits": _int64_feature_list(image.running_ids_splits),
        "image/word_ids": _int64_feature_list(image.word_ids),
        "image/pointer_ids": _int64_feature_list(image.pointer_ids),
        "image/attributes": _int64_feature_list(image.attributes),
        "image/coarse": _int64_feature_list(image.coarse),
        "image/fine": _int64_feature_list(image.fine),
        "image/plurality": _int64_feature_list(image.plurality),
        "image/parts_of_speech_ids": _int64_feature_list(parts_of_speech_ids),
    })
    sequence_example = tf.train.SequenceExample(
        context=context, feature_lists=feature_lists)

    return sequence_example
from detailed_captioning.inputs.mean_image_and_object_features_and_parts_of_speech_only import import_mscoco


PRINT_STRING = """({3:.2f} img/sec) iteration: {0:05d}\n    caption: {1}\n    label: {2}"""
tf.logging.set_verbosity(tf.logging.INFO)
tf.flags.DEFINE_integer("batch_size", 1, "")
tf.flags.DEFINE_integer("beam_size", 3, "")
tf.flags.DEFINE_boolean("is_mini", False, "")
tf.flags.DEFINE_string("mode", "eval", "")
FLAGS = tf.flags.FLAGS


if __name__ == "__main__":
    
    vocab, pretrained_matrix = load_glove(vocab_size=100000, embedding_size=300)
    pos, pos_embeddings = get_parts_of_speech(), np.random.normal(0, 0.1, [15, 300])
    with tf.Graph().as_default():

        (image_id, image_features, object_features, input_seq, target_seq, indicator, 
                pos_input_seq, pos_target_seq, pos_indicator) = import_mscoco(
            mode=FLAGS.mode, batch_size=FLAGS.batch_size, num_epochs=1, is_mini=FLAGS.is_mini)
        up_down_caption_cell = UpDownCell(300, name="up_down_caption_cell")
        up_down_decoder_cell = UpDownCell(300, name="up_down_decoder_cell")
        up_down_encoder_cell = UpDownCell(300, name="up_down_encoder_cell")
        image_captioner = PartOfSpeechImageCaptioner(
            up_down_caption_cell, vocab, pretrained_matrix,
            up_down_decoder_cell, up_down_encoder_cell, pos, pos_embeddings)
        pos_logits, pos_ids, word_logits, word_ids = image_captioner(
            mean_image_features=image_features, 
            mean_object_features=object_features)
        
Ejemplo n.º 5
0
def main(unused_argv):
    
    vocab, pretrained_matrix = load_glove(vocab_size=100000, embedding_size=300)
    pos, pos_embeddings = get_parts_of_speech(), np.random.normal(0, 0.1, [15, 300])
    with tf.Graph().as_default():

        (image_id, image_features, object_features, input_seq, target_seq, indicator, 
                pos_input_seq, pos_target_seq, pos_indicator) = import_mscoco(
            mode="train", batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs, is_mini=FLAGS.is_mini)
        up_down_caption_cell = UpDownCell(300, name="up_down_caption_cell")
        up_down_decoder_cell = UpDownCell(300, name="up_down_decoder_cell")
        up_down_encoder_cell = UpDownCell(300, name="up_down_encoder_cell")
        image_captioner = PartOfSpeechImageCaptioner(
            up_down_caption_cell, vocab, pretrained_matrix,
            up_down_decoder_cell, up_down_encoder_cell, pos, pos_embeddings)
        pos_logits, pos_ids, word_logits, word_ids = image_captioner(
            mean_image_features=image_features, 
            mean_object_features=object_features,
            word_seq_inputs=input_seq, word_lengths=tf.reduce_sum(indicator, axis=1),
            pos_seq_inputs=pos_input_seq, pos_seq_outputs=pos_target_seq, 
            pos_lengths=tf.reduce_sum(pos_indicator, axis=1))
        tf.losses.sparse_softmax_cross_entropy(target_seq, word_logits, weights=indicator)
        tf.losses.sparse_softmax_cross_entropy(pos_target_seq, pos_logits, weights=pos_indicator)
        loss = tf.losses.get_total_loss()
        
        global_step = tf.train.get_or_create_global_step()
        optimizer = tf.train.AdamOptimizer()
        learning_step = optimizer.minimize(loss, var_list=image_captioner.variables, global_step=global_step)

        captioner_saver = tf.train.Saver(var_list=image_captioner.variables + [global_step])
        captioner_ckpt, captioner_ckpt_name = get_up_down_part_of_speech_checkpoint()
        with tf.Session() as sess:
            
            sess.run(tf.variables_initializer(optimizer.variables()))
            if captioner_ckpt is not None:
                captioner_saver.restore(sess, captioner_ckpt)
            else:
                sess.run(tf.variables_initializer(image_captioner.variables + [global_step]))
            captioner_saver.save(sess, captioner_ckpt_name, global_step=global_step)
            last_save = time.time()
            
            for i in itertools.count():
                
                time_start = time.time()
                try:
                    _target, _ids, _loss, _learning_step = sess.run([target_seq, word_ids, 
                        loss, learning_step])
                except:
                    break
                    
                iteration = sess.run(global_step)
                    
                print(PRINT_STRING.format(
                    iteration, _loss, 
                    list_of_ids_to_string(_ids[0, :].tolist(), vocab), 
                    list_of_ids_to_string(_target[0, :].tolist(), vocab), 
                    FLAGS.batch_size / (time.time() - time_start)))
                
                new_save = time.time()
                if new_save - last_save > 3600: # save the model every hour
                    captioner_saver.save(sess, captioner_ckpt_name, global_step=global_step)
                    last_save = new_save
                    
            captioner_saver.save(sess, captioner_ckpt_name, global_step=global_step)
            print("Finishing training.")