Exemple #1
0
    def beam_search(self, sess, batch, vocabulary):
        def get_copy_word(
                sent, vocab2_size
        ):  # vocab2_size is boundry, e.g. index for word "copy"
            for w in sent:
                if w > vocab2_size - 1:
                    return w
            return vocab2_size - 1

        config = self.config

        (a, b, c), (m_a, m_b, m_c), (l_a_, l_b_,
                                     l_c_), dst, m_dst, l_dst_ = batch
        #print(_, a[0], b[0], c[0], m_a[0], m_b[0], m_c[0],  l_a_[0], l_b_[0], l_c_[0], dst[0], m_dst[0], l_dst_[0])
        feed_dict = {
            self.sentences1: a,
            self.sequence_length1: l_a_,
            self.sentences2: b,
            self.sequence_length2: l_b_,
            self.sentences3: c,
            self.sequence_length3: l_c_
        }

        cont1, cont2, cont3, initial_memory, initial_output = sess.run(
            [
                self.contexts1, self.contexts2, self.contexts3,
                self.initial_memory, self.initial_output
            ],
            feed_dict=feed_dict)

        #encode_state1_memory, encode_state1_output = encode_state1
        #encode_state2_memory, encode_state2_output = encode_state2

        partial_caption_data = []
        complete_caption_data = []
        for k in range(config.batch_size):
            initial_beam = LogicData(sentence=[],
                                     memory=initial_memory[k],
                                     output=initial_output[k],
                                     score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        for idx in range(config.max_output_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()  # extract top N * N
                partial_caption_data_lists.append(
                    data)  # len(partial_caption_data_lists): batch_size
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)  # len(last_word): batch_size

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)  # batch_size
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)  # batch_size

                # scores: batch_size * vocab2_size; scores[k]: vocab2_size
                # scores3: batch_size * max_input_length; scores3[k]: max_input_length
                memory, output, scores, scores2, scores3 = sess.run(
                    [
                        self.memory, self.output, self.probs, self.probs2,
                        self.probs3
                    ],
                    feed_dict={
                        self.b_ctx1: cont1,
                        self.b_ctx2: cont2,
                        self.b_ctx3: cont3,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })  #,
                #self.encode_state_a_memory: encode_state1_memory,
                #self.encode_state_a_output: encode_state1_output,
                #self.encode_state_b_memory: encode_state2_memory,
                #self.encode_state_b_output: encode_state2_output})

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(
                        enumerate(scores[k])
                    )  # scores: (i.e.prob); words_and_scores:(idx, prob)

                    # scores: batch_size * vocab2_size; scores[k]: vocab2_size
                    # scores3: batch_size * max_input_length; scores3[k]: max_input_length
                    #words_and_scores[-1][0] = c[k][np.argmax(scores3[k])]
                    #words_and_scores[-1][1] = scores[k][-1]

                    #words_and_scores[-1] = (get_copy_word(c[k], config.vocab2_size), scores[k][-1])
                    #(c[k][np.argmax(scores3[k])], scores[k][-1]) ## use the prob of "copy"

                    words_and_scores.sort(
                        key=lambda x: -x[1])  # x[1]: prob; x[0]:idx
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = LogicData(
                            sentence,
                            memory[k],  # new memory
                            output[k],  # new output
                            score)
                        #if w >= config.vocab2_size:
                        #    print(w, s)
                        if vocabulary.words[w] == 'stop':  # mark the end
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results
    def beam_search(self, sess, batch, vocabulary):
        config = self.config

        (a, b, c), (m_a, m_b, m_c), (l_a_, l_b_,
                                     l_c_), dst, m_dst, l_dst_ = batch
        #print(_, a[0], b[0], c[0], m_a[0], m_b[0], m_c[0],  l_a_[0], l_b_[0], l_c_[0], dst[0], m_dst[0], l_dst_[0])
        feed_dict = {self.sentences3: c, self.sequence_length3: l_c_}

        initial_memory, initial_output = sess.run(
            [self.initial_memory, self.initial_output], feed_dict=feed_dict)

        partial_caption_data = []
        complete_caption_data = []
        for k in range(config.batch_size):
            initial_beam = LogicData(sentence=[],
                                     memory=initial_memory[k],
                                     output=initial_output[k],
                                     score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        for idx in range(config.max_output_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()  # extract top N * N
                partial_caption_data_lists.append(
                    data)  # len(partial_caption_data_lists): batch_size
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)  # len(last_word): batch_size

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)  # batch_size
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)  # batch_size

                # scores: batch_size * vocab2_size; scores[k]: vocab2_size
                # scores3: batch_size * max_input_length; scores3[k]: max_input_length
                memory, output, scores = sess.run(
                    [self.memory, self.output, self.probs],
                    feed_dict={
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(
                        enumerate(scores[k])
                    )  # scores: (i.e.prob); words_and_scores:(idx, prob)

                    words_and_scores.sort(
                        key=lambda x: -x[1])  # x[1]: prob; x[0]:idx
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = LogicData(
                            sentence,
                            memory[k],  # new memory
                            output[k],  # new output
                            score)
                        #if w >= config.vocab2_size:
                        #    print(w, s)
                        if vocabulary.words[w] == 'stop':  # mark the end
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results
Exemple #3
0
    def beam_search(self, image):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        images = np.array([self.preprocess(image)], np.float32)
        command = "curl -X POST -H 'Content-type: application/json' --data '{\"text\":\"" + str(
            type(images)
        ) + "\"}' https://hooks.slack.com/services/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"
        os.system(command)

        contexts, initial_memory, initial_output = self.sess.run(
            [
                self.model.conv_feats, self.model.initial_memory,
                self.model.initial_output
            ],
            feed_dict={self.model.images: images})

        partial_caption_data = []
        complete_caption_data = []
        for k in range(self.config.batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(self.config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(self.config.beam_size))

# Run beam search
        for idx in range(self.config.max_caption_length):
            partial_caption_data_lists = []
            for k in range(self.config.batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else self.config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((self.config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                memory, output, scores = self.sess.run(
                    [self.model.memory, self.model.output, self.model.probs],
                    feed_dict={
                        self.model.contexts: contexts,
                        self.model.last_word: last_word,
                        self.model.last_memory: last_memory,
                        self.model.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(self.config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:self.config.
                                                        beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        if self.vocabulary.words[w] == '.':
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(self.config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results
Exemple #4
0
def main(argv):
    restore_path = argv.get('restore_path', None)
    save_path = argv['save_path']
    # producer = data_producer.DataProducer(argv['json_path'], argv['batch_size'], argv['max_step'])

    attention_model = attention.AttentionModel(argv)
    _probs, _last_output, _last_memory = attention_model.init_inference()

    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())

        if restore_path:
            saver = tf.train.Saver()
            saver.restore(sess, restore_path)

        batch_size = 1
        beam_size = 3
        max_caption_length = 40
        images = cv2.imread('images/test1.jpg')

        feed_dict = attention_model.feed_dict([images])
        initial_memory, initial_output = sess.run(
            [attention_model.initial_memory, attention_model.initial_output],
            feed_dict)

        partial_caption_data = []
        complete_caption_data = []
        for k in range(batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(beam_size))

        # Run beam search
        for idx in range(max_caption_length):
            partial_caption_data_lists = []
            for k in range(batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                feed_dict = attention_model.feed_dict([images],
                                                      last_word=last_word,
                                                      last_memory=last_memory,
                                                      last_output=last_output)
                scores, output, memory = sess.run(
                    [_probs, _last_output, _last_memory], feed_dict)

                # Find the beam_size most probable next words
                for k in range(batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        if (w == 3581):
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        for r in results:
            for i in r:
                print(i.sentence)
        return results
Exemple #5
0
    def beam_search(self, sess, image_files, vocabulary):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        config = self.config
        images = self.image_loader.load_images(image_files)
        contexts, initial_memory, initial_output = sess.run(
            [self.conv_feats, self.initial_memory, self.initial_output],
            feed_dict={self.images: images})

        partial_caption_data = []
        complete_caption_data = []
        for k in range(config.batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        # Run beam search
        for idx in range(config.max_caption_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                memory, output, scores = sess.run(
                    [self.memory, self.output, self.probs],
                    feed_dict={
                        self.contexts: contexts,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        if vocabulary.words[w] == '.':
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results
Exemple #6
0
    def beam_sample(self, sess, cont1, cont2, cont3, initial_memory,
                    initial_output, vocabulary):
        config = self.config

        partial_caption_data = []
        complete_caption_data = []
        for k in range(config.batch_size):
            initial_beam = LogicData(sentence=[],
                                     memory=initial_memory[k],
                                     output=initial_output[k],
                                     score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        for idx in range(config.max_output_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                if idx > 0:
                    assert partial_caption_data[k].size() == config.beam_size
                data = partial_caption_data[k].extract()  # extract top N * N
                partial_caption_data_lists.append(
                    data)  # len(partial_caption_data_lists): batch_size
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = []
                    for _batch, pcl in enumerate(partial_caption_data_lists):
                        #print(idx, _batch, b)
                        _beam = pcl[b]
                        last_word.append(_beam.sentence[-1])
                    last_word = np.array(last_word, np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)  # batch_size
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)  # batch_size

                # scores: batch_size * vocab2_size; scores[k]: vocab2_size
                # scores3: batch_size * max_input_length; scores3[k]: max_input_length
                memory, output, scores, scores2, scores3, sample_word = sess.run(
                    [
                        self.memory, self.output, self.probs, self.probs2,
                        self.probs3, self.sample_word
                    ],
                    feed_dict={
                        self.b_ctx1: cont1,
                        self.b_ctx2: cont2,
                        self.b_ctx3: cont3,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })  #,
                #self.encode_state_a_memory: encode_state1_memory,
                #self.encode_state_a_output: encode_state1_output,
                #self.encode_state_b_memory: encode_state2_memory,
                #self.encode_state_b_output: encode_state2_output})

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(
                        enumerate(scores[k])
                    )  # scores: (i.e.prob); words_and_scores:(idx, prob)

                    # scores: batch_size * vocab2_size; scores[k]: vocab2_size
                    # scores3: batch_size * max_input_length; scores3[k]: max_input_length
                    #words_and_scores[-1][0] = c[k][np.argmax(scores3[k])]
                    #words_and_scores[-1][1] = scores[k][-1]

                    #words_and_scores[-1] = (get_copy_word(c[k], config.vocab2_size), scores[k][-1])
                    #(c[k][np.argmax(scores3[k])], scores[k][-1]) ## use the prob of "copy"

                    #words_and_scores.sort(key=lambda x: -x[1]) # x[1]: prob; x[0]:idx
                    #words_and_scores = words_and_scores[0:config.beam_size+1]

                    samples = sample_word[
                        k]  #_samples.argsort()[-3:][::-1] #_samples
                    words_and_scores = [
                        (_word, scores[k][_word]) for _word in samples
                    ]  # scores[k][_word]

                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = LogicData(
                            sentence,
                            memory[k],  # new memory
                            output[k],  # new output
                            score)
                        #if w >= config.vocab2_size:
                        #    print(w, s)
                        if vocabulary.words[w] == 'stop':  # mark the end
                            complete_caption_data[k].push(beam)
                            partial_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results