Exemple #1
0
 def init_search_sentence(self, initial_memory, initial_output):
     self.partial_caption_data = []
     self.complete_caption_data = []
     for k in range(self.batch_size):
         initial_beam = CaptionData(sentence=[self.start_id],
                                    memory=initial_memory[k],
                                    output=initial_output[k],
                                    score=0,
                                    alphas=[])
         self.partial_caption_data.append(TopN(self.beam_size))
         self.partial_caption_data[-1].push(initial_beam)
         self.complete_caption_data.append(TopN(self.beam_size))
Exemple #2
0
    def add_result(self, beam_idx, memory, output, scores, alpha):
        # Find the beam_size most probable next words
        for k in range(self.batch_size):
            caption_data = self.partial_caption_data_lists[k][beam_idx]
            words_and_scores = list(enumerate(scores[k]))
            words_and_scores.sort(key=lambda x: -x[1])
            words_and_scores = words_and_scores[0:self.beam_size + 1]

            # Append each of these words to the current partial caption
            for w, s in words_and_scores:
                sentence = caption_data.sentence + [w]
                score = caption_data.score - np.log2(s)
                alphas = caption_data.alphas + [alpha[k]]
                beam = CaptionData(sentence, memory[k], output[k], score,
                                   alphas)
                if w == self.end_id:
                    self.complete_caption_data[k].push(beam)
                else:
                    self.partial_caption_data[k].push(beam)
Exemple #3
0
    def beam_search(self, sess, vocabulary):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        config = self.config
        contexts, initial_memory, initial_output = sess.run(
            [self.conv_feats, self.initial_memory, self.initial_output])

        partial_caption_data = []
        complete_caption_data = []
        for k in range(config.batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        # Run beam search
        for idx in range(config.max_caption_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = vocabulary.start_id * np.ones(
                        (config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                memory, output, scores = sess.run(
                    [self.memory, self.output, self.probs],
                    feed_dict={
                        self.contexts: contexts,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        if w == vocabulary.end_id:
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

        results = []
        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results
Exemple #4
0
    def beam_search(self, sess, image_files, vocabulary):
        """Use beam search to generate the captions for a batch of images."""
        # Feed in the images to get the contexts and the initial LSTM states
        config = self.config
        images = self.image_loader.load_images(image_files)
        contexts, initial_memory, initial_output = sess.run(
            [self.conv_feats, self.initial_memory, self.initial_output],
            feed_dict={self.images: images})

        partial_caption_data = []
        complete_caption_data = []

        alpha_all = []
        last_output_all = []

        for k in range(config.batch_size):
            initial_beam = CaptionData(sentence=[],
                                       memory=initial_memory[k],
                                       output=initial_output[k],
                                       score=1.0)
            partial_caption_data.append(TopN(config.beam_size))
            partial_caption_data[-1].push(initial_beam)
            complete_caption_data.append(TopN(config.beam_size))

        # Run beam search
        for idx in range(config.max_caption_length):
            partial_caption_data_lists = []
            for k in range(config.batch_size):
                data = partial_caption_data[k].extract()
                partial_caption_data_lists.append(data)
                partial_caption_data[k].reset()

            num_steps = 1 if idx == 0 else config.beam_size
            for b in range(num_steps):
                if idx == 0:
                    last_word = np.zeros((config.batch_size), np.int32)
                else:
                    last_word = np.array([
                        pcl[b].sentence[-1]
                        for pcl in partial_caption_data_lists
                    ], np.int32)

                last_memory = np.array(
                    [pcl[b].memory for pcl in partial_caption_data_lists],
                    np.float32)
                last_output = np.array(
                    [pcl[b].output for pcl in partial_caption_data_lists],
                    np.float32)

                memory, output, scores = sess.run(
                    [self.memory, self.output, self.probs],
                    feed_dict={
                        self.contexts: contexts,
                        self.last_word: last_word,
                        self.last_memory: last_memory,
                        self.last_output: last_output
                    })

                # Find the beam_size most probable next words
                for k in range(config.batch_size):
                    caption_data = partial_caption_data_lists[k][b]
                    words_and_scores = list(enumerate(scores[k]))
                    words_and_scores.sort(key=lambda x: -x[1])
                    words_and_scores = words_and_scores[0:config.beam_size + 1]

                    # Append each of these words to the current partial caption
                    for w, s in words_and_scores:
                        sentence = caption_data.sentence + [w]
                        score = caption_data.score * s
                        beam = CaptionData(sentence, memory[k], output[k],
                                           score)
                        if vocabulary.words[w] == '.':
                            complete_caption_data[k].push(beam)
                        else:
                            partial_caption_data[k].push(beam)

            results = []

            alpha_status = sess.run(self.alpha,
                                    feed_dict={
                                        self.contexts: contexts,
                                        self.last_memory: last_memory,
                                        self.last_output: last_output
                                    })

            alpha_all.append(alpha_status)
            last_output_all.append(last_output)

        file = open('ResultProcess/alpha.pickle', 'wb')
        pickle.dump(alpha_all, file)
        file.close()
        # file = open('ResultProcess/last_output_all.pickle', 'wb')
        # pickle.dump(last_output_all, file)
        # file.close()

        for k in range(config.batch_size):
            if complete_caption_data[k].size() == 0:
                complete_caption_data[k] = partial_caption_data[k]
            results.append(complete_caption_data[k].extract(sort=True))

        return results