def __init__(self, checkpoint_path, tokenizer_path, CONFIG):
        """Load weights of encoder-decoder model from checkpoint. Load saved tokenizer.

        Args:
            checkpoint_path (str): path to directory containing checkpoints
            tokenizer_path (str): path to pickle file storing tokenizer
            CONFIG (CONFIG object): an object storing the configuration for package
        """

        self.cnn_backbone = model_config_dict[CONFIG.CNN_BACKBONE]['model']
        self.cnn_feature_model = self._reconfigure_cnn()

        self.encoder = CNN_Encoder(CONFIG.EMBEDDING_SIZE)
        self.decoder = RNN_Decoder(CONFIG.EMBEDDING_SIZE, CONFIG.UNITS,
                                   CONFIG.VOCAB_SIZE)

        ckpt = tf.train.Checkpoint(encoder=self.encoder, decoder=self.decoder)

        ckpt_manager = tf.train.CheckpointManager(ckpt,
                                                  checkpoint_path,
                                                  max_to_keep=5)
        #chosen_checkpoint = ckpt_manager.checkpoints[2]
        chosen_checkpoint = ckpt_manager.latest_checkpoint
        ckpt.restore(chosen_checkpoint)

        if ckpt_manager.latest_checkpoint:
            print("******** Restored from {}".format(chosen_checkpoint))
        else:
            print("******** Initializing from scratch.")

        self.tokens_manager = pickle.load(open(tokenizer_path, 'rb'))
Beispiel #2
0
    def __init__(self, loss_object, tokenizer, checkpoint_path=None):

        print('Setting up Evaluation Handler')

        self.tokenizer = tokenizer
        self.loss_object = loss_object
        self.special_tokens = ['<unk>', '<pad>', '<end>', '<start>']       
        
        self.checkpoint_path = checkpoint_path
        if self.checkpoint_path is not None:
            self.encoder = CNN_Encoder(CONFIG.EMBEDDING_SIZE)
            self.decoder = RNN_Decoder(
                CONFIG.EMBEDDING_SIZE, CONFIG.UNITS, CONFIG.VOCAB_SIZE)
            ckpt = tf.train.Checkpoint(encoder=self.encoder,
                                       decoder=self.decoder)

            ckpt_manager = tf.train.CheckpointManager(
                ckpt, self.checkpoint_path, max_to_keep=5)
            #chosen_checkpoint = ckpt_manager.checkpoints[2]
            chosen_checkpoint = ckpt_manager.latest_checkpoint
            ckpt.restore(chosen_checkpoint)
Beispiel #3
0
def load_latest_imgcap(checkpoint_path, ckpt_index=-1):
    embedding_dim = 256
    units = 512
    vocab_size = TOP_K + 1

    encoder = CNN_Encoder(embedding_dim)
    decoder = RNN_Decoder(embedding_dim, units, vocab_size)
    optimizer = tf.keras.optimizers.Adam()

    ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer)
    ckpt_man = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=None)
    ckpt.restore(ckpt_man.checkpoints[ckpt_index])

    return encoder, decoder
Beispiel #4
0
def load_decoder(fname,
                 embedding_dim,
                 units,
                 batch_size=BATCH_SIZE,
                 vocab_size=vocab_size):

    decoder = RNN_Decoder(embedding_dim, units, vocab_size)

    input_shape = [(batch_size, 1),
                   (batch_size, attention_features_shape, embedding_dim),
                   (batch_size, units)]
    decoder.build(input_shape)
    decoder.load_weights(fname)

    return decoder
Beispiel #5
0
def main(args):

	# image processing
	transform = transforms.Compose([
		transforms.ToTensor(),
		transforms.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225))])

	# Load vocabulary wrapper
	with open(args.vocab_path, 'rb') as f:
		vocab = pickle.load(f)

	# Build models
	encoder = CNN_Encoder(args.embed_size).eval() # eval mode(batchnorm uses moving mwan/variance)
	encoder = encoder.to(device)
	decoder = RNN_Decoder(args.embed_size,args.hidden_dims,len(vocab),args.num_layers)
	decoder = decoder.to(device)

	# Load the trained model parameters
	encoder.load_state_dict(torch.load(args.encoder_path))
	decoder.load_state_dict(torch.load(args.decoder_path))

	# Prepare an image
	image = load_image(args.image,transform)
	image_tensor = image.to(device)

	# Generate a caption from the image
	feature = encoder(image_tensor)
	sampled_ids = decoder.sample(feature)
	sampled_ids = sampled_ids[0].cpu().numpy

	# Convert word_ids to words
	sampled_caption = []
	for word_id in sampled_ids:
		word = vocab.idx2word[word_id]
		sampled_caption.append(word)
		if word == '<end>':
			break
	sentence =' '.join(sampled_caption)

	# Print out the image and the generated caption
	print(sentence)
	image = Image.open(args.image)
	plt.imshow(np.asarray(image))
Beispiel #6
0
    img_name_vector, cap_vector, test_size=0.1, random_state=0)

vocab_size = len(tokenizer.word_index) + 1
print('vocab_size:' + str(vocab_size))
num_steps = len(img_name_train) // BATCH_SIZE
val_num_steps = len(img_name_val) // BATCH_SIZE
# shape of the vector extracted from InceptionV3 is (64, 2048)
# these two variables represent that
features_shape = 2048
attention_features_shape = 64

dataset = load_batch(img_name_train, cap_train, BATCH_SIZE, BUFFER_SIZE)
val_dataset = load_batch(img_name_val, cap_val, BATCH_SIZE, BUFFER_SIZE)

encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

optimizer = tf.train.AdamOptimizer()

checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder)
ckpt_manager = tf.train.CheckpointManager(ckpt,
                                          checkpoint_path,
                                          max_to_keep=50)

start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])

# adding this in a separate cell because if you run the training cell
# many times, the loss_plot array will be reset
class InstgramCaptioner:
    def __init__(self, checkpoint_path, tokenizer_path, CONFIG):
        """Load weights of encoder-decoder model from checkpoint. Load saved tokenizer.

        Args:
            checkpoint_path (str): path to directory containing checkpoints
            tokenizer_path (str): path to pickle file storing tokenizer
            CONFIG (CONFIG object): an object storing the configuration for package
        """

        self.cnn_backbone = model_config_dict[CONFIG.CNN_BACKBONE]['model']
        self.cnn_feature_model = self._reconfigure_cnn()

        self.encoder = CNN_Encoder(CONFIG.EMBEDDING_SIZE)
        self.decoder = RNN_Decoder(CONFIG.EMBEDDING_SIZE, CONFIG.UNITS,
                                   CONFIG.VOCAB_SIZE)

        ckpt = tf.train.Checkpoint(encoder=self.encoder, decoder=self.decoder)

        ckpt_manager = tf.train.CheckpointManager(ckpt,
                                                  checkpoint_path,
                                                  max_to_keep=5)
        #chosen_checkpoint = ckpt_manager.checkpoints[2]
        chosen_checkpoint = ckpt_manager.latest_checkpoint
        ckpt.restore(chosen_checkpoint)

        if ckpt_manager.latest_checkpoint:
            print("******** Restored from {}".format(chosen_checkpoint))
        else:
            print("******** Initializing from scratch.")

        self.tokens_manager = pickle.load(open(tokenizer_path, 'rb'))

    @timer
    def generate_caption(self, image_path):
        """Use a CNN-GRU model to predict the caption to an image.

        Args:
            image_path (str): the path to the serialized image - png/jpg/jpeg.

        Returns:
            result: a list of strings in a sequence representing predicted caption.
        """

        # max_length = 47 on this dataset
        max_length = self.tokens_manager.max_length
        print('MAX LENGTH: ', max_length)

        attention_plot = np.zeros(
            (max_length,
             model_config_dict[CONFIG.CNN_BACKBONE]['attention_features_shape']
             ))
        # hidden.shape = [1, 512]
        # features,shape = [1, 49, 256]
        # decoder_input.shape = [1, 1]
        hidden = self.decoder.reset_state(batch_size=1)

        img = self._load_image(image_path)
        features = self._create_img_encoding(img)
        decoder_input = tf.expand_dims(
            [self.tokens_manager.tokenizer.word_index['<start>']], 0)

        result = []
        for i in range(max_length):
            # we could use the code below instead to generate randomness in sentence creation - useful for production
            # but not the testing here: tf.random.categorical(predictions, 1, seed=42)[0][0].numpy()
            predictions, hidden, attention_weights = self.decoder(
                decoder_input, features, hidden)

            attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()

            predicted_id = np.argmax(predictions)
            result.append(
                self.tokens_manager.tokenizer.index_word[predicted_id])

            if self.tokens_manager.tokenizer.index_word[
                    predicted_id] == '<end>':
                return result, attention_plot

            decoder_input = tf.expand_dims([predicted_id], 0)

        attention_plot = attention_plot[:len(result), :]
        return result, attention_plot

    def _create_img_encoding(self, img):
        """Encode the image using the CNN (e.g. MobileNetV2) and pass through a fully connected layer to embed the image's features.

        Args:
            image_path (str): path to the serialized img - png/jpg/jpeg

        Returns:
            features: a tensorflow Tensor object of dim [batch_size, cnn_feature_shape, embedding_dim] (e.g. [1, 49, 256])
        """

        temp_input = tf.expand_dims(img,
                                    0)  # this is like saying batch_size = 1
        cnn_output = self.cnn_feature_model(temp_input)
        cnn_output = tf.reshape(cnn_output,
                                (cnn_output.shape[0], -1, cnn_output.shape[3]))
        features = self.encoder(cnn_output)

        return features

    def _reconfigure_cnn(self):
        """Reconfigures the CNN architecture, removing the final layer (and ImageNet classification layer).

        Returns:
            tf.keras.Model: the reconfigured architecture (e.g. MobileNetV2).
        """

        model = self.cnn_backbone(include_top=False, weights='imagenet')
        new_input = model.input
        remaining_desired_architecture = model.layers[-1].output
        reconfigured_cnn = tf.keras.Model(new_input,
                                          remaining_desired_architecture)
        return reconfigured_cnn

    def _load_image(self, image_path):
        """load_image function following the convention of keras preprocessing operations for consistency with training code.

        Args:
            image_path (str): path to serialized img - png/jpg/jpeg

        Returns:
            img: Tensor of image resized to e.g. (224, 224)
        """

        if isinstance(image_path, str):
            img = tf.io.read_file(image_path)
            img = tf.image.decode_jpeg(img, channels=3)

        elif isinstance(image_path, np.ndarray):
            img = image_path

        img = tf.image.resize(
            img, model_config_dict[CONFIG.CNN_BACKBONE]['input_shape'])
        img = tf.keras.applications.imagenet_utils.preprocess_input(img)
        return img

    def _plot_attention(self, img, attention_plot, result):

        fig, ax = plt.subplots(figsize=(10, 10))

        len_result = len(result)
        for l in range(len_result):
            temp_att = np.resize(attention_plot[l], (8, 8))
            ax = fig.add_subplot(len_result // 2, len_result // 2, l + 1)
            ax.set_title(result[l])
            matplotlib_img = ax.imshow(img)
            ax.imshow(temp_att,
                      cmap='gray',
                      alpha=0.6,
                      extent=matplotlib_img.get_extent())

        plt.tight_layout()
        plt.show()
        plt.savefig('attention_plot.png')

    @timer
    def test_img_from_mscoco(self,
                             idx,
                             caption_filename_tuple_path,
                             output_file='current_img.png'):
        """Test the model on an image from the downloaded dataset. This requires the caption_filename_tuple to have
            been generated and pickled using utils.organise_data(). 

            Example:

                train_captions, img_name_vector = utils.organise_data()
                caption_filename_tuple = list(zip(train_captions, img_name_vector))

                with open(os.path.join(captions_dir,'caption_filename_tuple.pkl'), 'wb') as pickle_file:
                    pickle.dump(caption_filename_tuple, pickle_file)

                tokenizer_path = os.path.join(CONFIG.CACHE_DIR_ROOT, f'{CONFIG.CNN_BACKBONE}_captions', 'coco_tokenizer.pkl') 
                checkpoint_path = '/mnt/pythonfiles/models/mobilenet_v2_bahdanau/checkpoints/train/02012021-183517'
                #model 31122020-180918 shows the best results so far

                caption_bot = InstgramCaptioner(checkpoint_path, tokenizer_path, CONFIG)
                caption_filename_tuple_path = os.path.join(CONFIG.CACHE_DIR_ROOT, f'{CONFIG.CNN_BACKBONE}_captions', 'caption_filename_tuple.pkl')

                idx = int(sys.argv[1])
                caption_bot.test_img_from_mscoco(idx, caption_filename_tuple_path)
            
        Args:
            idx (int): index the caption_filename_tuple to select an image for inference.
            caption_filename_tuple_path (str): path to the caption_filename_tuple.
            output_file (str, optional): path to output_file location. Defaults to 'current_img.png'.
        """

        caption_filename_tuple = pickle.load(
            open(caption_filename_tuple_path, 'rb'))
        current_img_path = caption_filename_tuple[idx][1]

        # remove <start> and <end> tokens and convert to string
        ground_truth_caption = ' '.join(
            caption_filename_tuple[idx][0].split(' ')[1:-1])

        # forward pass on the model
        result, attention_plot = self.generate_caption(current_img_path)
        gen_caption = ' '.join(result[:-1])

        logger.info(f' The caption PREDICTED by caption_bot '.center(80, '*'))
        logger.info(gen_caption)
        logger.info(f' The LABELLED ground truth caption '.center(80, '*'))
        logger.info(ground_truth_caption)

        # cv2 operations to annotate the image with predicted and ground-truth captions
        current_img = cv2.imread(current_img_path)
        cv2.rectangle(current_img, (15, 25), (current_img.shape[1] - 15, 85),
                      (95, 95, 95), cv2.FILLED)
        cv2.putText(current_img, gen_caption, (50, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (60, 30, 255), 1,
                    cv2.LINE_AA)
        cv2.putText(current_img, ground_truth_caption, (50, 80),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (20, 240, 10), 1,
                    cv2.LINE_AA)
        cv2.imwrite(output_file, current_img)

        self._plot_attention(current_img, attention_plot, result)
Beispiel #8
0
        CONFIG.BATCH_SIZE)
    train_dataset = train_dataset.prefetch(
        buffer_size=tf.data.experimental.AUTOTUNE)

    val_dataset = val_dataset.shuffle(CONFIG.BUFFER_SIZE).batch(
        CONFIG.EVAL_BATCH_SIZE)
    val_dataset = val_dataset.prefetch(
        buffer_size=tf.data.experimental.AUTOTUNE)

    # ************ Model ************

    # mirrored_strategy = tf.distribute.MirroredStrategy()
    # with mirrored_strategy.scope():
    encoder = CNN_Encoder(CONFIG.EMBEDDING_SIZE,
                          include_cnn_backbone=CONFIG.INCLUDE_CNN_IN_TRAINING)
    decoder = RNN_Decoder(CONFIG.EMBEDDING_SIZE, CONFIG.UNITS,
                          CONFIG.VOCAB_SIZE)

    # ************ Optimizer ************

    initial_learning_rate = CONFIG.LEARNING_RATE
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=200,
        decay_rate=0.96,
        staircase=True)

    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule,
                                         beta_1=0.9,
                                         beta_2=0.999,
                                         epsilon=1e-7)
Beispiel #9
0
class EvaluationHandler:
    def __init__(self, loss_object, tokenizer, checkpoint_path=None):

        print('Setting up Evaluation Handler')

        self.tokenizer = tokenizer
        self.loss_object = loss_object
        self.special_tokens = ['<unk>', '<pad>', '<end>', '<start>']       
        
        self.checkpoint_path = checkpoint_path
        if self.checkpoint_path is not None:
            self.encoder = CNN_Encoder(CONFIG.EMBEDDING_SIZE)
            self.decoder = RNN_Decoder(
                CONFIG.EMBEDDING_SIZE, CONFIG.UNITS, CONFIG.VOCAB_SIZE)
            ckpt = tf.train.Checkpoint(encoder=self.encoder,
                                       decoder=self.decoder)

            ckpt_manager = tf.train.CheckpointManager(
                ckpt, self.checkpoint_path, max_to_keep=5)
            #chosen_checkpoint = ckpt_manager.checkpoints[2]
            chosen_checkpoint = ckpt_manager.latest_checkpoint
            ckpt.restore(chosen_checkpoint)



    @timer
    def evaluate_data(self, validation_dataset, val_steps, encoder=None, decoder=None):

        if self.checkpoint_path is None:
            assert encoder is not None
            assert decoder is not None
            self.encoder = encoder
            self.decoder = decoder

        print('Begin evaluation')
        avg_bleu = np.array([0, 0, 0, 0], dtype=float)
        avg_rouge = 0.0
        for batch_idx, (img_tensor, target) in enumerate(validation_dataset):
            score = self._evaluate_batch(img_tensor, target)
            avg_bleu += np.array(score['BLEU'], dtype=float)/float(val_steps)
            avg_rouge += score['ROUGE']/float(val_steps)

        avg_bleu = avg_bleu.round(2)
        avg_rouge = avg_rouge.round(2)
        avg_scores = {'BLEU':avg_bleu, 'ROUGE': avg_rouge}
        print('The average BLEU:', avg_scores)
        return avg_scores


    def _evaluate_batch(self, img_tensor, target):
        
        self.loss, self.total_loss, predicted_ids = self._forward_pass(img_tensor, target)
        self.loss = self.loss/(int(target.shape[1]))

        predicted_ids = np.array(predicted_ids).reshape(-1) #TODO: remove 46 hardcoding

        cleaned_target = self._tokens_to_captions(target, self.special_tokens)
        cleaned_predicted_tokens = self._tokens_to_captions(predicted_ids, self.special_tokens)
    
        ground_truth_captions = {f'{k}':v for (k, v) in enumerate(cleaned_target)}
        predicted_captions = {f'{k}':v for (k, v) in enumerate(cleaned_predicted_tokens)}

        score, scores = compute_scores(ground_truth_captions, predicted_captions)
        score = self._clean_coco_scores_output(score)
               
        # for gt, pred in zip(ground_truth_captions.values(), predicted_captions.values()):            
        #     score = self.bleu_score(gt, pred, verbose=False)
        
        return score
    
    @tf.function
    def _forward_pass(self, img_tensor, target):
        """Training step as tf.function to allow for gradient updates in tensorflow.

        Args:
            img_tensor -- this is output of CNN
            target -- caption vectors of dim (units, max_length) where units is num GRUs and max_length is size of caption with most tokens
        """
        loss = 0

        hidden = self.decoder.reset_state(batch_size=target.shape[0])
        dec_input = tf.expand_dims([self.tokenizer.word_index['<start>']] * target.shape[0], 1)

        features = self.encoder(img_tensor)
        result_ids = []
        for i in range(1, target.shape[1]):
            predictions, hidden, _ = self.decoder(dec_input, features, hidden)
            predicted_ids = tf.math.argmax(predictions, axis=1)
            result_ids.append(predicted_ids) 
            loss += loss_function(target[:, i], predictions, self.loss_object)
            dec_input = tf.expand_dims(predicted_ids, 1) # take the ith word in target not pred i.e. teacher forcing method

        total_loss = (loss / int(target.shape[1]))

        return loss, total_loss, result_ids

    def _tokens_to_captions(self, tokens_batch, tokens_to_remove):

        predicted_captions = self.tokenizer.sequences_to_texts(np.array(tokens_batch))

        cleaned_captions_batch =[]
        for caption in predicted_captions:
            # 47 is max seqence length remember ... (6000 dataset)
            clean_caption = caption.split(' ')[:47]
            if '<end>' in clean_caption:
                clean_caption = [item for i, item in enumerate(clean_caption) if '<end>' in clean_caption[i:]]
            clean_caption = [item for item in clean_caption if item not in tokens_to_remove]
            if clean_caption == []:
                clean_caption = [' ']
            clean_caption_str = ' '.join(clean_caption)
            cleaned_captions_batch.append([clean_caption_str])            

        return cleaned_captions_batch


    def _clean_coco_scores_output(self, scores_dict):

        score_names = ['BLEU', 'ROUGE']

        cleaned_scores_dict = {}
        for i, (key, val) in enumerate(scores_dict.items()):
            cleaned_scores_dict[score_names[i]] = val

        return cleaned_scores_dict
            
    def bleu_score(self, predicted, actual, verbose=False):

        b1 = corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))
        b2 = corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))
        b3 = corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))
        b4 = corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))

        if verbose:
            print('BLEU-1: %f' % b1)
            print('BLEU-2: %f' % b2)
            print('BLEU-3: %f' % b3)
            print('BLEU-4: %f' % b4)

        return np.array([round(b1, 5), round(b2, 5), round(b3, 5), round(b4, 5)])
Beispiel #10
0
def main(_):
    # Imagenet 데이터셋에 대해 Pre-train된 Inception v3 모델의 Weight를 불러오고,
    # Softmax Layer 한칸 앞에서 8x8x2048 형태의 Feature map을 추출하는 hidden layer를 output으로 하는
    # image_features_extract_model을 tf.keras.Model을 이용해서 선언합니다.
    image_model = tf.keras.applications.InceptionV3(include_top=False,
                                                    weights='imagenet')
    new_input = image_model.input
    hidden_layer = image_model.layers[-1].output

    image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

    # do_caching flag가 True일 경우 이미지들에 대한 bottleneck caching을 수행합니다.
    if FLAGS.do_caching == True:
        cache_bottlenecks(img_name_vector, image_features_extract_model)
    else:
        print('Already bottleneck cached !')

    # 가장 빈도수가 높은 5000개의 단어를 선택해서 Vocabulary set을 만들고,
    # Vocabulary set에 속하지 않은 단어들은 <unk>로 지정합니다.
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=top_k,
        oov_token="<unk>",
        filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
    tokenizer.fit_on_texts(train_captions)
    # 가장 긴 문장보다 작은 문장들은 나머지 부분은 <pad>로 padding합니다.
    tokenizer.word_index['<pad>'] = 0
    tokenizer.index_word[0] = '<pad>'

    # caption 문장을 띄어쓰기 단위로 split해서 tokenize 합니다.
    train_seqs = tokenizer.texts_to_sequences(train_captions)
    # 길이가 짧은 문장들에 대한 padding을 진행합니다.
    cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs,
                                                               padding='post')
    # attetion weights를 위해서 가장 긴 문장의 길이를 저장합니다.
    max_length = calc_max_length(train_seqs)

    # 데이터의 80%를 training 데이터로, 20%를 validation 데이터로 split합니다.
    img_name_train, img_name_val, cap_train, cap_val = train_test_split(
        img_name_vector, cap_vector, test_size=0.2, random_state=0)

    print('train image size:', len(img_name_train), 'train caption size:',
          len(cap_train))
    print('validation image size:', len(img_name_val),
          'validation caption size:', len(cap_val))

    num_steps = len(img_name_train) // BATCH_SIZE

    # disk에 caching 해놓은 numpy 파일들을 읽습니다.
    def map_func(img_name, cap):
        img_tensor = np.load(img_name.decode('utf-8') + '.npy')
        return img_tensor, cap

    dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))
    # numpy 파일들을 병렬적(parallel)으로 불러옵니다.
    dataset = dataset.map(lambda item1, item2: tf.numpy_function(
        map_func, [item1, item2], [tf.float32, tf.int32]),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # tf.data API를 이용해서 데이터를 섞고(shuffle) batch 개수(=64)로 묶습니다.
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    # encoder와 decoder를 선언합니다.
    encoder = CNN_Encoder(embedding_dim)
    decoder = RNN_Decoder(embedding_dim, units, vocab_size)

    # checkpoint 데이터를 저장할 경로를 지정합니다.
    checkpoint_path = "./checkpoints/train"
    ckpt = tf.train.Checkpoint(encoder=encoder,
                               decoder=decoder,
                               optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_path,
                                              max_to_keep=5)

    start_epoch = 0
    if ckpt_manager.latest_checkpoint:
        start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
        # checkpoint_path에서 가장 최근의 checkpoint를 restore합니다.
        ckpt.restore(ckpt_manager.latest_checkpoint)

    loss_plot = []

    # 지정된 epoch 횟수만큼 optimization을 진행합니다.
    for epoch in range(start_epoch + 1, EPOCHS + 1):
        start = time.time()
        total_loss = 0

        for (batch, (img_tensor, target)) in enumerate(dataset):
            batch_loss, t_loss = train_step(img_tensor, target, tokenizer,
                                            encoder, decoder)
            total_loss += t_loss

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(
                    epoch, batch,
                    batch_loss.numpy() / int(target.shape[1])))
        # 추후에 plot을 위해서 epoch별 loss값을 저장합니다.
        loss_plot.append(total_loss / num_steps)

        # 5회 반복마다 파라미터값을 저장합니다.
        if epoch % 5 == 0:
            ckpt_manager.save(checkpoint_number=epoch)

        print('Epoch {} Loss {:.6f}'.format(epoch, total_loss / num_steps))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

    print('Training Finished !')
    plt.plot(loss_plot)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss Plot')
    plt.savefig('Loss plot.png')
    plt.show()

    # validation set에서 random하게 1장의 이미지를 뽑아 해당 이미지에 대한 captioning을 진행합니다.
    rid = np.random.randint(0, len(img_name_val))
    image = img_name_val[rid]
    real_caption = ' '.join(
        [tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])
    result, attention_plot = evaluate(image, max_length,
                                      attention_features_shape, encoder,
                                      decoder, image_features_extract_model,
                                      tokenizer)

    print('Real Caption:', real_caption)
    print('Prediction Caption:', ' '.join(result))
    plot_attention(image, result, attention_plot)

    # test를 위해서 surfer 이미지 한장을 다운받은뒤, 해당 이미지에 대한 captioning을 진행해봅니다.
    image_url = 'https://tensorflow.org/images/surf.jpg'
    image_extension = image_url[-4:]
    image_path = tf.keras.utils.get_file('image' + image_extension,
                                         origin=image_url)

    result, attention_plot = evaluate(image_path, max_length,
                                      attention_features_shape, encoder,
                                      decoder, image_features_extract_model,
                                      tokenizer)
    print('Prediction Caption:', ' '.join(result))
    plot_attention(image_path, result, attention_plot)
Beispiel #11
0
def train(hparams, models_path = './'):
    """

    Returns:
        results: dict
            dictionary containing model identifier, elapsed_time per epoch,
            learning curve with loss and metrics
        models: tuple of keras Models
            the trained encoder and decoder networks


    """

    model_id = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

    encoder = CNN_Encoder(**hparams['encoder'])
    decoder = RNN_Decoder(**hparams['decoder'], vocab_size=vocab_size)

    optimizer = make_optimizer(**hparams['optimizer'])

    lambda_reg = hparams['train']['lambda_reg']

    # ckpt = tf.train.Checkpoint(encoder=encoder,
    #                            decoder=decoder,
    #                            optimizer = optimizer)
    # ckpt_manager = tf.train.CheckpointManager(ckpt, CHECKPOINT_PATH, max_to_keep=5)


    start_epoch = 0
    # if ckpt_manager.latest_checkpoint:
    #   start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
    #   # restoring the latest checkpoint in checkpoint_path
    #   ckpt.restore(ckpt_manager.latest_checkpoint)

    @tf.function
    def train_step(img_tensor, target):
        loss = 0
        losses = {}

        batch_size, caption_length = target.shape

        # initializing the hidden state for each batch
        # because the captions are not related from image to image
        hidden = decoder.reset_state(batch_size = batch_size)

        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * batch_size, 1)
        # attention_plot = tf.Variable(tf.zeros((batch_size,
        #                                      caption_length,
        #                                      attention_features_shape)))


        with tf.GradientTape() as tape:

            features = encoder(img_tensor, training = True)
            attention_sum = tf.zeros((batch_size, attention_features_shape, 1))

            for i in range(1, caption_length):
                # passing the features through the decoder
                predictions, hidden, attention_weights = decoder((dec_input, features, hidden), training = True)
                attention_sum += attention_weights

                # loss += loss_function(target[:, i], predictions)

                # using teacher forcing
                dec_input = tf.expand_dims(target[:, i], 1)

            losses['cross_entropy'] = loss/caption_length

            # attention regularization loss
            loss_attn_reg = lambda_reg * tf.reduce_sum((1 - attention_sum)**2)
            losses['attention_reg'] = loss_attn_reg/caption_length
            loss += loss_attn_reg

            # Weight decay losses
            loss_weight_decay = tf.add_n(encoder.losses) + tf.add_n(decoder.losses)
            losses['weight_decay'] = loss_weight_decay/caption_length
            loss += loss_weight_decay



        losses['total'] = loss/ caption_length

        trainable_variables = encoder.trainable_variables + decoder.trainable_variables

        gradients = tape.gradient(loss, trainable_variables)

        optimizer.apply_gradients(zip(gradients, trainable_variables))

        return loss, losses

    num_steps = num_examples // BATCH_SIZE

    loss_plots = {'cross_entropy':[], 'attention_reg':[], 'weight_decay':[],
                  'total':[]}
    metrics = {'cross-entropy':[], 'bleu-1':[],'bleu-2':[],'bleu-3':[],
               'bleu-4':[], 'meteor':[]}
    epoch_times = []
    val_epoch_times = []

    start = time.time()
    logging.info('Training start for model ' + model_id)
    logging.info('hparams: ' + str(hparams))
    for epoch in range(start_epoch, EPOCHS):
        epoch_start = time.time()
        total_loss = {'cross_entropy':0, 'attention_reg':0, 'weight_decay':0,
                      'total':0}

        for (batch, (img_tensor, target)) in enumerate(dataset_train):
            batch_loss, t_loss = train_step(img_tensor, target)
            for key in total_loss.keys():
                total_loss[key] += float(t_loss[key])

            if batch % 100 == 0:
                logging.info('Epoch {} Batch {} Loss {:.4f}'.format(
                  epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))

        # storing the epoch end loss value to plot later
        for key in loss_plots.keys():
            loss_plots[key].append(total_loss[key] / num_steps)


        # Evaluate on validation
        val_epoch_start = time.time()
        epoch_scores = validation_scores(dataset_val, (encoder, decoder), tokenizer)
        val_epoch_stop = time.time() - val_epoch_start
        val_epoch_times.append(val_epoch_stop)

        for name, score in epoch_scores.items():
            metrics[name].append(score)

        epoch_stop = time.time() - epoch_start
        epoch_times.append(epoch_stop)

        # if epoch % 1 == 0:
        #   ckpt_manager.save()

        logging.info('Epoch {} Loss {:.6f}'.format(epoch + 1,
                                             total_loss['total']/num_steps))


        logging.info('Time taken for 1 epoch {} sec\n'.format(epoch_stop))

    total_time = time.time() - start
    logging.info('Total training time: {}'.format(total_time))

    results = { 'id':model_id,
                'losses':loss_plots,
                'epoch_times':epoch_times,
                'total_time':total_time,
                'encoder_params': encoder.count_params(),
                'decoder_params': decoder.count_params(),
                'instances_train': num_examples,
                'instances_valid': num_examples_val,
                'batch_size': BATCH_SIZE,
                'epochs': EPOCHS,
                'vocabulary': vocab_size,
                'valid_batch_size': VALID_BATCH_SIZE,
                'valid_epoch_times':val_epoch_times,
                'metrics_val': metrics}

    encoder.save_weights(str(models_path) + ('encoder_' + model_id + '.h5'))
    decoder.save_weights(str(models_path) + ('decoder_' + model_id + '.h5'))
    models = (encoder, decoder)

    return results, models
def main():

    # The sample size from the total 4,14,803 is defined here
    total_size = 400000
    img_name_vector, train_captions, = preprocess(total_size)

    # The CNN encoder model is initialised here
    image_features_extract_model = image_features_model()

    # This function initially takes all the images at a batch size of 16
    # provides them to input to a CNN model, the activation output from the
    # convolution layer is stored directly
    # run this function only once to create, rest of the time, comment this line
    # enc_len = batch_feature_processing(img_name_vector)
    #print("Encoded length", enc_len, "len", len(img_name_vector))

    # This function takes in the captions, preprocess them and
    # return a sequemce of numbers which represent sequence of words
    # part of a vocabulary
    cap_vector, tokenizer, max_length = proc_caption(train_captions)

    # Create training and validation sets using an 80-20 split
    img_name_train, img_name_val, cap_train, cap_val = train_test_split(
        img_name_vector, cap_vector, test_size=0.2, random_state=0)

    print(len(img_name_train), len(cap_train), len(img_name_val), len(cap_val),
          "\n")

    # Training parameters according to  system's configuration
    top_k = 10000
    BATCH_SIZE = 64
    BUFFER_SIZE = 1000
    embedding_dim = 256
    units = 512

    # vocabuary of words
    vocab_size = top_k + 1
    num_steps = len(img_name_train) // BATCH_SIZE
    # Shape of the vector extracted from InceptionV3 is (64, 2048)
    # These two variables represent that vector shape
    features_shape = 2048
    attention_features_shape = 64

    loss_plot = []

    # Tensorflow data pipeline, similar to the documented example
    # Taking tensor slices from both the image name list
    # and corresponding caption
    dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

    #Use map to load the numpy files in parallel
    dataset = dataset.map(lambda item1, item2: tf.numpy_function(
        map_func, [item1, item2], [tf.float32, tf.int32]),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Shuffle and batch
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    #MODEL STARTS HERE
    encoder = CNN_Encoder(embedding_dim)
    decoder = RNN_Decoder(embedding_dim, units, vocab_size)

    #OPTIMIZER AND LOSSS
    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    # Very own loss function according to paper
    def loss_function(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)

        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask

        return tf.reduce_mean(loss_)

    #CHECKPOINTS
    checkpoint_path = "./checkpoints/train400000"
    ckpt = tf.train.Checkpoint(encoder=encoder,
                               decoder=decoder,
                               optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_path,
                                              max_to_keep=10)

    start_epoch = 0
    if ckpt_manager.latest_checkpoint:
        start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
        # restoring the latest checkpoint in checkpoint_path
        print("LATEST CHECKPOINT:", ckpt_manager.latest_checkpoint)
        ckpt.restore(ckpt_manager.latest_checkpoint)

    @tf.function  # this declaration is important, without this you will see errors
    def train_step(img_tensor, target):
        loss = 0

        # initializing the hidden state for each batch
        # because the captions are not related from image to image
        hidden = decoder.reset_state(batch_size=target.shape[0])

        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] *
                                   target.shape[0], 1)

        # The newer tensorflow uses Gradient Tape to perform Back Propogation
        with tf.GradientTape() as tape:
            # final image features are generated
            features = encoder(img_tensor)

            # target is the sequence of caption word numbers, iterating through its length
            for i in range(1, target.shape[1]):
                # passing the features through the decoder
                # no need to have the attention weights here
                predictions, hidden, _ = decoder(dec_input, features, hidden)

                # cumulating the loss from every time step
                loss += loss_function(target[:, i], predictions)

                # using teacher forcing
                dec_input = tf.expand_dims(target[:, i], 1)

        total_loss = (loss / int(target.shape[1]))
        # taking all the trainable parameters
        trainable_variables = encoder.trainable_variables + decoder.trainable_variables

        # The gradients are calculated in this step and updated
        gradients = tape.gradient(loss, trainable_variables)

        optimizer.apply_gradients(zip(gradients, trainable_variables))

        return loss, total_loss

    # The total no of epochs
    EPOCHS = 30

    for epoch in range(start_epoch, EPOCHS):
        # Each epoch is recorded in time
        start = time.time()
        total_loss = 0

        # using the prebuilt tensorflow data pipeline dataset and iterating with each batch
        for (batch, (img_tensor, target)) in enumerate(dataset):
            # calaculating each batch loss
            batch_loss, t_loss = train_step(img_tensor, target)
            total_loss += t_loss

            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(
                    epoch + 1, batch,
                    batch_loss.numpy() / int(target.shape[1])))
        # storing the epoch end loss value to plot later
        loss_plot.append(total_loss / num_steps)

        # saving the model checkpoints
        if epoch % 5 == 0:
            ckpt_manager.save()

        print('Epoch {} Loss {:.6f}'.format(epoch + 1, total_loss / num_steps))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

    plt.plot(loss_plot)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss Plot')
    plt.savefig('training100000.jpg')
    plt.show()
Beispiel #13
0
def main():

    #Training parameters
    total_size = 100000
    top_k = 5000
    BATCH_SIZE = 64
    BUFFER_SIZE = 1000
    embedding_dim = 256
    units = 512
    vocab_size = top_k + 1
    features_shape = 512
    attention_features_shape = 49

    # loading the training caption sequences and image name vectors
    train_captions, img_name_vector = np.load('traincaption_imgname.npy')
    # taking a subset of these and comverting to list
    img_name_vector = img_name_vector[:total_size].tolist()
    train_captions = train_captions[:total_size].tolist()
    #Enc_len = batch_feature_processing(img_name_vector)

    # creating an instance of CNN mdel
    image_features_extract_model = image_features_model()

    max_length = 51
    # processing the captions
    cap_vector, tokenizer, max_length = proc_caption(train_captions)

    # Create training and validation sets using an 80-20 split
    img_name_train, img_name_val, cap_train, cap_val = train_test_split(
        img_name_vector, cap_vector, test_size=0.2, random_state=0)

    #MODEL STARTS HERE
    encoder = CNN_Encoder(embedding_dim)
    decoder = RNN_Decoder(embedding_dim, units, vocab_size)

    #OPTIMIZER AND LOSSS
    optimizer = tf.keras.optimizers.Adam()
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')

    #CHECKPOINTS
    checkpoint_path = "\checkpoints\train100000"
    ckpt = tf.train.Checkpoint(encoder=encoder,
                               decoder=decoder,
                               optimizer=optimizer)
    ckpt_manager = tf.train.CheckpointManager(ckpt,
                                              checkpoint_path,
                                              max_to_keep=5)

    start_epoch = 0
    if ckpt_manager.latest_checkpoint:
        start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
        # restoring the latest checkpoint in checkpoint_path
        ckpt.restore(ckpt_manager.latest_checkpoint)

    #EVALUATION
    def evaluate(image):
        attention_plot = np.zeros((max_length, attention_features_shape))

        # resetting the hidden state of decoder
        hidden = decoder.reset_state(batch_size=1)

        temp_input = tf.expand_dims(load_image(image)[0], 0)
        # extract the image features
        img_tensor_val = image_features_extract_model(temp_input)
        img_tensor_val = tf.reshape(
            img_tensor_val,
            (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))

        # passing the image features through an FC and ReLU layer
        features = encoder(img_tensor_val)

        dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
        result = []

        # running loop for max length
        for i in range(max_length):
            # The decoder takes the image features, hidden state and initial input
            predictions, hidden, attention_weights = decoder(
                dec_input, features, hidden)

            # The attention weights are stored every time step to show the change in attention
            attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()

            # The predicted id are important as they are the keys to which the values are words
            # basically generate a number which corresponds to a number
            predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
            print("predicted id:", predicted_id)
            # adding all the words together to make the caption
            result.append(tokenizer.index_word[predicted_id])

            # The loop ends when the model genrates the end token
            if tokenizer.index_word[predicted_id] == '<end>':
                return result, attention_plot

            # reinitialising the decoder input
            dec_input = tf.expand_dims([predicted_id], 0)

        attention_plot = attention_plot[:len(result), :]
        return result, attention_plot

    # captions on the validation set
    #ACTUAL EVALUATION
    print("len of image name val", len(img_name_val))
    # Taking a random number in the test dataset
    rid = np.random.randint(0, len(img_name_val))

    # taking the corresponding image from validation set
    image = img_name_val[rid]
    imageid = image[-10:-4]
    imageid = int(imageid)
    ref = ref_create(imageid)
    references = []
    for i in ref:
        l = i.split()
        references.append(l)

    real_caption = ' '.join(
        [tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])

    # testing from some random image
    #image = '/home/cis/Documents/Vijay/guy.jpeg'

    # BLEU score evaluation using NLTK library
    print("Reference Sentences:", references, "\n\n")
    print("Current real caption", real_caption, "\n\n")
    result, attention_plot = evaluate(image)
    print('Prediction Caption:', ' '.join(result), "\n\n")
    print('Cumulative 1-gram BLEU-1: %f' %
          sentence_bleu(references, result, weights=(1, 0, 0, 0)))
    print('Cumulative 2-gram BLEU-2: %f' %
          sentence_bleu(references, result, weights=(0.5, 0.5, 0, 0)))
    print('Cumulative 3-gram BLEU-3: %f' %
          sentence_bleu(references, result, weights=(0.33, 0.33, 0.33, 0)))
    print(
        'Cumulative 4-gram BLEU-4: %f' %
        sentence_bleu(references, result, weights=(0.25, 0.25, 0.25, 0.25)),
        "\n\n")

    # PLotting the attention weights along with words
    plot_attention(image, result, attention_plot)