# increase mini batch counter
        mini_batch_count += 1

        # determine if GPU training is enabled
        if (torch.backends.cudnn.version() != None) and (USE_CUDA == True):

            # convert mini batch to torch variable
            mini_batch_torch = torch.cuda.FloatTensor(mini_batch_data)

        else:

            # convert mini batch to torch variable
            mini_batch_torch = torch.FloatTensor(mini_batch_data)

        # reset the networks gradients
        encoder_train.zero_grad()
        decoder_train.zero_grad()
        discriminator_train.zero_grad()

        # =================== reconstruction phase =====================

        # run autoencoder encoding - decoding
        z_sample = encoder_train(mini_batch_torch)
        mini_batch_reconstruction = decoder_train(z_sample)

        # split input date to numerical and categorical part
        batch_cat = mini_batch_torch[:, :ori_dataset_categ_transformed.
                                     shape[1]]
        batch_num = mini_batch_torch[:,
                                     ori_dataset_categ_transformed.shape[1]:]
        tic = time.time()
        for i in range(num_captions):
            image_id = shuffled_images[i]
            image = dataloader.get_image(image_id)
            image = image.unsqueeze(0)

            if torch.cuda.is_available():
                image = Variable(image).cuda()
                caption = torch.cuda.LongTensor(shuffled_captions[i])
            else:
                image = Variable(image)
                caption = torch.LongTensor(shuffled_captions[i])

            caption_train = caption[:-1]

            encoder.zero_grad()
            decoder.zero_grad()

            encod_out = encoder(image)
            decoder_output = decoder(encod_out, caption_train)

            loss = criterion(decoder_output, caption)
            loss.backward()

            optimizer.step()

            loss_list.append(loss)

        toc = time.time()

        avg_loss = torch.mean(torch.Tensor(loss_list))