def _map_fn(filename, annotation):
    ## read image
    image = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.convert_image_dtype(image, dtype=tf.float32)
    ## data augmentation for image only  0.02s
    image = tf.image.random_brightness(image, max_delta=63)
    image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
    # subtract off the mean and divide by the variance of the pixels. (optional)
    # img = tf.image.per_image_standardization(img)
    ## data augmentation for image and bounding box
    image, annotation = tf.numpy_function(_data_aug_fn, [image, annotation], [tf.float32, tf.string])
    return image, annotation
Esempio n. 2
0
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64


# Load the numpy files
def map_func(img_name, cap):
    img_tensor = np.load(img_name.decode('utf-8') + '.npy')
    return img_tensor, cap


dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2: tf.numpy_function(
    map_func, [item1, item2], [tf.float32, tf.int32]),
                      num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)


class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
Esempio n. 3
0
 def _norm_mean_std_tf(self, x, mean, std):
     x = tf.numpy_function(self._norm_mean_std, [x, mean, std], tf.float32)
     return x
Esempio n. 4
0
 def aug_process(self, image, label):
     """Creates tensorflow function with related augmentation functions"""
     aug_img = tf.numpy_function(func=self.aug_func, inp=[image], Tout=tf.float32)
     return aug_img, label
Esempio n. 5
0
    def create_dataset(self):
        if self.ques_type in ['c4', 'overall']:
            ques_id_ds = tf.data.Dataset.from_tensor_slices(
                self.df['Question_Id'])

            image_path_ds = tf.data.Dataset.from_tensor_slices(
                self.df['image'])
            image_ds = image_path_ds.map(
                self.load_and_preprocess_image,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

            # create question feature dataset
            ques_path_ds = tf.data.Dataset.from_tensor_slices(
                self.df['question'])
            ques_ds = ques_path_ds.map(lambda x: tf.numpy_function(
                DataLoader.load_question_features, inp=[x], Tout=tf.float32),
                                       num_parallel_calls=tf.data.experimental.
                                       AUTOTUNE)

            answers = self.df['Answers'].map(
                lambda x: DataLoader.process_answer(x))
            # use tokenizer for string and one-hot mapping, counting vocab, max length
            tokenizer = tf.keras.preprocessing.text.Tokenizer(
                filters="", oov_token="<unk>", lower=True)
            tokenizer.fit_on_texts(answers)
            answers = tokenizer.texts_to_sequences(answers)
            # use 0 as padding
            tokenizer.word_index['<pad>'] = 0
            tokenizer.index_word[0] = '<pad>'
            answers = tf.keras.preprocessing.sequence.pad_sequences(
                answers, padding='post')
            ans_ds = tf.data.Dataset.from_tensor_slices(answers)

            return tf.data.Dataset.zip(
                ((image_ds, ques_ds), ans_ds, ques_id_ds)), tokenizer

        else:
            ques_id_ds = tf.data.Dataset.from_tensor_slices(
                self.df['Question_Id'])

            image_path_ds = tf.data.Dataset.from_tensor_slices(
                self.df['image'])
            image_ds = image_path_ds.map(
                self.load_and_preprocess_image,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

            # create question feature dataset
            ques_path_ds = tf.data.Dataset.from_tensor_slices(
                self.df['question'])
            ques_ds = ques_path_ds.map(lambda x: tf.numpy_function(
                DataLoader.load_question_features, inp=[x], Tout=tf.float32),
                                       num_parallel_calls=tf.data.experimental.
                                       AUTOTUNE)

            vocab = self.df['Answers'].unique()
            tokenizer = OnehotManager(vocab)
            answers = tf.data.Dataset.from_tensor_slices(self.df['Answers'])
            ans_ds = answers.map(
                lambda x: tf.numpy_function(
                    tokenizer.get_index, inp=[x], Tout=tf.int32),
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

            return tf.data.Dataset.zip(
                ((image_ds, ques_ds), ans_ds, ques_id_ds)), tokenizer
Esempio n. 6
0
def eval_as_np(fn, y_true, y_pred):
    return tf.numpy_function(fn, [y_true, tf.round(y_pred)], tf.double)
Esempio n. 7
0
    def initialize(self, inputs, input_lengths, embed_targets, mel_targets=None,
                   stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
                   global_step=None, is_training=False, is_evaluating=False, split_infos=None):
        """
        Initializes the model for inference sets "mel_outputs" and "alignments" fields.
        Args:
            - inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
              steps in the input time series, and values are character IDs
            - input_lengths: int32 Tensor with shape [N] where N is batch size and values are the
            lengths of each sequence in inputs.
            - embed_targets: float32 Tensor with shape [N, E] where E is the speaker
            embedding size.
            - mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size,
            T_out is number of steps in the output time series, M is num_mels, and values are
            entries in the mel spectrogram. Only needed for training.
        """
        if mel_targets is None and stop_token_targets is not None:
            raise ValueError("no multi targets were provided but token_targets were given")
        if mel_targets is not None and stop_token_targets is None and not gta:
            raise ValueError("Mel targets are provided without corresponding token_targets")
        if not gta and self._hparams.predict_linear == True and linear_targets is None and \
				is_training:
            raise ValueError(
                "Model is set to use post processing to predict linear spectrograms in training "
				"but no linear targets given!")
        if gta and linear_targets is not None:
            raise ValueError("Linear spectrogram prediction is not supported in GTA mode!")
        if is_training and self._hparams.mask_decoder and targets_lengths is None:
            raise RuntimeError(
                "Model set to mask paddings but no targets lengths provided for the mask!")
        if is_training and is_evaluating:
            raise RuntimeError(
                "Model can not be in training and evaluation modes at the same time!")

        split_device = "/cpu:0" if self._hparams.tacotron_num_gpus > 1 or \
								   self._hparams.split_on_cpu else "/gpu:{}".format(
            self._hparams.tacotron_gpu_start_idx)
        with tf.device(split_device):
            hp = self._hparams
            lout_int = [tf.int32] * hp.tacotron_num_gpus
            lout_float = [tf.float32] * hp.tacotron_num_gpus

            tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus,
                                           axis=0)
            tower_targets_lengths = \
                tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if \
                    targets_lengths is not None else targets_lengths

            ### SV2TTS ###

            tower_embed_targets = tf.split(embed_targets, num_or_size_splits=hp.tacotron_num_gpus,
                                           axis=0)

            ##############

            p_inputs = tf.numpy_function(split_func, [inputs, split_infos[:, 0]], lout_int)
            p_mel_targets = tf.numpy_function(split_func, [mel_targets, split_infos[:, 1]],
                                              lout_float) if mel_targets is not None else mel_targets
            p_stop_token_targets = tf.numpy_function(split_func, [stop_token_targets, split_infos[:, 2]],
                                                     lout_float) if stop_token_targets is not None else \
				stop_token_targets

            tower_inputs = []
            tower_mel_targets = []
            tower_stop_token_targets = []

            batch_size = tf.shape(inputs)[0]
            mel_channels = hp.num_mels
            for i in range(hp.tacotron_num_gpus):
                tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
                if p_mel_targets is not None:
                    tower_mel_targets.append(
                        tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels]))
                if p_stop_token_targets is not None:
                    tower_stop_token_targets.append(
                        tf.reshape(p_stop_token_targets[i], [batch_size, -1]))

        self.tower_decoder_output = []
        self.tower_alignments = []
        self.tower_stop_token_prediction = []
        self.tower_mel_outputs = []

        tower_embedded_inputs = []
        tower_enc_conv_output_shape = []
        tower_encoder_cond_outputs = []
        tower_residual = []
        tower_projected_residual = []

        # 1. Declare GPU Devices
        gpus = ["/gpu:{}".format(i) for i in
                range(hp.tacotron_gpu_start_idx, hp.tacotron_gpu_start_idx + hp.tacotron_num_gpus)]
        for i in range(hp.tacotron_num_gpus):
            with tf.device(tf.compat.v1.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0",
                                                                    worker_device=gpus[i])):
                with tf.compat.v1.variable_scope("inference") as scope:
                    assert hp.tacotron_teacher_forcing_mode in ("constant", "scheduled")
                    if hp.tacotron_teacher_forcing_mode == "scheduled" and is_training:
                        assert global_step is not None

                    # GTA is only used for predicting mels to train Wavenet vocoder, so we ommit
                    # post processing when doing GTA synthesis
                    post_condition = hp.predict_linear and not gta

                    # Embeddings ==> [batch_size, sequence_length, embedding_dim]
                    self.embedding_table = tf.compat.v1.get_variable(
                        "inputs_embedding", [len(symbols), hp.embedding_dim], dtype=tf.float32)
                    embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])

                    # Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
                    encoder_cell = TacotronEncoderCell(
                        EncoderConvolutions(is_training, hparams=hp, scope="encoder_convolutions"),
                        EncoderRNN(is_training, size=hp.encoder_lstm_units,
                                   zoneout=hp.tacotron_zoneout_rate, scope="encoder_LSTM"))

                    encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i])

                    # For shape visualization purpose
                    enc_conv_output_shape = encoder_cell.conv_output_shape


                    ### SV2TT2 ###

                    # Append the speaker embedding to the encoder output at each timestep
                    tileable_shape = [-1, 1, self._hparams.speaker_embedding_size]
                    tileable_embed_targets = tf.reshape(tower_embed_targets[i], tileable_shape)
                    tiled_embed_targets = tf.tile(tileable_embed_targets,
                                                       [1, tf.shape(encoder_outputs)[1], 1])
                    encoder_cond_outputs = tf.concat((encoder_outputs, tiled_embed_targets), 2)

                    ##############


                    # Decoder Parts
                    # Attention Decoder Prenet
                    prenet = Prenet(is_training, layers_sizes=hp.prenet_layers,
                                    drop_rate=hp.tacotron_dropout_rate, scope="decoder_prenet")
                    # Attention Mechanism
                    attention_mechanism = LocationSensitiveAttention(hp.attention_dim,
                                                                     encoder_cond_outputs,
                                                                     hparams=hp,
                                                                     mask_encoder=hp.mask_encoder,
                                                                     memory_sequence_length=tf.reshape(
                                                                         tower_input_lengths[i],
                                                                         [-1]),
                                                                     smoothing=hp.smoothing,
                                                                     cumulate_weights=hp.cumulative_weights)
                    # Decoder LSTM Cells
                    decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
                                              size=hp.decoder_lstm_units,
                                              zoneout=hp.tacotron_zoneout_rate,
                                              scope="decoder_LSTM")
                    # Frames Projection layer
                    frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step,
                                                       scope="linear_transform_projection")
                    # <stop_token> projection layer
                    stop_projection = StopProjection(is_training or is_evaluating, shape=hp
                                                     .outputs_per_step,
                                                     scope="stop_token_projection")

                    # Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
                    decoder_cell = TacotronDecoderCell(
                        prenet,
                        attention_mechanism,
                        decoder_lstm,
                        frame_projection,
                        stop_projection)

                    # Define the helper for our decoder
                    if is_training or is_evaluating or gta:
                        self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta,
                                                         is_evaluating, global_step)
                    else:
                        self.helper = TacoTestHelper(batch_size, hp)

                    # initial decoder state
                    decoder_init_state = decoder_cell.zero_state(batch_size=batch_size,
                                                                 dtype=tf.float32)

                    # Only use max iterations at synthesis time
                    max_iters = hp.max_iters if not (is_training or is_evaluating) else None

                    # Decode
                    (frames_prediction, stop_token_prediction,
                     _), final_decoder_state, _ = dynamic_decode(
                        CustomDecoder(decoder_cell, self.helper, decoder_init_state),
                        impute_finished=False,
                        maximum_iterations=max_iters,
                        swap_memory=hp.tacotron_swap_with_cpu)

                    # Reshape outputs to be one output per entry
                    # ==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
                    decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
                    stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])

                    # Postnet
                    postnet = Postnet(is_training, hparams=hp, scope="postnet_convolutions")

                    # Compute residual using post-net ==> [batch_size, decoder_steps * r,
                    # postnet_channels]
                    residual = postnet(decoder_output)

                    # Project residual to same dimension as mel spectrogram
                    # ==> [batch_size, decoder_steps * r, num_mels]
                    residual_projection = FrameProjection(hp.num_mels, scope="postnet_projection")
                    projected_residual = residual_projection(residual)

                    # Compute the mel spectrogram
                    mel_outputs = decoder_output + projected_residual

                    if post_condition:
                        # Add post-processing CBHG. This does a great job at extracting features
						# from mels before projection to Linear specs.
                        post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size,
                                         [hp.cbhg_projection, hp.num_mels],
                                         hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers,
                                         hp.cbhg_highway_units, hp.cbhg_rnn_units, is_training,
                                         name="CBHG_postnet")

                        # [batch_size, decoder_steps(mel_frames), cbhg_channels]
                        post_outputs = post_cbhg(mel_outputs, None)

                        # Linear projection of extracted features to make linear spectrogram
                        linear_specs_projection = FrameProjection(hp.num_freq,
                                                                  scope="cbhg_linear_specs_projection")

                        # [batch_size, decoder_steps(linear_frames), num_freq]
                        linear_outputs = linear_specs_projection(post_outputs)

                    # Grab alignments from the final decoder state
                    alignments = tf.transpose(final_decoder_state.alignment_history.stack(),
                                              [1, 2, 0])

                    self.tower_decoder_output.append(decoder_output)
                    self.tower_alignments.append(alignments)
                    self.tower_stop_token_prediction.append(stop_token_prediction)
                    self.tower_mel_outputs.append(mel_outputs)
                    tower_embedded_inputs.append(embedded_inputs)
                    tower_enc_conv_output_shape.append(enc_conv_output_shape)
                    tower_encoder_cond_outputs.append(encoder_cond_outputs)
                    tower_residual.append(residual)
                    tower_projected_residual.append(projected_residual)

                    if post_condition:
                        self.tower_linear_outputs.append(linear_outputs)
            log("initialisation done {}".format(gpus[i]))

        if is_training:
            self.ratio = self.helper._ratio
        self.tower_inputs = tower_inputs
        self.tower_input_lengths = tower_input_lengths
        self.tower_mel_targets = tower_mel_targets
        # self.tower_linear_targets = tower_linear_targets
        self.tower_targets_lengths = tower_targets_lengths
        self.tower_stop_token_targets = tower_stop_token_targets

        self.all_vars = tf.compat.v1.trainable_variables()

        log("Initialized Tacotron model. Dimensions (? = dynamic shape): ")
        log("  Train mode:               {}".format(is_training))
        log("  Eval mode:                {}".format(is_evaluating))
        log("  GTA mode:                 {}".format(gta))
        log("  Synthesis mode:           {}".format(not (is_training or is_evaluating)))
        log("  Input:                    {}".format(inputs.shape))
        for i in range(hp.tacotron_num_gpus + hp.tacotron_gpu_start_idx):
            log("  device:                   {}".format(i))
            log("  embedding:                {}".format(tower_embedded_inputs[i].shape))
            log("  enc conv out:             {}".format(tower_enc_conv_output_shape[i]))
            log("  encoder out (cond):       {}".format(tower_encoder_cond_outputs[i].shape))
            log("  decoder out:              {}".format(self.tower_decoder_output[i].shape))
            log("  residual out:             {}".format(tower_residual[i].shape))
            log("  projected residual out:   {}".format(tower_projected_residual[i].shape))
            log("  mel out:                  {}".format(self.tower_mel_outputs[i].shape))
            if post_condition:
                log("  linear out:               {}".format(self.tower_linear_outputs[i].shape))
            log("  <stop_token> out:         {}".format(self.tower_stop_token_prediction[i].shape))

            # 1_000_000 is causing syntax problems for some people?! Python please :)
            log("  Tacotron Parameters       {:.3f} Million.".format(
                np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
 def trimBlackMarginsTF(imageTensor):
     withoutMargins = tf.numpy_function(TrimBlackPaddings, [imageTensor],
                                        Tout=tf.uint8)
     return withoutMargins
Esempio n. 9
0
    def __init__(self, batch_size=48, train_model='mobilenetv2'):
        _, self.train_filenames, self.train_captions = load_records()
        _, self.val_filenames, self.val_captions = load_records(False)
        del _

        self.batch_size = batch_size

        if train_model == 'mobilenetv2':
            load_fn = self.load_image_mobilenet

        self.transfer_train_dataset = tf.data.Dataset.from_tensor_slices(
            list(self.train_filenames))
        self.transfer_train_dataset = self.transfer_train_dataset.map(
            load_fn,
            num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(batch_size)

        self.transfer_val_dataset = tf.data.Dataset.from_tensor_slices(
            list(self.val_filenames))
        self.transfer_val_dataset = self.transfer_val_dataset.map(
            load_fn,
            num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(batch_size)

        # Create a Tokenizer
        self.tokenizer = TokenizerWrapper(self.get_texts())

        # Create a list of filenames each mapped with its corresponding caption
        _train_filenames, _train_captions = [], []
        for i, captions in enumerate(self.train_captions, start=0):
            # get the path of train transfer features
            train_path = os.path.join(
                PATHS.TRAIN_TRANSFER_DIR,
                os.path.basename(self.train_filenames[i])) + '.npy'
            for cap in captions:
                _train_filenames.append(train_path)
                _train_captions.append(cap)
        _train_captions = self.tokenizer.texts_to_sequences(_train_captions)
        _train_captions = tf.keras.preprocessing.sequence.pad_sequences(
            _train_captions, padding='post')
        max_len = max([len(cap) for cap in _train_captions])
        self.train_dataset = tf.data.Dataset.from_tensor_slices((_train_filenames, _train_captions)).map(
            lambda item1, item2: tf.numpy_function(self.map_func, [item1, item2], [tf.float32, tf.int32]),
            num_parallel_calls=tf.data.experimental.AUTOTUNE) \
            .shuffle(1000, reshuffle_each_iteration=True) \
            .batch(self.batch_size) \
            .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        # Free the memory
        del _train_filenames
        del _train_captions

        _val_filenames, _val_captions = [], []
        for i, captions in enumerate(self.val_captions, start=0):
            # get the path of train transfer features
            val_path = os.path.join(PATHS.VAL_TRANSFER_DIR,
                                    os.path.basename(
                                        self.val_filenames[i])) + '.npy'
            for cap in captions:
                _val_filenames.append(val_path)
                _val_captions.append(cap)
        _val_captions = self.tokenizer.texts_to_sequences(_val_captions)
        _val_captions = tf.keras.preprocessing.sequence.pad_sequences(
            _val_captions, padding='post')
        max_len = max([len(cap) for cap in _val_captions])

        self.val_dataset = tf.data.Dataset.from_tensor_slices((_val_filenames, _val_captions)) \
            .map(lambda item1, item2: tf.numpy_function(self.map_func, [item1, item2], [tf.float32, tf.int32]),
                 num_parallel_calls=tf.data.experimental.AUTOTUNE) \
            .shuffle(1000, reshuffle_each_iteration=True) \
            .batch(self.batch_size) \
            .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

        # Free the memory
        del _val_filenames
        del _val_captions
Esempio n. 10
0
def loadImagePackTF(pathTensor):
    y = tf.numpy_function(loadImagePackNp, [pathTensor], (tf.uint8))
    return y
Esempio n. 11
0
def my_iou_metric(label, pred):
    # Tensorflow version
    return tf.numpy_function(get_iou_vector, [label, pred > 0.5], tf.float64)
Esempio n. 12
0
    def read_parse_single_example(self, serialized_sample, is_training=False):
        """
        parse tensor
        :param image_sample:
        :return:
        """
        # construct feature description
        keys_to_features = {
            'image/filename': tf.FixedLenFeature([], tf.string, default_value=''),
            'image/encoded': tf.FixedLenFeature([], tf.string, default_value=''),
            'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'),
            'image/height': tf.FixedLenFeature([], tf.int64),
            'image/width': tf.FixedLenFeature([], tf.int64),
            'image/channels': tf.FixedLenFeature([], tf.int64),
            'image/shape': tf.FixedLenFeature([3], tf.int64),
            'image/object/num_object': tf.FixedLenFeature([], tf.int64),
            'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
            'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64),
            'image/object/bbox/difficult': tf.VarLenFeature(dtype=tf.int64),
            'image/object/bbox/truncated': tf.VarLenFeature(dtype=tf.int64)
        }
        features = tf.io.parse_single_example(serialized=serialized_sample, features=keys_to_features)

        # parse feature
        image_name = tf.cast(features['image/filename'], dtype=tf.string)
        num_objects = tf.cast(features['image/object/num_object'], dtype=tf.int32)

        height = tf.cast(features['image/height'], dtype=tf.int32)
        width = tf.cast(features['image/width'], dtype=tf.int32)
        depth = tf.cast(features['image/channels'], dtype=tf.int32)

        # shape = tf.cast(feature['shape'], tf.int32)

        # actual data shape
        image_shape = [height, width, depth]
        bbox_shape = [num_objects, 1]

        image = tf.decode_raw(features['image/encoded'], out_type=tf.uint8)
        image = tf.reshape(image, image_shape)

        # parse gtbox
        x_min = tf.sparse_tensor_to_dense(features['image/object/bbox/xmin'], default_value=0)
        y_min = tf.sparse_tensor_to_dense(features['image/object/bbox/ymin'], default_value=0)
        x_max = tf.sparse_tensor_to_dense(features['image/object/bbox/xmax'], default_value=0)
        y_max = tf.sparse_tensor_to_dense(features['image/object/bbox/ymax'], default_value=0)
        label = tf.sparse_tensor_to_dense(features['image/object/bbox/label'],default_value=0)

        x_min = tf.reshape(x_min, bbox_shape)
        y_min = tf.reshape(y_min, bbox_shape)
        x_max = tf.reshape(x_max, bbox_shape)
        y_max = tf.reshape(y_max, bbox_shape)
        label = tf.reshape(label, bbox_shape)

        # bboxes = tf.concat([x_min[:, tf.newaxis], y_min[:, tf.newaxis], x_max[:, tf.newaxis], y_max[:, tf.newaxis], tf.cast(label[:, tf.newaxis], dtype=tf.float32)], axis=-1)
        bboxes = tf.concat([x_min, y_min, x_max, y_max, tf.cast(label, dtype=tf.float32)], axis=-1)
        bboxes = tf.reshape(bboxes, shape=[-1, 5])

        self.train_output_sizes = self.train_input_size // self.strides

        image, bboxes = tf.numpy_function(self.image_processing, inp=[image, bboxes, is_training], Tout=[tf.float32, tf.float32])
        image = tf.reshape(image, shape=(self.train_input_size, self.train_input_size, 3))
        bboxes = tf.reshape(bboxes, shape=(-1, 5))
        label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = tf.numpy_function(self.preprocess_true_boxes,
                                                                                             inp=[bboxes],
                                                                                             Tout=[tf.float32,
                                                                                                   tf.float32,
                                                                                                   tf.float32,
                                                                                                   tf.float32,
                                                                                                   tf.float32,
                                                                                                   tf.float32])

        label_sbbox = tf.reshape(label_sbbox, shape=(self.train_output_sizes[0], self.train_output_sizes[0],
                                                     self.anchor_per_scale, 5 + self.num_classes))
        label_mbbox = tf.reshape(label_mbbox, shape=(self.train_output_sizes[1], self.train_output_sizes[1],
                                                     self.anchor_per_scale, 5 + self.num_classes))
        label_lbbox = tf.reshape(label_lbbox, shape=(self.train_output_sizes[2], self.train_output_sizes[2],
                                                     self.anchor_per_scale, 5 + self.num_classes))
        sbboxes = tf.reshape(sbboxes, shape=(self.max_bbox_per_scale, 4))
        mbboxes = tf.reshape(mbboxes, shape=(self.max_bbox_per_scale, 4))
        lbboxes = tf.reshape(lbboxes, shape=(self.max_bbox_per_scale, 4))


        return image, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
Esempio n. 13
0
 def grad(dheights):
     return tf.numpy_function(
         numpy_grad_func,
         [heights, dheights],
         DEFAULT_FLOAT_DTYPE_TF,
     )
Esempio n. 14
0
 def eval_update(gt, pred):
     tf.numpy_function(evaluator.update_state,
                       [gt, postprocess.transform_detections(pred)], [])
vocab_size = top_k + 1
num_steps = len(img_name_train) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

# Load the numpy files
def map_func(img_name, cap):
    img_tensor = np.load(img_name.decode('utf-8') + '.npy')
    return img_tensor, cap

dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2 : tf.numpy_function(map_func, [item1, item2],
        [tf.float32, tf.int32]), num_parallel_calls = tf.data.AUTOTUNE)
# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size = tf.data.AUTOTUNE)

# 모델 : 디코더는 신경망 기계 번역에 대한 예제의 디코더와 동일...하다는 데 코드레벨로는 다르다.
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)
        # hidden shape == (batch_size, hidden_size)
Esempio n. 16
0
    def parse_example_ctc_attention(self, serial_example):
        norm_h = self.norm_h
        expand_rate = self.expand_rate
        debug = False

        feat_dict = tf.io.parse_single_example(serial_example, features={
            'img_raw': tf.io.FixedLenFeature([], tf.string),\
            'height': tf.io.FixedLenFeature([], tf.int64),\
            'width': tf.io.FixedLenFeature([], tf.int64),\
            'channel': tf.io.FixedLenFeature([], tf.int64),\
            'img_path': tf.io.FixedLenFeature([], tf.string),\
            'coord': tf.io.FixedLenFeature([], tf.string),\
            'label': tf.io.FixedLenFeature([], tf.string)})

        img_raw = feat_dict['img_raw']
        height = feat_dict['height']
        width = feat_dict['width']
        channel = feat_dict['channel']
        img_path = feat_dict['img_path']
        coord = feat_dict['coord']
        img_text = feat_dict['label']
        ctc_idx, ctc_len, att_idx, att_len = tf.numpy_function(
            self.get_idlst_by_charstr, [img_text],
            [tf.int64, tf.int64, tf.int64, tf.int64])
        ctc_idx = tf.cast(tf.reshape(ctc_idx, [-1]), tf.int32)
        att_idx = tf.cast(tf.reshape(att_idx, [-1]), tf.int32)
        ctc_len = tf.cast(ctc_len, tf.int32)
        att_len = tf.cast(att_len, tf.int32)

        coord_val = tf.strings.split([coord], ',').values
        coord_val = tf.strings.to_number(coord_val, out_type=tf.int32)

        img_raw = tf.io.decode_raw(img_raw, tf.uint8)
        orig_img = tf.reshape(img_raw, (height, width, channel))

        prob = tf.random.uniform([])
        invert_flg = tf.logical_and(tf.greater(prob, 0.75),
                                    tf.equal(self.mode, 'train'))
        orig_img = tf.cond(
            invert_flg,
            true_fn=lambda: tf.cast(255 - orig_img, dtype=tf.uint8),
            false_fn=lambda: orig_img)

        prob = tf.random.uniform([])
        noise_flg = tf.logical_and(tf.greater(prob, 0.75),
                                   tf.equal(self.mode, 'train'))
        noise_idx = tf.random.shuffle(tf.range(2))[0]
        orig_img = tf.cond(
            noise_flg,
            true_fn=lambda: random_noise_static(orig_img, noise_idx),
            false_fn=lambda: random_noise_static(orig_img, -1))

        prob = tf.random.uniform([])
        encode_flg = tf.logical_and(tf.greater(prob, 0.75),
                                    tf.equal(self.mode, 'train'))
        encode_idx = tf.random.shuffle(tf.range(4))[0]

        orig_img = tf.cond(
            encode_flg,
            true_fn=lambda: encode_decode_static(orig_img, encode_idx),
            false_fn=lambda: encode_decode_static(orig_img, -1))

        prob = tf.random.uniform([])
        color_flg = tf.logical_and(tf.greater(prob, 0.75),
                                   tf.equal(self.mode, 'train'))
        color_idx = tf.random.shuffle(tf.range(6))[0]
        orig_img = tf.cond(
            color_flg,
            true_fn=lambda: distort_color_static(orig_img, color_idx),
            false_fn=lambda: distort_color_static(orig_img, -1))

        prob = tf.random.uniform([])
        coord_flg = tf.logical_and(tf.greater(prob, 0.4),
                                   tf.equal(self.mode, 'train'))
        coord_val1 = tf.cond(
            coord_flg,
            true_fn=lambda: coord_augmentation(coord_val, width, height),
            false_fn=lambda:
            (coord_val[0], coord_val[1], coord_val[2], coord_val[3]))

        offset_w = coord_val1[0]
        offset_h = coord_val1[1]
        target_w = coord_val1[2] - coord_val1[0]
        target_h = coord_val1[3] - coord_val1[1]
        crop_img = tf.image.crop_to_bounding_box(orig_img, offset_h, offset_w,
                                                 target_h, target_w)

        ratio = tf.cast(norm_h, tf.float32) / tf.cast(target_h, tf.float32)
        norm_w = tf.cast(
            tf.cast(target_w, tf.float32) * expand_rate * ratio, tf.int32)
        norm_img = tf.image.resize(crop_img, (norm_h, norm_w))

        if debug:
            norm_img = tf.cast(norm_img, tf.uint8)
        else:
            # convert RGB-->BGR
            mean = [127.5, 127.5, 127.5]
            norm_img = norm_img[:, :, ::-1]
            norm_img = (norm_img - mean) / 127.5
        return img_path, norm_img, img_text, ctc_idx, ctc_len, att_idx, att_len, coord, norm_w
Esempio n. 17
0
        def map_fn(prob): return tf.numpy_function(self.perform_greedy, inp=[prob], Tout=tf.string)

        return tf.map_fn(map_fn, probs, fn_output_signature=tf.TensorSpec([], dtype=tf.string))
Esempio n. 18
0
  Load image from image_path resizing it to match inputs required for 
  InceptionV3 - notably width and height of 299 pixels
  """
    img = tf.io.read_file(image_path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, seq0, seq1, seq2, seq3, seq4, seq5, seq6, seq7, seq8, \
           matrix_shapes

train_dataset = train_dataset.map(
    lambda item1, item2, item3, item4, item5, item6, item7, item8, \
         item9, item10, item11:
    tf.numpy_function(load_image,
                    [item1, item2, item3, item4, item5, item6, item7, item8,
                     item9, item10, item11],
                    [tf.float32, tf.int32, tf.int32, tf.int32, tf.int32,
                     tf.int32, tf.int32, tf.int32, tf.int32, tf.int32,
                     tf.int32]),
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)

validation_dataset = validation_dataset.map(
    lambda item1, item2, item3, item4, item5, item6, item7, item8, \
         item9, item10, item11:
    tf.numpy_function(load_image,
                    [item1, item2, item3, item4, item5, item6, item7, item8,
                     item9, item10, item11],
                    [tf.float32, tf.int32, tf.int32, tf.int32, tf.int32,
                     tf.int32, tf.int32, tf.int32, tf.int32, tf.int32,
                     tf.int32]),
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
Esempio n. 19
0
        def map_fn(prob): return tf.numpy_function(self.perform_beam_search, inp=[prob, lm], Tout=tf.string)

        return tf.map_fn(map_fn, probs, dtype=tf.string)
Esempio n. 20
0
 def __call__(self, inputs):
   return tf.map_fn(
     lambda i: tf.numpy_function(self._policy, [i], tf.int64),
     inputs,
     fn_output_signature=tf.int64,
   )
Esempio n. 21
0
 def attack_as_tf(img, y_true):
     return tf.numpy_function(attack, [img, y_true], tf.double)
Esempio n. 22
0
 def tf_env_step(self, action: tf.Tensor) -> List[tf.Tensor]:
     return tf.numpy_function(self.env_step, [action],
                              [tf.float32, tf.float32, tf.float32])
Esempio n. 23
0
def preprocessData(raw):
    print("---preprocessData---")
    global MAX_LENGTH
    global image_features_extract_model

    image_paths, image_path_to_caption = raw

    # before pre-processing, each image is corresponding to multiple caption.
    # we will duplicate the images so that we have (image, caption) pairs

    train_captions = []

    img_name_vector = []
    for image_path in image_paths:
        caption_list = image_path_to_caption[image_path]
        train_captions.extend(caption_list)
        img_name_vector.extend([image_path] * len(caption_list))
    encode_train = sorted(set(img_name_vector))

    image_dataset = tf.data.Dataset.from_tensor_slices(image_paths)
    image_dataset = image_dataset.map(
        load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

    # pretrained InceptionV3 to extract features from images
    image_model = tf.keras.applications.InceptionV3(include_top=False,
                                                    weights='imagenet')
    new_input = image_model.input
    hidden_layer = image_model.layers[-1].output
    image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

    # cache the features of image extracted by InceptionV3 to the disk
    # because the memory in RAM is not sufficient to store these features for all images
    # Note: only need to run in the frist time. Haozhe 11/25/20
    '''for img, path in tqdm(image_dataset):
        batch_features = image_features_extract_model(img)
        batch_features = tf.reshape(batch_features,
                                    (batch_features.shape[0], -1, batch_features.shape[3]))
        for bf, p in zip(batch_features, path):
            path_of_feature = p.numpy().decode("utf-8")
            np.save(path_of_feature, bf.numpy())'''

    # img_name_vector is a list of image file paths
    # train_captions is a list of corresponding captions
    # we need to split the training and testing set from this
    # return img_name_vector, train_captions

    # Preprocess and tokenize the captions

    # Choose the top 5000 words from the vocabulary

    tokenizer.fit_on_texts(train_captions)
    train_seqs = tokenizer.texts_to_sequences(train_captions)

    tokenizer.word_index['<pad>'] = 0
    tokenizer.index_word[0] = '<pad>'

    # Create the tokenized vectors
    train_seqs = tokenizer.texts_to_sequences(train_captions)
    print("padding: ")
    print(train_seqs[:5])
    # Pad each vector to the max_length of the captions
    # If you do not provide a max_length value, pad_sequences calculates it automatically
    cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs,
                                                               padding='post')

    # Calculates the max_length, which is used to store the attention weights
    MAX_LENGTH = calc_max_length(train_seqs)

    # Split the data into training and testing

    img_to_cap_vector = collections.defaultdict(list)
    for img, cap in zip(img_name_vector, cap_vector):
        img_to_cap_vector[img].append(cap)

    # Create training and validation sets using an 80-20 split randomly.
    img_keys = list(img_to_cap_vector.keys())
    random.shuffle(img_keys)

    slice_index = int(len(img_keys) * 0.8)
    img_name_train_keys, img_name_val_keys = img_keys[:slice_index], img_keys[
        slice_index:]
    print("parsing dataset")

    img_name_train = []
    cap_train = []
    for imgt in img_name_train_keys:
        capt_len = len(img_to_cap_vector[imgt])
        img_name_train.extend([imgt] * capt_len)
        cap_train.extend(img_to_cap_vector[imgt])

    img_name_val = []
    cap_val = []
    for imgv in img_name_val_keys:
        capv_len = len(img_to_cap_vector[imgv])
        img_name_val.extend([imgv] * capv_len)
        cap_val.extend(img_to_cap_vector[imgv])

    # Create a tf.data dataset for training
    num_steps = len(img_name_train) // BATCH_SIZE
    # Shape of the vector extracted from InceptionV3 is (64, 2048)
    # These two variables represent that vector shape
    features_shape = 2048
    attention_features_shape = 64

    dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))
    # Use map to load the numpy files in parallel
    dataset = dataset.map(lambda item1, item2: tf.numpy_function(
        map_func, [item1, item2], [tf.float32, tf.int32]),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Shuffle and batch
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE,
                                                 drop_remainder=False)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset, (img_name_val, cap_val)
Esempio n. 24
0
def tf_augment_sample(depthmap, targets):
    depthmap_aug = tf.numpy_function(augmentation, [depthmap, CONFIG.DATA_AUGMENTATION_MODE], tf.float32)
    depthmap_aug.set_shape((CONFIG.IMAGE_TARGET_HEIGHT, CONFIG.IMAGE_TARGET_WIDTH, CONFIG.N_ARTIFACTS))
    targets.set_shape((len(CONFIG.TARGET_INDEXES,)))

    return depthmap_aug, targets
def parse_function(image_filename, label_filename):
    # print('--------------{}-------------------'.format(tf.as_string(image_filename))) # filename变成tensor了?
    img = tf.numpy_function(read_img, [image_filename], tf.float32)
    label = tf.numpy_function(read_label, [label_filename], tf.float32)
    return img, label
Esempio n. 26
0
def print_string(batch: tf.Tensor):
    tf.numpy_function(lambda x: print(*bytes_to_string(x), sep="\n"), [batch],
                      [])
Esempio n. 27
0
def process_data(image, label):
    aug_img = tf.numpy_function(func=aug_fn, inp=[image], Tout=tf.float32)

    return aug_img, label
Esempio n. 28
0
def tf_average_by_duration(x, durs):
    outs = tf.numpy_function(average_by_duration, [x, durs], tf.float32)
    return outs
Esempio n. 29
0
def generate_detections(params,
                        cls_outputs,
                        box_outputs,
                        image_scales,
                        image_ids,
                        flip=False):
  """A legacy interface for generating [id, x, y, w, h, score, class]."""
  _, width = utils.parse_image_size(params['image_size'])

  original_image_widths = tf.expand_dims(image_scales, -1) * width

  if params['nms_configs'].get('pyfunc', True):
    # numpy based soft-nms gives better accuracy than the tensorflow builtin
    # the reason why is unknown
    detections_bs = []
    boxes, scores, classes = pre_nms(params, cls_outputs, box_outputs)
    for index in range(boxes.shape[0]):
      nms_configs = params['nms_configs']
      detections = tf.numpy_function(
          functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [
              boxes[index],
              scores[index],
              classes[index],
              tf.slice(image_ids, [index], [1]),
              tf.slice(image_scales, [index], [1]),
              params['num_classes'],
              nms_configs['max_output_size'],
          ], tf.float32)

      if flip:
        detections = tf.stack([
            detections[:, 0],
            # the mirrored location of the left edge is the image width
            # minus the position of the right edge
            original_image_widths[index] - detections[:, 3],
            detections[:, 2],
            # the mirrored location of the right edge is the image width
            # minus the position of the left edge
            original_image_widths[index] - detections[:, 1],
            detections[:, 4],
            detections[:, 5],
            detections[:, 6],
        ], axis=-1)
      detections_bs.append(detections)
    return tf.stack(detections_bs, axis=0, name='detnections')

  nms_boxes_bs, nms_scores_bs, nms_classes_bs, _ = postprocess_per_class(
      params, cls_outputs, box_outputs, image_scales)

  image_ids_bs = tf.cast(tf.expand_dims(image_ids, -1), nms_scores_bs.dtype)
  if flip:
    detections_bs = [
        image_ids_bs * tf.ones_like(nms_scores_bs),
        # the mirrored location of the left edge is the image width
        # minus the position of the right edge
        original_image_widths - nms_boxes_bs[:, :, 3],
        nms_boxes_bs[:, :, 0],
        # the mirrored location of the right edge is the image width
        # minus the position of the left edge
        original_image_widths - nms_boxes_bs[:, :, 1],
        nms_boxes_bs[:, :, 2],
        nms_scores_bs,
        nms_classes_bs,
    ]
  else:
    detections_bs = [
        image_ids_bs * tf.ones_like(nms_scores_bs),
        nms_boxes_bs[:, :, 1],
        nms_boxes_bs[:, :, 0],
        nms_boxes_bs[:, :, 3],
        nms_boxes_bs[:, :, 2],
        nms_scores_bs,
        nms_classes_bs,
    ]
  return tf.stack(detections_bs, axis=-1, name='detnections')
Esempio n. 30
0
def load_tensor_from_kaldi_archive(ark_key):
    return tf.numpy_function(_kaldiio_load, [ark_key], tf.float32)
Esempio n. 31
0
    def parse_example_ctc_attention(self, line):
        norm_h = self.norm_h
        expand_rate = self.expand_rate
        debug = False
        field_delim = ' '
        use_quote_delim = False
        record_defaults = ['', '', '']
        img_path, img_text, coord = tf.io.decode_csv(line, record_defaults,
                                                     field_delim,
                                                     use_quote_delim)
        ctc_idx, ctc_len, att_idx, att_len = tf.numpy_function(
            self.get_idlst_by_charstr, [img_text],
            [tf.int64, tf.int64, tf.int64, tf.int64])
        ctc_idx = tf.cast(tf.reshape(ctc_idx, [-1]), tf.int32)
        att_idx = tf.cast(tf.reshape(att_idx, [-1]), tf.int32)
        ctc_len = tf.cast(ctc_len, tf.int32)
        att_len = tf.cast(att_len, tf.int32)
        coord_val = tf.strings.split([coord], ',').values
        coord_val = tf.strings.to_number(coord_val, out_type=tf.int32)
        orig_img = tf.image.decode_image(tf.io.read_file(img_path))
        img_shape = tf.shape(orig_img)
        width = img_shape[1]
        height = img_shape[0]

        prob = tf.random.uniform([])
        invert_flg = tf.logical_and(tf.greater(prob, 0.0),
                                    tf.equal(self.mode, 'train'))
        orig_img = tf.cond(
            invert_flg,
            true_fn=lambda: tf.cast(255 - orig_img, dtype=tf.uint8),
            false_fn=lambda: orig_img)

        prob = tf.random.uniform([])
        noise_flg = tf.logical_and(tf.greater(prob, 0.0),
                                   tf.equal(self.mode, 'train'))
        noise_idx = tf.random.shuffle(tf.range(2))[0]
        orig_img = tf.cond(
            noise_flg,
            true_fn=lambda: random_noise_static(orig_img, noise_idx),
            false_fn=lambda: random_noise_static(orig_img, -1))

        prob = tf.random.uniform([])
        encode_flg = tf.logical_and(tf.greater(0.3, prob),
                                    tf.equal(self.mode, 'train'))
        encode_idx = tf.random.shuffle(tf.range(4))[0]

        orig_img = tf.cond(
            encode_flg,
            true_fn=lambda: encode_decode_static(orig_img, encode_idx),
            false_fn=lambda: encode_decode_static(orig_img, -1))

        prob = tf.random.uniform([])
        color_flg = tf.logical_and(tf.greater(prob, 0),
                                   tf.equal(self.mode, 'train'))
        color_idx = tf.random.shuffle(tf.range(6))[0]
        orig_img = tf.cond(
            color_flg,
            true_fn=lambda: distort_color_static(orig_img, color_idx),
            false_fn=lambda: distort_color_static(orig_img, -1))

        prob = tf.random.uniform([])
        coord_flg = tf.logical_and(tf.greater(prob, 0),
                                   tf.equal(self.mode, 'train'))
        coord_val1 = tf.cond(
            coord_flg,
            true_fn=lambda: coord_augmentation(coord_val, width, height),
            false_fn=lambda:
            (coord_val[0], coord_val[1], coord_val[2], coord_val[3]))

        offset_w = coord_val1[0]
        offset_h = coord_val1[1]
        target_w = coord_val1[2] - coord_val1[0]
        target_h = coord_val1[3] - coord_val1[1]
        crop_img = tf.image.crop_to_bounding_box(orig_img, offset_h, offset_w,
                                                 target_h, target_w)

        ratio = tf.cast(norm_h, tf.float32) / tf.cast(target_h, tf.float32)
        norm_w = tf.cast(
            tf.cast(target_w, tf.float32) * expand_rate * ratio, tf.int32)
        norm_img = tf.image.resize(crop_img, (norm_h, norm_w))
        if debug:
            norm_img = tf.cast(norm_img, tf.uint8)
        else:
            # convert RGB-->BGR
            mean = [127.5, 127.5, 127.5]
            norm_img = norm_img[:, :, ::-1]
            norm_img = norm_img - mean

        return img_path, norm_img, img_text, ctc_idx, ctc_len, att_idx, att_len, coord, norm_w