Beispiel #1
0
    def encoded_to_compressed_tensor(encoded):
        """
        Transform a binary encoded string image to its compressed tensor.

        Parameters
        ----------
        encoded: bytes
            The binary representation of the image

        Returns
        -------
        numpy.ndarray
        """
        return io_ops.decode_raw(encoded, 'uint8')
Beispiel #2
0
    def decode(self, serialized_example):
        """ Load final output to neural network training procedure

        :param serialized_example: compressed example recorded
        on TFRecordWriter

        :return: image, label as one hot encoder
        """

        features = io.parse_single_example(serialized_example,
                                           self.get_template())
        image = io.decode_raw(features['image'], 'uint8')
        image = reshape(image, self.shape + (3, ))
        image = cast(image, dtype='float32') / 255
        return image, one_hot(features['label'], self.labels)
Beispiel #3
0
def bytes_sequence_to_encoding_bilstm(feature_tensor, feature_info, file_io: FileIO):
    """
    Encode a string tensor into an encoding.
    Works by converting the string into a bytes sequence and then generating
    a categorical/char embedding for each of the 256 bytes. The char/byte embeddings
    are then combined using a biLSTM

    Args:
        feature_tensor: String feature tensor that is to be encoded
        feature_info: Dictionary representing the feature_config for the input feature

    Returns:
        Encoded feature tensor

    Args under feature_layer_info:
        max_length: int; max length of bytes sequence
        embedding_size: int; dimension size of the embedding;
                        if null, then the tensor is just converted to its one-hot representation
        encoding_size: int: dimension size of the sequence encoding computed using a biLSTM

    NOTE:
        The input dimension for the embedding is fixed to 256 because the string is
        converted into a bytes sequence.
    """
    args = feature_info["feature_layer_info"]["args"]

    # Decode string tensor to bytes
    feature_tensor = io.decode_raw(
        feature_tensor, out_type=tf.uint8, fixed_length=args.get("max_length", None),
    )

    feature_tensor = tf.squeeze(feature_tensor, axis=1)
    if "embedding_size" in args:
        char_embedding = layers.Embedding(
            name="{}_bytes_embedding".format(
                feature_info.get("node_name", feature_info.get("name"))
            ),
            input_dim=256,
            output_dim=args["embedding_size"],
            mask_zero=True,
            input_length=args.get("max_length", None),
        )(feature_tensor)
    else:
        char_embedding = tf.one_hot(feature_tensor, depth=256)

    encoding = get_bilstm_encoding(char_embedding, int(args["encoding_size"] / 2))

    return encoding
Beispiel #4
0
    def __init__(self, observation_size, net_arch, initializer, activation,
                 clip_range, value_coef, entropy_coef, learning_rate,
                 pre_training_learning_rate, action_bounds, policy):
        """
        :param observation_size:
        :param net_arch:
        :param initializer:
        :param activation:
        :param clip_range:
        :param value_coef:
        :param entropy_coef:
        :param learning_rate:
        :param pre_training_learning_rate:
        :param action_bounds:
        :param policy:
        """
        """Set class constants"""
        self.observation_size = observation_size
        self.net_arch = net_arch
        self.initializer = initializer
        self.activation = activation
        self.clip_range = clip_range
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef

        if action_bounds is None:
            action_bounds = [0.0, 1.5]
        self.action_bounds = action_bounds
        self.learning_rate = learning_rate
        self.pre_training_learning_rate = pre_training_learning_rate

        if policy is None:
            policy = GaussFull()
        self.policy = policy
        """Set up the tensorflow graph"""
        self.graph = Graph()

        with self.graph.as_default():
            self.sess = Session(graph=self.graph)
            """ core """
            # place holders
            self.observation_string_ph = placeholder(
                shape=(None, 1), dtype=string, name="observation_string_ph")
            self.action_ph = placeholder(dtype=float32,
                                         shape=(None, 1),
                                         name="action_ph")
            self.old_neg_logits = placeholder(dtype=float32,
                                              shape=(None, 1),
                                              name="old_neg_logits")
            self.advantage_ph = placeholder(dtype=float32,
                                            shape=(None, 1),
                                            name="advantage_ph")
            self.value_target_ph = placeholder(dtype=float32,
                                               shape=(None, 1),
                                               name="value_target_ph")
            # learning rate tensors
            self.learning_rate_ph = placeholder_with_default(
                input=self.learning_rate, shape=())
            self.pre_training_learning_rate_ph = placeholder_with_default(
                input=self.pre_training_learning_rate, shape=())

            # observation tensor
            replaced1 = regex_replace(self.observation_string_ph, "/", "_")
            replaced2 = regex_replace(replaced1, r"\+", "-")
            byte_tensor = decode_base64(replaced2)
            decoded = decode_raw(byte_tensor, out_type=float32)
            squeezed = squeeze(decoded, axis=1)
            self.observation_input = ensure_shape(
                squeezed,
                shape=(None, self.observation_size),
                name="observation_input")

            # policy net
            latent_policy = net_core(self.observation_input, self.net_arch,
                                     self.initializer, self.activation)
            self.policy.construct(latent_policy=latent_policy)

            self.clipped_action = clip_by_value(
                cast(self.policy.action, float32), self.action_bounds[0],
                self.action_bounds[1], "clipped_action")

            # value net
            latent_value = net_core(self.observation_input, self.net_arch,
                                    self.initializer, self.activation)
            self.value = identity(
                input=Dense(units=1,
                            activation=None,
                            kernel_initializer=self.initializer)(latent_value),
                name="value")
            """loss calculation"""
            # policy loss
            self.neg_logits = self.policy.neg_logits_from_actions(
                self.action_ph)
            ratio = exp(self.old_neg_logits - self.neg_logits)

            standardized_adv = (self.advantage_ph - reduce_mean(
                self.advantage_ph)) / (reduce_std(self.advantage_ph) + 1e-8)
            raw_policy_loss = -standardized_adv * ratio
            clipped_policy_loss = -standardized_adv * clip_by_value(
                ratio, 1 - self.clip_range, 1 + self.clip_range)
            self.policy_loss = reduce_mean(
                maximum(raw_policy_loss, clipped_policy_loss))

            self.value_loss = mean_squared_error(self.value_target_ph,
                                                 self.value)

            # entropy loss
            self.entropy_loss = -reduce_mean(self.policy.entropy)

            # total loss
            self.total_loss = self.policy_loss + self.value_coef * self.value_loss + self.entropy_coef * self.entropy_loss

            # optimizer
            optimizer = AdamOptimizer(learning_rate=self.learning_rate_ph)

            # training ops
            self.training_op = optimizer.minimize(self.total_loss)

            # pre training
            self.dist_param_target_ph = placeholder(
                dtype=float32,
                shape=(None, self.policy.dist_params.shape[1]),
                name="dist_param_label_ph")
            self.pre_training_loss = mean_squared_error(
                self.dist_param_target_ph, self.policy.dist_params)
            pre_training_optimizer = GradientDescentOptimizer(
                learning_rate=self.pre_training_learning_rate_ph)
            self.pre_training_op = pre_training_optimizer.minimize(
                self.pre_training_loss)
            """utility nodes"""
            # inspect model weights
            self.trainable_variables = trainable_variables()

            # saviour
            self.saver = Saver()

            # tensorboard summaries
            self.summary = merge([
                histogram("values", self.value),
                histogram("advantages", standardized_adv),
                histogram("actions", self.clipped_action),
                histogram("det_actions",
                          replace_nan(self.policy.det_action, 0.0)),
                histogram("value_targets", self.value_target_ph),
                scalar("policy_loss", self.policy_loss),
                scalar("value_loss", self.value_loss),
                scalar("entropy_loss", self.entropy_loss)
            ])

            self.pre_summary = merge([
                histogram("pretraining_actions", self.clipped_action),
                scalar("pretraining_loss", self.pre_training_loss)
            ])

            # initialization
            init = global_variables_initializer()
            self.sess.run(init)
Beispiel #5
0
def bytes_sequence_to_encoding_bilstm(feature_tensor, feature_info,
                                      file_io: FileIO):
    """
    Encode a string tensor into an encoding.
    Works by converting the string into a bytes sequence and then generating
    a categorical/char embedding for each of the 256 bytes. The char/byte embeddings
    are then combined using a biLSTM

    Parameters
    ----------
    feature_tensor : Tensor object
        String feature tensor that is to be encoded
    feature_info : dict
        Dictionary representing the feature_config for the input feature
    file_io : FileIO object
        FileIO handler object for reading and writing

    Returns
    -------
    Tensor object
        Encoded feature tensor

    Notes
    -----
    Args under `feature_layer_info`:
        max_length : int
            max length of bytes sequence
        embedding_size : int
            dimension size of the embedding;
            if null, then the tensor is just converted to its one-hot representation
        encoding_size : int
            dimension size of the sequence encoding computed using a biLSTM

    The input dimension for the embedding is fixed to 256 because the string is
    converted into a bytes sequence.
    """
    args = feature_info["feature_layer_info"]["args"]

    # Decode string tensor to bytes
    feature_tensor = io.decode_raw(
        feature_tensor,
        out_type=tf.uint8,
        fixed_length=args.get("max_length", None),
    )

    feature_tensor = tf.squeeze(feature_tensor, axis=1)
    if "embedding_size" in args:
        char_embedding = layers.Embedding(
            name="{}_bytes_embedding".format(
                feature_info.get("node_name", feature_info.get("name"))),
            input_dim=256,
            output_dim=args["embedding_size"],
            mask_zero=True,
            input_length=args.get("max_length", None),
        )(feature_tensor)
    else:
        char_embedding = tf.one_hot(feature_tensor, depth=256)

    kernel_initializer = args.get("lstm_kernel_initializer", "glorot_uniform")
    encoding = get_bilstm_encoding(
        embedding=char_embedding,
        lstm_units=int(args["encoding_size"] / 2),
        kernel_initializer=kernel_initializer,
    )
    return encoding
Beispiel #6
0
    def _parse_sequence_example_fn(sequence_example_proto):
        """
        Parse the input `tf.Example` proto using the features_spec

        Args:
            sequence_example_proto: tfrecord SequenceExample protobuf data

        Returns:
            features: parsed features extracted from the protobuf
            labels: parsed label extracted from the protobuf
        """
        context_features, sequence_features = io.parse_single_sequence_example(
            serialized=sequence_example_proto,
            context_features=context_features_spec,
            sequence_features=sequence_features_spec,
        )

        features_dict = dict()

        # Explode context features into all records
        for feature_info in feature_config.get_context_features():
            feature_node_name = feature_info.get("node_name",
                                                 feature_info["name"])
            feature_layer_info = feature_info.get("feature_layer_info")

            feature_tensor = context_features.get(feature_node_name)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)
            feature_tensor = tf.tile(feature_tensor,
                                     multiples=[max_num_records])

            # If feature is a string, then decode into numbers
            if feature_layer_info["type"] == FeatureTypeKey.STRING:
                feature_tensor = io.decode_raw(
                    feature_tensor,
                    out_type=tf.uint8,
                    fixed_length=feature_layer_info["max_length"],
                )
                feature_tensor = tf.cast(feature_tensor, tf.float32)

            features_dict[feature_node_name] = feature_tensor

        # Pad sequence features to max_num_records
        for feature_info in feature_config.get_sequence_features():
            feature_node_name = feature_info.get("node_name",
                                                 feature_info["name"])
            feature_layer_info = feature_info["feature_layer_info"]

            feature_tensor = sequence_features.get(feature_node_name)

            if isinstance(feature_tensor, sparse.SparseTensor):
                if feature_node_name == feature_config.get_rank(
                        key="node_name"):
                    # Add mask for identifying padded records
                    mask = tf.ones_like(
                        sparse.to_dense(sparse.reset_shape(feature_tensor)))
                    mask = tf.expand_dims(mask, axis=2)

                    def crop_fn():
                        tf.print(
                            "\n[WARN] Bad query found. Number of records : ",
                            tf.shape(mask)[1])
                        return image.crop_to_bounding_box(
                            mask,
                            offset_height=0,
                            offset_width=0,
                            target_height=1,
                            target_width=max_num_records,
                        )

                    mask = tf.cond(
                        tf.shape(mask)[1] < max_num_records,
                        # Pad if there are missing records
                        lambda: image.pad_to_bounding_box(
                            mask,
                            offset_height=0,
                            offset_width=0,
                            target_height=1,
                            target_width=max_num_records,
                        ),
                        # Crop if there are extra records
                        crop_fn,
                    )
                    mask = tf.squeeze(mask)

                    # Check validity of mask
                    tf.debugging.assert_greater(
                        tf.cast(tf.reduce_sum(mask), tf.float32),
                        tf.constant(0.0))

                    features_dict["mask"] = mask

                feature_tensor = sparse.reset_shape(
                    feature_tensor, new_shape=[1, max_num_records])
                feature_tensor = sparse.to_dense(feature_tensor)
                feature_tensor = tf.squeeze(feature_tensor)

                # If feature is a string, then decode into numbers
                if feature_layer_info["type"] == FeatureTypeKey.STRING:
                    feature_tensor = io.decode_raw(
                        feature_tensor,
                        out_type=tf.uint8,
                        fixed_length=feature_layer_info["max_length"],
                    )
                    feature_tensor = tf.cast(feature_tensor, tf.float32)
            else:
                raise ValueError("Invalid input : {}".format(feature_name))

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        # Check if label is one-hot and correctly masked
        tf.debugging.assert_equal(tf.cast(tf.reduce_sum(labels), tf.float32),
                                  tf.constant(1.0))

        return features_dict, labels
Beispiel #7
0
    def feature_layer(inputs):
        train_features = list()
        metadata_features = dict()

        numeric_tile_shape = tf.shape(
            tf.expand_dims(tf.gather(inputs["mask"], indices=0), axis=0))

        for feature_info in feature_config.get_all_features(
                include_label=False):
            feature_name = feature_info["name"]
            feature_node_name = feature_info.get("node_name", feature_name)
            feature_layer_info = feature_info["feature_layer_info"]

            if feature_layer_info["type"] == FeatureTypeKey.NUMERIC:
                # Numeric input features
                if feature_info["dtype"] in (tf.float32, tf.int64):
                    dense_feature = inputs[feature_node_name]

                    if feature_info[
                            "tfrecord_type"] == SequenceExampleTypeKey.CONTEXT:
                        dense_feature = tf.tile(dense_feature,
                                                numeric_tile_shape)

                    if "fn" in feature_layer_info:
                        dense_feature = feature_layer_map.get_fn(
                            feature_layer_info["fn"])(dense_feature,
                                                      feature_info)
                    elif feature_info["trainable"]:
                        dense_feature = tf.expand_dims(tf.cast(
                            dense_feature, tf.float32),
                                                       axis=-1)

                    if feature_info["trainable"]:
                        train_features.append(dense_feature)
                    else:
                        metadata_features[feature_node_name] = tf.cast(
                            dense_feature, tf.float32)

                # String input features
                elif feature_info["dtype"] in (tf.string, ):
                    if feature_info["trainable"]:
                        decoded_string_tensor = io.decode_raw(
                            inputs[feature_node_name],
                            out_type=tf.uint8,
                            fixed_length=feature_layer_info["args"]
                            ["max_length"],
                        )
                        if "fn" in feature_layer_info:
                            dense_feature = feature_layer_map.get_fn(
                                feature_layer_info["fn"])(
                                    decoded_string_tensor, feature_info)
                        """
                        Creating a tensor [1, sequence_size, 1] dynamically
                        NOTE:
                        Tried multiple methods using `convert_to_tensor`, `concat`, with no results
                        """
                        if feature_info[
                                "tfrecord_type"] == SequenceExampleTypeKey.CONTEXT:
                            tile_dims = tf.shape(
                                tf.expand_dims(
                                    tf.expand_dims(tf.gather(inputs["mask"],
                                                             indices=0),
                                                   axis=0),
                                    axis=-1,
                                ))
                            dense_feature = tf.tile(dense_feature, tile_dims)

                        train_features.append(dense_feature)

            elif feature_layer_info["type"] == FeatureTypeKey.STRING:
                if feature_info["trainable"]:
                    raise ValueError(
                        "Can not train on string tensors directly. Please use a feature layer"
                    )
                else:
                    metadata_features[feature_node_name] = inputs[
                        feature_node_name]
            elif feature_layer_info["type"] == FeatureTypeKey.CATEGORICAL:
                if feature_info["trainable"]:
                    if "fn" in feature_layer_info:
                        dense_feature = feature_layer_map.get_fn(
                            feature_layer_info["fn"])(
                                inputs[feature_node_name], feature_info)

                    if feature_info[
                            "tfrecord_type"] == SequenceExampleTypeKey.CONTEXT:
                        tile_dims = tf.shape(
                            tf.expand_dims(
                                tf.expand_dims(tf.gather(inputs["mask"],
                                                         indices=0),
                                               axis=0),
                                axis=-1,
                            ))
                        dense_feature = tf.tile(dense_feature, tile_dims)

                    train_features.append(dense_feature)
                else:
                    raise NotImplementedError
            else:
                raise Exception(
                    "Unknown feature type {} for feature : {}".format(
                        feature_layer_info["type"], feature_name))

        return train_features, metadata_features
Beispiel #8
0
    def _parse_sequence_example_fn(sequence_example_proto):
        """
        Parse the input `tf.Example` proto using the features_spec

        Args:
            sequence_example_proto: tfrecord SequenceExample protobuf data

        Returns:
            TODO(ashish): note - "features" is not a Features object.  It's a {feat_name: tf.Tensor} mapping
            (so perhaps a bad name?)
            features: parsed features extracted from the protobuf
            labels: parsed label extracted from the protobuf
        """
        context, examples = io.parse_single_sequence_example(
            serialized=sequence_example_proto,
            context_features=context_features_spec,
            sequence_features=sequence_features_spec,
        )

        features = dict()

        # Explode context features into all records
        for feat, t in context.items():
            t = tf.expand_dims(t, axis=0)
            t = tf.tile(t, multiples=[max_num_records])

            # If feature is a string, then decode into numbers
            if feature_config.get_dict(
            )[feat]["type"] == FeatureTypeKey.STRING:
                t = io.decode_raw(
                    t,
                    out_type=tf.uint8,
                    fixed_length=feature_config.get_dict()[feat]["max_length"],
                )
                t = tf.cast(t, tf.float32)

            features[feat] = t

        # Pad sequence features to max_num_records
        for feat, t in examples.items():
            if isinstance(t, sparse.SparseTensor):
                if feat == "pos":
                    # Add mask for identifying padded records
                    mask = tf.ones_like(sparse.to_dense(sparse.reset_shape(t)))
                    mask = tf.expand_dims(mask, axis=2)
                    mask = image.pad_to_bounding_box(
                        mask,
                        offset_height=0,
                        offset_width=0,
                        target_height=1,
                        target_width=max_num_records,
                    )
                    features["mask"] = tf.squeeze(mask)

                t = sparse.reset_shape(t, new_shape=[1, max_num_records])
                t = sparse.to_dense(t)
                t = tf.squeeze(t)

                # If feature is a string, then decode into numbers
                if feature_config.get_dict(
                )[feat]["type"] == FeatureTypeKey.STRING:
                    t = io.decode_raw(
                        t,
                        out_type=tf.uint8,
                        fixed_length=feature_config.get_dict()[feat]
                        ["max_length"],
                    )
                    t = tf.cast(t, tf.float32)
            else:
                #
                # Handle dense tensors
                #
                # if len(t.shape) == 1:
                #     t = tf.expand_dims(t, axis=0)
                # if len(t.shape) == 2:
                #     t = tf.pad(t, paddings=[[0, 0], [0, max_num_records]])
                #     t = tf.squeeze(t)
                # else:
                #     raise Exception('Invalid input : {}'.format(feat))
                raise ValueError("Invalid input : {}".format(feat))

            features[feat] = t

        labels = features.pop(feature_config.label)
        return features, labels