def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_values=max_tokens)

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    # Prepare a Dataset that only yields our feature.
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices.
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    return lambda feature: encoder(index(feature))
Exemple #2
0
 def getCategoryEncodingLayer(self, name, dataset, dtype, max_tokens=None):
     if dtype == 'string':
         index = preprocessing.StringLookup(max_tokens=max_tokens)
     else:
         index = preprocessing.IntegerLookup(max_tokens=max_tokens)
     feature_ds = dataset.map(lambda x, y: x[name])
     index.adapt(feature_ds)
     encoder = preprocessing.CategoryEncoding(
         num_tokens=index.vocabulary_size())
     return lambda feature: encoder(index(feature))
Exemple #3
0
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_values=max_tokens)

    feature_ds = dataset.map(lambda x, y: x[name])
    index.adapt(feature_ds)
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())
    feature_ds = feature_ds.map(index)
    encoder.adapt(feature_ds)

    return lambda feature: encoder(index(feature))
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    """Creates everything that's needed for a categorical encoding input pipeline.

    Args:
        name (string): name of the feature
        dataset (tf.DataSet): tensorflow dataset
        dtype (string): datatype
        max_tokens (int, optional): maximum number of tokens. Defaults to None.

    Returns:
        lambda function: categorical input pipeline
    """
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = exp_preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = exp_preprocessing.IntegerLookup(max_values=max_tokens)

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = exp_preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    # Prepare a Dataset that only yields our feature.
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices.
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    return lambda feature: encoder(index(feature))
Exemple #5
0
string `""`), and index 1 is reserved for out-of-vocabulary values (values that were not
seen during `adapt()`). You can configure this by using the `mask_token` and `oov_token`
constructor arguments  of `StringLookup`.

You can see the `StringLookup` and `CategoryEncoding` layers in action in the example
[structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/).
"""
"""
### Encoding integer categorical features via one-hot encoding
"""

# Define some toy data
data = tf.constant([10, 20, 20, 10, 30, 0])

# Use IntegerLookup to build an index of the feature values
indexer = preprocessing.IntegerLookup()
indexer.adapt(data)

# Use CategoryEncoding to encode the integer indices to a one-hot vector
encoder = preprocessing.CategoryEncoding(output_mode="binary")
encoder.adapt(indexer(data))

# Convert new test data (which includes unknown feature values)
test_data = tf.constant([10, 10, 20, 50, 60, 0])
encoded_data = encoder(indexer(test_data))
print(encoded_data)
"""
Note that index 0 is reserved for missing values (which you should specify as the value
0), and index 1 is reserved for out-of-vocabulary values (values that were not seen
during `adapt()`). You can configure this by using the `mask_value` and `oov_value`
constructor arguments  of `IntegerLookup`.
Exemple #6
0
(values that were not seen during `adapt()`).

You can see the `StringLookup` in action in the
[Structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/)
example.
"""

"""
### Encoding integer categorical features via one-hot encoding
"""

# Define some toy data
data = tf.constant([[10], [20], [20], [10], [30], [0]])

# Use IntegerLookup to build an index of the feature values and encode output.
lookup = preprocessing.IntegerLookup(output_mode="one_hot")
lookup.adapt(data)

# Convert new test data (which includes unknown feature values)
test_data = tf.constant([[10], [10], [20], [50], [60], [0]])
encoded_data = lookup(test_data)
print(encoded_data)

"""
Note that index 0 is reserved for missing values (which you should specify as the value
0), and index 1 is reserved for out-of-vocabulary values (values that were not seen
during `adapt()`). You can configure this by using the `mask_token` and `oov_token`
constructor arguments  of `IntegerLookup`.

You can see the `IntegerLookup` in action in the example
[structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/).
seen during `adapt()`). You can configure this by using the `mask_token` and `oov_token`
constructor arguments  of `StringLookup`.

You can see the `StringLookup` in action in the
[Structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/)
example.
"""
"""
### Encoding integer categorical features via one-hot encoding
"""

# Define some toy data
data = tf.constant([[10], [20], [20], [10], [30], [0]])

# Use IntegerLookup to build an index of the feature values and encode output.
lookup = preprocessing.IntegerLookup(output_mode="binary")
lookup.adapt(data)

# Convert new test data (which includes unknown feature values)
test_data = tf.constant([[10], [10], [20], [50], [60], [0]])
encoded_data = lookup(test_data)
print(encoded_data)
"""
Note that index 0 is reserved for missing values (which you should specify as the value
0), and index 1 is reserved for out-of-vocabulary values (values that were not seen
during `adapt()`). You can configure this by using the `mask_token` and `oov_token`
constructor arguments  of `IntegerLookup`.

You can see the `IntegerLookup` in action in the example
[structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/).
"""