def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_values=max_tokens)

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    # Prepare a Dataset that only yields our feature.
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices.
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    return lambda feature: encoder(index(feature))
Ejemplo n.º 2
0
 def getCategoryEncodingLayer(self, name, dataset, dtype, max_tokens=None):
     if dtype == 'string':
         index = preprocessing.StringLookup(max_tokens=max_tokens)
     else:
         index = preprocessing.IntegerLookup(max_tokens=max_tokens)
     feature_ds = dataset.map(lambda x, y: x[name])
     index.adapt(feature_ds)
     encoder = preprocessing.CategoryEncoding(
         num_tokens=index.vocabulary_size())
     return lambda feature: encoder(index(feature))
Ejemplo n.º 3
0
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_values=max_tokens)

    feature_ds = dataset.map(lambda x, y: x[name])
    index.adapt(feature_ds)
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())
    feature_ds = feature_ds.map(index)
    encoder.adapt(feature_ds)

    return lambda feature: encoder(index(feature))
Ejemplo n.º 4
0
def processcsv(featurecsv, csv, preprocess):
    from tensorflow.keras.layers.experimental import preprocessing

    inputs = {}
    for name, column in featurecsv.items():
        dtype = column.dtype
        if dtype == object:
            dtype = tf.string
        else:
            dtype = tf.float32
        inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype)

    numericInputs = {
        name: input
        for name, input in inputs.items() if input.dtype == tf.float32
    }

    x = layers.Concatenate()(list(numericInputs.values()))
    if preprocess:
        norm = preprocessing.Normalization()
        norm.adapt(np.array(csv[numericInputs.keys()]))
        allNumericInputs = norm(x)
        preprocessedInputs = [allNumericInputs]
    else:
        preprocessedInputs = [x]

    for name, input in inputs.items():
        if input.dtype == tf.float32:
            continue

        lookup = preprocessing.StringLookup(
            vocabulary=np.unique(featurecsv[name]))
        oneHot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())

        x = lookup(input)
        x = oneHot(x)
        preprocessedInputs.append(x)

    preprocessedInputsCat = layers.Concatenate()(preprocessedInputs)
    preprocessing = tf.keras.Model(inputs, preprocessedInputsCat)

    featuresDict = {
        name: np.array(value)
        for name, value in featurecsv.items()
    }

    return inputs, preprocessing, featuresDict
Ejemplo n.º 5
0
def build_nn(q, initialize_cl, cl_df, dat_x=None):
    if dat_x is not None and dat_x.shape[1] == 5:
        features = Input(shape=(5, ), dtype="int32")

        encoders = []
        encoded = []
        for var_idx in range(5):
            if var_idx in [0, 1, 4]:
                current_encoder = preprocessing.CategoryEncoding(
                    output_mode="binary", sparse=True)
            else:
                current_encoder = preprocessing.Normalization()
            encoders.append(current_encoder)
            encoders[var_idx].adapt(dat_x[:, var_idx])
            encoded.append(encoders[var_idx](features[:, var_idx]))

        features_encoded = concatenate(encoded)
        hidden_layer = Dense(units=q, activation='tanh')(features_encoded)
    elif dat_x is None or dat_x.shape[1] > 5:
        features = Input(shape=(dat_x.shape[1], ))
        hidden_layer = Dense(units=q, activation='tanh')(features)

    if not initialize_cl:
        output_layer = Dense(units=1, activation=backend.exp)(hidden_layer)
    else:
        output_layer = Dense(units=1,
                             activation=backend.exp,
                             bias_initializer=Constant(value=cl_df),
                             kernel_initializer=Zeros())(hidden_layer)

    volumes = Input(shape=(1, ))
    offset_layer = Dense(units=1,
                         activation='linear',
                         use_bias=False,
                         trainable=False,
                         kernel_initializer=Ones())(volumes)

    merged = Multiply()([output_layer, offset_layer])

    model = Model(inputs=[features, volumes], outputs=merged)
    model.compile(loss='mse', optimizer='rmsprop', metrics=["mse"])

    return model
Ejemplo n.º 6
0
def feats_encoding(df):
    # encode numerical variables
    inputs = {}
    for name, column in df.items():
        dtype = column.dtype
        if dtype == object:
            dtype = tf.string
        else:
            dtype = tf.float32

        inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype)

    numeric_inputs = {
        name: input
        for name, input in inputs.items() if input.dtype == tf.float32
    }

    x = layers.Concatenate()(list(numeric_inputs.values()))
    norm = preprocessing.Normalization()
    norm.adapt(np.array(df[numeric_inputs.keys()]))
    all_numeric_inputs = norm(x)

    preprocessed_inputs = [all_numeric_inputs]  # all_numeric_inputs

    # encode categorial variables
    for feature in ["directors", "kinds"]:  #'movie_id',
        lookup = preprocessing.StringLookup(vocabulary=np.unique(df[feature]))
        one_hot = preprocessing.CategoryEncoding(
            max_tokens=lookup.vocab_size())

        x = lookup(inputs[feature])
        x = one_hot(x)
        preprocessed_inputs.append(x)

    preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
    return tf.keras.Model(inputs, preprocessed_inputs_cat), inputs
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    """Creates everything that's needed for a categorical encoding input pipeline.

    Args:
        name (string): name of the feature
        dataset (tf.DataSet): tensorflow dataset
        dtype (string): datatype
        max_tokens (int, optional): maximum number of tokens. Defaults to None.

    Returns:
        lambda function: categorical input pipeline
    """
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = exp_preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = exp_preprocessing.IntegerLookup(max_values=max_tokens)

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = exp_preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    # Prepare a Dataset that only yields our feature.
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices.
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    return lambda feature: encoder(index(feature))
Ejemplo n.º 8
0
x = layers.Concatenate()(list(numeric_inputs.values()))
norm = preprocessing.Normalization()
norm.adapt(np.array(titanic[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

all_numeric_inputs

preprocessed_inputs = [all_numeric_inputs]

for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue

    lookup = preprocessing.StringLookup(
        vocabulary=np.unique(titanic_features[name]))
    one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())

    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)

preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

# tf.keras.utils.plot_model(model = titanic_preprocessing , rankdir="LR", dpi=72, show_shapes=True)

titanic_features_dict = {
    name: np.array(value)
    for name, value in titanic_features.items()
}
Ejemplo n.º 9
0
# Train the model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
model.fit(x_train, y_train)
"""
### Encoding string categorical features via one-hot encoding
"""

# Define some toy data
data = tf.constant(["a", "b", "c", "b", "c", "a"])

# Use StringLookup to build an index of the feature values
indexer = preprocessing.StringLookup()
indexer.adapt(data)

# Use CategoryEncoding to encode the integer indices to a one-hot vector
encoder = preprocessing.CategoryEncoding(output_mode="binary")
encoder.adapt(indexer(data))

# Convert new test data (which includes unknown feature values)
test_data = tf.constant(["a", "b", "c", "d", "e", ""])
encoded_data = encoder(indexer(test_data))
print(encoded_data)
"""
Note that index 0 is reserved for missing values (which you should specify as the empty
string `""`), and index 1 is reserved for out-of-vocabulary values (values that were not
seen during `adapt()`). You can configure this by using the `mask_token` and `oov_token`
constructor arguments  of `StringLookup`.

You can see the `StringLookup` and `CategoryEncoding` layers in action in the example
[structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/).
"""
Ejemplo n.º 10
0
If you have a categorical feature that can take many different values (on the order of
10e3 or higher), where each value only appears a few times in the data,
it becomes impractical and ineffective to index and one-hot encode the feature values.
Instead, it can be a good idea to apply the "hashing trick": hash the values to a vector
of fixed size. This keeps the size of the feature space manageable, and removes the need
for explicit indexing.
"""

# Sample data: 10,000 random integers with values between 0 and 100,000
data = np.random.randint(0, 100000, size=(10000, 1))

# Use the Hashing layer to hash the values to the range [0, 64]
hasher = preprocessing.Hashing(num_bins=64, salt=1337)

# Use the CategoryEncoding layer to multi-hot encode the hashed values
encoder = preprocessing.CategoryEncoding(num_tokens=64, output_mode="multi_hot")
encoded_data = encoder(hasher(data))
print(encoded_data.shape)

"""
### Encoding text as a sequence of token indices

This is how you should preprocess text to be passed to an `Embedding` layer.
"""

# Define some text data to adapt the layer
adapt_data = tf.constant(
    [
        "The Brain is wider than the Sky",
        "For put them side by side",
        "The one the other will contain",
Ejemplo n.º 11
0
def processInput(filename):

  heart_data = pd.read_csv(filename, usecols=range(1, 11))

  heart_features = heart_data.copy()
  heart_labels = heart_features.pop('chd')

  # Preprocessing
  inputs = {}

  for name, column in heart_features.items():
    dtype = column.dtype
    if dtype == object:
      dtype = tf.string
    else:
      dtype = tf.float32

    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

  numeric_inputs = {name:input for name, input in inputs.items() if input.dtype==tf.float32}

  x = layers.Concatenate()(list(numeric_inputs.values()))
  norm = preprocessing.Normalization()
  norm.adapt(np.array(heart_data[numeric_inputs.keys()]))
  all_numeric_inputs = norm(x)

  preprocessed_inputs = [all_numeric_inputs]

  for name, input in inputs.items():
    if input.dtype == tf.float32:
      continue

    lookup = preprocessing.StringLookup(vocabulary=np.unique(heart_features[name]))
    one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())

    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)

  preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

  heart_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

  heart_features_dict = {name: np.array(value) for name, value in heart_features.items()}

  def heart_model(preprocessing_head, inputs):
    body = tf.keras.Sequential([
      layers.Dense(512, kernel_regularizer=regularizers.l2(0.001), activation='elu'),
      layers.Dense(512, activation='elu'),
      layers.Dropout(0.3),
      layers.Dense(1)
    ])

    preprocessed_inputs = preprocessing_head(inputs)
    result = body(preprocessed_inputs)
    model = tf.keras.Model(inputs, result)

    model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True), optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
    return model

  heart_model = heart_model(heart_preprocessing, inputs)

  return heart_features_dict, heart_labels, heart_model
Ejemplo n.º 12
0
def main():
    # In memory data
    url = 'https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv'
    abalone_train = pd.read_csv(url,
                                names=[
                                    'Length', 'Diamenter', 'Height',
                                    'Whole weight', 'Viscera weight',
                                    'Shell weight', 'Age'
                                ])

    print(abalone_train.head())

    abalone_features = abalone_train.copy()
    abalone_labels = abalone_features.pop('Age')

    abalone_features = np.array(abalone_features)
    print(f'Features: {abalone_features}')

    abalone_model = tf.keras.Sequential([layers.Dense(64), layers.Dense(1)])

    abalone_model.compile(loss=tf.losses.MeanSquaredError(),
                          optimizer=tf.optimizers.Adam())

    # Basic preprocessing
    normalize = preprocessing.Normalization()

    normalize.adapt(abalone_features)

    norm_abalone_model = tf.keras.Sequential(
        [normalize, layers.Dense(64),
         layers.Dense(1)])

    norm_abalone_model.compile(loss=tf.losses.MeanSquaredError(),
                               optimizer=tf.optimizers.Adam())
    norm_abalone_model.fit(abalone_features, abalone_labels, epochs=10)

    # Mixed data types
    url = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv'
    titanic = pd.read_csv(url)
    print(titanic.head())

    titanic_features = titanic.copy()
    titanic_labels = titanic_features.pop('survived')

    # Create a symbolic input
    input = tf.keras.Input(shape=(), dtype=tf.float32)

    # Do a calculation using is
    result = 2 * input + 1

    # The result doesn't have a value
    print(f'Result: {result}')

    calc = tf.keras.Model(inputs=input, outputs=result)

    print(f'calc(1) = {calc(1).numpy()}')
    print(f'calc(2) = {calc(2).numpy()}')

    inputs = {}
    for name, column in titanic_features.items():
        dtype = column.dtype
        if dtype == object:
            dtype = tf.string
        else:
            dtype = tf.float32

        inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype)

    inputs

    numeric_inputs = {
        name: input
        for name, input in inputs.items() if input.dtype == tf.float32
    }

    x = layers.Concatenate()(list(numeric_inputs.values()))
    norm = preprocessing.Normalization()
    norm.adapt(np.array(titanic[numeric_inputs.keys()]))
    all_numeric_inputs = norm(x)

    all_numeric_inputs

    preprocessed_inputs = [all_numeric_inputs]

    for name, input in inputs.items():
        if input.dtype == tf.float32:
            continue

        lookup = preprocessing.StringLookup(
            vocabulary=np.unique(titanic_features[name]))
        one_hot = preprocessing.CategoryEncoding(
            max_tokens=lookup.vocab_size())

        x = lookup(input)
        x = one_hot(x)
        preprocessed_inputs.append(x)

    preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

    titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

    tf.keras.utils.plot_model(model=titanic_preprocessing,
                              rankdir='LR',
                              dpi=72,
                              show_shapes=True)

    titanic_features_dict = {
        name: np.array(value)
        for name, value in titanic_features.items()
    }

    features_dict = {
        name: values[:1]
        for name, values in titanic_features_dict.items()
    }
    titanic_preprocessing(features_dict)

    titanic_model = get_titanic_model(titanic_preprocessing, inputs)

    titanic_model.fit(x=titanic_features_dict, y=titanic_labels, epochs=10)

    titanic_model.save('test')
    reloaded = tf.keras.models.load_model('test')

    features_dict = {
        name: values[:1]
        for name, values in titanic_features_dict.items()
    }

    before = titanic_model(features_dict)
    after = reloaded(features_dict)
    assert (before - after) < 1e-3
    print(f'Before: {before}')
    print(f'After: {after}')

    # Using tf.data
    # On in memory datasets
    for example in slices(titanic_features_dict):
        for name, value in example.items():
            print(f'{name:19s}: {value}')
        break

    titanic_ds = tf.data.Dataset.from_tensor_slices(
        (titanic_features_dict, titanic_labels))

    titanic_batches = titanic_ds.shuffle(len(titanic_labels)).batch(32)

    titanic_model.fit(titanic_batches, epochs=5)

    # From a single file
    url = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv'
    titanic_file_path = tf.keras.utils.get_file('train.csv', url)

    titanic_csv_ds = tf.data.experimental.make_csv_dataset(
        titanic_file_path,
        batch_size=5,  # Artificiallly small to make examples easier to show.
        label_name='survived',
        num_epochs=1,
        ignore_errors=True,
    )

    for batch, label in titanic_csv_ds.take(1):
        for key, value in batch.items():
            print(f'{key:20s}: value')
        print()
        print(f'{"label":20s}: {label}')

    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz'
    traffic_volume_csv_gz = tf.keras.utils.get_file(
        'Metro_Interstate_Traffic_Volume.csv.gz',
        url,
        cache_dir='.',
        cache_subdir='traffic')

    traffic_volume_csv_gz_ds = tf.data.experimental.make_csv_dataset(
        traffic_volume_csv_gz,
        batch_size=256,
        label_name='traffic_volume',
        num_epochs=1,
        compression_type='GZIP')

    for batch, label in traffic_volume_csv_gz_ds.take(1):
        for key, value in batch.items():
            print(f'{key:20s}: {value[:5]}')
        print()
        print(f'{"label":20s}: {label[:5]}')

    #Caching
    start = time.time()
    for i, (batch, label) in enumerate(traffic_volume_csv_gz_ds.repeat(20)):
        if i % 40 == 0:
            print('.', end='')
    print(f'Total time: {time.time() - start:.3f}')

    caching = traffic_volume_csv_gz_ds.cache().shuffle(1000)

    start = time.time()
    for i, (batch, label) in enumerate(caching.shuffle(1000).repeat(20)):
        if i % 40 == 0:
            print('.', end='')
    print(f'Total time: {time.time() - start:.3f}')

    start = time.time()
    snapshot = tf.data.experimental.snapshot('titanic.tfsnap')
    snapshotting = traffic_volume_csv_gz_ds.apply(snapshot).shuffle(1000)

    for i, (batch, label) in enumerate(snapshotting.shuffle(1000).repeat(20)):
        if i % 40 == 0:
            print('.', end='')
    print(f'Total time: {time.time() - start:.3f}')

    # Multiple files
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00417/fonts.zip'
    _ = tf.keras.utils.get_file('fonts.zip',
                                url,
                                cache_dir='.',
                                cache_subdir='fonts',
                                extract=True)

    fonts_csvs = sorted(str(p) for p in pathlib.Path('fonts').glob('*.csv'))

    print(f'Fonts: {fonts_csvs[:10]}')
    print(f'Fonts len: {len(fonts_csvs)}')

    fonts_ds = tf.data.experimental.make_csv_dataset(
        file_pattern='fonts/*.csv',
        batch_size=10,
        num_epochs=1,
        num_parallel_reads=20,
        shuffle_buffer_size=10000)

    for features in fonts_ds.take(1):
        for i, (name, value) in enumerate(features.items()):
            if i > 15:
                break
            print(f'{name:20s}: {value}')
    print('...')
    print(f'[total: {len(features)} features]')

    # Optional: Packing fields
    fonts_image_ds = fonts_ds.map(make_images)

    for features in fonts_image_ds.take(1):
        break

    plt.figure(figsize=(6, 6), dpi=120)

    for n in range(9):
        plt.subplot(3, 3, n + 1)
        plt.imshow(features['image'][..., n])
        plt.title(chr(features['m_label'][n]))
        plt.axis('off')

    plt.show()

    # Lower level functions
    # `tf.io.decode_csv`
    text = pathlib.Path(titanic_file_path).read_text()
    lines = text.split('\n')[1:-1]

    all_strings = [str()] * 10
    print(f'{all_strings}')

    features = tf.io.decode_csv(lines, record_defaults=all_strings)

    for f in features:
        print(f'type: {f.dtype.name}, shape: {f.shape}')

    print(f'Sample record: {lines[0]}')

    titanic_types = [
        int(),
        str(),
        float(),
        int(),
        int(),
        float(),
        str(),
        str(),
        str(),
        str()
    ]
    print(f'Data types: {titanic_types}')

    features = tf.io.decode_csv(lines, record_defaults=titanic_types)

    for f in features:
        print(f'type: {f.dtype.name}, shape: {f.shape}')

    # `tf.data.experimental.CsvDataset`
    simple_titanic = tf.data.experimental.CsvDataset(
        titanic_file_path, record_defaults=titanic_types, header=True)

    for example in simple_titanic.take(1):
        print(f'Sample record: {[e.numpy() for e in example]}')

    def decode_titanic_line(line):
        return tf.io.decode_csv(line, titanic_types)

    manual_titanic = (
        # Load the lines of text
        tf.data.TextLineDataset(titanic_file_path)
        # Skip the header row
        .skip(1)
        # Decode the line
        .map(decode_titanic_line))

    for example in manual_titanic.take(1):
        print(f'Sample record: {[e.numpy() for e in example]}')

    # Multiple files
    font_line = pathlib.Path(fonts_csvs[0]).read_text().splitlines()[1]
    print(f'Sample: {font_line}')

    num_font_features = font_line.count(',') + 1
    font_column_types = [str(), str()] + [float()] * (num_font_features - 2)

    print(f'Fonts[0]: {fonts_csvs[0]}')

    simple_font_ds = tf.data.experimental.CsvDataset(
        fonts_csvs, record_defaults=font_column_types, header=True)

    for row in simple_font_ds.take(10):
        print(f'CSV first column: {row[0].numpy()}')

    font_files = tf.data.Dataset.list_files('fonts/*.csv')

    print('Epoch 1:')
    for f in list(font_files)[:5]:
        print(f'    {f.numpy()}')
    print('    ...')
    print()
    print('Epoch 2:')
    for f in list(font_files)[:5]:
        print(f'    {f.numpy()}')
    print('    ...')

    def make_font_csv_ds(path):
        return tf.data.experimental.CsvDataset(
            path, record_defaults=font_column_types, header=True)

    font_rows = font_files.interleave(make_font_csv_ds, cycle_length=3)

    fonts_dict = {'font_name': [], 'character': []}

    for row in font_rows.take(10):
        fonts_dict['font_name'].append(row[0].numpy().decode())
        fonts_dict['character'].append(chr(row[2].numpy()))

    print(pd.DataFrame(fonts_dict))

    # Performance
    BATCH_SIZE = 2048
    font_ds = tf.data.experimental.make_csv_dataset(file_pattern='fonts/*.csv',
                                                    batch_size=BATCH_SIZE,
                                                    num_epochs=1,
                                                    num_parallel_reads=100)

    start = time.time()
    for i, batch in enumerate(font_ds.take(20)):
        print('.', end='')
    print(f'Total time: {time.time() - start:.3f}')