def get_category_encoding_layer(name, dataset, dtype, max_tokens=None): # Create a StringLookup layer which will turn strings into integer indices if dtype == 'string': index = preprocessing.StringLookup(max_tokens=max_tokens) else: index = preprocessing.IntegerLookup(max_values=max_tokens) # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) # Learn the set of possible values and assign them a fixed integer index. index.adapt(feature_ds) # Create a Discretization for our integer indices. encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size()) # Prepare a Dataset that only yields our feature. feature_ds = feature_ds.map(index) # Learn the space of possible indices. encoder.adapt(feature_ds) # Apply one-hot encoding to our indices. The lambda function captures the # layer so we can use them, or include them in the functional model later. return lambda feature: encoder(index(feature))
def getCategoryEncodingLayer(self, name, dataset, dtype, max_tokens=None): if dtype == 'string': index = preprocessing.StringLookup(max_tokens=max_tokens) else: index = preprocessing.IntegerLookup(max_tokens=max_tokens) feature_ds = dataset.map(lambda x, y: x[name]) index.adapt(feature_ds) encoder = preprocessing.CategoryEncoding( num_tokens=index.vocabulary_size()) return lambda feature: encoder(index(feature))
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None): if dtype == 'string': index = preprocessing.StringLookup(max_tokens=max_tokens) else: index = preprocessing.IntegerLookup(max_values=max_tokens) feature_ds = dataset.map(lambda x, y: x[name]) index.adapt(feature_ds) encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size()) feature_ds = feature_ds.map(index) encoder.adapt(feature_ds) return lambda feature: encoder(index(feature))
def processcsv(featurecsv, csv, preprocess): from tensorflow.keras.layers.experimental import preprocessing inputs = {} for name, column in featurecsv.items(): dtype = column.dtype if dtype == object: dtype = tf.string else: dtype = tf.float32 inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype) numericInputs = { name: input for name, input in inputs.items() if input.dtype == tf.float32 } x = layers.Concatenate()(list(numericInputs.values())) if preprocess: norm = preprocessing.Normalization() norm.adapt(np.array(csv[numericInputs.keys()])) allNumericInputs = norm(x) preprocessedInputs = [allNumericInputs] else: preprocessedInputs = [x] for name, input in inputs.items(): if input.dtype == tf.float32: continue lookup = preprocessing.StringLookup( vocabulary=np.unique(featurecsv[name])) oneHot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size()) x = lookup(input) x = oneHot(x) preprocessedInputs.append(x) preprocessedInputsCat = layers.Concatenate()(preprocessedInputs) preprocessing = tf.keras.Model(inputs, preprocessedInputsCat) featuresDict = { name: np.array(value) for name, value in featurecsv.items() } return inputs, preprocessing, featuresDict
def build_nn(q, initialize_cl, cl_df, dat_x=None): if dat_x is not None and dat_x.shape[1] == 5: features = Input(shape=(5, ), dtype="int32") encoders = [] encoded = [] for var_idx in range(5): if var_idx in [0, 1, 4]: current_encoder = preprocessing.CategoryEncoding( output_mode="binary", sparse=True) else: current_encoder = preprocessing.Normalization() encoders.append(current_encoder) encoders[var_idx].adapt(dat_x[:, var_idx]) encoded.append(encoders[var_idx](features[:, var_idx])) features_encoded = concatenate(encoded) hidden_layer = Dense(units=q, activation='tanh')(features_encoded) elif dat_x is None or dat_x.shape[1] > 5: features = Input(shape=(dat_x.shape[1], )) hidden_layer = Dense(units=q, activation='tanh')(features) if not initialize_cl: output_layer = Dense(units=1, activation=backend.exp)(hidden_layer) else: output_layer = Dense(units=1, activation=backend.exp, bias_initializer=Constant(value=cl_df), kernel_initializer=Zeros())(hidden_layer) volumes = Input(shape=(1, )) offset_layer = Dense(units=1, activation='linear', use_bias=False, trainable=False, kernel_initializer=Ones())(volumes) merged = Multiply()([output_layer, offset_layer]) model = Model(inputs=[features, volumes], outputs=merged) model.compile(loss='mse', optimizer='rmsprop', metrics=["mse"]) return model
def feats_encoding(df): # encode numerical variables inputs = {} for name, column in df.items(): dtype = column.dtype if dtype == object: dtype = tf.string else: dtype = tf.float32 inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype) numeric_inputs = { name: input for name, input in inputs.items() if input.dtype == tf.float32 } x = layers.Concatenate()(list(numeric_inputs.values())) norm = preprocessing.Normalization() norm.adapt(np.array(df[numeric_inputs.keys()])) all_numeric_inputs = norm(x) preprocessed_inputs = [all_numeric_inputs] # all_numeric_inputs # encode categorial variables for feature in ["directors", "kinds"]: #'movie_id', lookup = preprocessing.StringLookup(vocabulary=np.unique(df[feature])) one_hot = preprocessing.CategoryEncoding( max_tokens=lookup.vocab_size()) x = lookup(inputs[feature]) x = one_hot(x) preprocessed_inputs.append(x) preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs) return tf.keras.Model(inputs, preprocessed_inputs_cat), inputs
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None): """Creates everything that's needed for a categorical encoding input pipeline. Args: name (string): name of the feature dataset (tf.DataSet): tensorflow dataset dtype (string): datatype max_tokens (int, optional): maximum number of tokens. Defaults to None. Returns: lambda function: categorical input pipeline """ # Create a StringLookup layer which will turn strings into integer indices if dtype == 'string': index = exp_preprocessing.StringLookup(max_tokens=max_tokens) else: index = exp_preprocessing.IntegerLookup(max_values=max_tokens) # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) # Learn the set of possible values and assign them a fixed integer index. index.adapt(feature_ds) # Create a Discretization for our integer indices. encoder = exp_preprocessing.CategoryEncoding(max_tokens=index.vocab_size()) # Prepare a Dataset that only yields our feature. feature_ds = feature_ds.map(index) # Learn the space of possible indices. encoder.adapt(feature_ds) # Apply one-hot encoding to our indices. The lambda function captures the # layer so we can use them, or include them in the functional model later. return lambda feature: encoder(index(feature))
x = layers.Concatenate()(list(numeric_inputs.values())) norm = preprocessing.Normalization() norm.adapt(np.array(titanic[numeric_inputs.keys()])) all_numeric_inputs = norm(x) all_numeric_inputs preprocessed_inputs = [all_numeric_inputs] for name, input in inputs.items(): if input.dtype == tf.float32: continue lookup = preprocessing.StringLookup( vocabulary=np.unique(titanic_features[name])) one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size()) x = lookup(input) x = one_hot(x) preprocessed_inputs.append(x) preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs) titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat) # tf.keras.utils.plot_model(model = titanic_preprocessing , rankdir="LR", dpi=72, show_shapes=True) titanic_features_dict = { name: np.array(value) for name, value in titanic_features.items() }
# Train the model model.compile(optimizer="adam", loss="sparse_categorical_crossentropy") model.fit(x_train, y_train) """ ### Encoding string categorical features via one-hot encoding """ # Define some toy data data = tf.constant(["a", "b", "c", "b", "c", "a"]) # Use StringLookup to build an index of the feature values indexer = preprocessing.StringLookup() indexer.adapt(data) # Use CategoryEncoding to encode the integer indices to a one-hot vector encoder = preprocessing.CategoryEncoding(output_mode="binary") encoder.adapt(indexer(data)) # Convert new test data (which includes unknown feature values) test_data = tf.constant(["a", "b", "c", "d", "e", ""]) encoded_data = encoder(indexer(test_data)) print(encoded_data) """ Note that index 0 is reserved for missing values (which you should specify as the empty string `""`), and index 1 is reserved for out-of-vocabulary values (values that were not seen during `adapt()`). You can configure this by using the `mask_token` and `oov_token` constructor arguments of `StringLookup`. You can see the `StringLookup` and `CategoryEncoding` layers in action in the example [structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/). """
If you have a categorical feature that can take many different values (on the order of 10e3 or higher), where each value only appears a few times in the data, it becomes impractical and ineffective to index and one-hot encode the feature values. Instead, it can be a good idea to apply the "hashing trick": hash the values to a vector of fixed size. This keeps the size of the feature space manageable, and removes the need for explicit indexing. """ # Sample data: 10,000 random integers with values between 0 and 100,000 data = np.random.randint(0, 100000, size=(10000, 1)) # Use the Hashing layer to hash the values to the range [0, 64] hasher = preprocessing.Hashing(num_bins=64, salt=1337) # Use the CategoryEncoding layer to multi-hot encode the hashed values encoder = preprocessing.CategoryEncoding(num_tokens=64, output_mode="multi_hot") encoded_data = encoder(hasher(data)) print(encoded_data.shape) """ ### Encoding text as a sequence of token indices This is how you should preprocess text to be passed to an `Embedding` layer. """ # Define some text data to adapt the layer adapt_data = tf.constant( [ "The Brain is wider than the Sky", "For put them side by side", "The one the other will contain",
def processInput(filename): heart_data = pd.read_csv(filename, usecols=range(1, 11)) heart_features = heart_data.copy() heart_labels = heart_features.pop('chd') # Preprocessing inputs = {} for name, column in heart_features.items(): dtype = column.dtype if dtype == object: dtype = tf.string else: dtype = tf.float32 inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype) numeric_inputs = {name:input for name, input in inputs.items() if input.dtype==tf.float32} x = layers.Concatenate()(list(numeric_inputs.values())) norm = preprocessing.Normalization() norm.adapt(np.array(heart_data[numeric_inputs.keys()])) all_numeric_inputs = norm(x) preprocessed_inputs = [all_numeric_inputs] for name, input in inputs.items(): if input.dtype == tf.float32: continue lookup = preprocessing.StringLookup(vocabulary=np.unique(heart_features[name])) one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size()) x = lookup(input) x = one_hot(x) preprocessed_inputs.append(x) preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs) heart_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat) heart_features_dict = {name: np.array(value) for name, value in heart_features.items()} def heart_model(preprocessing_head, inputs): body = tf.keras.Sequential([ layers.Dense(512, kernel_regularizer=regularizers.l2(0.001), activation='elu'), layers.Dense(512, activation='elu'), layers.Dropout(0.3), layers.Dense(1) ]) preprocessed_inputs = preprocessing_head(inputs) result = body(preprocessed_inputs) model = tf.keras.Model(inputs, result) model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True), optimizer=tf.optimizers.Adam(), metrics=['accuracy']) return model heart_model = heart_model(heart_preprocessing, inputs) return heart_features_dict, heart_labels, heart_model
def main(): # In memory data url = 'https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv' abalone_train = pd.read_csv(url, names=[ 'Length', 'Diamenter', 'Height', 'Whole weight', 'Viscera weight', 'Shell weight', 'Age' ]) print(abalone_train.head()) abalone_features = abalone_train.copy() abalone_labels = abalone_features.pop('Age') abalone_features = np.array(abalone_features) print(f'Features: {abalone_features}') abalone_model = tf.keras.Sequential([layers.Dense(64), layers.Dense(1)]) abalone_model.compile(loss=tf.losses.MeanSquaredError(), optimizer=tf.optimizers.Adam()) # Basic preprocessing normalize = preprocessing.Normalization() normalize.adapt(abalone_features) norm_abalone_model = tf.keras.Sequential( [normalize, layers.Dense(64), layers.Dense(1)]) norm_abalone_model.compile(loss=tf.losses.MeanSquaredError(), optimizer=tf.optimizers.Adam()) norm_abalone_model.fit(abalone_features, abalone_labels, epochs=10) # Mixed data types url = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv' titanic = pd.read_csv(url) print(titanic.head()) titanic_features = titanic.copy() titanic_labels = titanic_features.pop('survived') # Create a symbolic input input = tf.keras.Input(shape=(), dtype=tf.float32) # Do a calculation using is result = 2 * input + 1 # The result doesn't have a value print(f'Result: {result}') calc = tf.keras.Model(inputs=input, outputs=result) print(f'calc(1) = {calc(1).numpy()}') print(f'calc(2) = {calc(2).numpy()}') inputs = {} for name, column in titanic_features.items(): dtype = column.dtype if dtype == object: dtype = tf.string else: dtype = tf.float32 inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype) inputs numeric_inputs = { name: input for name, input in inputs.items() if input.dtype == tf.float32 } x = layers.Concatenate()(list(numeric_inputs.values())) norm = preprocessing.Normalization() norm.adapt(np.array(titanic[numeric_inputs.keys()])) all_numeric_inputs = norm(x) all_numeric_inputs preprocessed_inputs = [all_numeric_inputs] for name, input in inputs.items(): if input.dtype == tf.float32: continue lookup = preprocessing.StringLookup( vocabulary=np.unique(titanic_features[name])) one_hot = preprocessing.CategoryEncoding( max_tokens=lookup.vocab_size()) x = lookup(input) x = one_hot(x) preprocessed_inputs.append(x) preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs) titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat) tf.keras.utils.plot_model(model=titanic_preprocessing, rankdir='LR', dpi=72, show_shapes=True) titanic_features_dict = { name: np.array(value) for name, value in titanic_features.items() } features_dict = { name: values[:1] for name, values in titanic_features_dict.items() } titanic_preprocessing(features_dict) titanic_model = get_titanic_model(titanic_preprocessing, inputs) titanic_model.fit(x=titanic_features_dict, y=titanic_labels, epochs=10) titanic_model.save('test') reloaded = tf.keras.models.load_model('test') features_dict = { name: values[:1] for name, values in titanic_features_dict.items() } before = titanic_model(features_dict) after = reloaded(features_dict) assert (before - after) < 1e-3 print(f'Before: {before}') print(f'After: {after}') # Using tf.data # On in memory datasets for example in slices(titanic_features_dict): for name, value in example.items(): print(f'{name:19s}: {value}') break titanic_ds = tf.data.Dataset.from_tensor_slices( (titanic_features_dict, titanic_labels)) titanic_batches = titanic_ds.shuffle(len(titanic_labels)).batch(32) titanic_model.fit(titanic_batches, epochs=5) # From a single file url = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv' titanic_file_path = tf.keras.utils.get_file('train.csv', url) titanic_csv_ds = tf.data.experimental.make_csv_dataset( titanic_file_path, batch_size=5, # Artificiallly small to make examples easier to show. label_name='survived', num_epochs=1, ignore_errors=True, ) for batch, label in titanic_csv_ds.take(1): for key, value in batch.items(): print(f'{key:20s}: value') print() print(f'{"label":20s}: {label}') url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz' traffic_volume_csv_gz = tf.keras.utils.get_file( 'Metro_Interstate_Traffic_Volume.csv.gz', url, cache_dir='.', cache_subdir='traffic') traffic_volume_csv_gz_ds = tf.data.experimental.make_csv_dataset( traffic_volume_csv_gz, batch_size=256, label_name='traffic_volume', num_epochs=1, compression_type='GZIP') for batch, label in traffic_volume_csv_gz_ds.take(1): for key, value in batch.items(): print(f'{key:20s}: {value[:5]}') print() print(f'{"label":20s}: {label[:5]}') #Caching start = time.time() for i, (batch, label) in enumerate(traffic_volume_csv_gz_ds.repeat(20)): if i % 40 == 0: print('.', end='') print(f'Total time: {time.time() - start:.3f}') caching = traffic_volume_csv_gz_ds.cache().shuffle(1000) start = time.time() for i, (batch, label) in enumerate(caching.shuffle(1000).repeat(20)): if i % 40 == 0: print('.', end='') print(f'Total time: {time.time() - start:.3f}') start = time.time() snapshot = tf.data.experimental.snapshot('titanic.tfsnap') snapshotting = traffic_volume_csv_gz_ds.apply(snapshot).shuffle(1000) for i, (batch, label) in enumerate(snapshotting.shuffle(1000).repeat(20)): if i % 40 == 0: print('.', end='') print(f'Total time: {time.time() - start:.3f}') # Multiple files url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00417/fonts.zip' _ = tf.keras.utils.get_file('fonts.zip', url, cache_dir='.', cache_subdir='fonts', extract=True) fonts_csvs = sorted(str(p) for p in pathlib.Path('fonts').glob('*.csv')) print(f'Fonts: {fonts_csvs[:10]}') print(f'Fonts len: {len(fonts_csvs)}') fonts_ds = tf.data.experimental.make_csv_dataset( file_pattern='fonts/*.csv', batch_size=10, num_epochs=1, num_parallel_reads=20, shuffle_buffer_size=10000) for features in fonts_ds.take(1): for i, (name, value) in enumerate(features.items()): if i > 15: break print(f'{name:20s}: {value}') print('...') print(f'[total: {len(features)} features]') # Optional: Packing fields fonts_image_ds = fonts_ds.map(make_images) for features in fonts_image_ds.take(1): break plt.figure(figsize=(6, 6), dpi=120) for n in range(9): plt.subplot(3, 3, n + 1) plt.imshow(features['image'][..., n]) plt.title(chr(features['m_label'][n])) plt.axis('off') plt.show() # Lower level functions # `tf.io.decode_csv` text = pathlib.Path(titanic_file_path).read_text() lines = text.split('\n')[1:-1] all_strings = [str()] * 10 print(f'{all_strings}') features = tf.io.decode_csv(lines, record_defaults=all_strings) for f in features: print(f'type: {f.dtype.name}, shape: {f.shape}') print(f'Sample record: {lines[0]}') titanic_types = [ int(), str(), float(), int(), int(), float(), str(), str(), str(), str() ] print(f'Data types: {titanic_types}') features = tf.io.decode_csv(lines, record_defaults=titanic_types) for f in features: print(f'type: {f.dtype.name}, shape: {f.shape}') # `tf.data.experimental.CsvDataset` simple_titanic = tf.data.experimental.CsvDataset( titanic_file_path, record_defaults=titanic_types, header=True) for example in simple_titanic.take(1): print(f'Sample record: {[e.numpy() for e in example]}') def decode_titanic_line(line): return tf.io.decode_csv(line, titanic_types) manual_titanic = ( # Load the lines of text tf.data.TextLineDataset(titanic_file_path) # Skip the header row .skip(1) # Decode the line .map(decode_titanic_line)) for example in manual_titanic.take(1): print(f'Sample record: {[e.numpy() for e in example]}') # Multiple files font_line = pathlib.Path(fonts_csvs[0]).read_text().splitlines()[1] print(f'Sample: {font_line}') num_font_features = font_line.count(',') + 1 font_column_types = [str(), str()] + [float()] * (num_font_features - 2) print(f'Fonts[0]: {fonts_csvs[0]}') simple_font_ds = tf.data.experimental.CsvDataset( fonts_csvs, record_defaults=font_column_types, header=True) for row in simple_font_ds.take(10): print(f'CSV first column: {row[0].numpy()}') font_files = tf.data.Dataset.list_files('fonts/*.csv') print('Epoch 1:') for f in list(font_files)[:5]: print(f' {f.numpy()}') print(' ...') print() print('Epoch 2:') for f in list(font_files)[:5]: print(f' {f.numpy()}') print(' ...') def make_font_csv_ds(path): return tf.data.experimental.CsvDataset( path, record_defaults=font_column_types, header=True) font_rows = font_files.interleave(make_font_csv_ds, cycle_length=3) fonts_dict = {'font_name': [], 'character': []} for row in font_rows.take(10): fonts_dict['font_name'].append(row[0].numpy().decode()) fonts_dict['character'].append(chr(row[2].numpy())) print(pd.DataFrame(fonts_dict)) # Performance BATCH_SIZE = 2048 font_ds = tf.data.experimental.make_csv_dataset(file_pattern='fonts/*.csv', batch_size=BATCH_SIZE, num_epochs=1, num_parallel_reads=100) start = time.time() for i, batch in enumerate(font_ds.take(20)): print('.', end='') print(f'Total time: {time.time() - start:.3f}')