Ejemplo n.º 1
0
def main(save_filename=None,
         load_filename="simple_rnn_custom_model_weights.h5",
         do_train=False,
         num_epochs=2,
         cell_type='gru'):
    """ Entry point """
    if do_train:
        print("Training and saving model...")
        (model, vocab) = train_model(file_name=save_filename,
                                     num_epochs=num_epochs)
        ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
        vocab_size = len(ids_from_chars.get_vocabulary())
    else:
        if load_filename is None:
            print(
                "ERROR load file name not provided and training flag set to false, no model can be used"
            )
            return 1
        # TODO Somehow this vocab should be accessible without needed to read and process this data
        data = open('./archive/drake_lyrics.txt').read()
        print('Length of text: {} characters'.format(len(data)))
        vocab = sorted(set(data))
        ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
        vocab_size = len(ids_from_chars.get_vocabulary())
        print("Loading model from disk...")
        #cell = custom_models.MyRNNCell(vocab_size)
        cell = custom_models.MyGRUCell(vocab_size)
        model = custom_models.MyCellModelWrapper(cell)
        utils.load_weights(load_filename, model,
                           tf.TensorShape([1, seq_length, vocab_size]))
    print("Generating Bars...please wait")
    seed_texts = [
        "[Verse]", "you", "love", "boy", "I love", "I love you", "Kiki, ",
        "Swanging"
    ]
    for seed in seed_texts:
        num_chars = 400
        output_text = utils.generate_text_one_h(seed,
                                                model,
                                                seq_length,
                                                ids_from_chars,
                                                chars_to_gen=num_chars)
        print(">>>>>>>>>>>>>>>>>>>>")
        print("Input seed: %s" % (seed))
        print("%d character generated sequence:\n%s\n" %
              (num_chars, output_text))
        print("<<<<<<<<<<<<<<<<<<<<")
        print("End of output for seed: %s" % (seed))
    #Hope you enjoyed :)
    return 0
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_values=max_tokens)

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    # Prepare a Dataset that only yields our feature.
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices.
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    return lambda feature: encoder(index(feature))
Ejemplo n.º 3
0
def get_data(data_file):
    # Read, then decode for py2 compat.
    text = open(data_file, 'rb').read().decode(encoding='utf-8')
    vocab = sorted(set(text))

    ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab),
                                                mask_token=None)
    chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=ids_from_chars.get_vocabulary(),
        invert=True,
        mask_token=None)

    all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
    ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

    seq_length = 100

    sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)

    dataset = sequences.map(split_input_target)

    dataset = (dataset.shuffle(BUFFER_SIZE).batch(
        BATCH_SIZE,
        drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE))

    return dataset, ids_from_chars, chars_from_ids
Ejemplo n.º 4
0
def create_alphabet_data(seq_length=30):
    """
    Creates a dataset from the alphabet text file

    @return Tuple of (xs, ys, vocab_size) as a training set from the alaphabet sample file
    """
    data = open('./archive/alphabet2.txt').read()
    #print('Length of text: {} characters'.format(len(data)))
    vocab = sorted(set(data))
    # This function as variable setup is weird to me but whatever
    ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
    chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True)

    # Preprocess the text into characters
    all_ids = ids_from_chars(tf.strings.unicode_split(data, 'UTF-8'))
    vocab_size = len(ids_from_chars.get_vocabulary())

    # Sanity tests
    vocab_sample = list(range(0,vocab_size))
    tf_vocab = tf.convert_to_tensor(vocab_sample)
    mapped_vocab = chars_from_ids(tf_vocab).numpy()

    # Warning: This is an untested function used as a test dependency
    (xs, ys) = utils.split_data_new(all_ids.numpy(), vocab_size, seq_length)
    return (xs, ys, vocab_size, ids_from_chars)
Ejemplo n.º 5
0
def train_model(file_name=None, debug=False, num_epochs=2, cell_type='gru'):
    """ Codepath to process input and train (as opposed to load up and generate)"""
    # Load Data
    data = open('./archive/drake_lyrics.txt').read()
    print('Length of text: {} characters'.format(len(data)))
    vocab = sorted(set(data))
    # Preprocess the text into integers
    ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
    chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=ids_from_chars.get_vocabulary(), invert=True)
    all_ids = ids_from_chars(tf.strings.unicode_split(data, 'UTF-8'))
    vocab_size = len(ids_from_chars.get_vocabulary())

    # Sanity Check: output vocab mapping
    vocab_sample = list(range(0, vocab_size))
    tf_vocab = tf.convert_to_tensor(vocab_sample)
    mapped_vocab = chars_from_ids(tf_vocab).numpy()
    print(vocab_sample)
    print(mapped_vocab)

    # Creating dataset from pre-processed text
    print("Splitting file into dataset")
    (split_xs, split_ys) = utils.split_data_new(all_ids.numpy(),
                                                vocab_size,
                                                seq_length,
                                                total_splits=char_to_process)

    # Create the Model
    if cell_type == 'gru':
        cell = custom_models.MyGRUCell(vocab_size)
        model = custom_models.MyCellModelWrapper(cell)
    elif cell_type == 'rnn' or cell_type == 'simple':
        cell = custom_models.MyRNNCell(vocab_size)
        model = custom_models.MyCellModelWrapper(cell)
    elif cell_type == 'keras' or cell_type == 'keras_gru':
        cell = keras.layers.SimpleRNNCell(150)
        model = custom_models.KerasRNNCellWrapper(cell, vocab_size)
    else:
        print(
            "Fatal ERROR: cell_type provided does not match supported options, terminating."
        )
        return -1
    my_loss = tf.losses.CategoricalCrossentropy(from_logits=True)
    model.compile(loss=my_loss,
                  optimizer=keras.optimizers.Adam(lr=0.001),
                  metrics=['accuracy'],
                  run_eagerly=True)
    # Train the model
    # TODO run this in a gradient tape loop and play with batch randomization
    model.fit(x=split_xs,
              y=split_ys,
              epochs=num_epochs,
              verbose=1,
              batch_size=64)

    print(model.summary())
    if file_name is not None:
        utils.save_model(file_name, model)
    return (model, vocab)
Ejemplo n.º 6
0
 def getCategoryEncodingLayer(self, name, dataset, dtype, max_tokens=None):
     if dtype == 'string':
         index = preprocessing.StringLookup(max_tokens=max_tokens)
     else:
         index = preprocessing.IntegerLookup(max_tokens=max_tokens)
     feature_ds = dataset.map(lambda x, y: x[name])
     index.adapt(feature_ds)
     encoder = preprocessing.CategoryEncoding(
         num_tokens=index.vocabulary_size())
     return lambda feature: encoder(index(feature))
Ejemplo n.º 7
0
 def __init__(self, encoding, **kwargs):
     super().__init__(**kwargs)
     self.encoding = encoding
     self.encoding_layers = []
     for encoding in self.encoding:
         if encoding == NONE:
             self.encoding_layers.append(None)
         elif encoding == INT:
             self.encoding_layers.append(preprocessing.StringLookup())
         elif encoding == ONE_HOT:
             self.encoding_layers.append(None)
Ejemplo n.º 8
0
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_values=max_tokens)

    feature_ds = dataset.map(lambda x, y: x[name])
    index.adapt(feature_ds)
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())
    feature_ds = feature_ds.map(index)
    encoder.adapt(feature_ds)

    return lambda feature: encoder(index(feature))
Ejemplo n.º 9
0
def processcsv(featurecsv, csv, preprocess):
    from tensorflow.keras.layers.experimental import preprocessing

    inputs = {}
    for name, column in featurecsv.items():
        dtype = column.dtype
        if dtype == object:
            dtype = tf.string
        else:
            dtype = tf.float32
        inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype)

    numericInputs = {
        name: input
        for name, input in inputs.items() if input.dtype == tf.float32
    }

    x = layers.Concatenate()(list(numericInputs.values()))
    if preprocess:
        norm = preprocessing.Normalization()
        norm.adapt(np.array(csv[numericInputs.keys()]))
        allNumericInputs = norm(x)
        preprocessedInputs = [allNumericInputs]
    else:
        preprocessedInputs = [x]

    for name, input in inputs.items():
        if input.dtype == tf.float32:
            continue

        lookup = preprocessing.StringLookup(
            vocabulary=np.unique(featurecsv[name]))
        oneHot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())

        x = lookup(input)
        x = oneHot(x)
        preprocessedInputs.append(x)

    preprocessedInputsCat = layers.Concatenate()(preprocessedInputs)
    preprocessing = tf.keras.Model(inputs, preprocessedInputsCat)

    featuresDict = {
        name: np.array(value)
        for name, value in featurecsv.items()
    }

    return inputs, preprocessing, featuresDict
Ejemplo n.º 10
0
def train_model(file_name=None, debug=False):
    """ Codepath to process input and train (as opposed to load up and generate)"""
    # Load Data
    data = open('./archive/drake_lyrics.txt').read()
    print('Length of text: {} characters'.format(len(data)))
    vocab = sorted(set(data))
    # Preprocess the text into integers
    ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
    chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=ids_from_chars.get_vocabulary(), invert=True)
    all_ids = ids_from_chars(tf.strings.unicode_split(data, 'UTF-8'))
    vocab_size = len(ids_from_chars.get_vocabulary())

    # Sanity Check: output vocab mapping
    vocab_sample = list(range(0, vocab_size))
    tf_vocab = tf.convert_to_tensor(vocab_sample)
    mapped_vocab = chars_from_ids(tf_vocab).numpy()
    print(vocab_sample)
    print(mapped_vocab)

    # Creating dataset from pre-processed text
    print("Splitting file into dataset")
    # TODO Try new split data method
    (split_xs, split_ys) = utils.split_data(all_ids.numpy(),
                                            vocab_size,
                                            seq_length,
                                            total_splits=char_to_process)

    # Create the Model
    my_model = DrakeGRUSequential(vocab_size, embedding_dim)
    my_loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    my_model.compile(loss=my_loss,
                     optimizer=keras.optimizers.Adam(lr=0.001),
                     metrics=['accuracy'],
                     run_eagerly=debug)

    # Train the model
    # TODO run this in a gradient tape loop and play with batch randomization
    my_model.fit(x=split_xs, y=split_ys, epochs=2, verbose=1, batch_size=64)

    print(my_model.summary())
    if file_name is not None:
        utils.save_model(file_name, my_model)
    return (my_model, vocab)
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    """Creates everything that's needed for a categorical encoding input pipeline.

    Args:
        name (string): name of the feature
        dataset (tf.DataSet): tensorflow dataset
        dtype (string): datatype
        max_tokens (int, optional): maximum number of tokens. Defaults to None.

    Returns:
        lambda function: categorical input pipeline
    """
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = exp_preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = exp_preprocessing.IntegerLookup(max_values=max_tokens)

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = exp_preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    # Prepare a Dataset that only yields our feature.
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices.
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    return lambda feature: encoder(index(feature))
Ejemplo n.º 12
0
def feats_encoding(df):
    # encode numerical variables
    inputs = {}
    for name, column in df.items():
        dtype = column.dtype
        if dtype == object:
            dtype = tf.string
        else:
            dtype = tf.float32

        inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype)

    numeric_inputs = {
        name: input
        for name, input in inputs.items() if input.dtype == tf.float32
    }

    x = layers.Concatenate()(list(numeric_inputs.values()))
    norm = preprocessing.Normalization()
    norm.adapt(np.array(df[numeric_inputs.keys()]))
    all_numeric_inputs = norm(x)

    preprocessed_inputs = [all_numeric_inputs]  # all_numeric_inputs

    # encode categorial variables
    for feature in ["directors", "kinds"]:  #'movie_id',
        lookup = preprocessing.StringLookup(vocabulary=np.unique(df[feature]))
        one_hot = preprocessing.CategoryEncoding(
            max_tokens=lookup.vocab_size())

        x = lookup(inputs[feature])
        x = one_hot(x)
        preprocessed_inputs.append(x)

    preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)
    return tf.keras.Model(inputs, preprocessed_inputs_cat), inputs
Ejemplo n.º 13
0
    def __init__(self, config):
        super(QuerySchemaEncoder, self).__init__()
        self.nnlm_embedder = hub.load(config['tf_hub_model'])
        self.margin = config['contrastive_loss_margin']
        dim = config['dim']

        self.use_char_embed = config['use_char_embedding']
        if self.use_char_embed:
            self.char_vocab = config['character_vocab']
            self.char_embedding_dim = config['char_embedding_dim']
            self.ids_from_chars = preprocessing.StringLookup(vocabulary=self.char_vocab, mask_token=None)

            self.char_encoder = tf.keras.models.Sequential()
            self.char_encoder.add(tf.keras.layers.Embedding(len(self.char_vocab), self.char_embedding_dim))
            self.char_encoder.add(tf.keras.layers.LSTM(self.char_embedding_dim, activation='relu'))

            self.char_word_combiner = tf.keras.models.Sequential()
            self.char_word_combiner.add(tf.keras.layers.InputLayer(input_shape=(dim + self.char_embedding_dim,)))
            self.char_word_combiner.add(tf.keras.layers.Dense(dim, activation='relu'))

        self.table_encoder_dense = tf.keras.models.Sequential()
        self.table_encoder_dense.add(tf.keras.layers.InputLayer(input_shape=(2 * dim,)))
        self.table_encoder_dense.add(tf.keras.layers.Dense(1.5 * dim, activation='relu'))
        self.table_encoder_dense.add(tf.keras.layers.Dense(dim, activation='relu'))

        self.use_lstm_query_encoder = config['use_lstm_query_encoder']

        if self.use_lstm_query_encoder:
            self.query_encoder = tf.keras.models.Sequential()
            self.query_encoder.add(tf.keras.layers.LSTM(dim, activation='relu'))

        else:
            self.query_encoder = tf.keras.models.Sequential()
            self.query_encoder.add(tf.keras.layers.InputLayer(input_shape=(dim,)))
            self.query_encoder.add(tf.keras.layers.Dense(dim, activation='relu'))
            self.query_encoder.add(tf.keras.layers.Dense(dim, activation='relu'))
Ejemplo n.º 14
0
"""
In addition, adaptable layers always expose an option to directly set state via
constructor arguments or weight assignment. If the intended state values are known at
layer construction time, or are calculated outside of the `adapt()` call, they can be set
without relying on the layer's internal computation. For instance, if external vocabulary
files for the `TextVectorization`, `StringLookup`, or `IntegerLookup` layers already
exist, those can be loaded directly into the lookup tables by passing a path to the
vocabulary file in the layer's constructor arguments.

Here's an example where we instantiate a `StringLookup` layer with precomputed vocabulary:
"""

vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer = preprocessing.StringLookup(vocabulary=vocab)
vectorized_data = layer(data)
print(vectorized_data)

"""
## Preprocessing data before the model or inside the model

There are two ways you could be using preprocessing layers:

**Option 1:** Make them part of the model, like this:

```python
inputs = keras.Input(shape=input_shape)
x = preprocessing_layer(inputs)
outputs = rest_of_the_model(x)
model = keras.Model(inputs, outputs)
Ejemplo n.º 15
0
    variable_partitioner = (
        tf.distribute.experimental.partitioners.FixedShardsPartitioner(
            num_shards=NUM_PS))

    strategy = tf.distribute.experimental.ParameterServerStrategy(
        cluster_resolver, variable_partitioner=variable_partitioner)

    # Setup Data
    feature_vocab = [
        "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong",
        "wonder_woman"
    ]
    label_vocab = ["yes", "no"]

    with strategy.scope():
        feature_lookup_layer = kpl.StringLookup(vocabulary=feature_vocab)

        label_lookup_layer = kpl.StringLookup(vocabulary=label_vocab,
                                              num_oov_indices=0,
                                              mask_token=None)

        raw_feature_input = keras.layers.Input(shape=(3, ),
                                               dtype=tf.string,
                                               name="feature")
        feature_id_input = feature_lookup_layer(raw_feature_input)
        feature_preprocess_stage = keras.Model({"features": raw_feature_input},
                                               feature_id_input)

        raw_label_input = keras.layers.Input(shape=(1, ),
                                             dtype=tf.string,
                                             name="label")
Ejemplo n.º 16
0

# Przetworzenie tekstu
print("---------- Przetwarzanie tekstu 2/2----------")
vocab = set()
for text in texts:
  vocab = vocab.union(set(text.split()))
vocab = sorted(vocab)

vectorizer = TextVectorization(standardize=None)
text_ds = tf.data.Dataset.from_tensor_slices(vocab).batch(128)
vectorizer.adapt(text_ds)



tokens_from_words = preprocessing.StringLookup(
    vocabulary=vectorizer.get_vocabulary())

words_from_tokens = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=tokens_from_words.get_vocabulary(), invert=True)

def text_from_tokens(ids):
    return tf.strings.reduce_join(words_from_tokens(ids), axis=-1)

print(len(tokens_from_words.get_vocabulary()))


voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

test = ["pan", "jest", "tu", "i", "tam", "."]
[word_index[w] for w in test]
Ejemplo n.º 17
0
def processInput(filename):

  heart_data = pd.read_csv(filename, usecols=range(1, 11))

  heart_features = heart_data.copy()
  heart_labels = heart_features.pop('chd')

  # Preprocessing
  inputs = {}

  for name, column in heart_features.items():
    dtype = column.dtype
    if dtype == object:
      dtype = tf.string
    else:
      dtype = tf.float32

    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

  numeric_inputs = {name:input for name, input in inputs.items() if input.dtype==tf.float32}

  x = layers.Concatenate()(list(numeric_inputs.values()))
  norm = preprocessing.Normalization()
  norm.adapt(np.array(heart_data[numeric_inputs.keys()]))
  all_numeric_inputs = norm(x)

  preprocessed_inputs = [all_numeric_inputs]

  for name, input in inputs.items():
    if input.dtype == tf.float32:
      continue

    lookup = preprocessing.StringLookup(vocabulary=np.unique(heart_features[name]))
    one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())

    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)

  preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

  heart_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

  heart_features_dict = {name: np.array(value) for name, value in heart_features.items()}

  def heart_model(preprocessing_head, inputs):
    body = tf.keras.Sequential([
      layers.Dense(512, kernel_regularizer=regularizers.l2(0.001), activation='elu'),
      layers.Dense(512, activation='elu'),
      layers.Dropout(0.3),
      layers.Dense(1)
    ])

    preprocessed_inputs = preprocessing_head(inputs)
    result = body(preprocessed_inputs)
    model = tf.keras.Model(inputs, result)

    model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True), optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
    return model

  heart_model = heart_model(heart_preprocessing, inputs)

  return heart_features_dict, heart_labels, heart_model
Ejemplo n.º 18
0
print(vectorized_text)
"""
In addition, adaptable layers always expose an option to directly set state via
constructor arguments or weight assignment. If the intended state values are known at
layer construction time, or are calculated outside of the `adapt()` call, they can be set
without relying on the layer's internal computation. For instance, if external vocabulary
files for the `TextVectorization`, `StringLookup`, or `IntegerLookup` layers already
exist, those can be loaded directly into the lookup tables by passing a path to the
vocabulary file in the layer's constructor arguments.

Here's an example where we instantiate a `StringLookup` layer with precomputed vocabulary:
"""

vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer = preprocessing.StringLookup(vocabulary=vocab)
vectorized_data = layer(data)
print(vectorized_data)
"""
## Preprocessing data before the model or inside the model

There are two ways you could be using preprocessing layers:

**Option 1:** Make them part of the model, like this:

```python
inputs = keras.Input(shape=input_shape)
x = preprocessing_layer(inputs)
outputs = rest_of_the_model(x)
model = keras.Model(inputs, outputs)
```
Ejemplo n.º 19
0
print(vectorized_text)
"""
In addition, adaptable layers always expose an option to directly set state via
constructor arguments or weight assignment. If the intended state values are known at
layer construction time, or are calculated outside of the `adapt()` call, they can be set
without relying on the layer's internal computation. For instance, if external vocabulary
files for the `TextVectorization`, `StringLookup`, or `IntegerLookup` layers already
exist, those can be loaded directly into the lookup tables by passing a path to the
vocabulary file in the layer's constructor arguments.

Here's an example where we instantiate a `StringLookup` layer with precomputed vocabulary:
"""

vocab = ["a", "b", "c", "d"]
data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
layer = preprocessing.StringLookup(vocabulary=vocab)
vectorized_data = layer(data)
print(vectorized_data)
"""
## Preprocessing data before the model or inside the model

There are two ways you could be using preprocessing layers:

**Option 1:** Make them part of the model, like this:

```python
inputs = keras.Input(shape=input_shape)
x = preprocessing_layer(inputs)
outputs = rest_of_the_model(x)
model = keras.Model(inputs, outputs)
```
Ejemplo n.º 20
0
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

from . import models

path_to_file = "./DS_1.txt"
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

vocab = sorted(set(text))
VOCAB_SIZE = len(vocab)

# CHAR ENCODING TO ID
chars = tf.strings.unicode_split(text, input_encoding='UTF-8')
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
ids = ids_from_chars(chars)

# INVERSION TO CHAR
chars_from_ids = preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)

with open('./model_arch.json') as f:
    model_archs = json.load(f)


def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)


def generate_lyrics(model_name, temp, length, input_text):
Ejemplo n.º 21
0
# Some other parameters.
BATCH_SIZE = 64
BUFFER_SIZE = 10000
EPOCHS = 5

# Download and load the text. Define its vocabulary.
path_to_file = tf.keras.utils.get_file(
    'shakespeare.txt',
    'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
)
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
vocab = sorted(set(text))

# Text Vectorization.
# Create the preprocessing layers which can convert chars and IDs.
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab),
                                            mask_token=None)
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)


def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)


# Define the dataset in terms of IDs.
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

# Define batches from the dataset. Sequences of characters are given as batch from a given size.

sequences = ids_dataset.batch(seq_length + 1, drop_remainder=True)
Ejemplo n.º 22
0
 def input_layer(self):
     return preprocessing.StringLookup(**self.feature_params)(self.inputs)
Ejemplo n.º 23
0
numeric_inputs = {name: input for name, input in inputs.items()
                  if input.dtype == tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = preprocessing.Normalization()
norm.adapt(np.array(ti[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

ppi = [all_numeric_inputs]

for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue

    lookup = preprocessing.StringLookup(
        vocabulary=np.unique(tif[name]))
    one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())

    x = lookup(input)
    x = one_hot(x)
    ppi.append(x)

ppic = layers.Concatenate()(ppi)
tip = tf.keras.Model(inputs, ppic)
# tf.keras.utils.plot_model(model=tip, rankdir='LR', dpi=72, show_shapes=True)

tifd = {name: np.array(value)
        for name, value in tif.items()}

# fd = {name: values[:1] for name, values in tifd.items()}
# print(tip(fd))
Ejemplo n.º 24
0
    print(f"--------- {i}/{len(texts)} ----------")
    text = text.translate(
        str.maketrans(unwanted_whitespaces,
                      "".join([" "
                               for _ in range(len(unwanted_whitespaces))])))
    text = text.translate(str.maketrans("", "", unwanted_chars))

# Przetworzenie tekstu
print("---------- Przetwarzanie teksty 2/2----------")
vocab = set()
for text in texts:
    vocab = vocab.union(set(text))
vocab = sorted(vocab)

ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)


def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)


print("---------- Tokenizacja tekstu ----------")

all_ids = []
ids_datasets = []
for i, text in enumerate(texts):
    print(f"Tokenizacja tekstu {i}/{len(texts)}")
Ejemplo n.º 25
0
def main():
    # In memory data
    url = 'https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv'
    abalone_train = pd.read_csv(url,
                                names=[
                                    'Length', 'Diamenter', 'Height',
                                    'Whole weight', 'Viscera weight',
                                    'Shell weight', 'Age'
                                ])

    print(abalone_train.head())

    abalone_features = abalone_train.copy()
    abalone_labels = abalone_features.pop('Age')

    abalone_features = np.array(abalone_features)
    print(f'Features: {abalone_features}')

    abalone_model = tf.keras.Sequential([layers.Dense(64), layers.Dense(1)])

    abalone_model.compile(loss=tf.losses.MeanSquaredError(),
                          optimizer=tf.optimizers.Adam())

    # Basic preprocessing
    normalize = preprocessing.Normalization()

    normalize.adapt(abalone_features)

    norm_abalone_model = tf.keras.Sequential(
        [normalize, layers.Dense(64),
         layers.Dense(1)])

    norm_abalone_model.compile(loss=tf.losses.MeanSquaredError(),
                               optimizer=tf.optimizers.Adam())
    norm_abalone_model.fit(abalone_features, abalone_labels, epochs=10)

    # Mixed data types
    url = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv'
    titanic = pd.read_csv(url)
    print(titanic.head())

    titanic_features = titanic.copy()
    titanic_labels = titanic_features.pop('survived')

    # Create a symbolic input
    input = tf.keras.Input(shape=(), dtype=tf.float32)

    # Do a calculation using is
    result = 2 * input + 1

    # The result doesn't have a value
    print(f'Result: {result}')

    calc = tf.keras.Model(inputs=input, outputs=result)

    print(f'calc(1) = {calc(1).numpy()}')
    print(f'calc(2) = {calc(2).numpy()}')

    inputs = {}
    for name, column in titanic_features.items():
        dtype = column.dtype
        if dtype == object:
            dtype = tf.string
        else:
            dtype = tf.float32

        inputs[name] = tf.keras.Input(shape=(1, ), name=name, dtype=dtype)

    inputs

    numeric_inputs = {
        name: input
        for name, input in inputs.items() if input.dtype == tf.float32
    }

    x = layers.Concatenate()(list(numeric_inputs.values()))
    norm = preprocessing.Normalization()
    norm.adapt(np.array(titanic[numeric_inputs.keys()]))
    all_numeric_inputs = norm(x)

    all_numeric_inputs

    preprocessed_inputs = [all_numeric_inputs]

    for name, input in inputs.items():
        if input.dtype == tf.float32:
            continue

        lookup = preprocessing.StringLookup(
            vocabulary=np.unique(titanic_features[name]))
        one_hot = preprocessing.CategoryEncoding(
            max_tokens=lookup.vocab_size())

        x = lookup(input)
        x = one_hot(x)
        preprocessed_inputs.append(x)

    preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

    titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

    tf.keras.utils.plot_model(model=titanic_preprocessing,
                              rankdir='LR',
                              dpi=72,
                              show_shapes=True)

    titanic_features_dict = {
        name: np.array(value)
        for name, value in titanic_features.items()
    }

    features_dict = {
        name: values[:1]
        for name, values in titanic_features_dict.items()
    }
    titanic_preprocessing(features_dict)

    titanic_model = get_titanic_model(titanic_preprocessing, inputs)

    titanic_model.fit(x=titanic_features_dict, y=titanic_labels, epochs=10)

    titanic_model.save('test')
    reloaded = tf.keras.models.load_model('test')

    features_dict = {
        name: values[:1]
        for name, values in titanic_features_dict.items()
    }

    before = titanic_model(features_dict)
    after = reloaded(features_dict)
    assert (before - after) < 1e-3
    print(f'Before: {before}')
    print(f'After: {after}')

    # Using tf.data
    # On in memory datasets
    for example in slices(titanic_features_dict):
        for name, value in example.items():
            print(f'{name:19s}: {value}')
        break

    titanic_ds = tf.data.Dataset.from_tensor_slices(
        (titanic_features_dict, titanic_labels))

    titanic_batches = titanic_ds.shuffle(len(titanic_labels)).batch(32)

    titanic_model.fit(titanic_batches, epochs=5)

    # From a single file
    url = 'https://storage.googleapis.com/tf-datasets/titanic/train.csv'
    titanic_file_path = tf.keras.utils.get_file('train.csv', url)

    titanic_csv_ds = tf.data.experimental.make_csv_dataset(
        titanic_file_path,
        batch_size=5,  # Artificiallly small to make examples easier to show.
        label_name='survived',
        num_epochs=1,
        ignore_errors=True,
    )

    for batch, label in titanic_csv_ds.take(1):
        for key, value in batch.items():
            print(f'{key:20s}: value')
        print()
        print(f'{"label":20s}: {label}')

    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz'
    traffic_volume_csv_gz = tf.keras.utils.get_file(
        'Metro_Interstate_Traffic_Volume.csv.gz',
        url,
        cache_dir='.',
        cache_subdir='traffic')

    traffic_volume_csv_gz_ds = tf.data.experimental.make_csv_dataset(
        traffic_volume_csv_gz,
        batch_size=256,
        label_name='traffic_volume',
        num_epochs=1,
        compression_type='GZIP')

    for batch, label in traffic_volume_csv_gz_ds.take(1):
        for key, value in batch.items():
            print(f'{key:20s}: {value[:5]}')
        print()
        print(f'{"label":20s}: {label[:5]}')

    #Caching
    start = time.time()
    for i, (batch, label) in enumerate(traffic_volume_csv_gz_ds.repeat(20)):
        if i % 40 == 0:
            print('.', end='')
    print(f'Total time: {time.time() - start:.3f}')

    caching = traffic_volume_csv_gz_ds.cache().shuffle(1000)

    start = time.time()
    for i, (batch, label) in enumerate(caching.shuffle(1000).repeat(20)):
        if i % 40 == 0:
            print('.', end='')
    print(f'Total time: {time.time() - start:.3f}')

    start = time.time()
    snapshot = tf.data.experimental.snapshot('titanic.tfsnap')
    snapshotting = traffic_volume_csv_gz_ds.apply(snapshot).shuffle(1000)

    for i, (batch, label) in enumerate(snapshotting.shuffle(1000).repeat(20)):
        if i % 40 == 0:
            print('.', end='')
    print(f'Total time: {time.time() - start:.3f}')

    # Multiple files
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00417/fonts.zip'
    _ = tf.keras.utils.get_file('fonts.zip',
                                url,
                                cache_dir='.',
                                cache_subdir='fonts',
                                extract=True)

    fonts_csvs = sorted(str(p) for p in pathlib.Path('fonts').glob('*.csv'))

    print(f'Fonts: {fonts_csvs[:10]}')
    print(f'Fonts len: {len(fonts_csvs)}')

    fonts_ds = tf.data.experimental.make_csv_dataset(
        file_pattern='fonts/*.csv',
        batch_size=10,
        num_epochs=1,
        num_parallel_reads=20,
        shuffle_buffer_size=10000)

    for features in fonts_ds.take(1):
        for i, (name, value) in enumerate(features.items()):
            if i > 15:
                break
            print(f'{name:20s}: {value}')
    print('...')
    print(f'[total: {len(features)} features]')

    # Optional: Packing fields
    fonts_image_ds = fonts_ds.map(make_images)

    for features in fonts_image_ds.take(1):
        break

    plt.figure(figsize=(6, 6), dpi=120)

    for n in range(9):
        plt.subplot(3, 3, n + 1)
        plt.imshow(features['image'][..., n])
        plt.title(chr(features['m_label'][n]))
        plt.axis('off')

    plt.show()

    # Lower level functions
    # `tf.io.decode_csv`
    text = pathlib.Path(titanic_file_path).read_text()
    lines = text.split('\n')[1:-1]

    all_strings = [str()] * 10
    print(f'{all_strings}')

    features = tf.io.decode_csv(lines, record_defaults=all_strings)

    for f in features:
        print(f'type: {f.dtype.name}, shape: {f.shape}')

    print(f'Sample record: {lines[0]}')

    titanic_types = [
        int(),
        str(),
        float(),
        int(),
        int(),
        float(),
        str(),
        str(),
        str(),
        str()
    ]
    print(f'Data types: {titanic_types}')

    features = tf.io.decode_csv(lines, record_defaults=titanic_types)

    for f in features:
        print(f'type: {f.dtype.name}, shape: {f.shape}')

    # `tf.data.experimental.CsvDataset`
    simple_titanic = tf.data.experimental.CsvDataset(
        titanic_file_path, record_defaults=titanic_types, header=True)

    for example in simple_titanic.take(1):
        print(f'Sample record: {[e.numpy() for e in example]}')

    def decode_titanic_line(line):
        return tf.io.decode_csv(line, titanic_types)

    manual_titanic = (
        # Load the lines of text
        tf.data.TextLineDataset(titanic_file_path)
        # Skip the header row
        .skip(1)
        # Decode the line
        .map(decode_titanic_line))

    for example in manual_titanic.take(1):
        print(f'Sample record: {[e.numpy() for e in example]}')

    # Multiple files
    font_line = pathlib.Path(fonts_csvs[0]).read_text().splitlines()[1]
    print(f'Sample: {font_line}')

    num_font_features = font_line.count(',') + 1
    font_column_types = [str(), str()] + [float()] * (num_font_features - 2)

    print(f'Fonts[0]: {fonts_csvs[0]}')

    simple_font_ds = tf.data.experimental.CsvDataset(
        fonts_csvs, record_defaults=font_column_types, header=True)

    for row in simple_font_ds.take(10):
        print(f'CSV first column: {row[0].numpy()}')

    font_files = tf.data.Dataset.list_files('fonts/*.csv')

    print('Epoch 1:')
    for f in list(font_files)[:5]:
        print(f'    {f.numpy()}')
    print('    ...')
    print()
    print('Epoch 2:')
    for f in list(font_files)[:5]:
        print(f'    {f.numpy()}')
    print('    ...')

    def make_font_csv_ds(path):
        return tf.data.experimental.CsvDataset(
            path, record_defaults=font_column_types, header=True)

    font_rows = font_files.interleave(make_font_csv_ds, cycle_length=3)

    fonts_dict = {'font_name': [], 'character': []}

    for row in font_rows.take(10):
        fonts_dict['font_name'].append(row[0].numpy().decode())
        fonts_dict['character'].append(chr(row[2].numpy()))

    print(pd.DataFrame(fonts_dict))

    # Performance
    BATCH_SIZE = 2048
    font_ds = tf.data.experimental.make_csv_dataset(file_pattern='fonts/*.csv',
                                                    batch_size=BATCH_SIZE,
                                                    num_epochs=1,
                                                    num_parallel_reads=100)

    start = time.time()
    for i, batch in enumerate(font_ds.take(20)):
        print('.', end='')
    print(f'Total time: {time.time() - start:.3f}')
Ejemplo n.º 26
0
}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = preprocessing.Normalization()
norm.adapt(np.array(titanic[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)

all_numeric_inputs

preprocessed_inputs = [all_numeric_inputs]

for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue

    lookup = preprocessing.StringLookup(
        vocabulary=np.unique(titanic_features[name]))
    one_hot = preprocessing.CategoryEncoding(max_tokens=lookup.vocab_size())

    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)

preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

# tf.keras.utils.plot_model(model = titanic_preprocessing , rankdir="LR", dpi=72, show_shapes=True)

titanic_features_dict = {
    name: np.array(value)
    for name, value in titanic_features.items()
Ejemplo n.º 27
0
def main(do_train=True):
    ## Open and pre-process the data
    # Verified this works with alphabet now lets make things more interesting
    # data = open('./archive/alphabet.txt').read()
    data = open('./archive/drake_lyrics.txt').read()
    print('Length of text: {} characters'.format(len(data)))

    vocab = sorted(set(data))

    # This function as variable setup is weird to me but whatever
    ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
    chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=ids_from_chars.get_vocabulary(), invert=True)

    # Preprocess the text into characters
    all_ids = ids_from_chars(tf.strings.unicode_split(data, 'UTF-8'))
    vocab_size = len(ids_from_chars.get_vocabulary())

    # Output vocab mapping for sanity
    vocab_sample = list(range(0, vocab_size))
    tf_vocab = tf.convert_to_tensor(vocab_sample)
    mapped_vocab = chars_from_ids(tf_vocab).numpy()
    print(vocab_sample)
    print(mapped_vocab)

    if do_train:
        (split_xs, split_ys) = split_data(all_ids.numpy(), vocab_size,
                                          seq_length, char_to_process)

        ## Build the model
        model = tf.keras.models.Sequential()
        model.add(
            tf.keras.layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=seq_length))
        model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150)))
        model.add(tf.keras.layers.Dense(vocab_size, activation='softmax'))
        print(model.summary())

        adam = tf.keras.optimizers.Adam(lr=0.01)
        model.compile(loss='categorical_crossentropy',
                      optimizer=adam,
                      metrics=['accuracy'])

        ## Train the model
        #earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
        history = model.fit(x=split_xs, y=split_ys, epochs=10, verbose=1)
        # Uncomment to show dope graph
        #plot_graphs(history, 'accuracy')

        saved_model_dir = "./models/simple_model/"
        model_filename = 'simple_bars.h5'
        model.save(saved_model_dir + model_filename)
    else:
        print("Loading model from file.")
        model = tf.keras.models.load_model(
            "./models/simple_model/simple_bars.h5")

    ## Generate text with model
    # Dank so we trained for 50 iterations on a slice of the data
    # Let's see what this model generates for a few seeds
    num_chars = 100
    seed_text = '[Verse]\n'
    output_text = generate_text(seed_text,
                                model,
                                ids_from_chars,
                                chars_to_gen=num_chars)
    print("Input sequence was: %s" % (seed_text))
    print("%d character generated sequence:\n%s" % (num_chars, output_text))

    seed_text = 'boy'
    output_text = generate_text(seed_text,
                                model,
                                ids_from_chars,
                                chars_to_gen=num_chars)
    print("Input sequence was: %s" % (seed_text))
    print("%d character generated sequence:\n%s" % (num_chars, output_text))

    seed_text = 'you'
    output_text = generate_text(seed_text,
                                model,
                                ids_from_chars,
                                chars_to_gen=num_chars)
    print("Input sequence was: %s" % (seed_text))
    print("%d character generated sequence:\n%s" % (num_chars, output_text))

    seed_text = 'love'
    output_text = generate_text(seed_text,
                                model,
                                ids_from_chars,
                                chars_to_gen=num_chars)
    print("Input sequence was: %s" % (seed_text))
    print("%d character generated sequence:\n%s" % (num_chars, output_text))