Esempio n. 1
0
def build_classifier(text):
    MAX_VOCAB_SIZE = 20000
    encoder = TextVectorization(max_tokens=MAX_VOCAB_SIZE)
    encoder.adapt(text)
    vocabset = set(encoder.get_vocabulary())
    vocab_size = len(encoder.get_vocabulary())

    word2idx, weights = get_glove_embeddings(vocabset)
    embedding_matrix = np.zeros((vocab_size, weights.shape[1]))
    for i, word in enumerate(encoder.get_vocabulary()):
        vec = word2idx.get(word)
        if(vec is not None):
            embedding_matrix[i] = weights[vec]
    
    model = tf.keras.Sequential([
        encoder,
        tf.keras.layers.Embedding(
            input_dim = embedding_matrix.shape[0],
            output_dim = embedding_matrix.shape[1],
            weights = embedding_matrix, 
            mask_zero=True,
            trainable=True
        ),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model
Esempio n. 2
0
def prepare_data_set_for_training(raw_train_ds, raw_val_ds, raw_test_ds):
    max_features = 10000
    sequence_length = 250

    # create vectorization layer
    vectorize_layer = TextVectorization(standardize=custom_standardization,
                                        max_tokens=max_features,
                                        output_mode='int',
                                        output_sequence_length=sequence_length)

    # Make a text-only dataset (without labels), then call adapt
    train_text = raw_train_ds.map(lambda x, y: x)
    vectorize_layer.adapt(train_text)

    def vectorize_text(text, label):
        text = tf.expand_dims(text, -1)
        return vectorize_layer(text), label

    # retrieve a batch (of 32 reviews and labels) from the dataset
    text_batch, label_batch = next(iter(raw_train_ds))
    first_review, first_label = text_batch[0], label_batch[0]
    print("Review", first_review)
    print("Label", raw_train_ds.class_names[first_label])
    print("Vectorized review", vectorize_text(first_review, first_label))

    #explore the vocabulary
    print("1287 ---> ", vectorize_layer.get_vocabulary()[1287])
    print(" 313 ---> ", vectorize_layer.get_vocabulary()[313])
    print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

    train_ds = raw_train_ds.map(vectorize_text)
    val_ds = raw_val_ds.map(vectorize_text)
    test_ds = raw_test_ds.map(vectorize_text)
    return max_features, train_ds, val_ds, test_ds, vectorize_layer
Esempio n. 3
0
    def __init__(self, model_dir):
        # load the artifacts
        self.artifacts = pickle.load(
            open(os.path.join(model_dir, 'model_artifacts.pkl'), 'rb'))

        # create the vectorizers
        train_src = np.load(os.path.join(model_dir, 'train_src.npy'),
                            allow_pickle=True)
        train_tgt = np.load(os.path.join(model_dir, 'train_tgt.npy'),
                            allow_pickle=True)

        vectorizer_src = TextVectorization()
        vectorizer_src.adapt(train_src)
        train_seq = vectorizer_src(train_src)
        self.vectorizer_src = vectorizer_src

        vectorizer_tgt = TextVectorization()
        vectorizer_tgt.adapt(train_tgt)
        self.vectorizer_tgt = vectorizer_tgt

        # load models
        vocab_src = vectorizer_src.get_vocabulary()
        self.encoder = MyEncoder(
            len(vocab_src),
            embedding_dim=self.artifacts['embedding_size'],
            enc_units=self.artifacts['bottleneck_units'],
            batch_size=self.artifacts['batch_size'])

        # call the model first to create the variables
        sample_hidden = self.encoder.initialize_hidden_state()
        sample_output, sample_hidden = self.encoder(
            tf.zeros(
                (self.artifacts['batch_size'], train_seq.numpy().shape[1])),
            sample_hidden)
        self.encoder.load_weights(
            os.path.join(model_dir,
                         f'encoder_weights_e{self.artifacts["epochs"]}.h5'))
        print(self.encoder.summary())

        vocab_tgt = vectorizer_tgt.get_vocabulary()
        self.decoder = MyDecoder(
            len(vocab_tgt),
            embedding_dim=self.artifacts['embedding_size'],
            dec_units=self.artifacts['bottleneck_units'],
            batch_size=self.artifacts['batch_size'])

        # call the model first to create the variables
        _ = self.decoder(tf.random.uniform((self.artifacts['batch_size'], 1)),
                         sample_hidden, sample_output)
        self.decoder.load_weights(
            os.path.join(model_dir,
                         f'decoder_weights_e{self.artifacts["epochs"]}.h5'))
        print(self.decoder.summary())
Esempio n. 4
0
    def get_base_embedder(self, vocab):

        # text-to-index-list layer
        encoder = TextVectorization(
            max_tokens=self.max_tokens,
            output_mode="int",
            output_sequence_length=self.max_sentence_length,
            vocabulary=vocab,
        )

        # embedder model
        # text input layer
        input_layer = Input(shape=(1, ), dtype=tfstring)

        # text to index layer
        vectorize_layer = encoder(input_layer)

        # embedding layer
        embedding_layer = Embedding(input_dim=len(encoder.get_vocabulary()),
                                    output_dim=32,
                                    mask_zero=True)(vectorize_layer)

        # bidirectional lstm layer
        bi_lstm_layer = Bidirectional(
            LSTM(32, name="lstm-layer"),
            name="bidirectional-layer")(embedding_layer)

        # normalization layer
        norm_layer = BatchNormalization()(bi_lstm_layer)

        # final embedding layer
        embedding = Dense(self.embedding_size,
                          name="embedding-layer")(norm_layer)

        return Model(inputs=input_layer, outputs=embedding)
Esempio n. 5
0
def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
    """Build Text vectorization layer

    Args:
      texts (list): List of string i.e input texts
      vocab_size (int): vocab size
      max_seq (int): Maximum sequence lenght.
      special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].

    Returns:
        layers.Layer: Return TextVectorization Keras Layer
    """
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )
    vectorize_layer.adapt(texts)

    # Insert mask token in vocabulary
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"]
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer
Esempio n. 6
0
    def get_vectorize_layer(self, texts, special_tokens=["mask"]):
        """Build Text vectorization layer

        Args:
          texts (list): List of string i.e input texts
          vocab_size (int): vocab size
          max_seq (int): Maximum sequence lenght.
          special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].

        Returns:
            layers.Layer: Return TextVectorization Keras Layer
        """
        vectorize_layer = TextVectorization(
            max_tokens=self.config.VOCAB_SIZE,
            output_mode="int",
            ngrams=None,
            standardize="lower_and_strip_punctuation",
            output_sequence_length=self.config.MAX_LEN,
        )
        vectorize_layer.adapt(texts)

        # Insert mask token in vocabulary
        vocab = vectorize_layer.get_vocabulary()
        vocab = vocab[2:self.config.VOCAB_SIZE -
                      len(special_tokens)] + ["mask"]
        vectorize_layer.set_vocabulary(vocab)
        return vectorize_layer
Esempio n. 7
0
    def load_dataset(self):
        raw_train_ds, raw_val_ds, raw_test_ds = self.load_raw_data()
        max_features = 10000
        sequence_length = 250

        vectorize_layer = TextVectorization(
            standardize=self.custom_standardization,
            max_tokens=self.max_tokens,
            output_mode='int',
            output_sequence_length=sequence_length)

        # Make a text-only dataset (without labels), then call adapt
        train_text = raw_train_ds.map(lambda x, y: x)
        vectorize_layer.adapt(train_text)
        self.vocab = vectorize_layer.get_vocabulary()

        train_ds = raw_train_ds.map(
            lambda x, y: (vectorize_layer(tf.expand_dims(x, -1)), y))
        val_ds = raw_val_ds.map(lambda x, y:
                                (vectorize_layer(tf.expand_dims(x, -1)), y))
        test_ds = raw_test_ds.map(lambda x, y:
                                  (vectorize_layer(tf.expand_dims(x, -1)), y))

        train_ds = train_ds.cache().prefetch(buffer_size=self.AUTOTUNE)
        val_ds = val_ds.cache().prefetch(buffer_size=self.AUTOTUNE)
        test_ds = test_ds.cache().prefetch(buffer_size=self.AUTOTUNE)

        return train_ds, val_ds, test_ds
Esempio n. 8
0
def get_vectorize_layer(texts, vocab_size, max_seq):
    """Build Text vectorization layer

    Args:
      texts (list): List of string, i.e., input texts
      vocab_size (int): vocab size
      max_seq (int): Maximum sequence length.

    Returns:
        layers.Layer: Return TextVectorization Keras Layer
    """
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )
    vectorize_layer.adapt(texts)

    # Insert mask token in vocabulary
    vocab = vectorize_layer.get_vocabulary()
    #print("len(vocab):", len(vocab)) #177
    #vocab: ['', '[UNK]', 'the', 'and', 'a', 'of', ...] all lower-case
    #GJ20: where do the empty string and [UNK] come from?
    # they are created by adapt() as words 0 and 1
    # '' is padding token; [UNK] is OOV token
    vocab = vocab[2:len(vocab)-1] + ["[mask]"]
    #print("len(vocab):", len(vocab)) #175
    #GJ20: anyway first 2 words removed and '[mask]' added at the end
    vectorize_layer.set_vocabulary(vocab)
    # '' and [UNK] are back in
    #vocab = vectorize_layer.get_vocabulary()
    #print("len(vocab):", len(vocab)) #177
    # '[mask]' has been added as last (least frequent) word in the vocab
    return vectorize_layer
Esempio n. 9
0
class TFVectTokenizer:
    def __init__(self, seqlen, step, freq_threshold):
        self.freq_threshold = freq_threshold
        self.freq_threshold = 0
        self.seqlen = seqlen
        self.step = step
        self.vocab_size = 20000
        self.vectorize_layer = TextVectorization(
            standardize=custom_standardization,
            max_tokens=self.vocab_size - 1,
            output_mode="int",
            output_sequence_length=self.seqlen + 1,
        )

    def tokenize(self, text_ds):
        # Create a vectorization layer and adapt it to the text

        self.vectorize_layer.adapt(text_ds)
        vocab = self.vectorize_layer.get_vocabulary(
        )  # To get words back from token indices
        reverse_token_map = {t: i for i, t in enumerate(vocab)}
        return text_ds, vocab, reverse_token_map

    def get_input_sequences(self, text_ds, reverse_token_map):
        text_ds = text_ds.map(
            lambda text: prepare_lm_inputs_labels(text, self.vectorize_layer))
        text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)
        return text_ds
Esempio n. 10
0
def build_vocab(directories, batch_size, vocab_size, maxlen):
    global vectorize_layer

    # Create a list all files
    filenames = []
    for dir in directories:
        for f in os.listdir(dir):
            filenames.append(os.path.join(dir, f))

    print(f"{len(filenames)} files")

    # Create dataset from text files
    random.shuffle(filenames)
    text_ds = tf.data.TextLineDataset(filenames)
    text_ds = text_ds.shuffle(buffer_size=256)
    text_ds = text_ds.batch(batch_size)

    # Create vectcorization layer and adapt it to the text
    vectorize_layer = TextVectorization(
        standardize=custom_standardization,
        max_tokens=vocab_size - 1,
        output_mode="int",
        output_sequence_length=maxlen + 1,
    )
    vectorize_layer.adapt(text_ds)
    vocab = vectorize_layer.get_vocabulary()

    word_to_index = {}
    for index, word in enumerate(vocab):
        word_to_index[word] = index

    text_ds = text_ds.map(prepare_lm_inputs_labels)
    text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)

    return (text_ds, vocab, word_to_index)
Esempio n. 11
0
def build_model(train_dataset: PrefetchDataset) -> Sequential:
    """
    Initializes a Sequential model and adds text vectorization, word embedding, LSTM, and densely connected layers.
    :param train_dataset: The dataset to adapt the vocabulary on.
    :return: A Sequential object.
    """

    # Initialize the TextVectorization layer which assigns integers to each token
    encoder = TextVectorization(max_tokens=VOCAB_SIZE)

    # Set the vocabulary for the encoding layer. This will be used to initialize a lookup table of word embeddings.
    # The code for this and subsequent layers adapted from:
    # https://www.tensorflow.org/tutorials/text/text_classification_rnn#create_the_text_encoder
    encoder.adapt(train_dataset.map(lambda text, label: text))

    model = Sequential()
    model.add(encoder)
    # Next we add our word embedding layer which converts token indices into dense vectors
    model.add(Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=8, activity_regularizer=l2(0.001),
                        mask_zero=True))
    # Bidirectional wrapper for LSTM allows data to be processed forwards and backwards and then concatenated into
    # one output
    model.add(Bidirectional(LSTM(8)))
    # Densely connected layers with L2 regularization to reduce over-fitting
    model.add(Dense(8, activation="relu", kernel_regularizer=l2(0.001), activity_regularizer=l2(0.001)))
    model.add(Dense(1, activation="sigmoid"))

    return model
Esempio n. 12
0
def initialize_vectorizer_layer(text, pad_length, max_tokens=None):
    # Create vectorizer
    vectorizer = TextVectorization(output_sequence_length=pad_length,
                                   standardize=None,
                                   max_tokens=max_tokens)
    vectorizer.adapt(text)
    vocab = vectorizer.get_vocabulary()
    return vectorizer, vocab
def vectorizer(raw_train_ds):
    vectorizer = TextVectorization(standardize=custom_standardization,
                                   max_tokens=max_tokens,
                                   output_sequence_length=sequence_length)
    text_ds = raw_train_ds.map(lambda x, y: x)
    vectorizer.adapt(text_ds)
    np.savetxt('voc.out', vectorizer.get_vocabulary(), fmt='%s')
    return vectorizer
Esempio n. 14
0
def main():
    train_dataset, test_dataset = generate_data()
    train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    encoder = TextVectorization(max_tokens=VOCAB_SIZE)
    encoder.adapt(train_dataset.map(lambda text, label: text))
    vocab = np.array(encoder.get_vocabulary())
    #print(vocab[:20])
    
    LSTM_model(train_dataset, test_dataset, encoder)
Esempio n. 15
0
def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )
    vectorize_layer.adapt(texts)

    # Insert mask token in vocabulary.
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2:vocab_size - len(special_tokens)] + ["[mask]"]
    #vocab = vocab[2 : vocab_size - len(special_tokens)] + special_tokens
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer
Esempio n. 16
0
class DataPrepare:
    def __init__(self):
        vocab_size = 50000
        batch_size = 128
        maxlen = 5
        filenames = ["test.txt"]
        self.text_ds = tf.data.TextLineDataset(filenames)
        self.text_ds = self.text_ds.shuffle(buffer_size=256)
        self.text_ds = self.text_ds.batch(batch_size)
        self.vectorize_layer = TextVectorization(
            standardize=self.custom_standardization,
            max_tokens=vocab_size - 1,
            output_mode="int",
            output_sequence_length=maxlen + 1,
        )
        self.vectorize_layer.adapt(self.text_ds)
        self.vocab = self.vectorize_layer.get_vocabulary(
        )  # To get words back from token indices
        self.text_ds = self.text_ds.map(self.prepare_lm_inputs_labels)
        self.text_ds = self.text_ds.prefetch(tf.data.experimental.AUTOTUNE)

    def custom_standardization(self, input_string):
        """ Remove html line-break tags and handle punctuation """
        lowercased = tf.strings.lower(input_string)
        stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
        return tf.strings.regex_replace(stripped_html,
                                        f"([{string.punctuation}])", r" \1")

    def prepare_lm_inputs_labels(self, text):
        """
        Shift word sequences by 1 position so that the target for position (i) is
        word at position (i+1). The model will use all words up till position (i)
        to predict the next word.
        """
        text = tf.expand_dims(text, -1)
        tokenized_sentences = self.vectorize_layer(text)
        x = tokenized_sentences[:, :-1]
        y = tokenized_sentences[:, 1:]
        return x, y

    def get_dataset(self):
        return self.text_ds

    def get_vocab(self):
        return self.vocab

    def get_vectorize_layer(self):
        return self.vectorize_layer
Esempio n. 17
0
    def vectorize(self):
        vectorizer = TextVectorization(max_tokens=20000,
                                       output_sequence_length=200)
        text_ds = tf.data.Dataset.from_tensor_slices(
            self.train_samples).batch(128)
        vectorizer.adapt(text_ds)

        self.train_X = vectorizer(np.array([[s] for s in self.train_samples
                                            ])).numpy()
        self.test_X = vectorizer(np.array([[s] for s in self.val_samples
                                           ])).numpy()

        self.test_y = np.array(self.train_labels)
        self.test_y = np.array(self.val_labels)

        return vectorizer.get_vocabulary()
Esempio n. 18
0
def get_model() -> Tuple[Sequential, Callable[[str, bool], Optional[list]]]:
    importedModel = tf.keras.models.load_model('savedModel/data')

    max_features = 10000  # number of max distinct words to be extracted from a dataset
    sequence_length = 250  # size of output sequence, constant regardless of number of tokens extracted from a sample

    batch_size = 32
    seed = 42

    raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
        'aclImdb/train',
        batch_size=batch_size,
        validation_split=0.2,
        subset='training',
        seed=seed)

    vectorization_layer = TextVectorization(
        standardize=lowercase_and_html_escape,
        max_tokens=max_features,
        output_mode='int',
        output_sequence_length=sequence_length)

    train_text_without_labels = raw_train_ds.map(lambda text, label: text)
    vectorization_layer.adapt(
        train_text_without_labels
    )  # create a dictionary of distinct words from the test set

    weights = importedModel.layers[0].get_weights()[0]
    vocabulary = vectorization_layer.get_vocabulary()
    dense_layer_weights = importedModel.layers[4].get_weights()[0]
    get_most_influential_words = most_influential_words_factory(
        weights, vocabulary, dense_layer_weights)

    model = tf.keras.Sequential(
        [vectorization_layer, importedModel,
         layers.Activation('sigmoid')])

    model.compile(loss=losses.BinaryCrossentropy(from_logits=False),
                  optimizer="adam",
                  metrics=['accuracy'])

    return model, get_most_influential_words
Esempio n. 19
0
class BiLSTMModel(BaseModel):
    def __init__(self):
        self.vocab_size = 2000
        self.encoder = TextVectorization(max_tokens=self.vocab_size)
        self.le = LabelEncoder()
        self.le.fit_transform(['neu', 'neg', 'pos'])

    def train(self, X_train, Y_train):
        Y_train = to_categorical(self.le.transform(Y_train), 3)
        self.encoder.adapt(X_train)
        self.model = Sequential([
            self.encoder,
            Embedding(input_dim=len(self.encoder.get_vocabulary()),
                      output_dim=64,
                      mask_zero=True),
            Bidirectional(LSTM(64, return_sequences=True)),
            Bidirectional(LSTM(32)),
            Dense(64, activation='relu'),
            Dropout(0.5),
            Dense(3, activation='softmax')
        ])
        self.model.compile(loss=CategoricalCrossentropy(),
                           optimizer=Adam(1e-4),
                           metrics=['accuracy'])
        self.model.fit(x=X_train, y=Y_train, epochs=10)

    def analyze(self, X_test, Y_test):
        self.model.summary()
        Y_test = to_categorical(self.le.transform(Y_test), 3)
        test_loss, test_acc = self.model.evaluate(x=X_test, y=Y_test)

        print(
            f'{self.__class__.__name__} Accuracy: {test_acc} Loss: {test_loss}'
        )

    def predict(self, texts):
        texts = [text_cleaner(text) for text in texts]
        p = self.model.predict(texts)
        y_classes = [np.argmax(y, axis=None, out=None) for y in p]

        return self.le.inverse_transform(y_classes)
Esempio n. 20
0
def load_cnn_dailymail_experiment(batch_size=1,
                                  max_vocab=5000,
                                  max_sequence=400):

    (_, _, ds_val), ds_info = tfds.load("cnn_dailymail",
                                        split=['train', 'test', 'validation'],
                                        shuffle_files=True,
                                        as_supervised=True,
                                        with_info=True)

    int_vectorize = TextVectorization(max_tokens=max_vocab,
                                      output_mode='int',
                                      output_sequence_length=max_sequence,
                                      standardize=standardize)

    ds_val = ds_val.map(remove_newline,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    #ds_val = ds_val.map(add_EOS_BOS, num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # "Train" vectorization layer on validate articles. *for debugging purposes only!!*
    int_vectorize.adapt(ds_val.map(lambda article, highlights: article))

    def int_vectorize_map(article, highlights, original_article,
                          original_highlights):
        article = tf.expand_dims(article, -1)
        highlights = tf.expand_dims(highlights, -1)
        return int_vectorize(article), int_vectorize(
            highlights), original_article, original_highlights

    ds_val = ds_val.map(duplicate_originals,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_val = ds_val.map(int_vectorize_map,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_val = ds_val.map(standardize_map,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_val = ds_val.shuffle(ds_info.splits['validation'].num_examples)
    ds_val = ds_val.batch(batch_size=batch_size)
    ds_val = ds_val.prefetch(tf.data.experimental.AUTOTUNE)

    return ds_val, None, None, int_vectorize.get_vocabulary()
Esempio n. 21
0
def gen_vocab(data, max_tokens = 200000):
    """
    helper function to generate the vocab for embedding. 
    by default this will limit to the top 20000 tokens

    Parameters
    ----------
    data : dataset from the pipeline.

    Returns
    -------
    vocab : 

    vectorizer : vectorizer for encoding x_train and y_train words
    

    """
    vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=200)
    text_ds = tf.data.Dataset.from_tensor_slices(data).batch(128)
    vectorizer.adapt(text_ds)
    vocab = vectorizer.get_vocabulary()
    return vocab, vectorizer
Esempio n. 22
0
def get_vectorize_layer(texts, max_seq=20, special_tokens=["x"]):
    """Build Text vectorization layer

    Args:
      texts (list): List of string i.e input texts
      max_seq (int): Maximum sequence lenght.
      special_tokens (list, optional): List of special tokens. Defaults to ['x'].

    Returns:
        layers.Layer: Return TextVectorization Keras Layer
    """
    vectorize_layer = TextVectorization(
        output_mode="int",
        split=char_split,
        output_sequence_length=max_seq,
    )
    vectorize_layer.adapt(texts)

    # Insert mask token in vocabulary
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2:] + ["x"]
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer
Esempio n. 23
0
    def vectorize_text(self, text_dataset, debug=False):
        """

        :param text_dataset:
        :return:
        """
        vectorize_layer = TextVectorization(standardize=self.custom_standardization,
                                            max_tokens=self.vocab_size,
                                            output_mode='int',
                                            output_sequence_length=self.sequence_len)
        vectorize_layer.adapt(text_dataset.batch(self.batch_size))
        self.vocab = vectorize_layer.get_vocabulary()

        text_vector_ds = text_dataset.batch(self.batch_size).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

        sequences = list(text_vector_ds.as_numpy_iterator())

        if debug:
            print(f"====>>>> length of sequences: {len(sequences)}")

            for seq in sequences[:5]:
                print(f"====>>>> {seq} => {[self.vocab[i] for i in seq]}")

        return sequences
Esempio n. 24
0
vectorize_layer.adapt(train_text)


def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

print("1287 ---> ", vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ", vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

embedding_dim = 16
Esempio n. 25
0
Our layer will only consider the top 20,000 words, and will truncate or pad sequences to
be actually 200 tokens long.
"""

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)
"""
You can retrieve the computed vocabulary used via `vectorizer.get_vocabulary()`. Let's
print the top 5 words:
"""

vectorizer.get_vocabulary()[:5]
"""
Let's vectorize a test sentence:
"""

output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]
"""
As you can see, "the" gets represented as "2". Why not 0, given that "the" was the first
word in the vocabulary? That's because index 0 is reserved for padding and index 1 is
reserved for "out of vocabulary" tokens.

Here's a dict mapping words to their indices:
"""

voc = vectorizer.get_vocabulary()
Esempio n. 26
0
        return {'batch_size': self.batch_size, 'dec_units': self.dec_units}


# test
if __name__ == '__main__':
    # target text that we will eventually want to decode to
    # see previous task for the source text
    spanish_text = [
        'Pidan, y se les dará', 'busquen, y encontrarán',
        'llamen, y se les abrirá.'
    ]

    texts_delimited = [f'{START_TOKEN} {t} {END_TOKEN}' for t in spanish_text]
    vectorizer = TextVectorization()
    vectorizer.adapt(texts_delimited)
    vocab = vectorizer.get_vocabulary()
    print('Vocabulary', vocab)
    print('Vocabulary size', len(vocab))

    print('========================')
    print('Vectorized texts')
    sequences = vectorizer(texts_delimited)
    print(sequences)

    sample_encoder_output = np.array(
        [[-0.00256194], [-0.00898881], [-0.00391034]], dtype=np.float32)
    sample_encoder_hidden = np.array(
        [[-0.00156194], [0.00020050], [-0.00095034]], dtype=np.float32)

    decoder = MyDecoder(len(vocab),
                        embedding_dim=EMBEDDING_SIZE,
Esempio n. 27
0
def load_cnn_dailymail_deep(batch_size=1, max_vocab=5000, max_sequence=400):

    (ds_train, ds_test,
     ds_val), ds_info = tfds.load("cnn_dailymail",
                                  split=['train', 'test', 'validation'],
                                  shuffle_files=False,
                                  as_supervised=True,
                                  with_info=True)
    ds_train = ds_train.take(20000)
    ds_test = ds_test.take(1000)
    ds_val = ds_val.take(100)

    int_vectorize = TextVectorization(max_tokens=max_vocab,
                                      output_mode='int',
                                      output_sequence_length=max_sequence,
                                      standardize=standardize)

    ds_train = ds_train.map(remove_newline,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # "Train" vectorization layer on training articles.
    int_vectorize.adapt(ds_train.map(lambda article, highlights: article))

    def int_vectorize_map(article, highlights, original_article,
                          original_highlights):
        article = tf.expand_dims(article, -1)
        highlights = tf.expand_dims(highlights, -1)
        return int_vectorize(article), int_vectorize(
            highlights), original_article, original_highlights

    ds_train = ds_train.map(duplicate_originals,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_train = ds_train.map(int_vectorize_map,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_train = ds_train.map(standardize_map,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
    ds_train = ds_train.batch(batch_size=batch_size)
    ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE)

    ds_test = ds_test.map(remove_newline,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_test = ds_test.map(duplicate_originals,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_test = ds_test.map(int_vectorize_map,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_test = ds_test.map(standardize_map,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_test = ds_test.batch(batch_size=batch_size)
    ds_test = ds_test.cache()
    ds_test = ds_test.prefetch(tf.data.experimental.AUTOTUNE)

    ds_val = ds_val.map(remove_newline,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_val = ds_val.map(duplicate_originals,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_val = ds_val.map(int_vectorize_map,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_val = ds_val.map(standardize_map,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds_val = ds_val.shuffle(ds_info.splits['validation'].num_examples)
    ds_val = ds_val.batch(batch_size=batch_size)
    ds_val = ds_val.prefetch(tf.data.experimental.AUTOTUNE)

    return ds_train, ds_val, ds_test, int_vectorize.get_vocabulary()
Esempio n. 28
0
caption_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss=cross_entropy)

# Fit the model
caption_model.fit(
    train_dataset,
    epochs=EPOCHS,
    validation_data=valid_dataset,
    callbacks=[early_stopping],
)
"""
## Check sample predictions
"""

vocab = vectorization.get_vocabulary()
index_lookup = dict(zip(range(len(vocab)), vocab))
max_decoded_sentence_length = SEQ_LENGTH - 1
valid_images = list(valid_data.keys())


def generate_caption():
    # Select a random image from the validation dataset
    sample_img = np.random.choice(valid_images)

    # Read the image from the disk
    sample_img = read_image(sample_img)
    img = sample_img.numpy().astype(np.uint8)
    plt.imshow(img)
    plt.show()
Esempio n. 29
0
    return int_vectorize_layer(text), label


# Retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print("Question", first_question)
print("Label", first_label)

print("'binary' vectorized question:",
      binary_vectorize_text(first_question, first_label)[0])

print("'int' vectorized question:",
      int_vectorize_text(first_question, first_label)[0])

print("1289 ---> ", int_vectorize_layer.get_vocabulary()[1289])
print("313 ---> ", int_vectorize_layer.get_vocabulary()[313])
print("Vocabulary size: {}".format(len(int_vectorize_layer.get_vocabulary())))

binary_train_ds = raw_train_ds.map(binary_vectorize_text)
binary_val_ds = raw_val_ds.map(binary_vectorize_text)
binary_test_ds = raw_test_ds.map(binary_vectorize_text)

int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

AUTOTUNE = tf.data.experimental.AUTOTUNE


def configure_dataset(dataset):
    def prepare_model_input(self):
        """
        Reads and parses MIDI files, then saves the note sequences as text files in the
        train and val directories.

        Note that each song is treated as 1 long sentence, which has implications for the
        max_sequence_length argument passed to the transformer.  In the future, it may be worth
        exploring how to break up the notes into sentence-like chunks, perhaps by using bars
        like MuseGAN.

        :return: TF TextLineDataset object with x and y mapped.
        """
        # if train and val directories do not exist, create them
        cwd = os.getcwd()
        data_dir = cwd + "/maestro-v3.0.0-midi/maestro-v3.0.0/"
        Path(cwd + "/clean/train").mkdir(parents=True, exist_ok=True)
        Path(cwd + "/clean/val").mkdir(parents=True, exist_ok=True)

        # read Maestro metadata to Pandas, shuffle, then determine train/test split
        metadata = pd.read_csv(data_dir + "maestro-v3.0.0.csv")
        metadata = metadata.sample(frac=1.0)
        train_val_split_index = int(len(metadata) * 0.8)

        # only parse MIDI files if the clean train or test dirs are empty
        if len(os.listdir('clean/train')) == 0 or len(
                os.listdir('clean/val')) == 0:
            # use the Pandas df to determine which parsed files to store in each folder
            for file_idx, file in enumerate(metadata.midi_filename.to_list()):
                # Pre-pend the data directory to the file path
                file = data_dir + file
                print("Parsing MIDI file:", file_idx, "/", len(metadata))
                notes_list, offsets = self.parse_midi_file(file)
                offsets_from_prior_note = self.offsets_relative_to_prior_note(
                    offsets=offsets)
                if file_idx <= train_val_split_index:
                    path_prefix = "clean/train/"
                else:
                    path_prefix = "clean/val/"
                # save each parsed sequence of notes as a string in a txt file
                with open(
                        path_prefix +
                        file[file.rindex("/") + 1:file.rindex(".")] + ".txt",
                        "w") as text_file:
                    text_file.write(' '.join(notes_list))

        # walk through the directories
        filenames = []
        directories = [
            "clean/train",
            "clean/val",
        ]
        for dir in directories:
            for f in os.listdir(dir):
                filenames.append(os.path.join(dir, f))

        print(f"{len(filenames)} files")

        # Create a dataset from text files
        random.shuffle(filenames)
        text_ds = tf.data.TextLineDataset(filenames)
        text_ds = text_ds.shuffle(buffer_size=256)
        text_ds = text_ds.batch(self.batch_size)

        # Create a vectorization layer and adapt it to the text
        vectorize_layer = TextVectorization(
            standardize=None,  # do not perform any pre-processing or cleaning
            max_tokens=self.vocab_size - 1,
            output_mode="int",
            output_sequence_length=self.max_sequence_len + 1,
        )
        vectorize_layer.adapt(text_ds)
        self.vocab = vectorize_layer.get_vocabulary()

        def create_x_and_y(text):
            """
            Shift word sequences by 1 position so that the target for position (i) is
            word at position (i+1). The model will use all words up till position (i)
            to predict the next word.
            """
            text = tf.expand_dims(text, -1)
            tokenized_sentences = vectorize_layer(text)
            x = tokenized_sentences[:, :-1]
            y = tokenized_sentences[:, 1:]

            return x, y

        return text_ds.map(create_x_and_y)