def init_vectorize_layer(self, text_dataset: np.ndarray) -> TextVectorization:
     text_vectorizer = TextVectorization(max_tokens=self.max_features,
                                         standardize=self.custom_preprocessing,
                                         output_mode='int',
                                         output_sequence_length=self.max_len)
     text_vectorizer.adapt(text_dataset)
     return text_vectorizer
Beispiel #2
0
    def construct(self, text_ds, label_num):
        max_features = 20000
        embedding_dim = 128
        sequence_length = 200

        vectorize_layer = TextVectorization(
            standardize=custom_standardization,
            max_tokens=max_features,
            output_mode="int",
            output_sequence_length=sequence_length,
        )
        vectorize_layer.adapt(text_ds)

        inputs = tf.keras.Input(shape=(1, ), dtype="string")
        indices = vectorize_layer(inputs)

        x = layers.Embedding(max_features + 1, embedding_dim)(indices)
        x = layers.Dropout(0.5)(x)

        # global max pooling
        x = layers.GlobalMaxPooling1D()(x)
        predictions = layers.Dense(label_num,
                                   activation="sigmoid",
                                   name="predictions")(x)
        model = tf.keras.Model(inputs, predictions)
        model.compile(loss="binary_crossentropy",
                      optimizer="adam",
                      metrics=["accuracy"])

        return model
    def trainWordVectorEncoder(trainText, VOCAB_SIZE=None):
        #https://www.tensorflow.org/tutorials/text/text_classification_rnn
        encoder = TextVectorization(
        ) if VOCAB_SIZE is None else TextVectorization(max_tokens=VOCAB_SIZE)
        encoder.adapt(tf.data.Dataset.from_tensor_slices(trainText))

        return encoder
Beispiel #4
0
def makePrediction(messages_as_string):
    print("Running prediction function...")
    messages = list(messages_as_string.split('s3cur!tywh@l3'))

    vocab_size = 12612
    sequence_length = 1000

    embedding_layer = tf.keras.layers.Embedding(vocab_size, sequence_length)

    # Use the text vectorization layer to normalize, split, and map strings to
    # integers. Note that the layer uses the custom standardization defined above.
    # Set maximum_sequence length as all samples are not of the same length.
    vectorizer = TextVectorization(max_tokens=vocab_size,
                                   output_sequence_length=sequence_length)
    text_ds = tf.data.Dataset.from_tensor_slices(messages).batch(32)
    vectorizer.adapt(text_ds)

    path = './assets/models/model.h5'
    print("trying to load model at: " + path)
    model = load_model(path)
    print("I loaded a model")

    string_input = keras.Input(shape=(1, ), dtype="string")
    x = vectorizer(string_input)
    preds = model(x)
    end_to_end_model = keras.Model(string_input, preds)

    count = 0
    Vuln = 0
    vulnLengthSum = 0
    nonVuln = 0
    nonVulnLengthSum = 0

    for message in messages:
        count = count + 1
        probabilities = end_to_end_model.predict([[message]])
        np.argmax(probabilities[0])

        if probabilities[0][1] > 0.5:
            vulnLengthSum = vulnLengthSum + len(message)
            # print("length:",len(message))
            # print(message)
            Vuln = Vuln + 1
        if probabilities[0][0] > 0.5:
            nonVulnLengthSum = nonVulnLengthSum + len(message)
            # print("length:",len(message))
            # print(message)
            nonVuln = nonVuln + 1

    vuln = str(Vuln)
    avg_vuln = '0' if vulnLengthSum == 0 else str(vulnLengthSum / Vuln)
    isVuln = 'true' if Vuln > nonVuln else 'false'
    non_vuln = str(nonVuln)
    avg_non_vuln = '0' if nonVulnLengthSum == 0 else str(nonVulnLengthSum /
                                                         nonVuln)
    return_string = vuln + "," + non_vuln + "," + isVuln

    print("Response body: \n" + return_string)

    return return_string
def build_model(train_dataset: PrefetchDataset) -> Sequential:
    """
    Initializes a Sequential model and adds text vectorization, word embedding, LSTM, and densely connected layers.
    :param train_dataset: The dataset to adapt the vocabulary on.
    :return: A Sequential object.
    """

    # Initialize the TextVectorization layer which assigns integers to each token
    encoder = TextVectorization(max_tokens=VOCAB_SIZE)

    # Set the vocabulary for the encoding layer. This will be used to initialize a lookup table of word embeddings.
    # The code for this and subsequent layers adapted from:
    # https://www.tensorflow.org/tutorials/text/text_classification_rnn#create_the_text_encoder
    encoder.adapt(train_dataset.map(lambda text, label: text))

    model = Sequential()
    model.add(encoder)
    # Next we add our word embedding layer which converts token indices into dense vectors
    model.add(Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=8, activity_regularizer=l2(0.001),
                        mask_zero=True))
    # Bidirectional wrapper for LSTM allows data to be processed forwards and backwards and then concatenated into
    # one output
    model.add(Bidirectional(LSTM(8)))
    # Densely connected layers with L2 regularization to reduce over-fitting
    model.add(Dense(8, activation="relu", kernel_regularizer=l2(0.001), activity_regularizer=l2(0.001)))
    model.add(Dense(1, activation="sigmoid"))

    return model
Beispiel #6
0
    def get_base_embedder(self, vocab):

        # text-to-index-list layer
        encoder = TextVectorization(
            max_tokens=self.max_tokens,
            output_mode="int",
            output_sequence_length=self.max_sentence_length,
            vocabulary=vocab,
        )

        # embedder model
        # text input layer
        input_layer = Input(shape=(1, ), dtype=tfstring)

        # text to index layer
        vectorize_layer = encoder(input_layer)

        # embedding layer
        embedding_layer = Embedding(input_dim=len(encoder.get_vocabulary()),
                                    output_dim=32,
                                    mask_zero=True)(vectorize_layer)

        # bidirectional lstm layer
        bi_lstm_layer = Bidirectional(
            LSTM(32, name="lstm-layer"),
            name="bidirectional-layer")(embedding_layer)

        # normalization layer
        norm_layer = BatchNormalization()(bi_lstm_layer)

        # final embedding layer
        embedding = Dense(self.embedding_size,
                          name="embedding-layer")(norm_layer)

        return Model(inputs=input_layer, outputs=embedding)
Beispiel #7
0
def create_text_vectorization_model(
        text_vectorization_filepath: str,
        dataset_all_tokens: tf.data.Dataset) -> tf.keras.models.Sequential:
    """
    create text vectorization model
    this vectorizer converts an array of strings to an array of integers
    """
    if exists(text_vectorization_filepath):
        logger.info('found text vectorization model')
        return tf.keras.models.load_model(text_vectorization_filepath,
                                          compile=False)

    vectorize_layer = TextVectorization(max_tokens=vocab_size,
                                        output_mode='int')
    logger.success('created text vectorization layer')
    # batch the dataset to make it easier to store
    # in memory
    vectorize_layer.adapt(dataset_all_tokens.batch(batch_size))
    logger.success('adapted vectorization to training dataset')

    text_vectorization_model = tf.keras.models.Sequential(
        [tf.keras.Input(shape=(1, ), dtype=tf.string), vectorize_layer])
    # simple text vectorization test
    logger.info(text_vectorization_model.predict(["this is a test"]))
    text_vectorization_model.save(text_vectorization_filepath)
    return text_vectorization_model
Beispiel #8
0
class TextVectorizer(Vectorizer):
    """Text vectorizer contains an instance of TextVectorization from TF."""

    def __init__(self, name, feature_number=10000, length=500):
        Vectorizer.__init__(self, name, feature_number)
        self.length = length

    def vectorize(self, text):
        text = tf.expand_dims(text, -1)
        return self.vectorizer(text)

    def vectorize_set(self, data):
        data =  [self.vectorize(t) for t in data]
        return data

    def fit_transform(self, data):
        print(f'Training {self.name}')
        data = np.array(data)
        self.vectorizer = TextVectorization(
            max_tokens=self.feature_number,
            output_mode='int',
            output_sequence_length=self.length)

        self.vectorizer.adapt(data)

        return self.transform(data)
Beispiel #9
0
class TFVectTokenizer:
    def __init__(self, seqlen, step, freq_threshold):
        self.freq_threshold = freq_threshold
        self.freq_threshold = 0
        self.seqlen = seqlen
        self.step = step
        self.vocab_size = 20000
        self.vectorize_layer = TextVectorization(
            standardize=custom_standardization,
            max_tokens=self.vocab_size - 1,
            output_mode="int",
            output_sequence_length=self.seqlen + 1,
        )

    def tokenize(self, text_ds):
        # Create a vectorization layer and adapt it to the text

        self.vectorize_layer.adapt(text_ds)
        vocab = self.vectorize_layer.get_vocabulary(
        )  # To get words back from token indices
        reverse_token_map = {t: i for i, t in enumerate(vocab)}
        return text_ds, vocab, reverse_token_map

    def get_input_sequences(self, text_ds, reverse_token_map):
        text_ds = text_ds.map(
            lambda text: prepare_lm_inputs_labels(text, self.vectorize_layer))
        text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)
        return text_ds
Beispiel #10
0
    def load_dataset(self):
        raw_train_ds, raw_val_ds, raw_test_ds = self.load_raw_data()
        max_features = 10000
        sequence_length = 250

        vectorize_layer = TextVectorization(
            standardize=self.custom_standardization,
            max_tokens=self.max_tokens,
            output_mode='int',
            output_sequence_length=sequence_length)

        # Make a text-only dataset (without labels), then call adapt
        train_text = raw_train_ds.map(lambda x, y: x)
        vectorize_layer.adapt(train_text)
        self.vocab = vectorize_layer.get_vocabulary()

        train_ds = raw_train_ds.map(
            lambda x, y: (vectorize_layer(tf.expand_dims(x, -1)), y))
        val_ds = raw_val_ds.map(lambda x, y:
                                (vectorize_layer(tf.expand_dims(x, -1)), y))
        test_ds = raw_test_ds.map(lambda x, y:
                                  (vectorize_layer(tf.expand_dims(x, -1)), y))

        train_ds = train_ds.cache().prefetch(buffer_size=self.AUTOTUNE)
        val_ds = val_ds.cache().prefetch(buffer_size=self.AUTOTUNE)
        test_ds = test_ds.cache().prefetch(buffer_size=self.AUTOTUNE)

        return train_ds, val_ds, test_ds
Beispiel #11
0
def build_vocab(directories, batch_size, vocab_size, maxlen):
    global vectorize_layer

    # Create a list all files
    filenames = []
    for dir in directories:
        for f in os.listdir(dir):
            filenames.append(os.path.join(dir, f))

    print(f"{len(filenames)} files")

    # Create dataset from text files
    random.shuffle(filenames)
    text_ds = tf.data.TextLineDataset(filenames)
    text_ds = text_ds.shuffle(buffer_size=256)
    text_ds = text_ds.batch(batch_size)

    # Create vectcorization layer and adapt it to the text
    vectorize_layer = TextVectorization(
        standardize=custom_standardization,
        max_tokens=vocab_size - 1,
        output_mode="int",
        output_sequence_length=maxlen + 1,
    )
    vectorize_layer.adapt(text_ds)
    vocab = vectorize_layer.get_vocabulary()

    word_to_index = {}
    for index, word in enumerate(vocab):
        word_to_index[word] = index

    text_ds = text_ds.map(prepare_lm_inputs_labels)
    text_ds = text_ds.prefetch(tf.data.experimental.AUTOTUNE)

    return (text_ds, vocab, word_to_index)
def make_model(vector_train,
               max_tokens,
               output_seq_len,
               num_hidden,
               size_hidden,
               hidden_activ='relu',
               output_activ='sigmoid',
               loss='binary_crossentropy',
               optimizer='adam',
               embed=True):

    vectorizer = TextVectorization(max_tokens=max_tokens,
                                   output_sequence_length=output_seq_len)
    vectorizer.adapt(vector_train)
    model = keras.Sequential()
    model.add(layers.Input(shape=(1, ), dtype=tf.string))
    model.add(vectorizer)  #Vectorizer Layer
    if embed:
        model.add(layers.Embedding(max_tokens + 1,
                                   size_hidden))  #Embedded Layer
    for i in range(num_hidden):
        model.add(layers.Dense(size_hidden,
                               activation=hidden_activ))  #hidden layers
    model.add(layers.Dense(1, activation=output_activ))  #output layer
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model
Beispiel #13
0
 def __init__(self,
              srcLang="eng",
              tgtLang="fra",
              src_vocab_size=20000,
              src_len=200,
              tgt_vocab_size=20000,
              tgt_len=200):
     super(TextPreprocessOld, self).__init__()
     # self.batch_size = batch_size
     self.srcLang = srcLang
     self.tgtLang = tgtLang
     self.src_vocab_size = src_vocab_size
     self.src_len = src_len
     self.tgt_vocab_size = tgt_vocab_size
     self.tgt_len = tgt_len
     self.src_text_vectorizer = TextVectorization(
         standardize=self.custom_standardization,
         max_tokens=self.src_vocab_size,
         output_mode="int",
         output_sequence_length=self.src_len)
     self.tgt_text_vectorizer = TextVectorization(
         standardize=self.custom_standardization,
         max_tokens=self.tgt_vocab_size,
         output_mode="int",
         output_sequence_length=self.tgt_len)
def get_text_vec_model(train_samples):
    # Taken from: https://github.com/mlflow/mlflow/issues/3910

    # pylint: disable=no-name-in-module
    from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

    VOCAB_SIZE = 10
    SEQUENCE_LENGTH = 16
    EMBEDDING_DIM = 16

    vectorizer_layer = TextVectorization(
        input_shape=(1, ),
        max_tokens=VOCAB_SIZE,
        output_mode="int",
        output_sequence_length=SEQUENCE_LENGTH,
    )
    vectorizer_layer.adapt(train_samples)
    model = tf.keras.Sequential([
        vectorizer_layer,
        tf.keras.layers.Embedding(
            VOCAB_SIZE,
            EMBEDDING_DIM,
            name="embedding",
            mask_zero=True,
            input_shape=(1, ),
        ),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dense(1, activation="tanh"),
    ])
    model.compile(optimizer="adam", loss="mse", metrics="mae")
    return model
def vectorizer(raw_train_ds):
    vectorizer = TextVectorization(standardize=custom_standardization,
                                   max_tokens=max_tokens,
                                   output_sequence_length=sequence_length)
    text_ds = raw_train_ds.map(lambda x, y: x)
    vectorizer.adapt(text_ds)
    np.savetxt('voc.out', vectorizer.get_vocabulary(), fmt='%s')
    return vectorizer
Beispiel #16
0
def initialize_vectorizer_layer(text, pad_length, max_tokens=None):
    # Create vectorizer
    vectorizer = TextVectorization(output_sequence_length=pad_length,
                                   standardize=None,
                                   max_tokens=max_tokens)
    vectorizer.adapt(text)
    vocab = vectorizer.get_vocabulary()
    return vectorizer, vocab
Beispiel #17
0
def create_encoder(list_of_texts):
    """
    Creates encoder that creates a vocabulary based on given list of texts.
    It can be used as a parameter for create_model() function.
    """
    encoder = TextVectorization(max_tokens=NUM_WORDS)
    encoder.adapt(list_of_texts)
    return encoder
Beispiel #18
0
def runRNN():
    # Assumes you're in the root level of the dataset directory.
    # If you aren't, you'll need to change the relative paths here.
    train_data = prepareData('./train')
    test_data = prepareData('./test')

    for text_batch, label_batch in train_data.take(1):
        print(text_batch.numpy()[0])
        print(label_batch.numpy()[0])  # 0 = negative, 1 = positive

    model = Sequential()

    # ----- 1. INPUT
    # We need this to use the TextVectorization layer next.
    model.add(Input(shape=(1,), dtype="string"))

    # ----- 2. TEXT VECTORIZATION
    # This layer processes the input string and turns it into a sequence of
    # max_len integers, each of which maps to a certain token.
    max_tokens = 1000
    max_len = 100
    vectorize_layer = TextVectorization(
        # Max vocab size. Any words outside of the max_tokens most common ones
        # will be treated the same way: as "out of vocabulary" (OOV) tokens.
        max_tokens=max_tokens,
        # Output integer indices, one per string token
        output_mode="int",
        # Always pad or truncate to exactly this many tokens
        output_sequence_length=max_len,
    )

    # Call adapt(), which fits the TextVectorization layer to our text dataset.
    # This is when the max_tokens most common words (i.e. the vocabulary) are selected.
    train_texts = train_data.map(lambda text, label: text)
    vectorize_layer.adapt(train_texts)

    model.add(vectorize_layer)

    # ----- 3. EMBEDDING
    # This layer turns each integer (representing a token) from the previous layer
    # an embedding. Note that we're using max_tokens + 1 here, since there's an
    # out-of-vocabulary (OOV) token that gets added to the vocab.
    model.add(Embedding(max_tokens + 1, 128))

    # ----- 4. RECURRENT LAYER
    model.add(LSTM(64))

    # ----- 5. DENSE HIDDEN LAYER
    model.add(Dense(64, activation="relu"))

    # ----- 6. OUTPUT
    model.add(Dense(1, activation="sigmoid"))

    # Compile and train the model.
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.fit(train_data, epochs=1)

    model.save_weights('rnn')
Beispiel #19
0
def get_vectorizer(df_train, df_test):

    # Vectorizes and pads dataset
    # Also lowers and strips punctuation
    vectorizer = TextVectorization(max_tokens=7500, output_sequence_length=200)
    text_ds = tf.data.Dataset.from_tensor_slices(df_train['text']).batch(32)
    vectorizer.adapt(text_ds)

    return vectorizer
Beispiel #20
0
def build_text_layer(raw_vocab):
    vocabulary = tf.data.Dataset.from_tensor_slices(list(raw_vocab))
    embed_layer = TextVectorization(
        max_tokens=100,
        #standardize=custom_standardization,
        output_mode='int',
        output_sequence_length=100)
    embed_layer.adapt(vocabulary.batch(64))
    return embed_layer
Beispiel #21
0
def main():
    train_dataset, test_dataset = generate_data()
    train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    encoder = TextVectorization(max_tokens=VOCAB_SIZE)
    encoder.adapt(train_dataset.map(lambda text, label: text))
    vocab = np.array(encoder.get_vocabulary())
    #print(vocab[:20])
    
    LSTM_model(train_dataset, test_dataset, encoder)
Beispiel #22
0
def create_encoder():
    encoder = TextVectorization(
        max_tokens=len(VOCAB) + 2,  # padding-mask + oov
        output_mode="int",
        output_sequence_length=MAX_SMILE_LEN,
        standardize=None,
        split=None,
    )
    encoder.set_vocabulary(VOCAB)
    return encoder
Beispiel #23
0
 def __init__(self):
     super().__init__()
     vocab = list(set(open(fname_wordlist).readlines()))
     self.vectorizer = TextVectorization(max_tokens=len(vocab),
                                         output_mode='int')
     self.vectorizer.adapt(vocab)
     self.emb = Embedding(len(vocab) + 1, self.embed_dim)
     self.lstm1 = LSTM(self.lstm_units[0], return_sequences=True)
     self.lstm2 = LSTM(self.lstm_units[1], return_sequences=True)
     self.lstm3 = LSTM(self.lstm_units[2])
     self.out = Dense(self.target_dim, activation=sigmoid)
Beispiel #24
0
    def fit_transform(self, data):
        print(f'Training {self.name}')
        data = np.array(data)
        self.vectorizer = TextVectorization(
            max_tokens=self.feature_number,
            output_mode='int',
            output_sequence_length=self.length)

        self.vectorizer.adapt(data)

        return self.transform(data)
def reviews_encoding(df, max_features, sequence_length):
    vectorize_layer = TextVectorization(
        standardize=None,
        max_tokens=max_features,
        output_mode="int",
        output_sequence_length=sequence_length,
    )

    # vectorize_layer.adapt(np.array(df['commentaire']))
    vectorize_layer.adapt(np.array(df))

    return vectorize_layer
Beispiel #26
0
    def load_model_components(self, path):
        # TextVectorizer layer
        vocab = pd.read_json(path + '/lstm_model/vectorizer_vocab.json',
                             typ='series')
        vectorizer = TextVectorization(max_tokens=20001,
                                       output_sequence_length=50)
        vectorizer.set_vocabulary(vocab.values)

        # Trained Model
        model = keras.models.load_model(path + '/lstm_model/model_weights.h5')

        return vectorizer, model
Beispiel #27
0
 def __init__(self, seqlen, step, freq_threshold):
     self.freq_threshold = freq_threshold
     self.freq_threshold = 0
     self.seqlen = seqlen
     self.step = step
     self.vocab_size = 20000
     self.vectorize_layer = TextVectorization(
         standardize=custom_standardization,
         max_tokens=self.vocab_size - 1,
         output_mode="int",
         output_sequence_length=self.seqlen + 1,
     )
Beispiel #28
0
    def __init__(self, model_dir):
        # load the artifacts
        self.artifacts = pickle.load(
            open(os.path.join(model_dir, 'model_artifacts.pkl'), 'rb'))

        # create the vectorizers
        train_src = np.load(os.path.join(model_dir, 'train_src.npy'),
                            allow_pickle=True)
        train_tgt = np.load(os.path.join(model_dir, 'train_tgt.npy'),
                            allow_pickle=True)

        vectorizer_src = TextVectorization()
        vectorizer_src.adapt(train_src)
        train_seq = vectorizer_src(train_src)
        self.vectorizer_src = vectorizer_src

        vectorizer_tgt = TextVectorization()
        vectorizer_tgt.adapt(train_tgt)
        self.vectorizer_tgt = vectorizer_tgt

        # load models
        vocab_src = vectorizer_src.get_vocabulary()
        self.encoder = MyEncoder(
            len(vocab_src),
            embedding_dim=self.artifacts['embedding_size'],
            enc_units=self.artifacts['bottleneck_units'],
            batch_size=self.artifacts['batch_size'])

        # call the model first to create the variables
        sample_hidden = self.encoder.initialize_hidden_state()
        sample_output, sample_hidden = self.encoder(
            tf.zeros(
                (self.artifacts['batch_size'], train_seq.numpy().shape[1])),
            sample_hidden)
        self.encoder.load_weights(
            os.path.join(model_dir,
                         f'encoder_weights_e{self.artifacts["epochs"]}.h5'))
        print(self.encoder.summary())

        vocab_tgt = vectorizer_tgt.get_vocabulary()
        self.decoder = MyDecoder(
            len(vocab_tgt),
            embedding_dim=self.artifacts['embedding_size'],
            dec_units=self.artifacts['bottleneck_units'],
            batch_size=self.artifacts['batch_size'])

        # call the model first to create the variables
        _ = self.decoder(tf.random.uniform((self.artifacts['batch_size'], 1)),
                         sample_hidden, sample_output)
        self.decoder.load_weights(
            os.path.join(model_dir,
                         f'decoder_weights_e{self.artifacts["epochs"]}.h5'))
        print(self.decoder.summary())
    def __init__(
        self,
        batch_size,
        seed,
        vectorisation_max_features=MAX_FEATURES,
        vectorisation_sequence_length=SEQUENCE_LENGTH,
    ):
        """Constructor for one SentimentAnalyser model.

        Args:
            batch_size (int): size of the batch when loading the dataset,
            seed (int): the seed of the dataset, for reproducibility
        """
        # load training set
        raw_train_ds = keras.preprocessing.text_dataset_from_directory(
            TRAIN_DIR,
            batch_size=batch_size,
            validation_split=0.2,
            subset="training",
            seed=seed,
        )

        # load validation set
        raw_val_ds = keras.preprocessing.text_dataset_from_directory(
            TRAIN_DIR,
            batch_size=batch_size,
            validation_split=0.2,
            subset="validation",
            seed=seed,
        )

        # load test set
        raw_test_ds = keras.preprocessing.text_dataset_from_directory(
            TEST_DIR, batch_size=batch_size
        )

        # layer to standardise, tokenise, vectorise the data
        # creates unique integers for all tokens
        self.vectorise_layer = TextVectorization(
            max_tokens=vectorisation_max_features,
            standardize=self.custom_standardisation,
            output_mode="int",
            output_sequence_length=vectorisation_sequence_length,
        )

        # fit the preprocessing layer to the training data, without labels
        train_text = raw_test_ds.map(lambda x, y: x)
        self.vectorise_layer.adapt(train_text)

        self.train_ds = raw_train_ds.map(self.vectorise_text)
        self.test_ds = raw_test_ds.map(self.vectorise_text)
        self.val_ds = raw_val_ds.map(self.vectorise_text)
Beispiel #30
0
def create_vectorize_text(ds):
    vectorize_layer = TextVectorization(max_tokens=max_features,
                                        output_mode='int',
                                        output_sequence_length=sequence_length)

    text_ds = ds.map(lambda x, y: x)
    vectorize_layer.adapt(text_ds)

    def vectorize_text(text, label):
        text = tf.expand_dims(text, -1)
        return vectorize_layer(text), label

    return vectorize_text