Python TextVectorizationの例、tensorflow.keras.layers.experimental.preprocessing.TextVectorization Pythonの例

コード例 #1

0

ファイルを表示

ファイル: preprocessing.py プロジェクト: weichenlong-boy/autokeras

 def build(self, hp, inputs=None):
     input_node = nest.flatten(inputs)[0]
     if self.ngrams is not None:
         ngrams = self.ngrams
     else:
         ngrams = hp.Int("ngrams", min_value=1, max_value=2, default=2)
     return preprocessing.TextVectorization(
         max_tokens=self.max_tokens, ngrams=ngrams,
         output_mode="tf-idf")(input_node)

コード例 #2

0

ファイルを表示

ファイル: preprocessing.py プロジェクト: yeqingcheng368/autokeras

 def build(self, hp, inputs=None):
     input_node = nest.flatten(inputs)[0]
     if self.output_sequence_length is not None:
         output_sequence_length = self.output_sequence_length
     else:
         output_sequence_length = hp.Choice('output_sequence_length',
                                            [64, 128, 256, 512], default=64)
     output_node = preprocessing.TextVectorization(
         max_tokens=self.max_tokens,
         output_mode='int',
         output_sequence_length=output_sequence_length)(input_node)
     return output_node

コード例 #3

0

ファイルを表示

ファイル: simple.py プロジェクト: zebengberg/nine-ninety

def build_model(x_train_text, x_train_numeric, **kwargs):
    """Build TF model."""

    max_features = 5000
    sequence_length = 100

    encoder = preprocessing.TextVectorization(
        max_tokens=max_features, output_sequence_length=sequence_length)
    encoder.adapt(x_train_text.values)

    normalizer = preprocessing.Normalization()
    normalizer.adapt(x_train_numeric.values)

    text_input = tf.keras.Input(shape=(None, ), name='text', dtype='string')
    embedded = encoder(text_input)
    embedded = layers.Embedding(input_dim=max_features,
                                output_dim=128)(embedded)
    # LSTM doesn't improved performance
    # embedded = layers.LSTM(128)(embedded)
    embedded = layers.GlobalAveragePooling1D()(embedded)

    numeric_shape = x_train_numeric.shape[1:]
    numeric_input = tf.keras.Input(shape=numeric_shape, name='numeric')
    normalized = normalizer(numeric_input)

    if 'only_numeric' in kwargs and kwargs['only_numeric']:
        print('\nBuilding TF model with only numeric data ...')
        inputs = numeric_input
        x = normalized
    elif 'only_text' in kwargs and kwargs['only_text']:
        print('\nBuilding TF model with only text data ...')
        inputs = text_input
        x = embedded
    else:
        print('\nBuilding TF model with both numeric and text data ...')
        inputs = [text_input, numeric_input]
        x = layers.concatenate([embedded, normalized])
    print('#' * 65)

    x = layers.Dropout(0.3)(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    output = layers.Dense(1)(x)

    model = tf.keras.Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='mse', metrics=['mape', 'mae'])
    return model

コード例 #4

0

ファイルを表示

"""
The `adapt()` method takes either a Numpy array or a `tf.data.Dataset` object. In the
case of `StringLookup` and `TextVectorization`, you can also pass a list of strings:
"""

data = [
    "ξεῖν᾽, ἦ τοι μὲν ὄνειροι ἀμήχανοι ἀκριτόμυθοι",
    "γίγνοντ᾽, οὐδέ τι πάντα τελείεται ἀνθρώποισι.",
    "δοιαὶ γάρ τε πύλαι ἀμενηνῶν εἰσὶν ὀνείρων:",
    "αἱ μὲν γὰρ κεράεσσι τετεύχαται, αἱ δ᾽ ἐλέφαντι:",
    "τῶν οἳ μέν κ᾽ ἔλθωσι διὰ πριστοῦ ἐλέφαντος,",
    "οἵ ῥ᾽ ἐλεφαίρονται, ἔπε᾽ ἀκράαντα φέροντες:",
    "οἱ δὲ διὰ ξεστῶν κεράων ἔλθωσι θύραζε,",
    "οἵ ῥ᾽ ἔτυμα κραίνουσι, βροτῶν ὅτε κέν τις ἴδηται.",
]
layer = preprocessing.TextVectorization()
layer.adapt(data)
vectorized_text = layer(data)
print(vectorized_text)
"""
In addition, adaptable layers always expose an option to directly set state via
constructor arguments or weight assignment. If the intended state values are known at
layer construction time, or are calculated outside of the `adapt()` call, they can be set
without relying on the layer's internal computation. For instance, if external vocabulary
files for the `TextVectorization`, `StringLookup`, or `IntegerLookup` layers already
exist, those can be loaded directly into the lookup tables by passing a path to the
vocabulary file in the layer's constructor arguments.

Here's an example where we instantiate a `StringLookup` layer with precomputed vocabulary:
"""

コード例 #5

0

ファイルを表示

 def build(self, hp, inputs=None):
     input_node = nest.flatten(inputs)[0]
     return preprocessing.TextVectorization(
         max_tokens=self.max_tokens, output_mode='tf-idf')(input_node)

コード例 #6

0

ファイルを表示

ファイル: demo.py プロジェクト: EHowardHill/language-studies

  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  # Strip whitespace.
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text

max_vocab_size = 5000

input_text_processor = preprocessing.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size)

input_text_processor.adapt(inp)

# Here are the first 10 words from the vocabulary:
input_text_processor.get_vocabulary()[:10]

output_text_processor = preprocessing.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size)

output_text_processor.adapt(targ)
output_text_processor.get_vocabulary()[:10]

embedding_dim = 256

コード例 #7

0

ファイルを表示

def create_model(dataset, model_name, filepath):
    l = pd.read_csv(dataset)

    # first step always having a dataset which is labeled and setting up the model to be able to interpret those labels
    # this means you have to preprocess the data in some way - that depends on the inputs youre passing

    feature_names = ['product_issue', 'script_issue', 'testbed_issue']
    target_class = l['bug_type'].values
    bug_desc = l['bug_description'].values

    # from scikitlearn I've tried tfidf encoding, onehot encoding (dont do this for the input data you are training on, only labels)
    text_vectorizer = preprocessing.TextVectorization(output_mode="int")
    text_vectorizer.adapt(bug_desc)
    vocab_len = len(text_vectorizer.get_vocabulary()) + 2

    # onehot encoder changes labels into a 2d matrix which can be understood as a truth table,
    # product, script, testbed becomes
    #[
    #    [p,mes]
    #    [t,mes]
    #    [s,mes]
    #]
    # => [ [1,0,0,mes] [0,1,0,mes],etc...]
    # 1,0,0 when a given input is labeled as a product issue

    enc = OneHotEncoder()

    # you have to add the [:,np.newaxis] in order for the input shape to match what our model wants
    # error you might see is something like shape (None,3) is not comapatible with shape (None,4) which is why I've added a new dimension to the matrix

    target_class = enc.fit_transform(np.array(
        target_class[:, np.newaxis])).toarray()

    # one thing to test later on is if lemmazation or other ways of preprocessing text so the model better UNDERSTANDS the text will increase accuracy

    # Size of vocabulary obtained when preprocessing text data, for now I have set this to 10k because of an issue I was getting
    # when it was the vocab length + 1
    num_words = 10000
    # Number of classes for prediction outputs
    num_classes = 3

    # this layer is the start of our model and says that we will accept a string input with shape (1,)
    # shape has to have the comma in order for the model to accept it - not 100% sure why

    description_input = keras.Input(shape=(1, ),
                                    dtype="string",
                                    name="bug_description")

    # this layer is key to accepting raw strings as input as it will vectorize the input strings when theyre passed to /predict
    # without text vectorization layer builtin to the model, a string like hello world worlds would be vectorized when sending your input as a request,
    # because of this your string wouldnt be processed with the model's vocabulary and would look something like this [0,1,2]
    # if you include the text vectorization layer in the model then it will interperet the string hello world worlds within its own vocabulary so it would look something more like [12, 26, 58] where these numbers represent some unique value within the model's vocabulary

    description_features = text_vectorizer(description_input)

    # the embedding layer creates word vectors out of inputs, passing the text vectorizer layer before this enables our model to use the words in context of our vocabulary
    description_features = layers.Embedding(
        input_dim=10000, output_dim=128)(description_features)

    # LSTM layer is what takes our word vectors and actually interprets the data heres a link - https://colah.github.io/posts/2015-08-Understanding-LSTMs/
    description_features = layers.LSTM(64)(description_features)

    # this layer is responsible for removing some amount of overfitting from the model and should only be added in after LSTM
    # this is because the predictions the LSTM layer is creating can be overfit to our training data and dropout layer can solve this a bit
    # chaning the 0.2 gives different results, but I stopped at 0.2 because I got good results with this model
    # its explained well here - https://machinelearningmastery.com/dropout-for-regularizing-deep-neural-networks/
    description_features = layers.Dropout(0.2)(description_features)

    # finally we have a dense layer which actually will output the predictions
    # sigmoid activation is necessary (an alternative is softmax but youll notice that the model's accuracy never changes during training with it)
    # "sigmoid squash" is a commonly used term because this activation function takes the data and squashes them down to a number between 0-1 so it becomes a probability problem
    # whichever output has the highest value is also the bug type that an input is most likely to be!
    bug_class = layers.Dense(3, name="desc",
                             activation='sigmoid')(description_features)

    # create the model object with 1 input layer and 1 output layer
    # can be modified to accept more as well!
    model = keras.Model(
        inputs=[description_input],
        outputs=[bug_class]  #,infra_categorization]
    )
    # visualize the model architecture with this function
    keras.utils.plot_model(model,
                           "multi_input_and_output_model.png",
                           show_shapes=True)

    # split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(bug_desc,
                                                        target_class,
                                                        test_size=0.2,
                                                        random_state=1)

    # compile the model with an optimizer which from my testing yielded the best results, and a loss function used for our type of problem
    # use CC loss function when we have multiple labels and pass label data as a onehot encoded matrix
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    # finally fit the model on our data - a higher batch size can yield lower accuracies but also speeds up training
    # I had issues with the training taking a long time initially but that was before I changed preprocessing methods
    # ML requires A LOT of trial and error with tuning your parameters and finding a middle ground you are happy with
    # after just two epochs our training accuracy is almost 100% so 10 epochs is a sufficient # for this model
    # a batch size of 64 isn't too big or too small either and this combo yields good results
    model.fit(X_train, y_train, epochs=10, batch_size=64)

    save_path = os.path.join(filepath, f'{model_name}/1/')
    tf.saved_model.save(model, save_path)