def build(self, hp, inputs=None): input_node = nest.flatten(inputs)[0] if self.ngrams is not None: ngrams = self.ngrams else: ngrams = hp.Int("ngrams", min_value=1, max_value=2, default=2) return preprocessing.TextVectorization( max_tokens=self.max_tokens, ngrams=ngrams, output_mode="tf-idf")(input_node)
def build(self, hp, inputs=None): input_node = nest.flatten(inputs)[0] if self.output_sequence_length is not None: output_sequence_length = self.output_sequence_length else: output_sequence_length = hp.Choice('output_sequence_length', [64, 128, 256, 512], default=64) output_node = preprocessing.TextVectorization( max_tokens=self.max_tokens, output_mode='int', output_sequence_length=output_sequence_length)(input_node) return output_node
def build_model(x_train_text, x_train_numeric, **kwargs): """Build TF model.""" max_features = 5000 sequence_length = 100 encoder = preprocessing.TextVectorization( max_tokens=max_features, output_sequence_length=sequence_length) encoder.adapt(x_train_text.values) normalizer = preprocessing.Normalization() normalizer.adapt(x_train_numeric.values) text_input = tf.keras.Input(shape=(None, ), name='text', dtype='string') embedded = encoder(text_input) embedded = layers.Embedding(input_dim=max_features, output_dim=128)(embedded) # LSTM doesn't improved performance # embedded = layers.LSTM(128)(embedded) embedded = layers.GlobalAveragePooling1D()(embedded) numeric_shape = x_train_numeric.shape[1:] numeric_input = tf.keras.Input(shape=numeric_shape, name='numeric') normalized = normalizer(numeric_input) if 'only_numeric' in kwargs and kwargs['only_numeric']: print('\nBuilding TF model with only numeric data ...') inputs = numeric_input x = normalized elif 'only_text' in kwargs and kwargs['only_text']: print('\nBuilding TF model with only text data ...') inputs = text_input x = embedded else: print('\nBuilding TF model with both numeric and text data ...') inputs = [text_input, numeric_input] x = layers.concatenate([embedded, normalized]) print('#' * 65) x = layers.Dropout(0.3)(x) x = layers.Dense(256, activation='relu')(x) x = layers.Dropout(0.3)(x) output = layers.Dense(1)(x) model = tf.keras.Model(inputs=inputs, outputs=output) model.compile(optimizer='adam', loss='mse', metrics=['mape', 'mae']) return model
""" The `adapt()` method takes either a Numpy array or a `tf.data.Dataset` object. In the case of `StringLookup` and `TextVectorization`, you can also pass a list of strings: """ data = [ "ξεῖν᾽, ἦ τοι μὲν ὄνειροι ἀμήχανοι ἀκριτόμυθοι", "γίγνοντ᾽, οὐδέ τι πάντα τελείεται ἀνθρώποισι.", "δοιαὶ γάρ τε πύλαι ἀμενηνῶν εἰσὶν ὀνείρων:", "αἱ μὲν γὰρ κεράεσσι τετεύχαται, αἱ δ᾽ ἐλέφαντι:", "τῶν οἳ μέν κ᾽ ἔλθωσι διὰ πριστοῦ ἐλέφαντος,", "οἵ ῥ᾽ ἐλεφαίρονται, ἔπε᾽ ἀκράαντα φέροντες:", "οἱ δὲ διὰ ξεστῶν κεράων ἔλθωσι θύραζε,", "οἵ ῥ᾽ ἔτυμα κραίνουσι, βροτῶν ὅτε κέν τις ἴδηται.", ] layer = preprocessing.TextVectorization() layer.adapt(data) vectorized_text = layer(data) print(vectorized_text) """ In addition, adaptable layers always expose an option to directly set state via constructor arguments or weight assignment. If the intended state values are known at layer construction time, or are calculated outside of the `adapt()` call, they can be set without relying on the layer's internal computation. For instance, if external vocabulary files for the `TextVectorization`, `StringLookup`, or `IntegerLookup` layers already exist, those can be loaded directly into the lookup tables by passing a path to the vocabulary file in the layer's constructor arguments. Here's an example where we instantiate a `StringLookup` layer with precomputed vocabulary: """
def build(self, hp, inputs=None): input_node = nest.flatten(inputs)[0] return preprocessing.TextVectorization( max_tokens=self.max_tokens, output_mode='tf-idf')(input_node)
text = tf_text.normalize_utf8(text, 'NFKD') text = tf.strings.lower(text) # Keep space, a to z, and select punctuation. text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '') # Add spaces around punctuation. text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ') # Strip whitespace. text = tf.strings.strip(text) text = tf.strings.join(['[START]', text, '[END]'], separator=' ') return text max_vocab_size = 5000 input_text_processor = preprocessing.TextVectorization( standardize=tf_lower_and_split_punct, max_tokens=max_vocab_size) input_text_processor.adapt(inp) # Here are the first 10 words from the vocabulary: input_text_processor.get_vocabulary()[:10] output_text_processor = preprocessing.TextVectorization( standardize=tf_lower_and_split_punct, max_tokens=max_vocab_size) output_text_processor.adapt(targ) output_text_processor.get_vocabulary()[:10] embedding_dim = 256
def create_model(dataset, model_name, filepath): l = pd.read_csv(dataset) # first step always having a dataset which is labeled and setting up the model to be able to interpret those labels # this means you have to preprocess the data in some way - that depends on the inputs youre passing feature_names = ['product_issue', 'script_issue', 'testbed_issue'] target_class = l['bug_type'].values bug_desc = l['bug_description'].values # from scikitlearn I've tried tfidf encoding, onehot encoding (dont do this for the input data you are training on, only labels) text_vectorizer = preprocessing.TextVectorization(output_mode="int") text_vectorizer.adapt(bug_desc) vocab_len = len(text_vectorizer.get_vocabulary()) + 2 # onehot encoder changes labels into a 2d matrix which can be understood as a truth table, # product, script, testbed becomes #[ # [p,mes] # [t,mes] # [s,mes] #] # => [ [1,0,0,mes] [0,1,0,mes],etc...] # 1,0,0 when a given input is labeled as a product issue enc = OneHotEncoder() # you have to add the [:,np.newaxis] in order for the input shape to match what our model wants # error you might see is something like shape (None,3) is not comapatible with shape (None,4) which is why I've added a new dimension to the matrix target_class = enc.fit_transform(np.array( target_class[:, np.newaxis])).toarray() # one thing to test later on is if lemmazation or other ways of preprocessing text so the model better UNDERSTANDS the text will increase accuracy # Size of vocabulary obtained when preprocessing text data, for now I have set this to 10k because of an issue I was getting # when it was the vocab length + 1 num_words = 10000 # Number of classes for prediction outputs num_classes = 3 # this layer is the start of our model and says that we will accept a string input with shape (1,) # shape has to have the comma in order for the model to accept it - not 100% sure why description_input = keras.Input(shape=(1, ), dtype="string", name="bug_description") # this layer is key to accepting raw strings as input as it will vectorize the input strings when theyre passed to /predict # without text vectorization layer builtin to the model, a string like hello world worlds would be vectorized when sending your input as a request, # because of this your string wouldnt be processed with the model's vocabulary and would look something like this [0,1,2] # if you include the text vectorization layer in the model then it will interperet the string hello world worlds within its own vocabulary so it would look something more like [12, 26, 58] where these numbers represent some unique value within the model's vocabulary description_features = text_vectorizer(description_input) # the embedding layer creates word vectors out of inputs, passing the text vectorizer layer before this enables our model to use the words in context of our vocabulary description_features = layers.Embedding( input_dim=10000, output_dim=128)(description_features) # LSTM layer is what takes our word vectors and actually interprets the data heres a link - https://colah.github.io/posts/2015-08-Understanding-LSTMs/ description_features = layers.LSTM(64)(description_features) # this layer is responsible for removing some amount of overfitting from the model and should only be added in after LSTM # this is because the predictions the LSTM layer is creating can be overfit to our training data and dropout layer can solve this a bit # chaning the 0.2 gives different results, but I stopped at 0.2 because I got good results with this model # its explained well here - https://machinelearningmastery.com/dropout-for-regularizing-deep-neural-networks/ description_features = layers.Dropout(0.2)(description_features) # finally we have a dense layer which actually will output the predictions # sigmoid activation is necessary (an alternative is softmax but youll notice that the model's accuracy never changes during training with it) # "sigmoid squash" is a commonly used term because this activation function takes the data and squashes them down to a number between 0-1 so it becomes a probability problem # whichever output has the highest value is also the bug type that an input is most likely to be! bug_class = layers.Dense(3, name="desc", activation='sigmoid')(description_features) # create the model object with 1 input layer and 1 output layer # can be modified to accept more as well! model = keras.Model( inputs=[description_input], outputs=[bug_class] #,infra_categorization] ) # visualize the model architecture with this function keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True) # split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(bug_desc, target_class, test_size=0.2, random_state=1) # compile the model with an optimizer which from my testing yielded the best results, and a loss function used for our type of problem # use CC loss function when we have multiple labels and pass label data as a onehot encoded matrix model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) # finally fit the model on our data - a higher batch size can yield lower accuracies but also speeds up training # I had issues with the training taking a long time initially but that was before I changed preprocessing methods # ML requires A LOT of trial and error with tuning your parameters and finding a middle ground you are happy with # after just two epochs our training accuracy is almost 100% so 10 epochs is a sufficient # for this model # a batch size of 64 isn't too big or too small either and this combo yields good results model.fit(X_train, y_train, epochs=10, batch_size=64) save_path = os.path.join(filepath, f'{model_name}/1/') tf.saved_model.save(model, save_path)