def train_sequence_model(data, learning_rate=1e-3, epochs=1000, batch_size=128, blocks=2, filters=64, dropout_rate=0.3, embedding_dim=200, kernel_size=3, pool_size=3): (train_texts, train_labels), (val_texts, val_labels) = data num_classes = explore_data.get_num_classes(train_labels) unexpected_labels = [v for v in val_labels if v not in range(num_classes)] if len(unexpected_labels): raise ValueError( 'Unexpected label values found in the validation set:' ' {unexpected_labels}. Please make sure that the ' 'labels in the validation set are in the same range ' 'as training labels.'.format(unexpected_labels=unexpected_labels)) x_train, x_val, word_index = vectorize_data.sequence_vectorize( train_texts, val_texts) num_features = min(len(word_index) + 1, TOP_K) model = build_model.sepcnn_model(blocks=blocks, filters=filters, kernel_size=kernel_size, embedding_dim=embedding_dim, dropout_rate=dropout_rate, pool_size=pool_size, input_shape=x_train.shape[1:], num_classes=num_classes, num_features=num_features) if num_classes == 2: loss = 'binary_crossentropy' else: loss = 'sparse_categorical_crossentropy' optimizer = tf.keras.optimizers.Adam(lr=learning_rate) model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) ] history = model.fit(x_train, train_labels, epochs=epochs, callbacks=callbacks, validation_data=(x_val, val_labels), verbose=2, batch_size=batch_size) history = history.history print('Validation accuracy: {acc}, loss: {loss}'.format( acc=history['val_acc'][-1], loss=history['val_loss'][-1])) model.save('rotten_tomatoes_sepcnn_model.h5') return history['val_acc'][-1], history['val_loss'][-1]
def checkLabels(train_labels, test_labels): # Verify that validation labels are in the same range as training labels. num_classes = explore_data.get_num_classes(train_labels) unexpected_labels = [v for v in test_labels if v not in range(num_classes)] if len(unexpected_labels): raise ValueError( 'Unexpected label values found in the validation set:' ' {unexpected_labels}. Please make sure that the ' 'labels in the validation set are in the same range ' 'as training labels.'.format(unexpected_labels=unexpected_labels)) return num_classes
def train_mlp_model(data, learning_rate=1e-3, epochs=1000, batch_size=128, layers=2, units=64, dropout_rate=0.3): (train_texts, train_labels), (val_texts, val_labels) = data num_classes = explore_data.get_num_classes(train_labels) unexpected_labels = [i for i in val_labels if i not in range(num_classes)] if len(unexpected_labels) > 0: raise ValueError( 'Unexpected label values found in validation set: ' '{unexpected_labels}. Please make sure that the ' 'labels in the validation set are in the same range ' 'as training labels.'.format(unexpected_labels=unexpected_labels)) x_train, x_val = vectorize_data.tfidf_vectorize(train_texts, train_labels, val_texts) model = build_model.mlp_model(layers=layers, units=units, dropout_rate=dropout_rate, input_shape=x_train.shape[1:], num_classes=num_classes) if num_classes == 2: loss = 'binary_crossentropy' else: loss = 'sparse_categorical_crossentropy' optimizer = tf.keras.optimizers.Adam(lr=learning_rate) model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) ] history = model.fit(x_train, train_labels, epochs=epochs, callbacks=callbacks, validation_data=(x_val, val_labels), verbose=2, batch_size=batch_size) history = history.history print('Validation accuracy: {acc}, loss: {loss}'.format( acc=history['val_acc'][-1], loss=history['val_loss'][-1])) model.save('mlp_model.h5') return history['val_acc'][-1], history['val_loss'][-1]
def treinar(): data, labels = load_data.load_text_dataset('/opt/mearin') (train_texts, train_labels), (test_texts, test_labels) = data num_classes = explore_data.get_num_classes(train_labels) print(num_classes) num_words_persample = explore_data.get_num_words_per_sample(train_texts) print(len(train_labels)) print(num_words_persample) print(len(train_labels) / num_words_persample) #print(train_texts[0]) #explore_data.plot_frequency_distribution_of_ngrams(train_texts) #explore_data.plot_sample_length_distribution(train_texts) #explore_data.plot_class_distribution(train_labels ) return train_ngram_model(data), labels, len(train_labels)
def train_embed_model(data, learning_rate=1e-3, epochs=1000, batch_size=128, layers=2, units=64, dropout_rate=0.5): (train_texts, train_labels), (val_texts, val_labels) = data num_classes = explore_data.get_num_classes(train_labels) model = build_model.embedding_model(layers, units, num_classes, dropout_rate) if num_classes == 2: loss = 'binary_crossentropy' else: loss = 'sparse_categorical_crossentropy' optimizer = tf.keras.optimizers.Adam(lr=learning_rate) model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) ] training_dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(train_texts, tf.string), tf.cast(train_labels, tf.int32))) validation_dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(val_texts, tf.string), tf.cast(val_labels, tf.int32))) history = model.fit( training_dataset.shuffle(1000).batch(batch_size), epochs=epochs, # callbacks=callbacks, validation_data=validation_dataset.batch(batch_size), verbose=1) history = history.history print('Validation accuracy: {acc}, loss: {loss}'.format( acc=history['val_acc'][-1], loss=history['val_loss'][-1])) # return history['val_acc'][-1], history['val_loss'][-1] return history
def train_fine_tuned_sequence_model(data, embedding_data_dir, learning_rate=1e-3, epochs=1000, batch_size=128, blocks=2, filters=64, dropout_rate=0.2, embedding_dim=200, kernel_size=3, pool_size=3): """Trains sequence model on the given dataset. # Arguments data: tuples of training and test texts and labels. embedding_data_dir: string, path to the pre-training embeddings. learning_rate: float, learning rate for training model. epochs: int, number of epochs. batch_size: int, number of samples per batch. blocks: int, number of pairs of sepCNN and pooling blocks in the model. filters: int, output dimension of sepCNN layers in the model. dropout_rate: float: percentage of input to drop at Dropout layers. embedding_dim: int, dimension of the embedding vectors. kernel_size: int, length of the convolution window. pool_size: int, factor by which to downscale input at MaxPooling layer. # Raises ValueError: If validation data has label values which were not seen in the training data. """ # Get the data. (train_texts, train_labels), (val_texts, val_labels) = data # Verify that validation labels are in the same range as training labels. num_classes = explore_data.get_num_classes(train_labels) unexpected_labels = [v for v in val_labels if v not in range(num_classes)] if len(unexpected_labels): raise ValueError('Unexpected label values found in the validation set:' ' {unexpected_labels}. Please make sure that the ' 'labels in the validation set are in the same range ' 'as training labels.'.format( unexpected_labels=unexpected_labels)) # Vectorize texts. x_train, x_val, word_index = vectorize_data.sequence_vectorize( train_texts, val_texts) # Number of features will be the embedding input dimension. Add 1 for the # reserved index 0. num_features = min(len(word_index) + 1, TOP_K) embedding_matrix = _get_embedding_matrix( word_index, embedding_data_dir, embedding_dim) # Create model instance. First time we will train rest of network while # keeping embedding layer weights frozen. So, we set # is_embedding_trainable as False. model = build_model.sepcnn_model(blocks=blocks, filters=filters, kernel_size=kernel_size, embedding_dim=embedding_dim, dropout_rate=dropout_rate, pool_size=pool_size, input_shape=x_train.shape[1:], num_classes=num_classes, num_features=num_features, use_pretrained_embedding=True, is_embedding_trainable=False, embedding_matrix=embedding_matrix) # Compile model with learning parameters. if num_classes == 2: loss = 'binary_crossentropy' else: loss = 'sparse_categorical_crossentropy' optimizer = tf.keras.optimizers.Adam(lr=learning_rate) model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # Create callback for early stopping on validation loss. If the loss does # not decrease in two consecutive tries, stop training. callbacks = [tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=2)] # Train and validate model. model.fit(x_train, train_labels, epochs=epochs, callbacks=callbacks, validation_data=(x_val, val_labels), verbose=2, # Logs once per epoch. batch_size=batch_size) # Save the model. model.save_weights('sequence_model_with_pre_trained_embedding.h5') # Create another model instance. This time we will unfreeze the embedding # layer and let it fine-tune to the given dataset. model = build_model.sepcnn_model(blocks=blocks, filters=filters, kernel_size=kernel_size, embedding_dim=embedding_dim, dropout_rate=dropout_rate, pool_size=pool_size, input_shape=x_train.shape[1:], num_classes=num_classes, num_features=num_features, use_pretrained_embedding=True, is_embedding_trainable=True, embedding_matrix=embedding_matrix) # Compile model with learning parameters. model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # Load the weights that we had saved into this new model. model.load_weights('sequence_model_with_pre_trained_embedding.h5') # Train and validate model. history = model.fit(x_train, train_labels, epochs=epochs, callbacks=callbacks, validation_data=(x_val, val_labels), verbose=2, # Logs once per epoch. batch_size=batch_size) # Print results. history = history.history print('Validation accuracy: {acc}, loss: {loss}'.format( acc=history['val_acc'][-1], loss=history['val_loss'][-1])) # Save model. model.save('tweet_weather_sepcnn_fine_tuned_model.h5') return history['val_acc'][-1], history['val_loss'][-1]
def train_ngram_model(data, learning_rate=1e-3, epochs=1000, batch_size=128, layers=2, units=64, dropout_rate=0.2): """Trains n-gram model on the given dataset. # Arguments data: tuple of training and test texts and labels. learning_rate: float, learning rate for training model. epochs: int, number of epochs. batch_size: int, number of samples per batch. layers: int, number of 'Dense' layers in the model. units: int, output dimension of Dense layers in the model. dropout_rate: float: percentage of input to drop at Dropout layers. # Raises valueError: If validation data has label values wwhich were not seen in training data. """ # Get the data. (train_texts, train_labels), (val_texts, val_labels) = data # Verify that validation labels are in the same range as training labels. num_classes = explore_data.get_num_classes(train_labels) unexpected_labels = [v for v in val_labels if v not in range(num_classes)] if len(unexpected_labels): raise ValueError( 'Unexpected label values found in the validation set:' ' {unexpected_labels}. Please make sure that the ' 'labels in the validation set are in the same range ' 'as training labels.'.format(unexpected_labels=unexpected_labels)) # Vectorize texts. x_train, x_val = vectorize_data.ngram_vectorize(train_texts, train_labels, val_texts) # Create model instance. model = mlp_model(layers=layers, units=units, dropout_rate=dropout_rate, input_shape=x_train.shape[1:], num_classes=num_classes) # Compile model with learning parameters. if (num_classes == 2): loss = 'binary_crossentropy' else: loss = 'sparse_categorical_crossentropy' optimizer = tf.keras.optimizers.Adam(lr=learning_rate) model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # Create callback for early stopping on validation loss. If the loss # does not decrease in two consecutive tries, stop training. callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) ] # Train and validate model. history = model.fit( x_train, train_labels, epochs=epochs, callbacks=callbacks, validation_data=(x_val, val_labels), verbose=2, # Logs one each epoch. batch_size=batch_size) # Print results. history = history.history print('Validation accruacy: {acc}, loss {loss}'.format( acc=history['val_acc'][-1], loss=history['val_loss'][-1])) # Save model. model.save('IMDb_mlp_model.h5') return (history['val_acc'][-1], history['val_loss'][-1])
def batch_train_sequence_model(data, learning_rate=1e-3, epochs=1000, batch_size=128, blocks=2, filters=64, dropout_rate=0.2, embedding_dim=200, kernel_size=3, pool_size=3): """Trains sequence model on the given dataset. # Arguments data: tuples of training and test texts and labels. learning_rate: float, learning rate for training model. epochs: int, number of epochs. batch_size: int, number of samples per batch. blocks: int, number of pairs of sepCNN and pooling blocks in the model. filters: int, output dimension of sepCNN layers in the model. dropout_rate: float: percentage of input to drop at Dropout layers. embedding_dim: int, dimension of the embedding vectors. kernel_size: int, length of the convolution window. pool_size: int, factor by which to downscale input at MaxPooling layer. # Raises ValueError: If validation data has label values which were not seen in the training data. """ # Get the data. (train_texts, train_labels), (val_texts, val_labels) = data # Verify that validation labels are in the same range as training labels. num_classes = explore_data.get_num_classes(train_labels) unexpected_labels = [v for v in val_labels if v not in range(num_classes)] if len(unexpected_labels): raise ValueError( 'Unexpected label values found in the validation set:' ' {unexpected_labels}. Please make sure that the ' 'labels in the validation set are in the same range ' 'as training labels.'.format(unexpected_labels=unexpected_labels)) # Vectorize texts. x_train, x_val, word_index = vectorize_data.sequence_vectorize( train_texts, val_texts) # Number of features will be the embedding input dimension. Add 1 for the # reserved index 0. num_features = min(len(word_index) + 1, TOP_K) # Create model instance. model = build_model.sepcnn_model(blocks=blocks, filters=filters, kernel_size=kernel_size, embedding_dim=embedding_dim, dropout_rate=dropout_rate, pool_size=pool_size, input_shape=x_train.shape[1:], num_classes=num_classes, num_features=num_features) # Compile model with learning parameters. if num_classes == 2: loss = 'binary_crossentropy' else: loss = 'sparse_categorical_crossentropy' optimizer = tf.keras.optimizers.Adam(lr=learning_rate) model.compile(optimizer=optimizer, loss=loss, metrics=['acc']) # Create callback for early stopping on validation loss. If the loss does # not decrease in two consecutive tries, stop training. callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2) ] # Create training and validation generators. training_generator = _data_generator(x_train, train_labels, num_features, batch_size) validation_generator = _data_generator(x_val, val_labels, num_features, batch_size) # Get number of training steps. This indicated the number of steps it takes # to cover all samples in one epoch. steps_per_epoch = x_train.shape[0] // batch_size if x_train.shape[0] % batch_size: steps_per_epoch += 1 # Get number of validation steps. validation_steps = x_val.shape[0] // batch_size if x_val.shape[0] % batch_size: validation_steps += 1 # Train and validate model. history = model.fit_generator(generator=training_generator, steps_per_epoch=steps_per_epoch, validation_data=validation_generator, validation_steps=validation_steps, callbacks=callbacks, epochs=epochs, verbose=2) # Logs once per epoch. # Print results. history = history.history print('Validation accuracy: {acc}, loss: {loss}'.format( acc=history['val_acc'][-1], loss=history['val_loss'][-1])) # Save model. model.save('amazon_reviews_sepcnn_model.h5') return history['val_acc'][-1], history['val_loss'][-1]
#explore_data.plot_frequency_distribution_of_ngrams(train_texts) #explore_data.plot_sample_length_distribution(train_texts) #explore_data.plot_class_distribution(train_labels ) return train_ngram_model(data), labels, len(train_labels) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='./data', help='input data directory') FLAGS, unparsed = parser.parse_known_args() # Using the IMDb movie reviews dataset to demonstrate training n-gram model data, labels = load_data.load_text_dataset('/media/eucassio/dados1/swap/') (train_texts, train_labels), (test_texts, test_labels) = data num_classes = explore_data.get_num_classes(train_labels) print(num_classes) num_words_persample = explore_data.get_num_words_per_sample(train_texts) print(len(train_labels)) print(num_words_persample) print(len(train_labels) / num_words_persample) #print(train_texts[0]) #explore_data.plot_frequency_distribution_of_ngrams(train_texts) #explore_data.plot_sample_length_distribution(train_texts) #explore_data.plot_class_distribution(train_labels ) train_ngram_model(data)
def train_conv_model(data, learning_rate=LEARNING_RATE, epochs=EPOCHS, batch_size=BATCH_SIZE, layers=LAYERS, dropout_rate=DROPOUT_RATE): """Trains n-gram model on the given dataset. # Arguments data: tuples of training and test texts and labels. learning_rate: float, learning rate for training model. epochs: int, number of epochs. batch_size: int, number of samples per batch. layers: int, number of `Dense` layers in the model. units: int, output dimension of Dense layers in the model. dropout_rate: float: percentage of input to drop at Dropout layers. # Raises ValueError: If validation data has label values which were not seen in the training data. """ (train_texts, train_labels), (test_texts, test_labels) = data # Verify that validation labels are in the same range as training labels. num_classes = explore_data.get_num_classes(train_labels) unexpected_labels = [v for v in test_labels if v not in range(num_classes)] if len(unexpected_labels): raise ValueError('Unexpected label values found in the validation set:' ' {unexpected_labels}. Please make sure that the ' 'labels in the validation set are in the same range ' 'as training labels.'.format( unexpected_labels=unexpected_labels)) # divide Cross Validation Set total_len = len(train_labels) train_len = int(total_len * 3 / 4) val_texts = train_texts[train_len:] val_labels = train_labels[train_len:] train_texts = train_texts[:train_len] train_labels = train_labels[:train_len] # Vectorize texts. x_train, x_val, x_test, word_index = sequence.vectorize(train_texts, val_texts, test_texts) # Create model instance. model = build_model.sepcnn_model(blocks=layers, filters=3, kernel_size=KERNEL_SIZE, embedding_dim=128, dropout_rate=dropout_rate, pool_size=POOL_SIZE, input_shape=x_train.shape[1:], # shape : (row, column) / shape[1:] : (column) num_classes=num_classes, num_features=sequence.TOP_K) optimizer = tf.keras.optimizers.Adam(lr=learning_rate) model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['acc']) # Create callback for early stopping on validation loss. If the loss does # not decrease in two consecutive tries, stop training. callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)] # Train and validate model. history = model.fit( x_train, train_labels, epochs=epochs, callbacks=callbacks, validation_data=(x_val, val_labels), verbose=2, # Logs once per epoch. batch_size=batch_size) history = history.history print_history(history) accuracy = model.evaluate(x_test, test_labels, batch_size=batch_size) print("\n%s : %.2f%%" % (model.metrics_names[1], accuracy[1] * 100)) plot_history(history)
if train_labels[i]==0 or train_labels[i]==4: continue elif train_labels[i]==1: train_labels_final.append(1) train_texts_final.append(train_texts[i]) elif train_labels[i]==2 and c2<count: c2+=1 train_labels_final.append(2) train_texts_final.append(train_texts[i]) elif train_labels[i]==3 and c3<count: c3+=1 train_labels_final.append(3) train_texts_final.append(train_texts[i]) train_labels_final = [x - 1 for x in train_labels_final] explore_data.get_num_classes(train_labels_final) explore_data.get_num_words_per_sample(train_texts_final) explore_data.plot_frequency_distribution_of_ngrams(train_texts_final) explore_data.plot_sample_length_distribution(train_texts_final) explore_data.plot_class_distribution(train_labels_final) count = 0 for i in validation_labels: if i == 1: count+=1 c2=0 c3=0 validation_labels_final = [] validation_texts_final = [] for i in range(len(validation_labels)): if validation_labels[i]==0 or validation_labels[i]==4: