def train_sequence_model(data,
                         learning_rate=1e-3,
                         epochs=1000,
                         batch_size=128,
                         blocks=2,
                         filters=64,
                         dropout_rate=0.3,
                         embedding_dim=200,
                         kernel_size=3,
                         pool_size=3):
    (train_texts, train_labels), (val_texts, val_labels) = data
    num_classes = explore_data.get_num_classes(train_labels)
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError(
            'Unexpected label values found in the validation set:'
            ' {unexpected_labels}. Please make sure that the '
            'labels in the validation set are in the same range '
            'as training labels.'.format(unexpected_labels=unexpected_labels))

    x_train, x_val, word_index = vectorize_data.sequence_vectorize(
        train_texts, val_texts)
    num_features = min(len(word_index) + 1, TOP_K)

    model = build_model.sepcnn_model(blocks=blocks,
                                     filters=filters,
                                     kernel_size=kernel_size,
                                     embedding_dim=embedding_dim,
                                     dropout_rate=dropout_rate,
                                     pool_size=pool_size,
                                     input_shape=x_train.shape[1:],
                                     num_classes=num_classes,
                                     num_features=num_features)

    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
    ]

    history = model.fit(x_train,
                        train_labels,
                        epochs=epochs,
                        callbacks=callbacks,
                        validation_data=(x_val, val_labels),
                        verbose=2,
                        batch_size=batch_size)

    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    model.save('rotten_tomatoes_sepcnn_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]
Beispiel #2
0
def checkLabels(train_labels, test_labels):
    # Verify that validation labels are in the same range as training labels.
    num_classes = explore_data.get_num_classes(train_labels)
    unexpected_labels = [v for v in test_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError(
            'Unexpected label values found in the validation set:'
            ' {unexpected_labels}. Please make sure that the '
            'labels in the validation set are in the same range '
            'as training labels.'.format(unexpected_labels=unexpected_labels))
    return num_classes
Beispiel #3
0
def train_mlp_model(data,
                    learning_rate=1e-3,
                    epochs=1000,
                    batch_size=128,
                    layers=2,
                    units=64,
                    dropout_rate=0.3):
    (train_texts, train_labels), (val_texts, val_labels) = data

    num_classes = explore_data.get_num_classes(train_labels)

    unexpected_labels = [i for i in val_labels if i not in range(num_classes)]
    if len(unexpected_labels) > 0:
        raise ValueError(
            'Unexpected label values found in validation set: '
            '{unexpected_labels}. Please make sure that the '
            'labels in the validation set are in the same range '
            'as training labels.'.format(unexpected_labels=unexpected_labels))

    x_train, x_val = vectorize_data.tfidf_vectorize(train_texts, train_labels,
                                                    val_texts)

    model = build_model.mlp_model(layers=layers,
                                  units=units,
                                  dropout_rate=dropout_rate,
                                  input_shape=x_train.shape[1:],
                                  num_classes=num_classes)

    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
    ]

    history = model.fit(x_train,
                        train_labels,
                        epochs=epochs,
                        callbacks=callbacks,
                        validation_data=(x_val, val_labels),
                        verbose=2,
                        batch_size=batch_size)

    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    model.save('mlp_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]
Beispiel #4
0
def treinar():
    data, labels = load_data.load_text_dataset('/opt/mearin')
    (train_texts, train_labels), (test_texts, test_labels) = data
    num_classes = explore_data.get_num_classes(train_labels)
    print(num_classes)

    num_words_persample = explore_data.get_num_words_per_sample(train_texts)
    print(len(train_labels))
    print(num_words_persample)

    print(len(train_labels) / num_words_persample)
    #print(train_texts[0])
    #explore_data.plot_frequency_distribution_of_ngrams(train_texts)
    #explore_data.plot_sample_length_distribution(train_texts)
    #explore_data.plot_class_distribution(train_labels )
    return train_ngram_model(data), labels, len(train_labels)
Beispiel #5
0
def train_embed_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.5):
    (train_texts, train_labels), (val_texts, val_labels) = data

    num_classes = explore_data.get_num_classes(train_labels)

    model = build_model.embedding_model(layers, units, num_classes,
                                        dropout_rate)

    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
    ]

    training_dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(train_texts, tf.string), tf.cast(train_labels, tf.int32)))

    validation_dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(val_texts, tf.string), tf.cast(val_labels, tf.int32)))

    history = model.fit(
        training_dataset.shuffle(1000).batch(batch_size),
        epochs=epochs,
        #                         callbacks=callbacks,
        validation_data=validation_dataset.batch(batch_size),
        verbose=1)

    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    #     return history['val_acc'][-1], history['val_loss'][-1]
    return history
def train_fine_tuned_sequence_model(data,
                                    embedding_data_dir,
                                    learning_rate=1e-3,
                                    epochs=1000,
                                    batch_size=128,
                                    blocks=2,
                                    filters=64,
                                    dropout_rate=0.2,
                                    embedding_dim=200,
                                    kernel_size=3,
                                    pool_size=3):
    """Trains sequence model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        embedding_data_dir: string, path to the pre-training embeddings.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        blocks: int, number of pairs of sepCNN and pooling blocks in the model.
        filters: int, output dimension of sepCNN layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.
        embedding_dim: int, dimension of the embedding vectors.
        kernel_size: int, length of the convolution window.
        pool_size: int, factor by which to downscale input at MaxPooling layer.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    num_classes = explore_data.get_num_classes(train_labels)
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError('Unexpected label values found in the validation set:'
                         ' {unexpected_labels}. Please make sure that the '
                         'labels in the validation set are in the same range '
                         'as training labels.'.format(
                             unexpected_labels=unexpected_labels))

    # Vectorize texts.
    x_train, x_val, word_index = vectorize_data.sequence_vectorize(
            train_texts, val_texts)

    # Number of features will be the embedding input dimension. Add 1 for the
    # reserved index 0.
    num_features = min(len(word_index) + 1, TOP_K)

    embedding_matrix = _get_embedding_matrix(
        word_index, embedding_data_dir, embedding_dim)

    # Create model instance. First time we will train rest of network while
    # keeping embedding layer weights frozen. So, we set
    # is_embedding_trainable as False.
    model = build_model.sepcnn_model(blocks=blocks,
                                     filters=filters,
                                     kernel_size=kernel_size,
                                     embedding_dim=embedding_dim,
                                     dropout_rate=dropout_rate,
                                     pool_size=pool_size,
                                     input_shape=x_train.shape[1:],
                                     num_classes=num_classes,
                                     num_features=num_features,
                                     use_pretrained_embedding=True,
                                     is_embedding_trainable=False,
                                     embedding_matrix=embedding_matrix)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

    # Train and validate model.
    model.fit(x_train,
              train_labels,
              epochs=epochs,
              callbacks=callbacks,
              validation_data=(x_val, val_labels),
              verbose=2,  # Logs once per epoch.
              batch_size=batch_size)

    # Save the model.
    model.save_weights('sequence_model_with_pre_trained_embedding.h5')

    # Create another model instance. This time we will unfreeze the embedding
    # layer and let it fine-tune to the given dataset.
    model = build_model.sepcnn_model(blocks=blocks,
                                     filters=filters,
                                     kernel_size=kernel_size,
                                     embedding_dim=embedding_dim,
                                     dropout_rate=dropout_rate,
                                     pool_size=pool_size,
                                     input_shape=x_train.shape[1:],
                                     num_classes=num_classes,
                                     num_features=num_features,
                                     use_pretrained_embedding=True,
                                     is_embedding_trainable=True,
                                     embedding_matrix=embedding_matrix)

    # Compile model with learning parameters.
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Load the weights that we had saved into this new model.
    model.load_weights('sequence_model_with_pre_trained_embedding.h5')

    # Train and validate model.
    history = model.fit(x_train,
                        train_labels,
                        epochs=epochs,
                        callbacks=callbacks,
                        validation_data=(x_val, val_labels),
                        verbose=2,  # Logs once per epoch.
                        batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('tweet_weather_sepcnn_fine_tuned_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]
Beispiel #7
0
def train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.2):
    """Trains n-gram model on the given dataset.

    # Arguments
        data: tuple of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of 'Dense' layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop  at Dropout layers.

    # Raises
        valueError: If validation data has label values wwhich were not seen
        in training data.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    num_classes = explore_data.get_num_classes(train_labels)
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError(
            'Unexpected label values found in the validation set:'
            ' {unexpected_labels}. Please make sure that the '
            'labels in the validation set are in the same range '
            'as training labels.'.format(unexpected_labels=unexpected_labels))

    # Vectorize texts.
    x_train, x_val = vectorize_data.ngram_vectorize(train_texts, train_labels,
                                                    val_texts)

    # Create model instance.
    model = mlp_model(layers=layers,
                      units=units,
                      dropout_rate=dropout_rate,
                      input_shape=x_train.shape[1:],
                      num_classes=num_classes)

    # Compile model with learning parameters.
    if (num_classes == 2):
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss
    # does not decrease in two consecutive tries, stop training.
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
    ]

    # Train and validate model.
    history = model.fit(
        x_train,
        train_labels,
        epochs=epochs,
        callbacks=callbacks,
        validation_data=(x_val, val_labels),
        verbose=2,  # Logs one each epoch.
        batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accruacy: {acc}, loss {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('IMDb_mlp_model.h5')
    return (history['val_acc'][-1], history['val_loss'][-1])
Beispiel #8
0
def batch_train_sequence_model(data,
                               learning_rate=1e-3,
                               epochs=1000,
                               batch_size=128,
                               blocks=2,
                               filters=64,
                               dropout_rate=0.2,
                               embedding_dim=200,
                               kernel_size=3,
                               pool_size=3):
    """Trains sequence model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        blocks: int, number of pairs of sepCNN and pooling blocks in the model.
        filters: int, output dimension of sepCNN layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.
        embedding_dim: int, dimension of the embedding vectors.
        kernel_size: int, length of the convolution window.
        pool_size: int, factor by which to downscale input at MaxPooling layer.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    # Get the data.
    (train_texts, train_labels), (val_texts, val_labels) = data

    # Verify that validation labels are in the same range as training labels.
    num_classes = explore_data.get_num_classes(train_labels)
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError(
            'Unexpected label values found in the validation set:'
            ' {unexpected_labels}. Please make sure that the '
            'labels in the validation set are in the same range '
            'as training labels.'.format(unexpected_labels=unexpected_labels))

    # Vectorize texts.
    x_train, x_val, word_index = vectorize_data.sequence_vectorize(
        train_texts, val_texts)

    # Number of features will be the embedding input dimension. Add 1 for the
    # reserved index 0.
    num_features = min(len(word_index) + 1, TOP_K)

    # Create model instance.
    model = build_model.sepcnn_model(blocks=blocks,
                                     filters=filters,
                                     kernel_size=kernel_size,
                                     embedding_dim=embedding_dim,
                                     dropout_rate=dropout_rate,
                                     pool_size=pool_size,
                                     input_shape=x_train.shape[1:],
                                     num_classes=num_classes,
                                     num_features=num_features)

    # Compile model with learning parameters.
    if num_classes == 2:
        loss = 'binary_crossentropy'
    else:
        loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
    ]

    # Create training and validation generators.
    training_generator = _data_generator(x_train, train_labels, num_features,
                                         batch_size)
    validation_generator = _data_generator(x_val, val_labels, num_features,
                                           batch_size)

    # Get number of training steps. This indicated the number of steps it takes
    # to cover all samples in one epoch.
    steps_per_epoch = x_train.shape[0] // batch_size
    if x_train.shape[0] % batch_size:
        steps_per_epoch += 1

    # Get number of validation steps.
    validation_steps = x_val.shape[0] // batch_size
    if x_val.shape[0] % batch_size:
        validation_steps += 1

    # Train and validate model.
    history = model.fit_generator(generator=training_generator,
                                  steps_per_epoch=steps_per_epoch,
                                  validation_data=validation_generator,
                                  validation_steps=validation_steps,
                                  callbacks=callbacks,
                                  epochs=epochs,
                                  verbose=2)  # Logs once per epoch.

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('amazon_reviews_sepcnn_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]
Beispiel #9
0
    #explore_data.plot_frequency_distribution_of_ngrams(train_texts)
    #explore_data.plot_sample_length_distribution(train_texts)
    #explore_data.plot_class_distribution(train_labels )
    return train_ngram_model(data), labels, len(train_labels)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir',
                        type=str,
                        default='./data',
                        help='input data directory')
    FLAGS, unparsed = parser.parse_known_args()

    # Using the IMDb movie reviews dataset to demonstrate training n-gram model
    data, labels = load_data.load_text_dataset('/media/eucassio/dados1/swap/')
    (train_texts, train_labels), (test_texts, test_labels) = data
    num_classes = explore_data.get_num_classes(train_labels)
    print(num_classes)

    num_words_persample = explore_data.get_num_words_per_sample(train_texts)
    print(len(train_labels))
    print(num_words_persample)

    print(len(train_labels) / num_words_persample)
    #print(train_texts[0])
    #explore_data.plot_frequency_distribution_of_ngrams(train_texts)
    #explore_data.plot_sample_length_distribution(train_texts)
    #explore_data.plot_class_distribution(train_labels )
    train_ngram_model(data)
def train_conv_model(data,
                     learning_rate=LEARNING_RATE,
                     epochs=EPOCHS,
                     batch_size=BATCH_SIZE,
                     layers=LAYERS,
                     dropout_rate=DROPOUT_RATE):
    """Trains n-gram model on the given dataset.

    # Arguments
        data: tuples of training and test texts and labels.
        learning_rate: float, learning rate for training model.
        epochs: int, number of epochs.
        batch_size: int, number of samples per batch.
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of Dense layers in the model.
        dropout_rate: float: percentage of input to drop at Dropout layers.

    # Raises
        ValueError: If validation data has label values which were not seen
            in the training data.
    """
    (train_texts, train_labels), (test_texts, test_labels) = data

    # Verify that validation labels are in the same range as training labels.
    num_classes = explore_data.get_num_classes(train_labels)
    unexpected_labels = [v for v in test_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError('Unexpected label values found in the validation set:'
                         ' {unexpected_labels}. Please make sure that the '
                         'labels in the validation set are in the same range '
                         'as training labels.'.format(
            unexpected_labels=unexpected_labels))

    # divide Cross Validation Set
    total_len = len(train_labels)
    train_len = int(total_len * 3 / 4)
    val_texts = train_texts[train_len:]
    val_labels = train_labels[train_len:]
    train_texts = train_texts[:train_len]
    train_labels = train_labels[:train_len]

    # Vectorize texts.
    x_train, x_val, x_test, word_index = sequence.vectorize(train_texts, val_texts, test_texts)
    # Create model instance.
    model = build_model.sepcnn_model(blocks=layers,
                                     filters=3,
                                     kernel_size=KERNEL_SIZE,
                                     embedding_dim=128,
                                     dropout_rate=dropout_rate,
                                     pool_size=POOL_SIZE,
                                     input_shape=x_train.shape[1:],  # shape : (row, column) / shape[1:] : (column)
                                     num_classes=num_classes,
                                     num_features=sequence.TOP_K)

    optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)]

    # Train and validate model.
    history = model.fit(
        x_train,
        train_labels,
        epochs=epochs,
        callbacks=callbacks,
        validation_data=(x_val, val_labels),
        verbose=2,  # Logs once per epoch.
        batch_size=batch_size)

    history = history.history
    print_history(history)

    accuracy = model.evaluate(x_test, test_labels, batch_size=batch_size)
    print("\n%s : %.2f%%" % (model.metrics_names[1], accuracy[1] * 100))
    plot_history(history)
Beispiel #11
0
	if train_labels[i]==0 or train_labels[i]==4:
		continue
	elif train_labels[i]==1:
		train_labels_final.append(1)
		train_texts_final.append(train_texts[i])
	elif train_labels[i]==2 and c2<count:
		c2+=1
		train_labels_final.append(2)
		train_texts_final.append(train_texts[i])
	elif train_labels[i]==3 and c3<count:
		c3+=1
		train_labels_final.append(3)
		train_texts_final.append(train_texts[i])

train_labels_final = [x - 1 for x in train_labels_final]
explore_data.get_num_classes(train_labels_final)
explore_data.get_num_words_per_sample(train_texts_final)
explore_data.plot_frequency_distribution_of_ngrams(train_texts_final)
explore_data.plot_sample_length_distribution(train_texts_final)
explore_data.plot_class_distribution(train_labels_final)

count = 0
for i in validation_labels:
	if i == 1:
		count+=1
c2=0
c3=0
validation_labels_final = []
validation_texts_final = []
for i in range(len(validation_labels)):
	if validation_labels[i]==0 or validation_labels[i]==4: