Ejemplo n.º 1
0
def test_train_ngram_model():
    data_dir = './data/'
    data = load_data.load_imdb_sentiment_analysis_dataset(data_dir)
    # data = load_data.load_rotten_tomatoes_sentiment_analysis_dataset(data_dir)
    acc, loss = train_ngram_model.train_ngram_model(data)
    assert acc == pytest.approx(0.91, 0.02)
    assert loss == pytest.approx(0.24, 0.02)
Ejemplo n.º 2
0
                                            units=units))
            params['accuracy'].append(accuracy)
    _plot_parameters(params)


def _plot_parameters(params):
    """Creates a 3D surface plot of given parameters.
    # Arguments
        params: dict, contains layers, units and accuracy value combinations.
    """
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.plot_trisurf(params['layers'],
                    params['units'],
                    params['accuracy'],
                    cmap=cm.coolwarm,
                    antialiased=False)
    plt.show()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir',
                        type=str,
                        default='./data',
                        help='input data directory')
    FLAGS, unparsed = parser.parse_known_args()

    # Using the IMDb movie reviews dataset to demonstrate training n-gram model
    data = load_data.load_imdb_sentiment_analysis_dataset(FLAGS.data_dir)
    tune_ngram_model(data)
Ejemplo n.º 3
0
        epochs=epochs,
        callbacks=callbacks,
        validation_data=(x_val, val_labels),
        verbose=2,  # Logs once per epoch.
        batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('rotten_tomatoes_sepcnn_model.h5')
    return history['val_acc'][-1], history['val_loss'][-1]


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir',
                        type=str,
                        default='./data',
                        help='input data directory')
    FLAGS, unparsed = parser.parse_known_args()

    # Using the Rotten tomatoes movie reviews dataset to demonstrate
    # training sequence model.
    data = load_data.load_imdb_sentiment_analysis_dataset(
        load_data.sentences_train, load_data.sentences_test, load_data.y_train,
        load_data.y_test)
    train_sequence_model(data)
Ejemplo n.º 4
0
# -------------------

"""
Create a /data folder in this repo
Download v1 dataset to /data folder from
https://ai.stanford.edu/~amaas/data/sentiment/
Extract contents into /data/aclImdb
"""

# Step 2: Explore Data
# --------------------

# Load the dataset
data_dir = './data/'
# NOTE: Only pulling in 5000 samples to allow running locally
data_tuple = load_data.load_imdb_sentiment_analysis_dataset(data_dir, seed=150, max_samples=5000)
# (train_texts, train_labels), (val_texts, val_labels) = data_tuple

# The two charts in the course
# explore_data.plot_frequency_distribution_of_ngrams(train_texts)
# explore_data.plot_sample_length_distribution(train_texts)

# Additional data exploring functions
# explore_data.get_num_words_per_sample(train_texts)
# explore_data.plot_class_distribution(train_labels)

# Step 3: Prepare Data
# -> Tokenization and vecotrization included in train_ngram_model()
# --------------------

# N-gram Tokenization into unigrams and bigrams
Ejemplo n.º 5
0
def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)#转换为小写
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')#取出html的符号
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')#去除标点
# In[]
dataset_dir = os.path.join("../dataset", 'aclImdb')

"""1. load data"""
(train_texts, train_labels), (val_texts, val_labels) = load_data.load_imdb_sentiment_analysis_dataset("../dataset")
print("load data finished!")
# In[]

train_texts_dataset=tf.data.Dataset.from_tensor_slices(train_texts)
test_texts_dataset=tf.data.Dataset.from_tensor_slices(val_texts)
# In[]
VOCAB_SIZE=2000
sequence_length = 100
# if tf version is lower than 2., use tf.keras.layers.experimental.preprocessing.TextVectorization
vectorizer = tf.keras.layers.TextVectorization(
        standardize="lower_and_strip_punctuation",
        split="whitespace",
        max_tokens=VOCAB_SIZE,
        output_mode='int',
        output_sequence_length=sequence_length)#The tensors of indices are 0-padded to the longest sequence in the batch (unless you set a fixed output_sequence_length):
Ejemplo n.º 6
0
import numpy as np
import os
import re
import tensorflow as tf
import string
import matplotlib.pyplot as plt
import pickle
import load_data
from tensorflow.keras.models import load_model
# In[]
dataset_dir = os.path.join("../dataset", 'aclImdb')
"""1. load data"""
(train_texts, train_labels), (
    val_texts,
    val_labels) = load_data.load_imdb_sentiment_analysis_dataset("../dataset")
print("load data finished!")

# In[]
test_texts_dataset = tf.data.Dataset.from_tensor_slices(val_texts)

from_disk = pickle.load(open("./models/tv_layer.pkl", "rb"))

if from_disk['config']['output_mode'] == 'int':
    from_disk['config']['output_mode'] = 'int'
new_v = tf.keras.layers.TextVectorization.from_config(from_disk['config'])
# You have to call `adapt` with some dummy data (BUG in Keras)
new_v.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
new_v.set_weights(from_disk['weights'])

# In[]