Beispiel #1
0
def get_dataset():
    (x_train, y_train), (_, _) = imdb.load_data(num_words=max_features)

    x_train = sequence.pad_sequences(x_train, maxlen=80)

    ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    ds = ds.repeat()
    ds = ds.map(lambda x, y: (x, tf.cast(y, tf.int32)))
    ds = ds.batch(32, drop_remainder=True)
    return ds
def tf2_estimator():
    
    from zoo.orca.learn.tf2.estimator import Estimator
    # import ray
    init_orca_context(cluster_mode="local", cores=4, memory="3g")

    print("running tf2 estimator")
    
    imdb = keras.datasets.imdb
    (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000)
    # print(train_data)
    word_index = imdb.get_word_index()
    word_index = {k: (v + 3) for k, v in word_index.items()}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2  # unknown
    word_index["<UNUSED>"] = 3

    train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post',
                                                            maxlen=256)

    test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post',
                                                            maxlen=256)

    model = keras.Sequential()
    model.add(keras.layers.Embedding(1000, 16))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(16, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

    model.summary()

    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
    
    x_val = train_data[:1000]
    partial_x_train = train_data[1000:]

    y_val = train_labels[:1000]
    partial_y_train = train_labels[1000:]

    train_dataset = tf.data.Dataset.from_tensor_slices((partial_x_train, partial_y_train))
    validation_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    est = Estimator.from_keras(model_creator=model)
    est.fit(data=train_dataset, batch_size=512, epochs=100, validation_data=validation_dataset)
    results = est.evaluate(validation_dataset)
    print(results)
    est.save('work/saved_model')
    est.get_train_summary(tag='Loss')
    est.get_validation_summary(tag='Top1Accuracy')


    stop_orca_context()
Beispiel #3
0
 def prepare_data(self):
     """
     main prepare data
     :return:
     """
     (x_train, y_train), (_, _) = imdb.load_data(num_words=self.flags.vocab_size)
     word_index = self.build_word_index()
     x_train = pad_sequences(x_train, maxlen=250, value=word_index['<PAD>'], padding='post')
     (x_train, x_eval) = x_train[:20000], x_train[20000:]
     (y_train, y_eval) = y_train[:20000], y_train[20000:]
     train_data, eval_data = self.build_generator(x_train, y_train), self.build_generator(x_eval, y_eval)
     return train_data, eval_data, len(x_train), len(x_eval)
Beispiel #4
0
 def prepare_data(self):
     """
     main prepare data
     :return:
     """
     (_, _), (x_test, y_test) = imdb.load_data(num_words=self.flags.vocab_size)
     # build word index and reverse word index
     self.build_word_index()
     self.build_reverse_word_index()
     self.x_test = x_test
     x_test = pad_sequences(x_test, maxlen=250, value=self.word_index['<PAD>'], padding='post')
     return x_test
Beispiel #5
0
    def preprocess_data(self, data_dir):
        print('IMDB_Task preprocess_data')
        
        vocab_size = self.configs['vocab_size']
        sentence_size = self.configs['max_time']

        # we assign the first indices in the vocabulary \
        # to special tokens that we use
        # for padding, as start token, and for indicating unknown words
        pad_id = 0
        start_id = 1
        oov_id = 2
	index_offset = 2

	print("Loading data...")
	(x_train_variable, y_train), \
	(x_test_variable, y_test) = imdb.load_data(
	      num_words=vocab_size, 
	      start_char=start_id, 
	      oov_char=oov_id,
	      index_from=index_offset)
	print(len(y_train), "train sequences")
	print(len(y_test), "test sequences")

	print("Pad sequences (samples x time)")
	x_train = sequence.pad_sequences(x_train_variable, 
				     maxlen=sentence_size,
				     truncating='post',
				     padding='post',
				     value=pad_id)
	x_test = sequence.pad_sequences(x_test_variable, 
				    maxlen=sentence_size,
				    truncating='post',
				    padding='post', 
				    value=pad_id)
	print("x_train shape:", x_train.shape)
	print("x_test shape:", x_test.shape)

	x_len_train = np.array([min(len(x), sentence_size) \
	                        for x in x_train_variable])
	x_len_test = np.array([min(len(x), sentence_size) \
	                        for x in x_test_variable])
	word_index = imdb.get_word_index()
	#pdb.set_trace()  
	return PreProcessedData(x_train=x_train,
	                        y_train=y_train,
		                x_len_train=x_len_train,
				x_test=x_test,
				y_test=y_test,
				x_len_test=x_len_test,
				vocab_size=vocab_size,
				word_index = word_index)
Beispiel #6
0
def import_data():
    '''Imports the imdb dataset and returns train and test tensors using one-hot
    encoding.
    '''
    (train_data, train_labels), (test_data, test_labels) =\
        imdb.load_data(num_words=10000)

    x_train = vectorize_sequence(train_data)
    x_test = vectorize_sequence(test_data)

    y_train = np.asarray(train_labels).astype('float32')
    y_test = np.asarray(test_labels).astype('float32')

    return (x_train, y_train), (x_test, y_test)
Beispiel #7
0
    def __init__(self):
        num_classes = 1
        self.max_features = 5000
        self.maxlen = 400

        (x_train,
         y_train), (x_test,
                    y_test) = imdb.load_data(num_words=self.max_features)

        x_train = pad_sequences(x_train, maxlen=self.maxlen)
        x_test = pad_sequences(x_test, maxlen=self.maxlen)

        super().__init__(x_train, x_test, y_train, y_test, (self.maxlen, ),
                         num_classes, 'imdb')
    def __init__(self):

        self.vocab_size = 5000
        self.start_id = 1
        self.oov_id = 2
        self.index_offset = 2
        self.sentence_size = 200

        model_dir = tempfile.mkdtemp()

        print("Loading data...")
        (self.x_train_variable,
         self.y_train), (self.x_test_variable, self.y_test) = imdb.load_data(
             num_words=self.vocab_size,
             start_char=self.start_id,
             oov_char=self.oov_id,
             index_from=self.index_offset)

        self.x_train = 0
        self.x_test = 0

        print(len(self.y_train), "train sequences")
        print(len(self.y_test), "test sequences")
def test_train_ngram():
    train, val = imdb.load_data()
    acc, loss = training.train_ngram(train, val)
    assert acc == pytest.approx(0.91, 0.02)
    assert loss == pytest.approx(0.24, 0.02)
def test_fine_tuned_sequence():
    train, val = imdb.load_data()
    acc, loss = tune.fine_tune_sequence(train, val)
    assert acc == pytest.approx(0.84, 0.02)
    assert loss == pytest.approx(0.55, 0.02)
def test_fine_tune_ngram():
    train, val = imdb.load_data()
    acc, loss = tune.fine_tune_ngram(train, val)
    assert acc == pytest.approx(0.61, 0.02)
    assert loss == pytest.approx(0.89, 0.02)
    message = message.translate(str.maketrans('', '', string.punctuation))
    tmp = []
    for word in message.split(" "):
        tmp.append(word_to_id[word])
    padded_message = sequence.pad_sequences([tmp], maxlen=PAD_MAX_LENGTH)
    sentiment_prediction = my_model.predict(np.array(padded_message))
    return sentiment_prediction


# Trainings und Testdaten werden über Keras geladen
# Alternativ können Sie direkt die Datei als Pikle Datei herunterladen
(x_train, y_train), (x_test,
                     y_test) = imdb.load_data(path="imdb.npz",
                                              num_words=VOCABULARY_SIZE,
                                              skip_top=0,
                                              maxlen=None,
                                              seed=113,
                                              start_char=START_CHAR,
                                              oov_char=2,
                                              index_from=INDEX_FROM)

# Die Datei wird imdb_word_index.json heruntergeladen
word_to_id = imdb.get_word_index(path="./imdb_word_index.json")

# Hier werden die korrekten Indizes mit dem passenden Wort gespeichert, da es eine Index-Verschiebung von +3 gibt (siehe Erklärung in
# https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification)
# Aus: https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
word_to_id = {k: (v + INDEX_FROM) for k, v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = START_CHAR  # 1
word_to_id["<UNK>"] = 2
id_to_word = {value: key for key, value in word_to_id.items()}
def test_train_sequence():
    train, val = imdb.load_data()
    acc, loss = training.train_sequence(train, val)
    assert acc == pytest.approx(0.68, 0.02)
    assert loss == pytest.approx(0.82, 0.02)
Beispiel #14
0
parser.add_argument('--cluster_mode', type=str, default="local",
                    help='The mode for the Spark cluster. local or yarn.')
args = parser.parse_args()
cluster_mode = args.cluster_mode
if cluster_mode == "local":
    init_orca_context(cluster_mode="local", cores=4, memory="3g")
elif cluster_mode == "yarn":
    init_orca_context(cluster_mode="yarn-client", num_nodes=2, cores=2, driver_memory="3g", \
                      conf={"spark.executor.extraJavaOptions": "-Xss512m",
                            "spark.driver.extraJavaOptions": "-Xss512m"})

max_features = 20000
max_len = 200

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

train_pos = np.zeros((len(x_train), max_len), dtype=np.int32)
val_pos = np.zeros((len(x_test), max_len), dtype=np.int32)
for i in range(0, len(x_train)):
    train_pos[i, :] = np.arange(max_len)
    val_pos[i, :] = np.arange(max_len)
Beispiel #15
0
tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)
"""### Loading the data
Keras provides a convenient handler for importing the dataset which is also available as a serialized numpy array `.npz` file to download [here]( https://s3.amazonaws.com/text-datasets/imdb.npz). Each review consists of a series of word indexes that go from $4$ (the most frequent word in the dataset, **the**) to $4999$, which corresponds to **orange**. Index $1$ represents the beginning of the sentence and the index $2$ is assigned to all unknown (also known as *out-of-vocabulary* or *OOV*) tokens. These indexes have been obtained by pre-processing the text data in a pipeline that cleans, normalizes and tokenizes each sentence first and then builds a dictionary indexing each of the tokens by frequency. We are not convering these techniques in this post, but you can take a look at [this chapter](http://www.nltk.org/book/ch03.html) of the NLTK book to learn more.
"""

vocab_size = 5000
sentence_size = 200
embedding_size = 50
model_dir = tempfile.mkdtemp()

# Should we not use keras and rewrite this logic?
print("Loading data...")
(x_train_variable, y_train), (x_test_variable,
                              y_test) = imdb.load_data(num_words=vocab_size)
print(len(y_train), "train sequences")
print(len(y_test), "test sequences")

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train_variable,
                                 maxlen=sentence_size,
                                 padding='post',
                                 value=0)
x_test = sequence.pad_sequences(x_test_variable,
                                maxlen=sentence_size,
                                padding='post',
                                value=0)
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
"""We can use the word index map to inspect how the first review looks like."""
Beispiel #16
0
#https://github.com/iamved/IMDB-sentiment-analysis/blob/master/IMDB_Sentiment_Analysis.ipynb
#https://github.com/balag59/imdb-sentiment-bidirectional-LSTM/blob/master/imdb_bilstm_train.py

from tensorflow.python.keras.datasets import imdb
from tensorflow.python.keras.preprocessing import sequence
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
import numpy as np

max_features = 10000
max_length = 500

(train_data,
 train_label), (test_data,
                test_labels) = imdb.load_data(num_words=max_features)

train_data = sequence.pad_sequences(train_data, maxlen=max_length)
test_data = sequence.pad_sequences(test_data, maxlen=max_length)

train_data = np.concatenate((train_data, test_data[:15000]))
test_data = test_data[15000:]
train_label = np.concatenate((train_label, test_labels[:15000]))
test_labels = test_labels[15000:]

embedding_size = 128
network = Sequential()
network.add(Embedding(max_features, embedding_size, input_length=max_length))
network.add(Bidirectional(LSTM(embedding_size, return_sequences=True)))
#network.add(Dropout(0.2))
network.add(Bidirectional(LSTM(embedding_size, return_sequences=True)))
#network.add(Dropout(0.2))
def tf_estimator():
    from zoo.orca.learn.tf.estimator import Estimator
    init_orca_context(cluster_mode="local", cores=4, memory="3g")
    
    os.environ["HDF5_USE_FILE_LOCKING"] = 'FALSE'

    print("running tf estimator")
    
    imdb = keras.datasets.imdb
    (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000)
    # print(train_data)
    word_index = imdb.get_word_index()
    word_index = {k: (v + 3) for k, v in word_index.items()}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2  # unknown
    word_index["<UNUSED>"] = 3
    
    train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post', 
                                                            maxlen=256)

    test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post',
                                                            maxlen=256)
    
    model = keras.Sequential()
    model.add(keras.layers.Embedding(1000, 16))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(16, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

    model.summary()
    
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

    x_val = train_data[:1000]
    partial_x_train = train_data[1000:]

    y_val = train_labels[:1000]
    partial_y_train = train_labels[1000:]

    train_dataset = tf.data.Dataset.from_tensor_slices((partial_x_train, partial_y_train))
    validation_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    est = Estimator.from_keras(keras_model=model)
    est.set_constant_gradient_clipping(0.1, 0.2)
    est.fit(data=train_dataset, batch_size=512, epochs=5, validation_data=validation_dataset)
    results = est.evaluate(validation_dataset)
    print(results)
    est.clear_gradient_clipping()
    est.set_l2_norm_gradient_clipping(0.1)
    est.fit(data=train_dataset, batch_size=512, epochs=5, validation_data=validation_dataset)
    results = est.evaluate(validation_dataset)
    print(results)
    est.save('work/saved_model')
    print("save API finished")
    # est.save_tf_checkpoint('work/checkpoint')
    # est.load_tf_checkpoint('work/checkpoint')
    print("checkpoint save and load API finished")
    est.save_keras_model('work/keras_model')
    est.save_keras_weights('work/keras_weights')
    print("keras model and weights save API finished")
    # est.load_keras_model('work/keras_model')
    # est.load_keras_weights('work')
    print("keras model and weights load API finished")
    est.get_train_summary(tag='Loss')
    est.get_validation_summary(tag='Top1Accuracy')
    # Estimator.load(est, model_path='work/') # Has not been implemented
    # resutls = est.predict(validation_dataset)
    # print(results)

    stop_orca_context()
Beispiel #18
0
import tensorflow as tf

from tensorflow.python.keras.datasets import imdb
import numpy
from tensorflow.python.keras.datasets import imdb
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import LSTM
from tensorflow.python.keras.layers import Conv1D
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.preprocessing import sequence

numpy.random.seed(7)
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)


max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)

"""### Loading the data
Keras provides a convenient handler for importing the dataset which is also available as a serialized numpy array `.npz` file to download [here]( https://s3.amazonaws.com/text-datasets/imdb.npz). Each review consists of a series of word indexes that go from $4$ (the most frequent word in the dataset, **the**) to $4999$, which corresponds to **orange**. Index $1$ represents the beginning of the sentence and the index $2$ is assigned to all unknown (also known as *out-of-vocabulary* or *OOV*) tokens. These indexes have been obtained by pre-processing the text data in a pipeline that cleans, normalizes and tokenizes each sentence first and then builds a dictionary indexing each of the tokens by frequency. We are not convering these techniques in this post, but you can take a look at [this chapter](http://www.nltk.org/book/ch03.html) of the NLTK book to learn more.
It's standard to limit the size of the vocabulary to prevent the dataset from becoming too sparse and high dimensional, causing potential overfitting. After we've loaded the data in memory we pad each of the sentences with $-1$ to a fixed size (here: $200$) so that we have two $2$-dimensional $25000\times200$ arrays for training and testing respectively.
"""

vocab_size = 5000
sentence_size = 200
embedding_size = 50
model_dir = tempfile.mkdtemp()

# Should we not use keras and rewrite this logic?
print("Loading data...")
(x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data(
    num_words=vocab_size)
print(len(y_train), "train sequences")
print(len(y_test), "test sequences")

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train_variable, 
                                 maxlen=sentence_size, 
                                 padding='post', 
                                 value=0)
x_test = sequence.pad_sequences(x_test_variable, 
                                maxlen=sentence_size, 
                                padding='post', 
                                value=0)
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
Beispiel #20
0
def load_imdb(max_features=20000, maxlen=1000):
    (X_train, y_train), (X_val,
                         y_val) = _imdb.load_data(num_words=max_features)
    X_train = csr_matrix(pad_sequences(X_train, maxlen=maxlen))
    X_val = csr_matrix(pad_sequences(X_val, maxlen=maxlen))
    return (X_train, y_train), (X_val, y_val)
Beispiel #21
0
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Embedding, Flatten
from tensorflow.python.keras.datasets import imdb
from tensorflow.python.keras import preprocessing
path = r"F:\5-model data\imdb.npz"
max_feature = 10000
maxlen = 20
(x_trian, y_train), (x_test, y_test) = imdb.load_data(path=path,
                                                      num_words=max_feature)
x_trian = preprocessing.sequence.pad_sequences(x_trian, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
model.fit(x_trian, y_train, epochs=10, batch_size=32, validation_split=0.2)
from tensorflow.python.keras.datasets import imdb
from tensorflow.python.keras.preprocessing import sequence
max_features = 10000
maxlen = 500
batch_size = 32
print('Lodaing data...')
imdb_path = r"F:\5-model data\imdb.npz"
(input_train,y_train),(input_test,y_test) = imdb.load_data(
    path=imdb_path,
    num_words=max_features
)
print(len(input_train),'train sequences')
print(len(input_test),'test sequences')

print('pad sequences (samples x time)')
input_train = sequence.pad_sequences(input_train,maxlen=maxlen)
input_test = sequence.pad_sequences(input_test,maxlen=maxlen)

#用Embedding层和SimpleRNN层来训练
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Embedding,SimpleRNN,Dense
model = Sequential()
model.add(Embedding(max_features,32))
model.add(SimpleRNN(32))
model.add(Dense(1,activation='sigmoid'))
model.summary()
model.compile(
    optimizer='rmsprop',
    loss= 'binary_crossentropy',
    metrics=['acc']
)
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1))
    model.summary()
    model.compile(optimizer=RMSprop(lr=1e-4),
                  loss='binary_crossentropy',
                  metrics=['acc'])
    return model


if __name__ == '__main__':
    max_features = 10000
    maxlen = 500
    batch_size = 32
    print('Loading data...')
    (input_train, y_train), (input_test,
                             y_test) = imdb.load_data(num_words=max_features)
    print(len(input_train), 'train sequences')
    print(len(input_test), 'test sequences')
    print('Pad sequences (samples x time)')
    input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
    input_test = sequence.pad_sequences(input_test, maxlen=maxlen)
    print('input_train shape:', input_train.shape)
    print('input_test shape:', input_test.shape)
    # rnn训练速度极慢
    # model = build_simple_rnn_model()

    #CuDNNLSTM, 才能使用gpu提高速度,普通LSTM很慢
    model = build_LSTM_model()

    # 训练速度较快
    # model = build_cnn_1d_model()
from input_functions import train_input_fn, eval_input_fn
from utils import load_glove_embeddings, load_data
from models import cnn_model_fn, lstm_model_fn


tf.logging.set_verbosity(tf.logging.INFO)
print(tf.__version__)

vocab_size = 5000
sentence_size = 200
embedding_size = 50
#model_dir = tempfile.mkdtemp()
model_dir = 'model'
project_path = os.path.dirname(os.path.abspath(__file__))

(x_train_variable, y_train), (x_test_variable, y_test) = imdb.load_data(path=os.path.join(project_path, 'data/imdb.npz'),
    num_words=vocab_size)
print(len(y_train), "train sequences")
print(len(y_test), "test sequences")

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train_variable,
                                 maxlen=sentence_size,
                                 padding='post',
                                 value=0)
x_test = sequence.pad_sequences(x_test_variable,
                                maxlen=sentence_size,
                                padding='post',
                                value=0)

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
Beispiel #25
0
# from github susanli2016
#https://github.com/andyngo95/SA_Positive_Negative_Comments/blob/master/Sentiment_Analysis_v2.ipynb
#https://towardsdatascience.com/light-on-math-ml-attention-with-keras-dc8dbc1fad39
#https://github.com/thushv89/attention_keras
from tensorflow.python.keras.datasets import imdb
from tensorflow.python.keras.preprocessing import sequence
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
#from attention_keras.layers.attention import AttentionLayer
import numpy as np

max_features = 10000
max_length = 500

(train_data, train_label), (test_data, test_labels) = imdb.load_data(num_words=max_features)

train_data = sequence.pad_sequences(train_data, maxlen=max_length)
test_data = sequence.pad_sequences(test_data, maxlen=max_length)

train_data = np.concatenate((train_data, test_data[:15000]))
test_data = test_data[15000:]
train_label = np.concatenate((train_label, test_labels[:15000]))
test_labels = test_labels[15000:]

embedding_size = 128
model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=max_length))
model.add(LSTM(embedding_size, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(embedding_size))
model.add(Dropout(0.2))
  # raise SystemError('GPU device not found')
else: print('Found GPU at: {}'.format(device_name))

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
config.log_device_placement = False
sess = tf.Session(config=config)
tf.keras.backend.set_session(sess)

# # save np.load
# np_load_old = np.load
# # modify the default parameters of np.load
# np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)
num_words = 20000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)

epoch = 5
batch_size = 64

word_to_index = imdb.get_word_index()
word_to_index = {key:(value+3) for key,value in word_to_index.items()}
word_to_index["<PAD>"] = 0
word_to_index["<START>"] = 1
word_to_index["<UNK>"] = 2
index_to_word = {value:key for key,value in word_to_index.items()}

def print_sentence(id_list):
    print(' '.join([index_to_word[id] for id in id_list if id != 0]))

print("Train-set size: ", len(x_train))
Beispiel #27
0
import numpy as np

from keras.layers import Dense, Input, concatenate
from keras.optimizers import SGD
from keras.models import Model

from tensorflow.python.keras.datasets import imdb

word_num = 6666

(train_data, train_labels), (test_data,
                             test_labels) = imdb.load_data(num_words=word_num)


def vectorize_sequences(sequences, dimension=word_num):  # one-hot
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results


x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

y_train = np.asarray(train_labels).astype('float32')  # 向量化标签数据
y_test = np.asarray(test_labels).astype('float32')

x_val = x_train[:2000]
partial_x_train = x_train[2000:]

y_val = y_train[:2000]
Beispiel #28
0
from tensorflow.python.keras.datasets import imdb
(train_data, train_labels), (test_data,
                             test_labels) = imdb.load_data(num_words=10000)
# print(train_data)
# word_index = imdb.get_word_index()
# reverse_word_index = dict(
#     [(value,key) for (key,value) in word_index.items()]
# )
# decoded_revied = ' '.join([reverse_word_index.get(i-3,"?") for i in train_data[0]])
# # print(reverse_word_index)
# # print(decoded_revied)
import numpy as ny


def vectorize_sequences(sequences, dimension=10000):

    results = ny.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):  #enumerate枚举,遍历对象
        results[i, sequence] = 1.
    return results


# #把训练数据和测试数据向量化
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
# print(x_train,x_test)
#将标签向量化
y_train = ny.asarray(train_labels).astype('float32')
y_test = ny.asarray(test_labels).astype('float32')

#留出验证集
print(tf.__version__)

vocab_size = 5000
sentence_size = 200
embedding_size = 50
model_dir = tempfile.mkdtemp()

pad_id = 0
start_id = 1
oov_id = 2
index_offset = 2

print("Loading data...")
(x_train_variable, y_train), (x_test_variable,
                              y_test) = imdb.load_data(num_words=vocab_size,
                                                       start_char=start_id,
                                                       oov_char=oov_id,
                                                       index_from=index_offset)
print(len(y_train), "train sequences")
print(len(y_test), "test sequences")

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train_variable,
                                 maxlen=sentence_size,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
x_test = sequence.pad_sequences(x_test_variable,
                                maxlen=sentence_size,
                                truncating='post',
                                padding='post',
                                value=pad_id)
def bigdl_estimator():
    from zoo.orca.learn.bigdl.estimator import Estimator
    from tensorflow.python.keras.datasets import imdb
    from tensorflow.python.keras.preprocessing import sequence
    from zoo.pipeline.api.keras.models import Model
    from zoo.pipeline.api.keras.objectives import SparseCategoricalCrossEntropy
    from zoo.orca.data import XShards
    from zoo.orca.learn.metrics import Accuracy
    import numpy as np

    # conf = {"spark.executor.extraJavaOptions": "-Xss512m", "spark.driver.extraJavaOptions": "-Xss512m"}

    # init_orca_context(cluster_mode="local", cores=8, memory="16g")
    init_orca_context(cluster_mode="local", cores=4, memory="16g")
    max_features = 200
    max_len = 20

    print("running bigdl estimator")

    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    
    x_train = x_train[:1000]
    y_train = y_train[:1000]
    x_test = x_test[-1000:]
    y_test = y_test[-1000:]
    
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=max_len)
    x_test = sequence.pad_sequences(x_test, maxlen=max_len)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)

    train_pos = np.zeros((len(x_train), max_len), dtype=np.int32)
    val_pos = np.zeros((len(x_test), max_len), dtype=np.int32)
    for i in range(0, len(x_train)):
        train_pos[i, :] = np.arange(max_len)
        val_pos[i, :] = np.arange(max_len)

    train_dataset = XShards.partition({"x": (x_train, train_pos), "y": np.array(y_train)})
    val_dataset = XShards.partition({"x": (x_test, val_pos), "y": np.array(y_test)})

    token_shape = (max_len,)
    position_shape = (max_len,)
    token_input = Input(shape=token_shape)
    position_input = Input(shape=position_shape)
    O_seq = TransformerLayer.init(vocab=max_features, hidden_size=128, n_head=8, seq_len=max_len)([token_input, position_input])
    # Select the first output of the Transformer. The second is the pooled output.
    O_seq = SelectTable(0)(O_seq)
    O_seq = GlobalAveragePooling1D()(O_seq)
    O_seq = Dropout(0.2)(O_seq)
    outputs = Dense(2, activation='softmax')(O_seq)

    model = Model([token_input, position_input], outputs)
    model.summary()
    batch_size = 64
    print("Train started")
    est = Estimator.from_bigdl(model=model, loss=SparseCategoricalCrossEntropy(), optimizer=Adam(), metrics=[Accuracy()])
    est.set_constant_gradient_clipping(0.1, 0.2)
    est.fit(data=train_dataset, batch_size=batch_size, epochs=1)
    result = est.evaluate(val_dataset)
    print(result)
    est.clear_gradient_clipping()
    est.set_l2_norm_gradient_clipping(0.5)
    est.fit(data=train_dataset, batch_size=batch_size, epochs=1)
    print("Train finished") 
    
    print("Evaluating started")
    result = est.evaluate(val_dataset)
    print(result)
    print("Evaluating finished")
    est.save('work/saved_model')
    # est.load('work/saved_model')
    print("load and save API finished")

    est.get_train_summary(tag='Loss')
    est.get_validation_summary(tag='Top1Accuracy')
    print("get summary API finished")


    stop_orca_context()