def tf2_estimator():
    
    from zoo.orca.learn.tf2.estimator import Estimator
    # import ray
    init_orca_context(cluster_mode="local", cores=4, memory="3g")

    print("running tf2 estimator")
    
    imdb = keras.datasets.imdb
    (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000)
    # print(train_data)
    word_index = imdb.get_word_index()
    word_index = {k: (v + 3) for k, v in word_index.items()}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2  # unknown
    word_index["<UNUSED>"] = 3

    train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post',
                                                            maxlen=256)

    test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post',
                                                            maxlen=256)

    model = keras.Sequential()
    model.add(keras.layers.Embedding(1000, 16))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(16, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

    model.summary()

    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
    
    x_val = train_data[:1000]
    partial_x_train = train_data[1000:]

    y_val = train_labels[:1000]
    partial_y_train = train_labels[1000:]

    train_dataset = tf.data.Dataset.from_tensor_slices((partial_x_train, partial_y_train))
    validation_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    est = Estimator.from_keras(model_creator=model)
    est.fit(data=train_dataset, batch_size=512, epochs=100, validation_data=validation_dataset)
    results = est.evaluate(validation_dataset)
    print(results)
    est.save('work/saved_model')
    est.get_train_summary(tag='Loss')
    est.get_validation_summary(tag='Top1Accuracy')


    stop_orca_context()
Exemple #2
0
 def build_word_index(self):
     """
     build word index for pad, start and other symbols
     :return:
     """
     word_index = imdb.get_word_index()
     word_index = {k: (v + 3) for k, v in word_index.items()}
     word_index['<PAD>'] = 0
     word_index['<START>'] = 1
     word_index['<UNK>'] = 2
     word_index['<UNUSED>'] = 3
     self.word_index = word_index
Exemple #3
0
    def preprocess_data(self, data_dir):
        print('IMDB_Task preprocess_data')
        
        vocab_size = self.configs['vocab_size']
        sentence_size = self.configs['max_time']

        # we assign the first indices in the vocabulary \
        # to special tokens that we use
        # for padding, as start token, and for indicating unknown words
        pad_id = 0
        start_id = 1
        oov_id = 2
	index_offset = 2

	print("Loading data...")
	(x_train_variable, y_train), \
	(x_test_variable, y_test) = imdb.load_data(
	      num_words=vocab_size, 
	      start_char=start_id, 
	      oov_char=oov_id,
	      index_from=index_offset)
	print(len(y_train), "train sequences")
	print(len(y_test), "test sequences")

	print("Pad sequences (samples x time)")
	x_train = sequence.pad_sequences(x_train_variable, 
				     maxlen=sentence_size,
				     truncating='post',
				     padding='post',
				     value=pad_id)
	x_test = sequence.pad_sequences(x_test_variable, 
				    maxlen=sentence_size,
				    truncating='post',
				    padding='post', 
				    value=pad_id)
	print("x_train shape:", x_train.shape)
	print("x_test shape:", x_test.shape)

	x_len_train = np.array([min(len(x), sentence_size) \
	                        for x in x_train_variable])
	x_len_test = np.array([min(len(x), sentence_size) \
	                        for x in x_test_variable])
	word_index = imdb.get_word_index()
	#pdb.set_trace()  
	return PreProcessedData(x_train=x_train,
	                        y_train=y_train,
		                x_len_train=x_len_train,
				x_test=x_test,
				y_test=y_test,
				x_len_test=x_len_test,
				vocab_size=vocab_size,
				word_index = word_index)
    def convert2Text(self, pad_id, oov_id, start_id, index_offset):
        '''
            Description: covert index to text
            Usage:
        '''
        word_index = imdb.get_word_index()
        word_inverted_index = {
            v + index_offset: k
            for k, v in word_index.items()
        }

        # The first indexes in the map are reserved to represent things other than tokens
        word_inverted_index[pad_id] = '<PAD>'
        word_inverted_index[start_id] = '<START>'
        word_inverted_index[oov_id] = '<OOV>'

        for i in range(0, 10):
            print(i, word_inverted_index[i])

        def index_to_text(indexes):
            return ' '.join([word_inverted_index[i] for i in indexes])

        print(index_to_text(self.x_train_variable[0]))
                                truncating='post',
                                padding='post',
                                value=pad_id)
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)

x_len_train = np.array([min(len(x), sentence_size) for x in x_train_variable])
x_len_test = np.array([min(len(x), sentence_size) for x in x_test_variable])


def parser(x, length, y):
    features = {"x": x, "len": length}
    return features, y


word_index = imdb.get_word_index()
word_inverted_index = {v + index_offset: k for k, v in word_index.items()}

# The first indexes in the map are reserved to represent things other than tokens
word_inverted_index[pad_id] = '<PAD>'
word_inverted_index[start_id] = '<START>'
word_inverted_index[oov_id] = '<OOV>'

for i in range(0, 10):
    print(i, word_inverted_index[i])


def index_to_text(indexes):
    return ' '.join([word_inverted_index[i] for i in indexes])

print(len(y_train), "train sequences")
print(len(y_test), "test sequences")

print("Pad sequences (samples x time)")
x_train = sequence.pad_sequences(x_train_variable,
                                 maxlen=sentence_size,
                                 padding='post',
                                 value=0)
x_test = sequence.pad_sequences(x_test_variable,
                                maxlen=sentence_size,
                                padding='post',
                                value=0)

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
word_index = imdb.get_word_index(os.path.join(project_path, 'data/imdb_word_index.json'))
word_inverted_index = {v: k for k, v in word_index.items()}
# The first indexes in the map are reserved to represet things other than tokens
index_offset = 3
word_inverted_index[-1 - index_offset] = '_' # Padding at the end
word_inverted_index[ 1 - index_offset] = '>' # Start of the sentence
word_inverted_index[ 2 - index_offset] = '?' # OOV
word_inverted_index[ 3 - index_offset] = ''  # Un-used





x_len_train = np.array([min(len(x), sentence_size) for x in x_train_variable])
x_len_test = np.array([min(len(x), sentence_size) for x in x_test_variable])
embedding_matrix = load_glove_embeddings('data/glove.6B.50d.txt', word_index, vocab_size, embedding_size)
def tf_estimator():
    from zoo.orca.learn.tf.estimator import Estimator
    init_orca_context(cluster_mode="local", cores=4, memory="3g")
    
    os.environ["HDF5_USE_FILE_LOCKING"] = 'FALSE'

    print("running tf estimator")
    
    imdb = keras.datasets.imdb
    (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000)
    # print(train_data)
    word_index = imdb.get_word_index()
    word_index = {k: (v + 3) for k, v in word_index.items()}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2  # unknown
    word_index["<UNUSED>"] = 3
    
    train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post', 
                                                            maxlen=256)

    test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post',
                                                            maxlen=256)
    
    model = keras.Sequential()
    model.add(keras.layers.Embedding(1000, 16))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(16, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

    model.summary()
    
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

    x_val = train_data[:1000]
    partial_x_train = train_data[1000:]

    y_val = train_labels[:1000]
    partial_y_train = train_labels[1000:]

    train_dataset = tf.data.Dataset.from_tensor_slices((partial_x_train, partial_y_train))
    validation_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    est = Estimator.from_keras(keras_model=model)
    est.set_constant_gradient_clipping(0.1, 0.2)
    est.fit(data=train_dataset, batch_size=512, epochs=5, validation_data=validation_dataset)
    results = est.evaluate(validation_dataset)
    print(results)
    est.clear_gradient_clipping()
    est.set_l2_norm_gradient_clipping(0.1)
    est.fit(data=train_dataset, batch_size=512, epochs=5, validation_data=validation_dataset)
    results = est.evaluate(validation_dataset)
    print(results)
    est.save('work/saved_model')
    print("save API finished")
    # est.save_tf_checkpoint('work/checkpoint')
    # est.load_tf_checkpoint('work/checkpoint')
    print("checkpoint save and load API finished")
    est.save_keras_model('work/keras_model')
    est.save_keras_weights('work/keras_weights')
    print("keras model and weights save API finished")
    # est.load_keras_model('work/keras_model')
    # est.load_keras_weights('work')
    print("keras model and weights load API finished")
    est.get_train_summary(tag='Loss')
    est.get_validation_summary(tag='Top1Accuracy')
    # Estimator.load(est, model_path='work/') # Has not been implemented
    # resutls = est.predict(validation_dataset)
    # print(results)

    stop_orca_context()

# Trainings und Testdaten werden über Keras geladen
# Alternativ können Sie direkt die Datei als Pikle Datei herunterladen
(x_train, y_train), (x_test,
                     y_test) = imdb.load_data(path="imdb.npz",
                                              num_words=VOCABULARY_SIZE,
                                              skip_top=0,
                                              maxlen=None,
                                              seed=113,
                                              start_char=START_CHAR,
                                              oov_char=2,
                                              index_from=INDEX_FROM)

# Die Datei wird imdb_word_index.json heruntergeladen
word_to_id = imdb.get_word_index(path="./imdb_word_index.json")

# Hier werden die korrekten Indizes mit dem passenden Wort gespeichert, da es eine Index-Verschiebung von +3 gibt (siehe Erklärung in
# https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification)
# Aus: https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
word_to_id = {k: (v + INDEX_FROM) for k, v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = START_CHAR  # 1
word_to_id["<UNK>"] = 2
id_to_word = {value: key for key, value in word_to_id.items()}

# Zeigt den Inhalt einer Rezension (bestimmt durch REVIEW_INDEX)
REVIEW_INDEX = 2
print(x_train[REVIEW_INDEX])
print("---- Rezensionstext --------- ")
print(' '.join(id_to_word[id] for id in x_train[REVIEW_INDEX]))