def tf2_estimator(): from zoo.orca.learn.tf2.estimator import Estimator # import ray init_orca_context(cluster_mode="local", cores=4, memory="3g") print("running tf2 estimator") imdb = keras.datasets.imdb (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000) # print(train_data) word_index = imdb.get_word_index() word_index = {k: (v + 3) for k, v in word_index.items()} word_index["<PAD>"] = 0 word_index["<START>"] = 1 word_index["<UNK>"] = 2 # unknown word_index["<UNUSED>"] = 3 train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post', maxlen=256) test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post', maxlen=256) model = keras.Sequential() model.add(keras.layers.Embedding(1000, 16)) model.add(keras.layers.GlobalAveragePooling1D()) model.add(keras.layers.Dense(16, activation=tf.nn.relu)) model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid)) model.summary() model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) x_val = train_data[:1000] partial_x_train = train_data[1000:] y_val = train_labels[:1000] partial_y_train = train_labels[1000:] train_dataset = tf.data.Dataset.from_tensor_slices((partial_x_train, partial_y_train)) validation_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)) est = Estimator.from_keras(model_creator=model) est.fit(data=train_dataset, batch_size=512, epochs=100, validation_data=validation_dataset) results = est.evaluate(validation_dataset) print(results) est.save('work/saved_model') est.get_train_summary(tag='Loss') est.get_validation_summary(tag='Top1Accuracy') stop_orca_context()
def build_word_index(self): """ build word index for pad, start and other symbols :return: """ word_index = imdb.get_word_index() word_index = {k: (v + 3) for k, v in word_index.items()} word_index['<PAD>'] = 0 word_index['<START>'] = 1 word_index['<UNK>'] = 2 word_index['<UNUSED>'] = 3 self.word_index = word_index
def preprocess_data(self, data_dir): print('IMDB_Task preprocess_data') vocab_size = self.configs['vocab_size'] sentence_size = self.configs['max_time'] # we assign the first indices in the vocabulary \ # to special tokens that we use # for padding, as start token, and for indicating unknown words pad_id = 0 start_id = 1 oov_id = 2 index_offset = 2 print("Loading data...") (x_train_variable, y_train), \ (x_test_variable, y_test) = imdb.load_data( num_words=vocab_size, start_char=start_id, oov_char=oov_id, index_from=index_offset) print(len(y_train), "train sequences") print(len(y_test), "test sequences") print("Pad sequences (samples x time)") x_train = sequence.pad_sequences(x_train_variable, maxlen=sentence_size, truncating='post', padding='post', value=pad_id) x_test = sequence.pad_sequences(x_test_variable, maxlen=sentence_size, truncating='post', padding='post', value=pad_id) print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) x_len_train = np.array([min(len(x), sentence_size) \ for x in x_train_variable]) x_len_test = np.array([min(len(x), sentence_size) \ for x in x_test_variable]) word_index = imdb.get_word_index() #pdb.set_trace() return PreProcessedData(x_train=x_train, y_train=y_train, x_len_train=x_len_train, x_test=x_test, y_test=y_test, x_len_test=x_len_test, vocab_size=vocab_size, word_index = word_index)
def convert2Text(self, pad_id, oov_id, start_id, index_offset): ''' Description: covert index to text Usage: ''' word_index = imdb.get_word_index() word_inverted_index = { v + index_offset: k for k, v in word_index.items() } # The first indexes in the map are reserved to represent things other than tokens word_inverted_index[pad_id] = '<PAD>' word_inverted_index[start_id] = '<START>' word_inverted_index[oov_id] = '<OOV>' for i in range(0, 10): print(i, word_inverted_index[i]) def index_to_text(indexes): return ' '.join([word_inverted_index[i] for i in indexes]) print(index_to_text(self.x_train_variable[0]))
truncating='post', padding='post', value=pad_id) print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) x_len_train = np.array([min(len(x), sentence_size) for x in x_train_variable]) x_len_test = np.array([min(len(x), sentence_size) for x in x_test_variable]) def parser(x, length, y): features = {"x": x, "len": length} return features, y word_index = imdb.get_word_index() word_inverted_index = {v + index_offset: k for k, v in word_index.items()} # The first indexes in the map are reserved to represent things other than tokens word_inverted_index[pad_id] = '<PAD>' word_inverted_index[start_id] = '<START>' word_inverted_index[oov_id] = '<OOV>' for i in range(0, 10): print(i, word_inverted_index[i]) def index_to_text(indexes): return ' '.join([word_inverted_index[i] for i in indexes])
print(len(y_train), "train sequences") print(len(y_test), "test sequences") print("Pad sequences (samples x time)") x_train = sequence.pad_sequences(x_train_variable, maxlen=sentence_size, padding='post', value=0) x_test = sequence.pad_sequences(x_test_variable, maxlen=sentence_size, padding='post', value=0) print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) word_index = imdb.get_word_index(os.path.join(project_path, 'data/imdb_word_index.json')) word_inverted_index = {v: k for k, v in word_index.items()} # The first indexes in the map are reserved to represet things other than tokens index_offset = 3 word_inverted_index[-1 - index_offset] = '_' # Padding at the end word_inverted_index[ 1 - index_offset] = '>' # Start of the sentence word_inverted_index[ 2 - index_offset] = '?' # OOV word_inverted_index[ 3 - index_offset] = '' # Un-used x_len_train = np.array([min(len(x), sentence_size) for x in x_train_variable]) x_len_test = np.array([min(len(x), sentence_size) for x in x_test_variable]) embedding_matrix = load_glove_embeddings('data/glove.6B.50d.txt', word_index, vocab_size, embedding_size)
def tf_estimator(): from zoo.orca.learn.tf.estimator import Estimator init_orca_context(cluster_mode="local", cores=4, memory="3g") os.environ["HDF5_USE_FILE_LOCKING"] = 'FALSE' print("running tf estimator") imdb = keras.datasets.imdb (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000) # print(train_data) word_index = imdb.get_word_index() word_index = {k: (v + 3) for k, v in word_index.items()} word_index["<PAD>"] = 0 word_index["<START>"] = 1 word_index["<UNK>"] = 2 # unknown word_index["<UNUSED>"] = 3 train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post', maxlen=256) test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post', maxlen=256) model = keras.Sequential() model.add(keras.layers.Embedding(1000, 16)) model.add(keras.layers.GlobalAveragePooling1D()) model.add(keras.layers.Dense(16, activation=tf.nn.relu)) model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid)) model.summary() model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) x_val = train_data[:1000] partial_x_train = train_data[1000:] y_val = train_labels[:1000] partial_y_train = train_labels[1000:] train_dataset = tf.data.Dataset.from_tensor_slices((partial_x_train, partial_y_train)) validation_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)) est = Estimator.from_keras(keras_model=model) est.set_constant_gradient_clipping(0.1, 0.2) est.fit(data=train_dataset, batch_size=512, epochs=5, validation_data=validation_dataset) results = est.evaluate(validation_dataset) print(results) est.clear_gradient_clipping() est.set_l2_norm_gradient_clipping(0.1) est.fit(data=train_dataset, batch_size=512, epochs=5, validation_data=validation_dataset) results = est.evaluate(validation_dataset) print(results) est.save('work/saved_model') print("save API finished") # est.save_tf_checkpoint('work/checkpoint') # est.load_tf_checkpoint('work/checkpoint') print("checkpoint save and load API finished") est.save_keras_model('work/keras_model') est.save_keras_weights('work/keras_weights') print("keras model and weights save API finished") # est.load_keras_model('work/keras_model') # est.load_keras_weights('work') print("keras model and weights load API finished") est.get_train_summary(tag='Loss') est.get_validation_summary(tag='Top1Accuracy') # Estimator.load(est, model_path='work/') # Has not been implemented # resutls = est.predict(validation_dataset) # print(results) stop_orca_context()
# Trainings und Testdaten werden über Keras geladen # Alternativ können Sie direkt die Datei als Pikle Datei herunterladen (x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz", num_words=VOCABULARY_SIZE, skip_top=0, maxlen=None, seed=113, start_char=START_CHAR, oov_char=2, index_from=INDEX_FROM) # Die Datei wird imdb_word_index.json heruntergeladen word_to_id = imdb.get_word_index(path="./imdb_word_index.json") # Hier werden die korrekten Indizes mit dem passenden Wort gespeichert, da es eine Index-Verschiebung von +3 gibt (siehe Erklärung in # https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification) # Aus: https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset word_to_id = {k: (v + INDEX_FROM) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = START_CHAR # 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()} # Zeigt den Inhalt einer Rezension (bestimmt durch REVIEW_INDEX) REVIEW_INDEX = 2 print(x_train[REVIEW_INDEX]) print("---- Rezensionstext --------- ") print(' '.join(id_to_word[id] for id in x_train[REVIEW_INDEX]))