def test_tfdataset_with_tf_data_dataset(self): dataset = tf.data.Dataset.from_tensor_slices( (np.random.randn(100, 28, 28, 1), np.random.randint(0, 10, size=(100, )))) dataset = dataset.map(lambda feature, label: (tf.to_float(feature), label)) dataset = TFDataset.from_tf_data_dataset(dataset, batch_size=16) seq = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), tf.keras.layers.Dense(10, activation="softmax") ]) seq.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) model = KerasModel(seq) model.fit(dataset) dataset = tf.data.Dataset.from_tensor_slices( (np.random.randn(100, 28, 28, 1), np.random.randint(0, 10, size=(100, )))) dataset = dataset.map(lambda feature, label: (tf.to_float(feature), label)) dataset = TFDataset.from_tf_data_dataset(dataset, batch_per_thread=16) model.evaluate(dataset) dataset = tf.data.Dataset.from_tensor_slices( np.random.randn(100, 28, 28, 1)) dataset = dataset.map(lambda data: tf.to_float(data)) dataset = TFDataset.from_tf_data_dataset(dataset, batch_per_thread=16) model.predict(dataset).collect()
def test_tfdataset_with_dataframe(self): rdd = self.sc.range(0, 1000) df = rdd.map(lambda x: (DenseVector( np.random.rand(20).astype(np.float)), x % 10)).toDF( ["feature", "label"]) train_df, val_df = df.randomSplit([0.7, 0.3]) dataset = TFDataset.from_dataframe(train_df, feature_cols=["feature"], labels_cols=["label"], batch_size=32, validation_df=val_df) seq = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(20, )), tf.keras.layers.Dense(10, activation="softmax") ]) seq.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) model = KerasModel(seq) model.fit(dataset) dataset = TFDataset.from_dataframe(val_df, feature_cols=["feature"], batch_per_thread=32) model.predict(dataset).collect() dataset = TFDataset.from_dataframe(val_df, feature_cols=["feature"], labels_cols=["label"], batch_per_thread=32) model.evaluate(dataset)
def input_fn(mode): import os resource_path = os.path.join(os.path.split(__file__)[0], "../resources") if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: image_folder = os.path.join(resource_path, "cat_dog") image_set = ImageSet.read(image_folder, with_label=True, sc=self.sc, one_based_label=False) transformer = ChainedPreprocessing([ImageResize(256, 256), ImageRandomCrop(224, 224, True), ImageMatToTensor(format="NHWC"), ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"])]) image_set = image_set.transform(transformer) dataset = TFDataset.from_image_set(image_set, image=(tf.float32, [224, 224, 3]), label=(tf.int32, [1]), batch_size=8) else: image_folder = os.path.join(resource_path, "cat_dog/*/*") image_set = ImageSet.read(image_folder, with_label=False, sc=self.sc, one_based_label=False) transformer = ChainedPreprocessing([ImageResize(256, 256), ImageRandomCrop(224, 224, True), ImageMatToTensor(format="NHWC"), ImageSetToSample( input_keys=["imageTensor"])]) image_set = image_set.transform(transformer) dataset = TFDataset.from_image_set(image_set, image=(tf.float32, [224, 224, 3]), batch_per_thread=8) return dataset
def input_fn(mode): x = np.random.rand(20, 10) y = np.random.randint(0, 10, (20, )) if mode == tf.estimator.ModeKeys.TRAIN: return TFDataset.from_ndarrays((x, y), batch_size=8) elif mode == tf.estimator.ModeKeys.EVAL: return TFDataset.from_ndarrays((x, y), batch_per_thread=1) else: return TFDataset.from_ndarrays(x, batch_per_thread=1)
def input_fn(mode): if mode == tf.estimator.ModeKeys.TRAIN: training_data = get_data("train") dataset = TFDataset.from_ndarrays(training_data, batch_size=320) elif mode == tf.estimator.ModeKeys.EVAL: testing_data = get_data("test") dataset = TFDataset.from_ndarrays(testing_data, batch_per_thread=80) else: images, _ = get_data("test") dataset = TFDataset.from_ndarrays(images, batch_per_thread=80) return dataset
def input_fn(mode): if mode == tf.estimator.ModeKeys.TRAIN: image_set = self.get_raw_image_set(with_label=True) feature_set = FeatureSet.image_frame( image_set.to_image_frame()) train_transformer = ChainedPreprocessing([ ImageBytesToMat(), ImageResize(256, 256), ImageRandomCrop(224, 224), ImageRandomPreprocessing(ImageHFlip(), 0.5), ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225), ImageMatToTensor(to_RGB=True, format="NHWC"), ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"]) ]) feature_set = feature_set.transform(train_transformer) feature_set = feature_set.transform(ImageFeatureToSample()) training_dataset = TFDataset.from_feature_set( feature_set, features=(tf.float32, [224, 224, 3]), labels=(tf.int32, [1]), batch_size=8) return training_dataset else: raise NotImplementedError
def test_tfdataset_with_tf_data_dataset_which_requires_table(self): keys = [1, 0, -1] dataset = tf.data.Dataset.from_tensor_slices([1, 2, -1, 5] * 40) table = tf.contrib.lookup.HashTable( initializer=tf.contrib.lookup.KeyValueTensorInitializer( keys=keys, values=list(reversed(keys))), default_value=100) dataset = dataset.map(table.lookup) def transform(x): float_x = tf.to_float(x) return float_x, 1 dataset = dataset.map(transform) dataset = TFDataset.from_tf_data_dataset(dataset, batch_size=16) seq = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=()), tf.keras.layers.Dense(10, activation="softmax") ]) seq.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) model = KerasModel(seq) model.fit(dataset)
def test_tf_optimizer_with_sparse_gradient_using_keras(self): import tensorflow as tf ids = np.random.randint(0, 10, size=[40]) labels = np.random.randint(0, 5, size=[40]) id_rdd = self.sc.parallelize(ids) label_rdd = self.sc.parallelize(labels) training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]]) dataset = TFDataset.from_rdd(training_rdd, names=["ids", "labels"], shapes=[[], []], types=[tf.int32, tf.int32], batch_size=8) from tensorflow.python.ops import variable_scope def variable_creator(**kwargs): kwargs["use_resource"] = False return variable_scope.default_variable_creator(None, **kwargs) getter = lambda next_creator, **kwargs: variable_creator(**kwargs) with variable_scope.variable_creator_scope(getter): words_input = tf.keras.layers.Input(shape=(), name='words_input') embedding_layer = tf.keras.layers.Embedding(input_dim=10, output_dim=5, name='word_embedding') word_embeddings = embedding_layer(words_input) embedding = tf.keras.layers.Flatten()(word_embeddings) output = tf.keras.layers.Dense(5, activation="softmax")(embedding) model = tf.keras.models.Model(inputs=[words_input], outputs=[output]) model.compile(optimizer="sgd", loss="sparse_categorical_crossentropy")\ optimizer = TFOptimizer.from_keras(model, dataset) optimizer.optimize()
def input_fn(mode): if mode == tf.estimator.ModeKeys.PREDICT: # get the TFDataset image_dataset = TFDataset.from_ndarrays(image_array[None, ...]) return image_dataset else: raise NotImplementedError
def test_control_inputs(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) is_training = tf.placeholder(dtype=tf.bool, shape=()) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) features = tf.layers.dropout(features, training=is_training) output = tf.layers.dense(features, 10) loss = tf.reduce_mean( tf.losses.sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss( loss, Adam(), val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), tensor_with_value={is_training: (True, False)}, metrics={"loss": loss}) optimizer.optimize(end_trigger=MaxEpoch(1)) optimizer.sess.close()
def test_tf_net_predict_dataset(self): tfnet_path = os.path.join(TestTF.resource_path, "tfnet") net = TFNet.from_export_folder(tfnet_path) dataset = TFDataset.from_ndarrays((np.random.rand(16, 4), )) output = net.predict(dataset) output = np.stack(output.collect()) assert output.shape == (16, 2)
def test_tf_optimizer_metrics(self): features = np.random.randn(20, 10) labels = np.random.randint(0, 10, size=[20]) with tf.Graph().as_default(): dataset = TFDataset.from_ndarrays((features, labels), batch_size=4, val_tensors=(features, labels)) feature_tensor, label_tensor = dataset.tensors features = tf.layers.dense(feature_tensor, 8) output = tf.layers.dense(features, 10) loss = tf.reduce_mean(tf.losses. sparse_softmax_cross_entropy(logits=output, labels=label_tensor)) optimizer = TFOptimizer.from_loss(loss, {"dense/": Adam(1e-3), "dense_1/": SGD(0.0)}, val_outputs=[output], val_labels=[label_tensor], val_method=Accuracy(), metrics={"loss": loss}) initial_weights = optimizer.tf_model.training_helper_layer.get_weights() optimizer.optimize(end_trigger=MaxEpoch(1)) updated_weights = optimizer.tf_model.training_helper_layer.get_weights() for i in [0, 1]: # weights and bias combined with "dense/" should be updated assert not np.allclose(initial_weights[i], updated_weights[i]) for i in [2, 3]: # weights and bias combined with "dense_1" should be unchanged assert np.allclose(initial_weights[i], updated_weights[i]) optimizer.sess.close()
def test_tf_optimizer_with_sparse_gradient_using_keras(self): import tensorflow as tf ids = np.random.randint(0, 10, size=[40]) labels = np.random.randint(0, 5, size=[40]) id_rdd = self.sc.parallelize(ids) label_rdd = self.sc.parallelize(labels) training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]]) dataset = TFDataset.from_rdd(training_rdd, names=["ids", "labels"], shapes=[[], []], types=[tf.int32, tf.int32], batch_size=8) words_input = tf.keras.layers.Input(shape=(), name='words_input') embedding_layer = tf.keras.layers.Embedding(input_dim=10, output_dim=5, name='word_embedding') word_embeddings = embedding_layer(words_input) embedding = tf.keras.layers.Flatten()(word_embeddings) output = tf.keras.layers.Dense(5, activation="softmax")(embedding) model = tf.keras.models.Model(inputs=[words_input], outputs=[output]) model.compile(optimizer="sgd", loss="sparse_categorical_crossentropy") optimizer = TFOptimizer.from_keras(model, dataset) optimizer.optimize()
def input_fn(mode): if mode == tf.estimator.ModeKeys.TRAIN: # demo_small directory structure # \demo_small # \cats # cat images ... # \dogs # dog images ... image_set = ImageSet.read("./datasets/cat_dog/demo_small", sc=sc, with_label=True, one_based_label=False) train_transformer = ChainedPreprocessing([ ImageBytesToMat(), ImageResize(256, 256), ImageRandomCrop(224, 224), ImageRandomPreprocessing(ImageHFlip(), 0.5), ImageChannelNormalize(0.485, 0.456, 0.406, 0.229, 0.224, 0.225), ImageMatToTensor(to_RGB=True, format="NHWC"), ImageSetToSample(input_keys=["imageTensor"], target_keys=["label"]) ]) feature_set = FeatureSet.image_frame(image_set.to_image_frame()) feature_set = feature_set.transform(train_transformer) dataset = TFDataset.from_feature_set(feature_set, features=(tf.float32, [224, 224, 3]), labels=(tf.int32, [1]), batch_size=16) else: raise NotImplementedError return dataset
def main(data_num): sc = init_nncontext() # get data, pre-process and create TFDataset (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", "test") images_data = (images_data[:data_num] - mnist.TRAIN_MEAN) / mnist.TRAIN_STD labels_data = labels_data[:data_num].astype(np.int32) dataset = TFDataset.from_ndarrays((images_data, labels_data), batch_per_thread=20) # construct the model from TFDataset images, labels = dataset.tensors labels = tf.squeeze(labels) with slim.arg_scope(lenet.lenet_arg_scope()): logits, end_points = lenet.lenet(images, num_classes=10, is_training=False) predictions = tf.to_int32(tf.argmax(logits, axis=1)) correct = tf.expand_dims(tf.to_int32(tf.equal(predictions, labels)), axis=1) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver.restore(sess, "/tmp/lenet/model") predictor = TFPredictor(sess, [correct]) accuracy = predictor.predict().mean() print("predict accuracy is %s" % accuracy)
def input_function(): ds = tf.data.Dataset.from_tensor_slices((dict(data_df), )) ds = TFDataset.from_tf_data_dataset( dataset=ds, batch_size=batch_size, batch_per_thread=batch_per_thread) return ds
def input_fn(mode): np.random.seed(20) x = np.random.rand(20, 10) y = np.random.randint(0, 10, (20)) rdd_x = self.sc.parallelize(x) rdd_y = self.sc.parallelize(y) rdd = rdd_x.zip(rdd_y) if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: dataset = TFDataset.from_rdd(rdd, features=(tf.float32, [10]), labels=(tf.int32, [])) else: dataset = TFDataset.from_rdd(rdd_x, features=(tf.float32, [10])) return dataset
def test_training_for_feature_set(self): model = self.create_image_model() feature_set = self.create_train_features_Set() training_dataset = TFDataset.from_feature_set(feature_set, features=(tf.float32, [224, 224, 3]), labels=(tf.int32, [1]), batch_size=8) model.fit(training_dataset)
def test_predict_for_imageset(self): model = self.create_image_model() image_set = self.create_image_set(with_label=False) predict_dataset = TFDataset.from_image_set(image_set, image=(tf.float32, [224, 224, 3]), batch_per_thread=1) results = model.predict(predict_dataset).get_predict().collect() assert all(r[1] is not None for r in results)
def test_training_for_imageset(self): model = self.create_image_model() image_set = self.create_image_set(with_label=True) training_dataset = TFDataset.from_image_set(image_set, image=(tf.float32, [224, 224, 3]), label=(tf.int32, [1]), batch_size=4) model.fit(training_dataset)
def test_evaluation_for_imageset(self): model = self.create_image_model() image_set = self.create_image_set(with_label=True) eval_dataset = TFDataset.from_image_set(image_set, image=(tf.float32, [224, 224, 3]), label=(tf.int32, [1]), batch_per_thread=1) model.evaluate(eval_dataset)
def main(max_epoch): sc = init_nncontext() training_rdd = get_data_rdd("train", sc) testing_rdd = get_data_rdd("test", sc) dataset = TFDataset.from_rdd(training_rdd, features=(tf.float32, [28, 28, 1]), labels=(tf.int32, []), batch_size=320, val_rdd=testing_rdd) model = tf.keras.Sequential( [tf.keras.layers.Flatten(input_shape=(28, 28, 1)), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax'), ] ) model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='sparse_categorical_crossentropy', metrics=['accuracy']) keras_model = KerasModel(model) keras_model.fit(dataset, epochs=max_epoch, distributed=True) eval_dataset = TFDataset.from_rdd( testing_rdd, features=(tf.float32, [28, 28, 1]), labels=(tf.int32, []), batch_per_thread=80) result = keras_model.evaluate(eval_dataset) print(result) # >> [0.08865142822265625, 0.9722] # the following assert is used for internal testing assert result['acc Top1Accuracy'] > 0.95 model.save_weights("/tmp/mnist_keras.h5")
def input_fn(mode): if mode == tf.estimator.ModeKeys.TRAIN: training_rdd = get_data_rdd("train", sc) dataset = TFDataset.from_rdd(training_rdd, features=(tf.float32, [28, 28, 1]), labels=(tf.int32, []), batch_size=320) elif mode == tf.estimator.ModeKeys.EVAL: testing_rdd = get_data_rdd("test", sc) dataset = TFDataset.from_rdd(testing_rdd, features=(tf.float32, [28, 28, 1]), labels=(tf.int32, []), batch_size=320) else: testing_rdd = get_data_rdd("test", sc).map(lambda x: x[0]) dataset = TFDataset.from_rdd(testing_rdd, features=(tf.float32, [28, 28, 1]), batch_per_thread=80) return dataset
def create_ds(mode): if mode == "train": dataset = TFDataset.from_dataframe(train_df, feature_cols=["feature"], labels_cols=["label"], batch_size=32, validation_df=val_df) elif mode == "predict": dataset = TFDataset.from_dataframe(val_df, feature_cols=["feature"], batch_per_thread=32) elif mode == "evaluate": dataset = TFDataset.from_dataframe(val_df, feature_cols=["feature"], labels_cols=["label"], batch_per_thread=32) else: raise ValueError("unrecognized mode: {}".format(mode)) return dataset
def test_dataset_without_batch(self): x = np.random.rand(20, 10) y = np.random.randint(0, 2, (20)) rdd_x = self.sc.parallelize(x) rdd_y = self.sc.parallelize(y) rdd = rdd_x.zip(rdd_y) dataset = TFDataset.from_rdd(rdd, features=(tf.float32, [10]), labels=(tf.int32, []), names=["features", "labels"], val_rdd=rdd) keras_model = self.create_model() model = KerasModel(keras_model) self.intercept( lambda: model.fit(dataset), "The batch_size of TFDataset must be" + " specified when used in KerasModel fit.") dataset = TFDataset.from_rdd( rdd, features=(tf.float32, [10]), labels=(tf.int32, []), names=["features", "labels"], ) self.intercept( lambda: model.evaluate(dataset), "The batch_per_thread of TFDataset must be " + "specified when used in KerasModel evaluate.") dataset = TFDataset.from_rdd( rdd_x, features=(tf.float32, [10]), names=["features", "labels"], ) self.intercept( lambda: model.predict(dataset), "The batch_per_thread of TFDataset must be" + " specified when used in KerasModel predict.")
def create_predict_dataset(self): np.random.seed(20) x = np.random.rand(20, 10) rdd = self.sc.parallelize(x) rdd = rdd.map(lambda x: [x]) dataset = TFDataset.from_rdd(rdd, features=(tf.float32, [10]), batch_per_thread=1) return dataset
def test_tfdataset_with_string_rdd(self): string_rdd = self.sc.parallelize(["123", "456"], 1) ds = TFDataset.from_string_rdd(string_rdd, batch_per_thread=1) input_tensor = tf.placeholder(dtype=tf.string, shape=(None, )) output_tensor = tf.string_to_number(input_tensor) with tf.Session() as sess: tfnet = TFNet.from_session(sess, inputs=[input_tensor], outputs=[output_tensor]) result = tfnet.predict(ds).collect() assert result[0] == 123 assert result[1] == 456
def create_predict_dataset(self): np.random.seed(20) x = np.random.rand(20, 10) rdd = self.sc.parallelize(x) rdd = rdd.map(lambda x: [x]) dataset = TFDataset.from_rdd(rdd, names=["features"], shapes=[[10]], types=[tf.float32], batch_per_thread=1) return dataset
def input_fn(): def map_func(data): image = data['image'] label = data['label'] one_hot_label = tf.one_hot(label, depth=10) noise = tf.random.normal(mean=0.0, stddev=1.0, shape=(NOISE_DIM,)) generator_inputs = (noise, one_hot_label) discriminator_inputs = ((tf.to_float(image) / 255.0) - 0.5) * 2 return (generator_inputs, discriminator_inputs) ds = tfds.load("mnist", split="train") ds = ds.map(map_func) dataset = TFDataset.from_tf_data_dataset(ds, batch_size=36) return dataset
def create_evaluation_dataset(self): np.random.seed(20) x = np.random.rand(20, 10) y = np.random.randint(0, 2, (20)) rdd_x = self.sc.parallelize(x) rdd_y = self.sc.parallelize(y) rdd = rdd_x.zip(rdd_y) dataset = TFDataset.from_rdd(rdd, features=(tf.float32, [10]), labels=(tf.int32, []), batch_per_thread=1) return dataset