def test_kafka_global_configuration(self): """Tests for KafkaDataset global configuration properties.""" tf.compat.v1.disable_eager_execution() import tensorflow_io.kafka as kafka_io # pylint: disable=wrong-import-position topics = tf.compat.v1.placeholder(tf.dtypes.string, shape=[None]) num_epochs = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[]) cfg_list = ["debug=generic", "enable.auto.commit=false"] repeat_dataset = kafka_io.KafkaDataset( topics, group="test", eof=True, config_global=cfg_list).repeat(num_epochs) iterator = tf.compat.v1.data.Iterator.from_structure( repeat_dataset.output_types) init_op = iterator.make_initializer(repeat_dataset) get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op, feed_dict={ topics: ["test:0:0:4"], num_epochs: 1 }) for i in range(5): self.assertEqual(("D" + str(i)).encode(), sess.run(get_next)) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(get_next)
def test_kafka_wrong_topic_configuration_failed(self): """Tests for KafkaDataset wrong topic configuration properties.""" tf.compat.v1.disable_eager_execution() import tensorflow_io.kafka as kafka_io # pylint: disable=wrong-import-position topics = tf.compat.v1.placeholder(tf.dtypes.string, shape=[None]) num_epochs = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[]) # Add wrong configuration wrong_cfg = ["auto.offset.reset=arliest"] repeat_dataset = kafka_io.KafkaDataset( topics, group="test", eof=True, config_topic=wrong_cfg).repeat(num_epochs) iterator = tf.compat.v1.data.Iterator.from_structure( repeat_dataset.output_types) init_op = iterator.make_initializer(repeat_dataset) get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op, feed_dict={ topics: ["test:0:0:4"], num_epochs: 1 }) with self.assertRaises(tf.errors.InternalError): sess.run(get_next)
def kafka_dataset(servers, topic, offset, schema, eof=True): print("Create: ", "{}:0:{}".format(topic, offset)) dataset = kafka_io.KafkaDataset(["{}:0:{}".format(topic, offset, offset)], servers=servers, group="cardata-autoencoder", eof=eof, config_global=kafka_config) # remove kafka framing dataset = dataset.map(lambda e: tf.strings.substr(e, 5, -1)) # deserialize avro dataset = dataset.map( lambda e: kafka_io.decode_avro( e, schema=schema, dtype=[ tf.float64, tf.float64, tf.float64, tf.float64, tf.float64, tf.float64, tf.float64, tf.float64, tf.float64, tf.int32, tf.int32, tf.int32, tf.int32, tf.float64, tf.float64, tf.float64, tf.float64, tf.int32, tf.string])) return dataset
def test_kafka_topic_configuration(self): """Tests for KafkaDataset topic configuration properties.""" tf.compat.v1.disable_eager_execution() import tensorflow_io.kafka as kafka_io # pylint: disable=wrong-import-position topics = tf.compat.v1.placeholder(tf.dtypes.string, shape=[None]) num_epochs = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[]) cfg_list = ["auto.offset.reset=earliest"] repeat_dataset = kafka_io.KafkaDataset( topics, group="test", eof=True, config_topic=cfg_list).repeat(num_epochs) iterator = tf.compat.v1.data.Iterator.from_structure( repeat_dataset.output_types) init_op = iterator.make_initializer(repeat_dataset) get_next = iterator.get_next() with self.cached_session() as sess: # Use a wrong offset 100 here to make sure # configuration 'auto.offset.reset=earliest' works. sess.run(init_op, feed_dict={ topics: ["test:0:100:-1"], num_epochs: 1 }) for i in range(5): self.assertEqual(("D" + str(i)).encode(), sess.run(get_next))
def test_kafka_dataset_save_and_restore(self): """Tests for KafkaDataset save and restore.""" g = tf.Graph() with g.as_default(): topics = tf.compat.v1.placeholder(dtypes.string, shape=[None]) num_epochs = tf.compat.v1.placeholder(dtypes.int64, shape=[]) repeat_dataset = kafka_io.KafkaDataset( topics, group="test", eof=True ).repeat(num_epochs) iterator = repeat_dataset.make_initializable_iterator() get_next = iterator.get_next() it = tf.data.experimental.make_saveable_from_iterator(iterator) g.add_to_collection(tf.compat.v1.GraphKeys.SAVEABLE_OBJECTS, it) saver = tf.compat.v1.train.Saver() model_file = "/tmp/test-kafka-model" with self.cached_session() as sess: sess.run( iterator.initializer, feed_dict={topics: ["test:0:0:4"], num_epochs: 1}, ) for i in range(3): self.assertEqual(("D" + str(i)).encode(), sess.run(get_next)) # Save current offset which is 2 saver.save(sess, model_file, global_step=3) checkpoint_file = "/tmp/test-kafka-model-3" with self.cached_session() as sess: saver.restore(sess, checkpoint_file) # Restore current offset to 2 for i in [2, 3]: self.assertEqual(("D" + str(i)).encode(), sess.run(get_next))
def test_kafka_dataset_with_key(self): """Tests for KafkaDataset.""" topics = tf.compat.v1.placeholder(dtypes.string, shape=[None]) num_epochs = tf.compat.v1.placeholder(dtypes.int64, shape=[]) batch_size = tf.compat.v1.placeholder(dtypes.int64, shape=[]) repeat_dataset = kafka_io.KafkaDataset( topics, group="test", eof=True, message_key=True).repeat(num_epochs) batch_dataset = repeat_dataset.batch(batch_size) iterator = data.Iterator.from_structure(batch_dataset.output_types) init_op = iterator.make_initializer(repeat_dataset) get_next = iterator.get_next() with self.cached_session() as sess: # Basic test: read from topic 0. sess.run(init_op, feed_dict={ topics: ["key-test:0:0:4"], num_epochs: 1 }) for i in range(5): self.assertEqual( (("D" + str(i)).encode(), ("K" + str(i % 2)).encode()), sess.run(get_next), ) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next)
def test_kafka_dataset_with_offset(self): """Tests for KafkaDataset when reading non-keyed messages from a single-partitioned topic""" tf.compat.v1.disable_eager_execution() import tensorflow_io.kafka as kafka_io # pylint: disable=wrong-import-position topics = tf.compat.v1.placeholder(tf.dtypes.string, shape=[None]) num_epochs = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[]) batch_size = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[]) repeat_dataset = kafka_io.KafkaDataset( topics, group="test", eof=True, message_offset=True).repeat(num_epochs) batch_dataset = repeat_dataset.batch(batch_size) iterator = tf.compat.v1.data.Iterator.from_structure( batch_dataset.output_types) init_op = iterator.make_initializer(repeat_dataset) get_next = iterator.get_next() with self.cached_session() as sess: # Basic offset test: read a limited number of messages from the topic. sess.run(init_op, feed_dict={ topics: ["offset-test:0:0:4"], num_epochs: 1 }) for i in range(5): self.assertEqual( (("D" + str(i)).encode(), ("0:" + str(i)).encode()), sess.run(get_next), ) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(get_next)
def run(): dataset = kafka_io.KafkaDataset([ TRAINING_TOPIC_NAME + ':0' ], servers=KAFKA_BOOTSTRAP, group="dalelane-tensorflow-train", config_global=[ "api.version.request=true", "sasl.mechanisms=PLAIN", "security.protocol=sasl_ssl", "sasl.username=token", "sasl.password="******"ssl.ca.location=" + CERT ]) dataset = dataset.map(deserialize).batch(1) # neural net definition taken from # https://www.tensorflow.org/tutorials/keras/classification#set_up_the_layers model = keras.Sequential([ keras.layers.Flatten(input_shape=(28, 28)), keras.layers.Dense(128, activation="relu"), keras.layers.Dense(10, activation="softmax") ]) model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) model.fit(dataset, epochs=4, steps_per_epoch=1000) return model
def check(self, images, predictions): import tensorflow_io.kafka as kafka_io f = kafka_io.KafkaDataset(topics=[self._topic], group="test", eof=True) lines = list(f) assert np.all(lines == predictions) assert len(lines) == len(images)
def test_kafka_output_sequence(): """Test case based on fashion mnist tutorial""" fashion_mnist = tf.keras.datasets.fashion_mnist ((train_images, train_labels), (test_images, _)) = fashion_mnist.load_data() class_names = [ 'T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot' ] train_images = train_images / 255.0 test_images = test_images / 255.0 model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(128, activation=tf.nn.relu), tf.keras.layers.Dense(10, activation=tf.nn.softmax) ]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) model.fit(train_images, train_labels, epochs=5) class OutputCallback(tf.keras.callbacks.Callback): """KafkaOutputCallback""" def __init__(self, batch_size, topic, servers): self._sequence = kafka_io.KafkaOutputSequence(topic=topic, servers=servers) self._batch_size = batch_size def on_predict_batch_end(self, batch, logs=None): index = batch * self._batch_size for outputs in logs['outputs']: for output in outputs: self._sequence.setitem(index, class_names[np.argmax(output)]) index += 1 def flush(self): self._sequence.flush() channel = "e{}e".format(time.time()) topic = "test_" + channel # By default batch size is 32 output = OutputCallback(32, topic, "localhost") predictions = model.predict(test_images, callbacks=[output]) output.flush() predictions = [class_names[v] for v in np.argmax(predictions, axis=1)] # Reading from `test_e(time)e` we should get the same result dataset = kafka_io.KafkaDataset(topics=[topic], group="test", eof=True) for entry, prediction in zip(dataset, predictions): assert entry.numpy() == prediction.encode()
def test_write_kafka(self): """test_write_kafka""" tf.compat.v1.disable_eager_execution() import tensorflow_io.kafka as kafka_io # pylint: disable=wrong-import-position channel = "e{}e".format(time.time()) # Start with reading test topic, replace `D` with `e(time)e`, # and write to test_e(time)e` topic. dataset = kafka_io.KafkaDataset(topics=["test:0:0:4"], group="test", eof=True) dataset = dataset.map(lambda x: kafka_io.write_kafka( tf.strings.regex_replace(x, "D", channel), topic="test_" + channel) ) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: # Basic test: read from topic 0. sess.run(init_op) for i in range(5): self.assertEqual((channel + str(i)).encode(), sess.run(get_next)) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(get_next) # Reading from `test_e(time)e` we should get the same result dataset = kafka_io.KafkaDataset(topics=["test_" + channel], group="test", eof=True) iterator = dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) for i in range(5): self.assertEqual((channel + str(i)).encode(), sess.run(get_next)) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(get_next)
def test_avro_kafka_dataset(): """test_avro_kafka_dataset""" schema = ('{"type":"record","name":"myrecord","fields":' '[{"name":"f1","type":"string"},{"name":"f2","type":"long"}]}"') dataset = kafka_io.KafkaDataset(["avro-test:0"], group="avro-test", eof=True) # remove kafka framing dataset = dataset.map(lambda e: tf.strings.substr(e, 5, -1)) # deserialize avro dataset = dataset.map(lambda e: kafka_io.decode_avro( e, schema=schema, dtype=[tf.string, tf.int64])) entries = [(f1.numpy(), f2.numpy()) for (f1, f2) in dataset] np.all(entries == [('value1', 1), ('value2', 2), ('value3', 3)])
def get_train_data(boostrap_servers, kafka_topic, group, batch, decoder): """Obtains the data and labels for training from Kafka Args: boostrap_servers (str): list of boostrap servers for the connection with Kafka kafka_topic (str): Kafka topic out_type_x, out_type_y, reshape_x, reshape_y) (raw): input data batch (int): batch size for training decoder(class): decoder to decode the data Returns: train_kafka: training data and labels from Kafka """ logging.info("Starts receiving training data from Kafka servers [%s] with topics [%s]", boostrap_servers, kafka_topic) train_data = kafka_io.KafkaDataset([kafka_topic], servers=boostrap_servers, group=group, eof=True, message_key=True).map(lambda x, y: decoder.decode(x, y)).batch(batch) return train_data
def test_avro_kafka_dataset(): """test_avro_kafka_dataset""" schema = ('{"type":"record","name":"myrecord","fields":[' '{"name":"f1","type":"string"},' '{"name":"f2","type":"long"},' '{"name":"f3","type":["null","string"],"default":null}' "]}") dataset = kafka_io.KafkaDataset(["avro-test:0"], group="avro-test", eof=True) # remove kafka framing dataset = dataset.map(lambda e: tf.strings.substr(e, 5, -1)) # deserialize avro dataset = dataset.map(lambda e: tfio.experimental.serialization. decode_avro(e, schema=schema)) entries = [(e["f1"], e["f2"], e["f3"]) for e in dataset] np.all(entries == [("value1", 1, ""), ("value2", 2, ""), ("value3", 3, "")])
def test_avro_kafka_dataset_with_resource(): """test_avro_kafka_dataset_with_resource""" schema = ('{"type":"record","name":"myrecord","fields":[' '{"name":"f1","type":"string"},' '{"name":"f2","type":"long"},' '{"name":"f3","type":["null","string"],"default":null}' ']}"') schema_resource = kafka_io.decode_avro_init(schema) dataset = kafka_io.KafkaDataset( ["avro-test:0"], group="avro-test", eof=True) # remove kafka framing dataset = dataset.map(lambda e: tf.strings.substr(e, 5, -1)) # deserialize avro dataset = dataset.map( lambda e: kafka_io.decode_avro( e, schema=schema_resource, dtype=[tf.string, tf.int64, tf.string])) entries = [(f1.numpy(), f2.numpy(), f3.numpy()) for (f1, f2, f3) in dataset] np.all(entries == [('value1', 1), ('value2', 2), ('value3', 3)])
def test_kafka_wrong_global_configuration_failed(self): """Tests for KafkaDataset worng global configuration properties.""" topics = tf.compat.v1.placeholder(dtypes.string, shape=[None]) num_epochs = tf.compat.v1.placeholder(dtypes.int64, shape=[]) # Add wrong configuration wrong_cfg = ["debug=al"] repeat_dataset = kafka_io.KafkaDataset( topics, group="test", eof=True, config_global=wrong_cfg ).repeat(num_epochs) iterator = data.Iterator.from_structure(repeat_dataset.output_types) init_op = iterator.make_initializer(repeat_dataset) get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op, feed_dict={topics: ["test:0:0:4"], num_epochs: 1}) with self.assertRaises(errors.InternalError): sess.run(get_next)
def test_kafka_dataset(self): """Tests for KafkaDataset.""" topics = tensorflow.compat.v1.placeholder(dtypes.string, shape=[None]) num_epochs = tensorflow.compat.v1.placeholder(dtypes.int64, shape=[]) batch_size = tensorflow.compat.v1.placeholder(dtypes.int64, shape=[]) repeat_dataset = kafka_io.KafkaDataset(topics, group="test", eof=True).repeat(num_epochs) batch_dataset = repeat_dataset.batch(batch_size) iterator = data.Iterator.from_structure(batch_dataset.output_types) init_op = iterator.make_initializer(repeat_dataset) init_batch_op = iterator.make_initializer(batch_dataset) get_next = iterator.get_next() with self.cached_session() as sess: # Basic test: read from topic 0. sess.run(init_op, feed_dict={ topics: ["test:0:0:4"], num_epochs: 1 }) for i in range(5): self.assertEqual(("D" + str(i)).encode(), sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Basic test: read from topic 1. sess.run(init_op, feed_dict={ topics: ["test:0:5:-1"], num_epochs: 1 }) for i in range(5): self.assertEqual(("D" + str(i + 5)).encode(), sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Basic test: read from both topics. sess.run(init_op, feed_dict={ topics: ["test:0:0:4", "test:0:5:-1"], num_epochs: 1 }) for j in range(2): for i in range(5): self.assertEqual(("D" + str(i + j * 5)).encode(), sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Test repeated iteration through both files. sess.run(init_op, feed_dict={ topics: ["test:0:0:4", "test:0:5:-1"], num_epochs: 10 }) for _ in range(10): for j in range(2): for i in range(5): self.assertEqual(("D" + str(i + j * 5)).encode(), sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Test batched and repeated iteration through both files. sess.run(init_batch_op, feed_dict={ topics: ["test:0:0:4", "test:0:5:-1"], num_epochs: 10, batch_size: 5 }) for _ in range(10): self.assertAllEqual([("D" + str(i)).encode() for i in range(5)], sess.run(get_next)) self.assertAllEqual([("D" + str(i + 5)).encode() for i in range(5)], sess.run(get_next))
def test_kafka_dataset(self): """Tests for KafkaDataset when reading non-keyed messages from a single-partitioned topic""" topics = tf.compat.v1.placeholder(dtypes.string, shape=[None]) num_epochs = tf.compat.v1.placeholder(dtypes.int64, shape=[]) batch_size = tf.compat.v1.placeholder(dtypes.int64, shape=[]) repeat_dataset = kafka_io.KafkaDataset(topics, group="test", eof=True).repeat( num_epochs ) batch_dataset = repeat_dataset.batch(batch_size) iterator = data.Iterator.from_structure(batch_dataset.output_types) init_op = iterator.make_initializer(repeat_dataset) init_batch_op = iterator.make_initializer(batch_dataset) get_next = iterator.get_next() with self.cached_session() as sess: # Basic test: read a limited number of messages from the topic. sess.run(init_op, feed_dict={topics: ["test:0:0:4"], num_epochs: 1}) for i in range(5): self.assertEqual(("D" + str(i)).encode(), sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Basic test: read all the messages from the topic from offset 5. sess.run(init_op, feed_dict={topics: ["test:0:5:-1"], num_epochs: 1}) for i in range(5): self.assertEqual(("D" + str(i + 5)).encode(), sess.run(get_next)) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Basic test: read from different subscriptions of the same topic. sess.run( init_op, feed_dict={topics: ["test:0:0:4", "test:0:5:-1"], num_epochs: 1}, ) for j in range(2): for i in range(5): self.assertEqual( ("D" + str(i + j * 5)).encode(), sess.run(get_next) ) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Test repeated iteration through both subscriptions. sess.run( init_op, feed_dict={topics: ["test:0:0:4", "test:0:5:-1"], num_epochs: 10}, ) for _ in range(10): for j in range(2): for i in range(5): self.assertEqual( ("D" + str(i + j * 5)).encode(), sess.run(get_next) ) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Test batched and repeated iteration through both subscriptions. sess.run( init_batch_op, feed_dict={ topics: ["test:0:0:4", "test:0:5:-1"], num_epochs: 10, batch_size: 5, }, ) for _ in range(10): self.assertAllEqual( [("D" + str(i)).encode() for i in range(5)], sess.run(get_next) ) self.assertAllEqual( [("D" + str(i + 5)).encode() for i in range(5)], sess.run(get_next) )
def test_kafka_dataset_with_partitioned_key(self): """Tests for KafkaDataset when reading keyed-messages from a multi-partitioned topic""" topics = tf.compat.v1.placeholder(dtypes.string, shape=[None]) num_epochs = tf.compat.v1.placeholder(dtypes.int64, shape=[]) batch_size = tf.compat.v1.placeholder(dtypes.int64, shape=[]) repeat_dataset = kafka_io.KafkaDataset( topics, group="test", eof=True, message_key=True ).repeat(num_epochs) batch_dataset = repeat_dataset.batch(batch_size) iterator = data.Iterator.from_structure(batch_dataset.output_types) init_op = iterator.make_initializer(repeat_dataset) init_batch_op = iterator.make_initializer(batch_dataset) get_next = iterator.get_next() with self.cached_session() as sess: # Basic test: read first 5 messages from the first partition of the topic. # NOTE: The key-partition mapping occurs based on the order in which the data # is being stored in kafka. Please check kafka_test.sh for the sample data. sess.run( init_op, feed_dict={topics: ["key-partition-test:0:0:5"], num_epochs: 1}, ) for i in range(5): self.assertEqual( (("D" + str(i * 2)).encode(), (b"K0")), sess.run(get_next), ) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Basic test: read first 5 messages from the second partition of the topic. sess.run( init_op, feed_dict={topics: ["key-partition-test:1:0:5"], num_epochs: 1}, ) for i in range(5): self.assertEqual( (("D" + str(i * 2 + 1)).encode(), (b"K1")), sess.run(get_next), ) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Basic test: read from different subscriptions to the same topic. sess.run( init_op, feed_dict={ topics: ["key-partition-test:0:0:5", "key-partition-test:1:0:5"], num_epochs: 1, }, ) for j in range(2): for i in range(5): self.assertEqual( (("D" + str(i * 2 + j)).encode(), ("K" + str(j)).encode()), sess.run(get_next), ) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Test repeated iteration through both subscriptions. sess.run( init_op, feed_dict={ topics: ["key-partition-test:0:0:5", "key-partition-test:1:0:5"], num_epochs: 10, }, ) for _ in range(10): for j in range(2): for i in range(5): self.assertEqual( (("D" + str(i * 2 + j)).encode(), ("K" + str(j)).encode()), sess.run(get_next), ) with self.assertRaises(errors.OutOfRangeError): sess.run(get_next) # Test batched and repeated iteration through both subscriptions. sess.run( init_batch_op, feed_dict={ topics: ["key-partition-test:0:0:5", "key-partition-test:1:0:5"], num_epochs: 10, batch_size: 5, }, ) for _ in range(10): for j in range(2): self.assertAllEqual( [ [("D" + str(i * 2 + j)).encode() for i in range(5)], [("K" + str(j)).encode() for i in range(5)], ], sess.run(get_next), )
# Decode image to (28, 28) x = tf.io.decode_raw(x, out_type=tf.uint8) x = tf.reshape(x, [28, 28]) # Convert to float32 for tf.keras x = tf.image.convert_image_dtype(x, tf.float32) return x def func_y(y): # Decode image to (,) y = tf.io.decode_raw(y, out_type=tf.uint8) y = tf.reshape(y, []) return y train_images = kafka_io.KafkaDataset(['xx:0'], group='xx', eof=True).map(func_x) train_labels = kafka_io.KafkaDataset(['yy:0'], group='yy', eof=True).map(func_y) train_kafka = tf.data.Dataset.zip((train_images, train_labels)).batch(1) print(train_kafka) # 3. Keras model model = tf.keras.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(128, activation=tf.nn.relu), tf.keras.layers.Dense(10, activation=tf.nn.softmax) ]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # default: steps_per_epoch=12000
def check(self, images, predictions): f = kafka_io.KafkaDataset(topics=[self._topic], group="test", eof=True) lines = [line for line in f] assert np.all(lines == predictions) assert len(lines) == len(images)
import tensorflow_io.kafka as kafka_io # local dependencies import train_model # config values from config import EVENT_STREAMS_API_KEY, CERT from config import KAFKA_BOOTSTRAP, TEST_STREAM_TOPIC_NAME print("-->>> Train a machine learning model using training data from Kafka") model = train_model.run() print("-->>> Prepare a streaming dataset based on a Kafka topic") dataset = kafka_io.KafkaDataset( [TEST_STREAM_TOPIC_NAME + ":0"], servers=KAFKA_BOOTSTRAP, group="dalelane-tensorflow-test", eof=False, config_global=[ "api.version.request=true", "sasl.mechanisms=PLAIN", "security.protocol=sasl_ssl", "sasl.username=token", "sasl.password="******"ssl.ca.location=" + CERT ]) dataset = dataset.map(train_model.deserialize).batch(1) print("-->>> Start classifying events received on the topic %s" % TEST_STREAM_TOPIC_NAME) for image, label in dataset: prediction = model.predict(image) if prediction.argmax() == label[0]: print("---->>>> ✓ Event classified correctly") else: print("---->>>> ✘ Event INCORRECTLY classified")
encoder = tf.keras.layers.Dense(encoding_dim, activation="tanh", activity_regularizer=tf.keras.regularizers.l1(learning_rate))(input_layer) encoder = tf.keras.layers.Dense(hidden_dim, activation="relu")(encoder) decoder = tf.keras.layers.Dense(hidden_dim, activation='tanh')(encoder) decoder = tf.keras.layers.Dense(input_dim, activation='relu')(decoder) autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=decoder) autoencoder.compile(metrics=['accuracy'], loss='mean_squared_error', optimizer='adam') # NOTE: KafkaDataset processing def process_csv(entry): # "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class" return tf.io.decode_csv(entry, [[0.0], *[[0.0] for i in range(28)], [0.0], [""]]) creditcard_dataset = kafka_io.KafkaDataset(['creditcard:0'], group='creditcard', eof=True).batch(batch_size).map(process_csv) def process_x_y(*entry): return (tf.stack(entry[0:30], 1), tf.strings.to_number(entry[30], out_type=tf.int32)) train_dataset = creditcard_dataset.map(process_x_y) print(train_dataset) # NOTE: model.fit() # NOTE: "Time" and "Amount" are not transformed with the following yet # df_norm['Time'] = StandardScaler().fit_transform(df_norm['Time'].values.reshape(-1, 1)) # df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1)) # Runtime transformation of the above, may require all data available which may defeat the purpose of "streaming" data autoencoder.fit(train_dataset, epochs=nb_epoch)
import tensorflow as tf import tensorflow_io.kafka as kafka_io with open('cardata-v1.avsc') as f: schema = f.read() dataset = kafka_io.KafkaDataset(["cardata-v1:0"], group="cardata-v1", eof=True) # remove kafka framing dataset = dataset.map(lambda e: tf.strings.substr(e, 5, -1)) # deserialize avro dataset = dataset.map(lambda e: kafka_io.decode_avro( e, schema=schema, dtype=[ tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.int32, tf.int32, tf.int32, tf. int32, tf.float32, tf.float32, tf.float32, tf.float32, tf.int32 ])) def normalize_fn(coolant_temp, intake_air_temp, intake_air_flow_speed, battery_percentage, battery_voltage, current_draw, speed, engine_vibration_amplitude, throttle_pos, tire_pressure_1_1, tire_pressure_1_2, tire_pressure_2_1, tire_pressure_2_2, accelerometer_1_1_value, accelerometer_1_2_value, accelerometer_2_1_value, accelerometer_2_2_value, control_unit_firmware): tire_pressure_1_1 = tf.cast(tire_pressure_1_1, tf.float32)
def test_kafka_dataset_with_key(self): """Tests for KafkaDataset when reading keyed-messages from a single-partitioned topic""" tf.compat.v1.disable_eager_execution() import tensorflow_io.kafka as kafka_io # pylint: disable=wrong-import-position topics = tf.compat.v1.placeholder(tf.dtypes.string, shape=[None]) num_epochs = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[]) batch_size = tf.compat.v1.placeholder(tf.dtypes.int64, shape=[]) repeat_dataset = kafka_io.KafkaDataset( topics, group="test", eof=True, message_key=True).repeat(num_epochs) batch_dataset = repeat_dataset.batch(batch_size) iterator = tf.compat.v1.data.Iterator.from_structure( batch_dataset.output_types) init_op = iterator.make_initializer(repeat_dataset) init_batch_op = iterator.make_initializer(batch_dataset) get_next = iterator.get_next() with self.cached_session() as sess: # Basic test: read a limited number of keyed messages from the topic. sess.run(init_op, feed_dict={ topics: ["key-test:0:0:4"], num_epochs: 1 }) for i in range(5): self.assertEqual( (("D" + str(i)).encode(), ("K" + str(i % 2)).encode()), sess.run(get_next), ) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(get_next) # Basic test: read all the keyed messages from the topic from offset 5. sess.run(init_op, feed_dict={ topics: ["key-test:0:5:-1"], num_epochs: 1 }) for i in range(5): self.assertEqual( (("D" + str(i + 5)).encode(), ("K" + str( (i + 5) % 2)).encode()), sess.run(get_next), ) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(get_next) # Basic test: read from different subscriptions of the same topic. sess.run( init_op, feed_dict={ topics: ["key-test:0:0:4", "key-test:0:5:-1"], num_epochs: 1, }, ) for j in range(2): for i in range(5): self.assertEqual( ( ("D" + str(i + j * 5)).encode(), ("K" + str((i + j * 5) % 2)).encode(), ), sess.run(get_next), ) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(get_next) # Test repeated iteration through both subscriptions. sess.run( init_op, feed_dict={ topics: ["key-test:0:0:4", "key-test:0:5:-1"], num_epochs: 10, }, ) for _ in range(10): for j in range(2): for i in range(5): self.assertEqual( ( ("D" + str(i + j * 5)).encode(), ("K" + str((i + j * 5) % 2)).encode(), ), sess.run(get_next), ) with self.assertRaises(tf.errors.OutOfRangeError): sess.run(get_next) # Test batched and repeated iteration through both subscriptions. sess.run( init_batch_op, feed_dict={ topics: ["key-test:0:0:4", "key-test:0:5:-1"], num_epochs: 10, batch_size: 5, }, ) for _ in range(10): self.assertAllEqual( [ [("D" + str(i)).encode() for i in range(5)], [("K" + str(i % 2)).encode() for i in range(5)], ], sess.run(get_next), ) self.assertAllEqual( [ [("D" + str(i + 5)).encode() for i in range(5)], [("K" + str((i + 5) % 2)).encode() for i in range(5)], ], sess.run(get_next), )