def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() with tf.train.MonitoredTrainingSession( is_chief=(ctx.task_index == 0)) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sess.should_stop() and not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if len(batch) > 0: outputs = sess.run([sq], feed_dict={x: batch}) tf_feed.batch_results(outputs[0]) # simulate post-feed actions that raise an exception time.sleep(2) raise Exception("FAKE exception after feeding")
def test_datafeed(self): mgr = TFManager.start('abc', ['input', 'output'], 'local') # insert 10 numbers followed by an end-of-feed marker q = mgr.get_queue('input') for i in range(10): q.put(i) q.put(None) feed = TFNode.DataFeed(mgr) # [0,1] self.assertFalse(feed.done_feeding) batch = feed.next_batch(2) self.assertEqual(2, len(batch)) self.assertEqual(1, sum(batch)) # [2,3,4,5] batch = feed.next_batch(4) self.assertEqual(4, len(batch)) self.assertEqual(14, sum(batch)) # [6,7,8,9] batch = feed.next_batch(10) self.assertEqual(4, len(batch)) self.assertEqual(30, sum(batch)) # should be done self.assertTrue(feed.should_stop())
def test_datafeed(self): """TFNode.DataFeed basic operations""" mgr = TFManager.start('abc', ['input', 'output'], 'local') # insert 10 numbers followed by an end-of-feed marker q = mgr.get_queue('input') for i in range(10): q.put(i) q.put(None) # end-of-feed marker feed = TFNode.DataFeed(mgr) # [0,1] self.assertFalse(feed.done_feeding) batch = feed.next_batch(2) self.assertEqual(len(batch), 2) self.assertEqual(sum(batch), 1) # [2,3,4,5] self.assertFalse(feed.done_feeding) batch = feed.next_batch(4) self.assertEqual(len(batch), 4) self.assertEqual(sum(batch), 14) # [6,7,8,9] self.assertFalse(feed.done_feeding) batch = feed.next_batch(10) # ask for more than available self.assertEqual(len(batch), 4) self.assertEqual(sum(batch), 30) # should be done self.assertTrue(feed.should_stop())
def __call__(self, args, ctx): self.task_index = ctx.task_index self.job_name = ctx.job_name self.cluster, self.server = TFNode.start_cluster_server(ctx) self.tf_feed = TFNode.DataFeed(ctx.mgr) if ctx.job_name == "ps": self.server.join() elif ctx.job_name == "worker": self.build_model() self.execute()
def _map_fun(args, ctx): import tensorflow as tf tf_feed = TFNode.DataFeed(ctx.mgr, False) while not tf_feed.should_stop(): batch = tf_feed.next_batch(batch_size=10) print("batch: {}".format(batch)) squares = tf.math.square(batch) print("squares: {}".format(squares)) tf_feed.batch_results(squares.numpy())
def _map_fun(args, ctx): import tensorflow as tf tf_feed = TFNode.DataFeed(ctx.mgr, False) while not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if len(batch) > 0: squares = tf.math.square(batch) tf_feed.batch_results(squares.numpy()) raise Exception("FAKE exception during feeding")
def _spark_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" import tensorflow as tf from tensorflowonspark import TFNode tf.reset_default_graph() # reset graph in case we're re-using a Spark python worker cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.float32, [None, 2], name='x') y_ = tf.placeholder(tf.float32, [None, 1], name='y_') w = tf.Variable(tf.truncated_normal([2,1]), name='w') y = tf.matmul(x, w, name='y') y2 = tf.square(y, name="y2") # extra/optional output for testing multiple output tensors cost = tf.reduce_mean(tf.square(y_ - y), name='cost') optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(cost) init_op = tf.global_variables_initializer() saver = tf.train.Saver() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) with sv.managed_session(server.target) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) while not sv.should_stop() and not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if args.input_mapping: if len(batch['x']) > 0: feed = { x: batch['x'], y_: batch['y_'] } opt = sess.run(optimizer, feed_dict=feed) if sv.is_chief: if args.model_dir: # manually save checkpoint ckpt_name = args.model_dir + "/model.ckpt" print("Saving checkpoint to: {}".format(ckpt_name)) saver.save(sess, ckpt_name) elif args.export_dir: # export a saved_model signatures = { 'test_key': { 'inputs': { 'features': x }, 'outputs': { 'prediction': y }, 'method_name': 'test' } } TFNode.export_saved_model(sess, export_dir=args.export_dir, tag_set='test_tag', signatures=signatures) else: print("WARNING: model state not saved.") sv.stop()
def __call__(self, args, ctx): self.task_index = ctx.task_index self.job_name = ctx.job_name self.cluster, self.server = TFNode.start_cluster_server(ctx) self.tf_feed = TFNode.DataFeed(ctx.mgr) if ctx.job_name == "ps": self.server.join() elif ctx.job_name == "worker": self.create_tmp_dir() self.process() self.delete_tmp_dir()
def _map_fun(args, ctx): import tensorflow as tf tf_feed = TFNode.DataFeed(ctx.mgr, False) while not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if len(batch) > 0: squares = tf.math.square(batch) tf_feed.batch_results(squares.numpy()) # simulate post-feed actions that raise an exception time.sleep(2) raise Exception("FAKE exception after feeding")
def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) with sv.managed_session(server.target) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sv.should_stop() and not tf_feed.should_stop(): outputs = sess.run( [sq], feed_dict={x: tf_feed.next_batch(10)}) tf_feed.batch_results(outputs[0]) sv.stop()
def main_fun(args, ctx): import numpy as np import tensorflow as tf from tensorflowonspark import compat, TFNode strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() def build_and_compile_cnn_model(): model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax') ]) model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy, optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), metrics=['accuracy']) return model # single node # single_worker_model = build_and_compile_cnn_model() # single_worker_model.fit(x=train_datasets, epochs=3) tf_feed = TFNode.DataFeed(ctx.mgr, False) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch) > 0: example = batch[0] image = np.array(example[0]).astype(np.float32) / 255.0 image = np.reshape(image, (28, 28, 1)) label = np.array(example[1]).astype(np.float32) label = np.reshape(label, (1, )) yield (image, label) else: return ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) ds = ds.batch(args.batch_size) # this fails # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)] tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" callbacks = [ tf.keras.callbacks.ModelCheckpoint(filepath=filepath, verbose=1, save_weights_only=True) ] with strategy.scope(): multi_worker_model = build_and_compile_cnn_model() # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous, # so we need to ensure that all workers complete training before any of them run out of data from the RDD. # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers, # we'll just stop training at 90% of the total expected number of steps. steps_per_epoch = 60000 / args.batch_size steps_per_epoch_per_worker = steps_per_epoch / ctx.num_workers max_steps_per_worker = steps_per_epoch_per_worker * 0.9 multi_worker_model.fit(x=ds, epochs=args.epochs, steps_per_epoch=max_steps_per_worker, callbacks=callbacks) from tensorflow_estimator.python.estimator.export import export_lib export_dir = export_lib.get_timestamped_export_dir(args.export_dir) compat.export_saved_model(multi_worker_model, export_dir, ctx.job_name == 'chief') # terminating feed tells spark to skip processing further partitions tf_feed.terminate()
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index IMAGE_PIXELS = 28 # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = args.batch_size # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == 'rdma') def feed_dict(batch): # Convert from dict of named arrays to two numpy arrays of the proper type images = batch['image'] labels = batch['label'] xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs / 255.0 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") y_ = tf.placeholder(tf.float32, [None, 10], name="y_") x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model_dir) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph()) sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy, {x: batch_xs, y_: batch_ys}))) if sv.is_chief: summary_writer.add_summary(summary, step) if sv.should_stop() or step >= args.steps: tf_feed.terminate() if sv.is_chief and args.export_dir: print("{0} exporting saved_model to: {1}".format(datetime.now().isoformat(), args.export_dir)) # exported signatures defined in code signatures = { tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: { 'inputs': {'image': x}, 'outputs': {'prediction': prediction}, 'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME }, 'featurize': { 'inputs': {'image': x}, 'outputs': {'features': hid}, 'method_name': 'featurize' } } TFNode.export_saved_model(sess, args.export_dir, tf.saved_model.tag_constants.SERVING, signatures) else: # non-chief workers should wait for chief while not sv.should_stop(): print("Waiting for chief") time.sleep(5) # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(args, ctx): IMAGE_PIXELS = 28 num_classes = 10 # use Keras API to load data from tensorflow.python.keras.datasets import mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) x_train = x_train.astype('float32') / 255 x_test = x_test.astype('float32') / 255 # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) # setup a Keras model model = Sequential() model.add(Dense(512, activation='relu', input_shape=(784, ))) model.add(Dropout(0.2)) model.add(Dense(512, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(10, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001), metrics=['accuracy']) model.summary() print("model.inputs: {}".format(model.inputs)) print("model.outputs: {}".format(model.outputs)) # convert Keras model to tf.estimator estimator = tf.keras.estimator.model_to_estimator(model, model_dir=args.model_dir) # setup train_input_fn for InputMode.TENSORFLOW or InputMode.SPARK if args.input_mode == 'tf': # For InputMode.TENSORFLOW, just use data in memory train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"dense_input": x_train}, y=y_train, batch_size=128, num_epochs=args.epochs, shuffle=True) hooks = [] else: # 'spark' # For InputMode.SPARK, read data from RDD tf_feed = TFNode.DataFeed(ctx.mgr) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch) > 0: record = batch[0] image = numpy.array(record[0]).astype( numpy.float32) / 255.0 label = numpy.array(record[1]).astype(numpy.float32) yield (image, label) else: return def train_input_fn(): ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape( [IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10]))) ds = ds.batch(args.batch_size) return ds # add a hook to terminate the RDD data feed when the session ends hooks = [StopFeedHook(tf_feed)] # eval_input_fn ALWAYS uses data loaded in memory, since InputMode.SPARK can only feed one RDD at a time eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"dense_input": x_test}, y=y_test, num_epochs=1, shuffle=False) # setup tf.estimator.train_and_evaluate() w/ FinalExporter feature_spec = { 'dense_input': tf.placeholder(tf.float32, shape=[None, 784]) } exporter = tf.estimator.FinalExporter( "serving", serving_input_receiver_fn=tf.estimator.export. build_raw_serving_input_receiver_fn(feature_spec)) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.steps, hooks=hooks) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, exporters=exporter) # train and export model tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) # WORKAROUND FOR https://github.com/tensorflow/tensorflow/issues/21745 # wait for all other nodes to complete (via done files) done_dir = "{}/done".format(ctx.absolute_path(args.model_dir)) print("Writing done file to: {}".format(done_dir)) tf.gfile.MakeDirs(done_dir) with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as done_file: done_file.write("done") for i in range(60): if len(tf.gfile.ListDirectory(done_dir)) < len( ctx.cluster_spec['worker']): print("{} Waiting for other nodes {}".format( datetime.now().isoformat(), i)) time.sleep(1) else: print("{} All nodes done".format(datetime.now().isoformat())) break
def main_fun(args, ctx): IMAGE_PIXELS = 28 num_classes = 10 # use Keras API to load data from tensorflow.python.keras.datasets import mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) x_train = x_train.astype('float32') / 255 x_test = x_test.astype('float32') / 255 # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) # setup a Keras model model = Sequential() model.add(Dense(512, activation='relu', input_shape=(784, ))) model.add(Dropout(0.2)) model.add(Dense(512, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(10, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) model.summary() # convert Keras model to tf.estimator estimator = tf.keras.estimator.model_to_estimator(model, model_dir=args.model_dir) # setup train_input_fn for InputMode.TENSORFLOW or InputMode.SPARK if args.input_mode == 'tf': train_input_fn = tf.estimator.inputs.numpy_input_fn( x={"dense_1_input": x_train}, y=y_train, batch_size=128, num_epochs=None, shuffle=True) else: # 'spark' tf_feed = TFNode.DataFeed(ctx.mgr) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch) > 0: record = batch[0] image = numpy.array(record[0]).astype( numpy.float32) / 255.0 label = numpy.array(record[1]).astype(numpy.float32) yield (image, label) def train_input_fn(): ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape( [IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10]))) ds = ds.batch(args.batch_size) return ds # eval_input_fn ALWAYS uses data loaded in memory, since InputMode.SPARK can only feed one RDD at a time eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"dense_1_input": x_test}, y=y_test, num_epochs=args.epochs, shuffle=False) # setup tf.estimator.train_and_evaluate() train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) # export a saved_model, if export_dir provided if args.export_dir: def serving_input_receiver_fn(): """An input receiver that expects a serialized tf.Example.""" serialized_tf_example = tf.placeholder(dtype=tf.string, shape=[args.batch_size], name='input_example_tensor') receiver_tensors = {'dense_1_input': serialized_tf_example} feature_spec = { 'dense_1_input': tf.FixedLenFeature(784, tf.string) } features = tf.parse_example(serialized_tf_example, feature_spec) return tf.estimator.export.ServingInputReceiver( features, receiver_tensors) estimator.export_savedmodel(args.export_dir, serving_input_receiver_fn)
def main_fun(args, ctx): import numpy as np import tensorflow as tf import tensorflow_datasets as tfds from tensorflowonspark import TFNode tfds.disable_progress_bar() class StopFeedHook(tf.estimator.SessionRunHook): """SessionRunHook to terminate InputMode.SPARK RDD feeding if the training loop exits before the entire RDD is consumed.""" def __init__(self, feed): self.feed = feed def end(self, session): self.feed.terminate() self.feed.next_batch(1) BATCH_SIZE = args.batch_size LEARNING_RATE = args.learning_rate tf_feed = TFNode.DataFeed(ctx.mgr) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch) > 0: example = batch[0] image = np.array(example[0]).astype(np.float32) / 255.0 image = np.reshape(image, (28, 28, 1)) label = np.array(example[1]).astype(np.float32) label = np.reshape(label, (1, )) yield (image, label) else: return def input_fn(mode, input_context=None): if mode == tf.estimator.ModeKeys.TRAIN: # Note: Spark is responsible for sharding/repeating/shuffling the data via RDD ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) return ds.batch(BATCH_SIZE) else: # read evaluation data from tensorflow_datasets directly def scale(image, label): image = tf.cast(image, tf.float32) / 255.0 return image, label mnist = tfds.load(name='mnist', with_info=True, as_supervised=True) ds = mnist['test'] if input_context: ds = ds.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) return ds.map(scale).batch(BATCH_SIZE) def serving_input_receiver_fn(): features = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 28, 28, 1], name='conv2d_input') receiver_tensors = {'conv2d_input': features} return tf.estimator.export.ServingInputReceiver( receiver_tensors, receiver_tensors) def model_fn(features, labels, mode): model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax') ]) logits = model(features, training=False) if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'logits': logits} return tf.estimator.EstimatorSpec(mode, predictions=predictions) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=LEARNING_RATE) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels, logits) loss = tf.reduce_sum(input_tensor=loss) * (1. / BATCH_SIZE) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode, loss=loss) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=optimizer.minimize( loss, tf.compat.v1.train.get_or_create_global_step())) strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() config = tf.estimator.RunConfig(train_distribute=strategy, save_checkpoints_steps=100) classifier = tf.estimator.Estimator(model_fn=model_fn, model_dir=args.model_dir, config=config) # exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn) # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous, # so we need to ensure that all workers complete training before any of them run out of data from the RDD. # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers, # we'll just stop training at 90% of the total expected number of steps. steps = 60000 * args.epochs / args.batch_size steps_per_worker = steps / ctx.num_workers max_steps_per_worker = steps_per_worker * 0.9 tf.estimator.train_and_evaluate( classifier, train_spec=tf.estimator.TrainSpec(input_fn=input_fn, max_steps=max_steps_per_worker, hooks=[StopFeedHook(tf_feed)]), eval_spec=tf.estimator.EvalSpec(input_fn=input_fn) # eval_spec=tf.estimator.EvalSpec(input_fn=input_fn, exporters=exporter) ) if ctx.job_name == 'chief': print("Exporting saved_model to {}".format(args.export_dir)) classifier.export_saved_model(args.export_dir, serving_input_receiver_fn)
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index # Parameters IMAGE_PIXELS = 28 hidden_units = 128 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) # Create generator for Spark data feed tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1)[0] image = numpy.array(batch[0]) image = image.astype(numpy.float32) / 255.0 label = numpy.array(batch[1]) label = label.astype(numpy.int64) yield (image, label) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Dataset for input data ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]), tf.TensorShape([10]))).batch(args.batch_size) iterator = ds.make_one_shot_iterator() x, y_ = iterator.get_next() # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # # Placeholders or QueueRunner/Readers for input data # x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") # y_ = tf.placeholder(tf.float32, [None, 10], name="y_") x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num, graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 while not sv.should_stop() and not tf_feed.should_stop( ) and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. if args.mode == "train": _, summary, step = sess.run( [train_op, summary_op, global_step]) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format( datetime.now().isoformat(), step, sess.run(accuracy))) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run( [label, prediction, accuracy]) results = [ "{0} Label: {1}, Prediction: {2}".format( datetime.now().isoformat(), l, p) for l, p in zip(labels, preds) ] tf_feed.batch_results(results) print("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def map_fun(args, ctx): # from com.yahoo.ml.tf import TFNode from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num #worker数量 job_name = ctx.job_name # job名 task_index = ctx.task_index # 任务索引 cluster_spec = ctx.cluster_spec # 集群 IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1 (后续参考自己图像大小进行修改) channels=3 num_class=2 dropout = 0.5 learning_rate=1e-6 # Parameters hidden_units = 128 # NN隐藏层 training_epochs=args.epochs img_nums=630000 #batch_size = args.batch_size #每批次训练的样本数 batch_size=200 """ # ---------设置动态学习效率 # Constants describing the training process. # MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. NUM_EPOCHS_PER_DECAY = batch_size # Epochs after which learning rate decays. LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. global_step1 = training_epochs * (img_nums // batch_size) # Integer Variable counting the number of training steps # Variables that affect learning rate. num_batches_per_epoch = img_nums / batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step1, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) # 设置动态学习效率---------- """ # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": # ps节点(主节点) time.sleep((worker_num + 1) * 5) # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] numpy.random.shuffle(batch) # 随机打乱 for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) #xs = xs/255.0 # 数据归一化 # Z-score标准化方法 #mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1]) #std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1]) #xs = (xs - mean) / std # min-max标准化(Min-Max Normalization max_=numpy.reshape(numpy.max(xs,1),[numpy.shape(xs)[0], 1]) min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1]) xs=(xs-min_)/(max_-min_) ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Create some wrappers for simplicity def conv2d(x, W, b, strides=1): # Conv2D wrapper, with bias and relu activation x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') x = tf.nn.bias_add(x, b) # strides中间两个为1 表示x,y方向都不间隔取样 return tf.nn.relu(x) def maxpool2d(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME') # strides中间两个为2 表示x,y方向都间隔1个取样 def maxpool2d2(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='VALID') # strides中间两个为2 表示x,y方向都间隔1个取样 # Store layers weight & bias weights = { # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入 'wc1': tf.get_variable('wc1',[3,3,channels,64],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 5X5的卷积模板 # 5x5 conv, 32 inputs, 64 outputs 'wc2': tf.get_variable('wc2',[3,3,64,128],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 'wc3': tf.Variable(tf.random_normal([3, 3, 256, 128])), 'wc4': tf.get_variable('wc4',[3,3,128,num_class],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # fully connected, 7*7*64 inputs, 1024 outputs # 'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])), # 1024 inputs, 10 outputs (class prediction) # 'out': tf.Variable(tf.random_normal([1024, num_class])) } biases = { 'bc1': tf.get_variable('bc1',[64],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), 'bc2': tf.get_variable('bc2',[128],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 'bc3': tf.Variable(tf.random_normal([128])), 'bc4': tf.get_variable('bc4',[num_class],dtype=tf.float32, initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss), # 'bd1': tf.Variable(tf.random_normal([1024])), # 'out': tf.Variable(tf.random_normal([num_class])) } # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS * channels], name="x") # mnist 28*28*1 y_ = tf.placeholder(tf.float32, [None, num_class], name="y_") # keep=tf.placeholder(tf.float32) x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels]) # mnist 数据 28x28x1 (灰度图 波段为1) # tf.summary.image("x_img", x_img) # 改成卷积模型 conv1 = conv2d(x_img, weights['wc1'], biases['bc1']) conv1 = maxpool2d(conv1, k=2) # conv1 = tf.nn.dropout(conv1, keep) conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) conv2 = maxpool2d(conv2, k=2) conv2 = tf.nn.dropout(conv2, dropout) # conv3 = conv2d(conv2, weights['wc3'], biases['bc3']) # conv3 = tf.nn.dropout(conv3, keep) conv4 = conv2d(conv2, weights['wc4'], biases['bc4']) conv4 = maxpool2d2(conv4, k=2) y = tf.reshape(conv4, [-1, num_class]) # fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]]) # fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1']) # fc1 = tf.nn.relu(fc1) # if args.mode == "train" or args.mode == "retrain": # fc1 = tf.nn.dropout(fc1, dropout) # y = tf.add(tf.matmul(fc1, weights['out']), biases['out']) # global_step = tf.Variable(0) global_step = tf.Variable(0, name="global_step", trainable=False) # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y)) # tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(learning_rate).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") # tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() # summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) # # log.info("tensorflow model path: {0}".format(logdir)) # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=1) elif args.mode == "retrain": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, # init_op=init_op, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: # 打开session print("{0} session ready".format(datetime.now().isoformat())) # log.info("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train" or args.mode == "retrain") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train" or args.mode == "retrain": # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) _, step = sess.run([train_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: pass # summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) # log.info("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import numpy import tensorflow as tf import time import math worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters batch_size = args.batch_size # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [images_labels] to two numpy arrays of the proper type images = [] labels = [] for item in batch: images.append(item[0: 4]) labels.append(item[4]) xs = numpy.array(images) xs = xs.astype(numpy.float32) ys = dense_to_one_hot(numpy.array(labels, dtype=numpy.uint), 3) ys = ys.astype(numpy.uint8) return (xs, ys) def dense_to_one_hot(labels_dense, num_classes): """Convert class labels from scalars to one-hot vectors.""" num_labels = labels_dense.shape[0] index_offset = numpy.arange(num_labels) * num_classes labels_one_hot = numpy.zeros((num_labels, num_classes)) tt = index_offset + labels_dense.ravel() tt = tt.astype(numpy.int32) labels_one_hot.flat[tt] = 1 return labels_one_hot if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # network x = tf.placeholder(tf.float32, [None, 4]) # paras W = tf.Variable(tf.zeros([4, 3])) b = tf.Variable(tf.zeros([3])) y = tf.nn.softmax(tf.matmul(x, W) + b) y_ = tf.placeholder(tf.float32, [None, 3]) # loss func cross_entropy = -tf.reduce_sum(y_ * tf.log(y)) global_step = tf.Variable(0) train_op = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy,global_step=global_step ) # Test trained model label = tf.argmax(y_, 1, name="label") #??? does the function argmax use in the right way ? prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=1) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train": _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: summary_writer.add_summary(summary, step) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(args, ctx): iris = datasets.load_iris() X = iris.data Y = iris.target X = preprocessing.scale(X) # Y = to_categorical(Y, num_classes=3) train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2) print(train_X.shape, test_X.shape, train_Y.shape, test_Y.shape) model = Sequential() model.add(Dense(12, input_shape=(4,), activation='relu')) model.add(Dense(3, input_shape=(12,), activation='softmax')) model.compile(loss='sparse_categorical_crossentropy', optimizer='SGD', metrics=['accuracy']) model.summary() estimator = tf.keras.estimator.model_to_estimator(model, model_dir=args.model_dir) # model.fit(train_X, train_Y, nb_epoch=50, batch_size=1, verbose=1) # loss, accuracy = model.evaluate(test_X, test_Y, verbose=0) # print("Accuracy = {:.2f}".format(accuracy)) tf_feed = TFNode.DataFeed(ctx.mgr) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch) > 0: record = batch[0] features = np.array(record[0]).astype(numpy.array) label = np.array(record[1]).astype(numpy.float32) yield (features, label) def train_input_fn(): ds = tf.data.Dataset.from_generator(rdd_generator, (tf.array, tf.float32), (tf.TensorShape([4]), tf.TensorShape([3]))) ds = ds.batch(args.batch_size) return ds # train_input_fn = tf.estimator.inputs.numpy_input_fn( # x={"dense_input": train_X}, # y=train_Y, # batch_size=1, # num_epochs=None, # shuffle=True # ) eval_input_fn = tf.estimator.inputs.numpy_input_fn( x={"dense_input": test_X}, y=test_Y, num_epochs=args.epochs, shuffle=False ) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=args.steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) test_input_fn = tf.estimator.inputs.numpy_input_fn( x={"dense_input": test_X[:1]}, y=test_Y[:1], batch_size=1, shuffle=False )
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec IMAGE_PIXELS=28 # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = args.batch_size # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def writeFileToHDFS(): rootdir = '/tmp/mnist_model' client = HdfsClient(hosts='localhost:50070') client.mkdirs('/user/root/mnist_model') for parent,dirnames,filenames in os.walk(rootdir): for dirname in dirnames: print("parent is:{0}".format(parent)) for filename in filenames: client.copy_from_local(os.path.join(parent,filename), os.path.join('/user/root/mnist_model',filename), overwrite=True) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs/255.0 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x") y_ = tf.placeholder(tf.float32, [None, 10], name="y_") x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS # logdir = TFNode.hdfs_path(ctx, args.model) logdir = "hdfs:///tmp/" + args.model print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, summary_writer=summary_writer, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train": _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() writeFileToHDFS() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def _spark_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" import tensorflow as tf from tensorflowonspark import TFNode class ExportHook(tf.train.SessionRunHook): def __init__(self, export_dir, input_tensor, output_tensor): self.export_dir = export_dir self.input_tensor = input_tensor self.output_tensor = output_tensor def end(self, session): print("{} ======= Exporting to: {}".format( datetime.now().isoformat(), self.export_dir)) signatures = { "test_key": { 'inputs': { 'features': self.input_tensor }, 'outputs': { 'prediction': self.output_tensor }, 'method_name': tf.saved_model.signature_constants. PREDICT_METHOD_NAME } } TFNode.export_saved_model(session, self.export_dir, "test_tag", signatures) print("{} ======= Done exporting".format( datetime.now().isoformat())) tf.reset_default_graph( ) # reset graph in case we're re-using a Spark python worker cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.float32, [None, 2], name='x') y_ = tf.placeholder(tf.float32, [None, 1], name='y_') w = tf.Variable(tf.truncated_normal([2, 1]), name='w') y = tf.matmul(x, w, name='y') y2 = tf.square( y, name="y2" ) # extra/optional output for testing multiple output tensors global_step = tf.train.get_or_create_global_step() cost = tf.reduce_mean(tf.square(y_ - y), name='cost') optimizer = tf.train.GradientDescentOptimizer( 0.5).minimize(cost, global_step) chief_hooks = [ ExportHook(ctx.absolute_path(args.export_dir), x, y) ] if args.export_dir else [] with tf.train.MonitoredTrainingSession( master=server.target, is_chief=(ctx.task_index == 0), checkpoint_dir=args.model_dir, chief_only_hooks=chief_hooks) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) while not sess.should_stop() and not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if args.input_mapping: if len(batch['x']) > 0: feed = {x: batch['x'], y_: batch['y_']} sess.run(optimizer, feed_dict=feed)
def map_fun(args, ctx): # from com.yahoo.ml.tf import TFNode from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num #worker数量 job_name = ctx.job_name # job名 task_index = ctx.task_index # 任务索引 cluster_spec = ctx.cluster_spec # 集群 IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1 (后续参考自己图像大小进行修改) channels=4 num_class=2 # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": # ps节点(主节点) time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 # NN隐藏层 batch_size = args.batch_size #每批次训练的样本数 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) xs = xs/255.0 # 数据归一化 ys = numpy.array(labels) ys = ys.astype(numpy.uint8) return (xs, ys) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): #-------------普通的NN模型(可以修改成自己的模型)---------------------------------# #↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓# ''' # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") # tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") # tf.summary.histogram("softmax_weights", sm_w) ''' # Create some wrappers for simplicity def conv2d(x, W, b, strides=1): # Conv2D wrapper, with bias and relu activation x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') x = tf.nn.bias_add(x, b) # strides中间两个为1 表示x,y方向都不间隔取样 return tf.nn.relu(x) def maxpool2d(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME') # strides中间两个为2 表示x,y方向都间隔1个取样 # Store layers weight & bias weights = { # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入 'wc1': tf.Variable(tf.random_normal([5, 5, channels, 32])), # 5X5的卷积模板 # 5x5 conv, 32 inputs, 64 outputs 'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])), # fully connected, 7*7*64 inputs, 1024 outputs 'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])), # 1024 inputs, 10 outputs (class prediction) 'out': tf.Variable(tf.random_normal([1024, num_class])) } biases = { 'bc1': tf.Variable(tf.random_normal([32])), 'bc2': tf.Variable(tf.random_normal([64])), 'bd1': tf.Variable(tf.random_normal([1024])), 'out': tf.Variable(tf.random_normal([num_class])) } # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS*channels], name="x") # mnist 28*28*1 y_ = tf.placeholder(tf.float32, [None, num_class], name="y_") x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels]) # mnist 数据 28x28x1 (灰度图 波段为1) # tf.summary.image("x_img", x_img) # 改成卷积模型 conv1 = conv2d(x_img, weights['wc1'], biases['bc1']) conv1 = maxpool2d(conv1, k=2) conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) conv2 = maxpool2d(conv2, k=2) fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]]) fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1']) fc1 = tf.nn.relu(fc1) if args.mode == "train": fc1 = tf.nn.dropout(fc1, 0.7) y = tf.add(tf.matmul(fc1, weights['out']), biases['out']) ''' hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) # tf.nn.add(tf.nn.matmul(x,hid_w),hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) ''' # global_step = tf.Variable(0) global_step = tf.Variable(0, name="global_step", trainable=False) # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y)) # tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") # tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() # summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑# #---------------上面的模型可以修改成自己的模型------------------------------# # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) # log.info("tensorflow model path: {0}".format(logdir)) # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: # 打开session logging.basicConfig(level=logging.INFO) print("{0} session ready".format(datetime.now().isoformat())) log.info("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train": # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) _, step = sess.run([train_op, global_step], feed_dict=feed) # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: pass # summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) log.info("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) log.info("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(args, ctx): import numpy as np import tensorflow as tf import tensorflow_datasets as tfds from tensorflowonspark import TFNode tfds.disable_progress_bar() BUFFER_SIZE = args.buffer_size BATCH_SIZE = args.batch_size LEARNING_RATE = args.learning_rate tf_feed = TFNode.DataFeed(ctx.mgr) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch) > 0: example = batch[0] image = np.array(example[0]).astype(np.float32) / 255.0 image = np.reshape(image, (28, 28, 1)) label = np.array(example[1]).astype(np.float32) label = np.reshape(label, (1, )) yield (image, label) else: return def input_fn(mode, input_context=None): if mode == tf.estimator.ModeKeys.TRAIN: # Note: Spark is responsible for feeding data via streaming RDD ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([28, 28, 1]), tf.TensorShape([1]))) return ds.batch(BATCH_SIZE) else: raise Exception("I'm evaluating: mode={}, input_context={}".format( mode, input_context)) def scale(image, label): image = tf.cast(image, tf.float32) / 255.0 return image, label mnist = tfds.load(name='mnist', with_info=True, as_supervised=True) ds = mnist['test'] if input_context: ds = ds.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) return ds.map(scale).batch(BATCH_SIZE) def serving_input_receiver_fn(): features = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, 28, 28, 1], name='features') receiver_tensors = {'features': features} return tf.estimator.export.ServingInputReceiver( receiver_tensors, receiver_tensors) def model_fn(features, labels, mode): model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(10, activation='softmax') ]) logits = model(features, training=False) if mode == tf.estimator.ModeKeys.PREDICT: predictions = {'logits': logits} return tf.estimator.EstimatorSpec(mode, predictions=predictions) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=LEARNING_RATE) loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels, logits) loss = tf.reduce_sum(input_tensor=loss) * (1. / BATCH_SIZE) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode, loss=loss) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=optimizer.minimize( loss, tf.compat.v1.train.get_or_create_global_step())) # Note: the original example used MultiWorkerMirroredStrategy which is a synchronous training strategy. # Since streaming data arrives irregularly, we must use the asynchronous ParameterServerStrategy # to allow data to be processed as it arrives and to avoid deadlocks. # strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() strategy = tf.distribute.experimental.ParameterServerStrategy() config = tf.estimator.RunConfig(train_distribute=strategy, save_checkpoints_steps=100) classifier = tf.estimator.Estimator(model_fn=model_fn, model_dir=args.model_dir, config=config) # exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn) tf.estimator.train_and_evaluate( classifier, train_spec=tf.estimator.TrainSpec(input_fn=input_fn), eval_spec=tf.estimator.EvalSpec(input_fn=input_fn) # eval_spec=tf.estimator.EvalSpec(input_fn=input_fn, exporters=exporter) ) if ctx.job_name == 'chief': print("Exporting saved_model to {}".format(args.export_dir)) classifier.export_saved_model(args.export_dir, serving_input_receiver_fn)
def mainFun(args, ctx): import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Input, BatchNormalization import tensorflow.keras as keras from tensorflowonspark import compat, TFNode # Setting distributed model strategy strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() def buildAndCompileModel(): # Initiating model model = Sequential() # Building model structure model.add(Input(shape=(1025, 50, 1))) # First convolution and pooling step model.add(Conv2D(16, kernel_size=[3,3], activation='relu', data_format='channels_last')) model.add(MaxPool2D(pool_size=[3,3], data_format='channels_last')) model.add(Dropout(0.2)) # Second convolution and pooling step model.add(Conv2D(32, kernel_size=[3,3], activation='relu', data_format='channels_last')) model.add(MaxPool2D(pool_size=[3,3], data_format='channels_last')) model.add(Dropout(0.2)) # Flattening output of convolution to pass on to Dense layers model.add(Flatten()) model.add(BatchNormalization()) model.add(Dense(128, activation='relu')) model.add(BatchNormalization()) model.add(Dense(128, activation='relu')) # Output layer model.add(Dense(30, activation='softmax')) # Compiling model model.compile( loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'] ) return model # Opening up datafeed to iterate over entries tfFeed = TFNode.DataFeed(ctx.mgr, False) # Function to split data into features and labels def rddGenerator(): while not tfFeed.should_stop(): batch = tfFeed.next_batch(1) if len(batch) > 0: example = batch[0] # Splitting into X and y X = np.array(example[1]).astype(np.float32) y = np.array(example[0]) # Encoding labels _, y = np.unique(y, return_inverse=True) y = y.astype(np.float32) # Adjusting data shape X = X.reshape(-1, 50, 1) # Returning features and labels as separate arrays yield (X, y) else: return # Creating Tensorflow Dataset ds = tf.data.Dataset.from_generator(rddGenerator, (tf.float32, tf.float32), (tf.TensorShape([1025, 50, 1]), tf.TensorShape([1]))) ds = ds.batch(1) # Instantiating Model with strategy.scope(): multiWorkerModel = buildAndCompileModel() # Defining Training Parameters stepsPerEpoch = 600 / 1 stepsPerWorker = stepsPerEpoch / 1 maxStepsPerWorker = stepsPerWorker * 0.9 # Fitting Model multiWorkerModel.fit(x = ds, epochs = 2, steps_per_epoch = stepsPerWorker) # Exporting log files for Tensorboard from tensorflow_estimator.python.estimator.export import export_lib exportDir = export_lib.get_timestamped_export_dir(args.export_dir) compat.export_saved_model(multiWorkerModel, exportDir, ctx.job_name == 'chief') # terminating feed tells spark to skip processing further partitions tfFeed.terminate()
def _spark_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" import tensorflow as tf from tensorflowonspark import TFNode tf.compat.v1.reset_default_graph() strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with strategy.scope(): model = Sequential() model.add(Dense(1, activation='linear', input_shape=[2])) model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) model.summary() tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch['x']) > 0: features = batch['x'][0] label = batch['y_'][0] yield (features, label) else: return ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([2]), tf.TensorShape([1]))) ds = ds.batch(args.batch_size) # disable auto-sharding dataset options = tf.data.Options() options.experimental_distribute.auto_shard = False ds = ds.with_options(options) # only train 90% of each epoch to account for uneven RDD partition sizes steps_per_epoch = 1000 * 0.9 // (args.batch_size * ctx.num_workers) tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" callbacks = [ tf.keras.callbacks.ModelCheckpoint( filepath=filepath, verbose=1, load_weights_on_restart=True, save_weights_only=True) ] model.fit(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) # This fails with: "NotImplementedError: `fit_generator` is not supported for models compiled with tf.distribute.Strategy" # model.fit_generator(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) if ctx.job_name == 'chief' and args.export_dir: print("exporting model to: {}".format(args.export_dir)) tf.keras.experimental.export_saved_model( model, args.export_dir) tf_feed.terminate()
def map_fun(args, ctx): # from com.yahoo.ml.tf import TFNode from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf from tensorflow.contrib.layers.python.layers import batch_norm import time import os worker_num = ctx.worker_num #worker数量 job_name = ctx.job_name # job名 task_index = ctx.task_index # 任务索引 cluster_spec = ctx.cluster_spec # 集群 IMAGE_PIXELS = 2 # 图像大小 mnist 28x28x1 (后续参考自己图像大小进行修改) channels = 3 num_class = 2 # global dropout dropout = args.dropout # Parameters # hidden_units = 128 # NN隐藏层 # training_epochs=args.epochs batch_size = args.batch_size #每批次训练的样本数 # img_nums=630000 # global learning_rate # learning_rate=args.learning_rate INITIAL_LEARNING_RATE = args.learning_rate # flag=True # batch_size=200 num_examples_per_epoch_for_train = (4015 - 1)**2 # 每次迭代的样本数 num_batches_per_epoch = int(num_examples_per_epoch_for_train / batch_size) num_epochs_per_decay = 1.2 learning_rate_decay_rate = 0.8 learning_rate_decay_steps = int(num_batches_per_epoch * num_epochs_per_decay) """ # ---------设置动态学习效率 # Constants describing the training process. # MOVING_AVERAGE_DECAY = 0.9999 # The decay to use for the moving average. NUM_EPOCHS_PER_DECAY = batch_size # Epochs after which learning rate decays. LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor. INITIAL_LEARNING_RATE = 0.1 # Initial learning rate. global_step1 = training_epochs * (img_nums // batch_size) # Integer Variable counting the number of training steps # Variables that affect learning rate. num_batches_per_epoch = img_nums / batch_size decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY) # Decay the learning rate exponentially based on the number of steps. learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE, global_step1, decay_steps, LEARNING_RATE_DECAY_FACTOR, staircase=True) # 设置动态学习效率---------- """ # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": # ps节点(主节点) time.sleep((worker_num + 1) * 5) # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def feed_dict(batch): # Convert from [(images, labels)] to two numpy arrays of the proper type images = [] labels = [] if args.mode != 'inference': numpy.random.shuffle(batch) # 随机打乱 for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) xs = xs.astype(numpy.float32) # xs = xs/255.0 # 数据归一化 # Z-score标准化方法 # mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1]) # std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1]) # xs = (xs - mean) / std # min-max标准化(Min-Max Normalization max_ = numpy.reshape(numpy.max(xs, 1), [numpy.shape(xs)[0], 1]) min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1]) xs = (xs - min_) / (max_ - min_) ys = numpy.array(labels) if args.mode != 'inference': ys = ys.astype(numpy.uint8) else: ys = ys.astype(numpy.uint16) return (xs, ys) def batch_norm_layer(inputT, is_training=True, scope=None): # Note: is_training is tf.placeholder(tf.bool) type return tf.cond(is_training, lambda: batch_norm(inputT, is_training=True, center=True, scale=True, activation_fn=tf.nn.relu, decay=0.9, scope=scope), lambda: batch_norm(inputT, is_training=False, center=True, scale=True, activation_fn=tf.nn.relu, decay=0.9, scope=scope)) # , reuse = True)) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Create some wrappers for simplicity def conv2d(x, W, b, strides=1): # Conv2D wrapper, with bias and relu activation x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME') x = tf.nn.bias_add(x, b) # strides中间两个为1 表示x,y方向都不间隔取样 return tf.nn.relu(x) def maxpool2d(x, k=2): # MaxPool2D wrapper return tf.nn.max_pool( x, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME') # strides中间两个为2 表示x,y方向都间隔1个取样 # Store layers weight & bias weights = { # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入 'wc1': tf.get_variable('wc1', [3, 3, channels, 128], dtype=tf.float32, initializer=tf.truncated_normal_initializer, regularizer=tf.nn.l2_loss), # 5X5的卷积模板 # 5x5 conv, 32 inputs, 64 outputs 'wc2': tf.get_variable('wc2', [3, 3, 32, 64], dtype=tf.float32, initializer=tf.truncated_normal_initializer, regularizer=tf.nn.l2_loss), # fully connected, 7*7*64 inputs, 1024 outputs 'wd1': tf.Variable( tf.random_normal([ (IMAGE_PIXELS // 2) * (IMAGE_PIXELS // 2) * 128, 1024 ])), # 1024 inputs, 10 outputs (class prediction) 'out': tf.Variable(tf.random_normal([1024, num_class])) } biases = { 'bc1': tf.get_variable('bc1', [128], dtype=tf.float32, initializer=tf.truncated_normal_initializer, regularizer=tf.nn.l2_loss), 'bc2': tf.get_variable('bc2', [64], dtype=tf.float32, initializer=tf.truncated_normal_initializer, regularizer=tf.nn.l2_loss), 'bd1': tf.Variable(tf.random_normal([1024])), 'out': tf.Variable(tf.random_normal([num_class])) } # Placeholders or QueueRunner/Readers for input data x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS * channels], name="x") # mnist 28*28*1 if args.mode != 'inference': y_ = tf.placeholder(tf.float32, [None, num_class], name="y_") else: y_ = tf.placeholder(tf.float32, [None, 4], name="y_") label = y_ keep = tf.placeholder(tf.float32) is_training = tf.placeholder(tf.bool, name='MODE') x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels ]) # mnist 数据 28x28x1 (灰度图 波段为1) # x_img=batch_norm_layer(x_img,is_training) x_img = tf.nn.lrn(x_img, depth_radius=5, bias=2.0, alpha=1e-3, beta=0.75) # lrn层 # 改成卷积模型 conv1 = conv2d(x_img, weights['wc1'], biases['bc1']) conv1 = maxpool2d(conv1, k=2) # shape [N,1,1,32] conv1 = tf.nn.lrn(conv1, depth_radius=5, bias=2.0, alpha=1e-3, beta=0.75) # lrn层 # conv2 = conv2d(conv1, weights['wc2'], biases['bc2']) # conv2 = maxpool2d(conv2, k=2) # shape [N,1,1,32] # conv1 = tf.nn.dropout(conv1, keep+0.1) fc1 = tf.reshape(conv1, [-1, weights['wd1'].get_shape().as_list()[0]]) fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1']) # fc1=batch_norm_layer(fc1, is_training) fc1 = tf.nn.relu(fc1) fc1 = tf.nn.dropout(fc1, keep) y = tf.add(tf.matmul(fc1, weights['out']), biases['out']) prediction = tf.argmax(y, 1, name="prediction") # y=tf.sigmoid(y) # 二分类 多分类加 tf.nn.softmax() global_step = tf.Variable(0, name="global_step", trainable=False) # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) if args.mode != 'inference': loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) # learning_rate=tf.train.exponential_decay(INITIAL_LEARNING_RATE,global_step, # learning_rate_decay_steps,learning_rate_decay_rate, # staircase=False) # learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE, # global_step, # 10000, # 0.96, # staircase=False) learning_rate = tf.train.polynomial_decay( INITIAL_LEARNING_RATE, global_step, 3000000, 1e-5, 0.8, True) # 运行steps:decay_steps>1000:1 # train_op = tf.train.AdagradOptimizer(learning_rate).minimize( # loss, global_step=global_step) train_op = tf.train.GradientDescentOptimizer( learning_rate).minimize(loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") # prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") # tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() # summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) # # log.info("tensorflow model path: {0}".format(logdir)) # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor( is_chief=(task_index == 0), logdir=logdir, init_op=init_op, # summary_op=None, saver=saver, # saver=None, # None 不自动保存模型 # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=10) elif args.mode == "retrain": sv = tf.train.Supervisor( is_chief=(task_index == 0), logdir=logdir, # init_op=init_op, # summary_op=None, # saver=None, # None 不自动保存模型 saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor( is_chief=(task_index == 0), logdir=logdir, # summary_op=None, saver=saver, # recovery_wait_secs=1, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: # 打开session """ # 验证之前是否已经保存了检查点文件 ckpt = tf.train.get_checkpoint_state(logdir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess,ckpt.model_checkpoint_path) """ # global_step=int(ckpt.model_checkpoint_path.rsplit('-',1)[1]) # else: # sess.run(init_op) print("{0} session ready".format(datetime.now().isoformat())) # log.info("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 # acc1=args.acc # n = 0 tf_feed = TFNode.DataFeed( ctx.mgr, args.mode == "train" or args.mode == "retrain") while not sv.should_stop() and not tf_feed.should_stop( ) and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = { x: batch_xs, y_: batch_ys, keep: dropout, is_training: True } if len(batch_xs) > 0: if args.mode == "train" or args.mode == "retrain": # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) _, step = sess.run([train_op, global_step], feed_dict=feed) ''' if dropout > 0.2: if step%10000==0:dropout=dropout*0.85 else: dropout=0.7 ''' """ acc=sess.run(accuracy,{x: batch_xs, y_: batch_ys,keep:1.}) if acc>acc1: if flag and acc>0.9: os.popen('hdfs dfs -rm -r '+logdir+'/*') # 清空hdfs上面文件夹下的所有文件 flag=False # acc1=acc # 训练达到一定程度加上 saver.save(sess,logdir+'/'+args.model_name,global_step=step) n=0 # learning_rate=1e-3 # dropout=.7 else: n += 1 if n > 100: ckpt1 = tf.train.get_checkpoint_state(logdir) if ckpt1 and ckpt1.model_checkpoint_path: saver.restore(sess, ckpt1.model_checkpoint_path) if learning_rate > 1e-7: # learning_rate = learning_rate * .96**(step/10) learning_rate = learning_rate * .8 else: learning_rate = 1e-3 if dropout > 0.2: dropout = dropout * .85 else: dropout = .7 """ # print accuracy and save model checkpoint to HDFS every 100 steps if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format( datetime.now().isoformat(), step, sess.run( accuracy, { x: batch_xs, y_: batch_ys, keep: 1., is_training: False }))) # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys}))) if sv.is_chief: pass # summary_writer.add_summary(summary, step) elif args.mode == 'test': feed2 = { x: batch_xs, y_: batch_ys, keep: 1., is_training: False } labels, preds, acc = sess.run( [label, prediction, accuracy], feed_dict=feed2) results = [ "{0} Label: {1}, Prediction: {2}".format( datetime.now().isoformat(), l, p) for l, p in zip(labels, preds) ] tf_feed.batch_results(results) print("acc: {0}".format(acc)) else: # args.mode == "inference" feed2 = { x: batch_xs, y_: batch_ys, keep: 1., is_training: False } # labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed2) labels, preds = sess.run([label, prediction], feed_dict=feed2) # results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)] results = [ "Label: {0}, Prediction: {1}".format(l, p) for l, p in zip(labels, preds) ] tf_feed.batch_results(results) # print("acc: {0}".format(acc)) # log.info("acc: {0}".format(acc)) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) # log.info("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time import logging import cnn_lstm_ctc_ocr #import redis_logger_handler #redis_logger_handler.logging_setup(args.redis) worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec worker_name = '(worker:%s tf:%s idx:%s)' % (worker_num, job_name, task_index) logging.info( '{0} batch_size:{1} initial_learning_rate:{2} decay_steps:{3} decay_rate:{4} momentum:{5}' .format(worker_name, args.batch_size, args.initial_learning_rate, args.decay_steps, args.decay_rate, args.momentum)) # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters CHANNELS = 1 IMAGE_WIDTH = 120 IMAGE_HEIGHT = 45 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def sparse_tuple_from_label(sequences, dtype=numpy.int32): indices = [] values = [] for n, seq in enumerate(sequences): indices.extend(zip([n] * len(seq), range(len(seq)))) values.extend(seq) indices = numpy.asarray(indices, dtype=numpy.int64) values = numpy.asarray(values, dtype=dtype) shape = numpy.asarray( [len(sequences), numpy.asarray(indices).max(0)[1] + 1], dtype=numpy.int64) return indices, values, shape def get_input_lens(sequences): lengths = numpy.asarray([58 for s in sequences], dtype=numpy.int64) return sequences, lengths def placeholder_inputs(image_width, image_height, channels): images_placeholder = tf.placeholder( tf.float32, [None, image_height, image_width, channels]) labels_placeholder = tf.sparse_placeholder(tf.int32) seqlen_placeholder = tf.placeholder(tf.int32, [None]) keep_prob = tf.placeholder(tf.float32) return images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob def format_batch(data_set, batch_size, image_height, image_width, channels): batch = data_set.next_batch(batch_size) images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) xs = numpy.array(images) # [batch_size, height * width] => [batch_size, height, width, channels] xs = xs.reshape(batch_size, image_height, image_width, channels) xs = xs.astype(numpy.float32) xs = xs / 255. ys = labels return xs, ys def fill_feed_dict(xs, ys, images_pl, labels_pl, seqlen_pl, keep_prob, train=True): images_feed, seqlen_feed = get_input_lens(xs) labels_feed = sparse_tuple_from_label(ys) if train: feed_dict = { images_pl: images_feed, labels_pl: labels_feed, seqlen_pl: seqlen_feed, keep_prob: 0.5, } else: feed_dict = { images_pl: images_feed, labels_pl: labels_feed, seqlen_pl: seqlen_feed, keep_prob: 1, } return feed_dict def do_eval(sess, dense_decoded, lastbatch_err, learning_rate, images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob, train, xs, ys): true_count = 0 # Counts the number of correct predictions. feed_dict = fill_feed_dict(xs, ys, images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob, train) dd, lerr, lr = sess.run([dense_decoded, lastbatch_err, learning_rate], feed_dict=feed_dict) #accuracy calculation for i, origin_label in enumerate(ys): decoded_label = [j for j in dd[i] if j != -1] if i < 10: logging.info('{0} seq {1} => origin:{2} decoded:{3}'.format( worker_name, i, origin_label, decoded_label)) if origin_label == decoded_label: true_count += 1 #accuracy acc = true_count * 1.0 / len(ys) #print subsummary logging.info( "%s accuracy = %.3f, lastbatch_err = %.3f, learning_rate = %.8f" % (worker_name, acc, lerr, lr)) if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Generate placeholders for the images, labels and seqlens. images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob = placeholder_inputs( IMAGE_WIDTH, IMAGE_HEIGHT, CHANNELS) # Build a Graph that computes predictions from the inference model. #images_lp, seqlen_lp, num_features, num_layers, hidden_units logits = cnn_lstm_ctc_ocr.inference(images_placeholder, seqlen_placeholder, keep_prob, args.hidden_units, args.mode, args.batch_size) # Add to the Graph the Ops for loss calculation. #logits, labels_lp, seqlen_lp loss = cnn_lstm_ctc_ocr.loss(logits, labels_placeholder, seqlen_placeholder) tf.summary.scalar('loss', loss) # global counter global_step = tf.Variable(0, name='global_step', trainable=False) # Add to the Graph the Ops that calculate and apply gradients. #loss, initial_learning_rate, decay_steps, decay_rate, momentum train_op, learning_rate = cnn_lstm_ctc_ocr.training( loss, global_step, args.initial_learning_rate, args.decay_steps, args.decay_rate, args.momentum) # Add the Op to compare the logits to the labels during evaluation. dense_decoded, lerr = cnn_lstm_ctc_ocr.evaluation( logits, labels_placeholder, seqlen_placeholder) tf.summary.scalar('lerr', lerr) summary_op = tf.summary.merge_all() # Add the variable initializer Op. init_op = tf.global_variables_initializer() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = TFNode.hdfs_path(ctx, args.model) logging.info("{0} tensorflow model path: {1}".format( worker_name, logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=60) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. validation_xs = None validation_ys = None validation_batchs = 10 with sv.managed_session(server.target) as sess: logging.info("{0} session ready".format(worker_name)) # Loop until the supervisor shuts down or 1000000 steps have completed. g_step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") # for do_eval samples if None == validation_xs or None == validation_ys: validation_xs, validation_ys = format_batch( tf_feed, args.batch_size * validation_batchs, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS) while not sv.should_stop() and not tf_feed.should_stop( ) and g_step < (args.steps * args.epochs - validation_batchs): # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. start_time = time.time() # using feed_dict xs, ys = format_batch(tf_feed, args.batch_size, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS) feed_dict = fill_feed_dict(xs, ys, images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob, args.mode == "train") # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value, g_step = sess.run([train_op, loss, global_step], feed_dict=feed_dict) duration = time.time() - start_time if g_step % 20 == 0: # Print status to stdout. logging.info( '%s [g_step:%d epoch:%d/%d step:%d/%d] loss = %.2f (%.3f sec)' % (worker_name, g_step, g_step / args.steps, args.epochs, g_step % args.steps, args.steps, loss_value, duration)) # Write the summaries and print an overview fairly often. if g_step % 100 == 0: # Update the events file. if sv.is_chief: summary = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary, g_step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (g_step + 1) % 500 == 0 or (g_step + 1) == args.steps: # Evaluate against the validation set. logging.info('{0} ---- Validation Data Eval: ----'.format( worker_name)) do_eval(sess, dense_decoded, lerr, learning_rate, images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob, args.mode == "train", validation_xs, validation_ys) if sv.should_stop() or g_step >= (args.steps * args.epochs - validation_batchs): logging.info("{0} terminating tf_feed".format(worker_name)) tf_feed.terminate() # Ask for all the services to stop. logging.info("{0} stopping supervisor".format(worker_name)) sv.stop()
def map_fun(args, ctx): from tensorflowonspark import TFNode from datetime import datetime import math import numpy import tensorflow as tf import time worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec if job_name == "ps": time.sleep((worker_num + 1) * 5) batch_size = args.batch_size cluster, server = TFNode.start_cluster_server(ctx, 1) def feed_dict(batch): images = [] labels = [] for item in batch: images.append(item[0]) labels.append(item[1]) x_initial = numpy.array(images) x_objdump = x_initial[:,519:719] x_cnn = numpy.empty((0, 200), dtype=numpy.float64) for i in xrange(len(images)): x_cnn_batch = numpy.zeros((200, 120), dtype=numpy.float64) for j in xrange(0, 200): x_cnn_batch[j, int(x_objdump[i, j])] = True x_cnn_batch = numpy.transpose(x_cnn_batch) x_cnn = numpy.append(x_cnn, x_cnn_batch, axis=0) x_peinfo = x_initial[:,0:519] ys = numpy.array(labels) return (x_peinfo.reshape(-1,519,1,1),x_cnn.reshape(-1, 200, 120, 1), ys) def conv2d(x, W): return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') def max_pool_1(x): return tf.nn.avg_pool(x, ksize=[1, 2,1, 1], strides=[1, 2, 1, 1], padding='SAME') def max_pool_2(x): return tf.nn.avg_pool(x, ksize=[1, 100,1, 1], strides=[1, 100, 1, 1], padding='SAME') if job_name == "ps": server.join() elif job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Build NN-Network W_mlp_1 = tf.Variable(tf.truncated_normal([519,519],stddev=0.1), name="W_mlp_1") b_mlp_1 = tf.Variable(tf.constant(0.1, shape=[519]),name="b_mlp_1") tf.summary.histogram("W_mlp_1", W_mlp_1) W_mlp_2 = tf.Variable(tf.truncated_normal([519,519],stddev=0.1), name="W_mlp_2") b_mlp_2 = tf.Variable(tf.constant(0.1, shape=[519]),name="b_mlp_2") tf.summary.histogram("W_mlp_2", W_mlp_2) W_conv1 = tf.Variable(tf.truncated_normal([3,120,1,3],stddev=0.1), name="W_conv1") b_conv1 = tf.Variable(tf.constant(0.1, shape=[3]),name="b_conv1") tf.summary.histogram("W_conv1", W_conv1) W_conv2 = tf.Variable(tf.truncated_normal([3,120,3,6],stddev=0.1),name="W_conv2") b_conv2 = tf.Variable(tf.constant(0.1, shape=[6]),name="b_conv2") tf.summary.histogram("W_conv2", W_conv2) sm_w = tf.Variable(tf.truncated_normal([1239, 10], stddev= 0.1), name="sm_w") sm_b = tf.Variable(tf.constant(0.1, shape=[10]),name="sm_b") tf.summary.histogram("softmax_weights", sm_w) x_cnn = tf.placeholder(tf.float32, [None, 200,120,1], name="x_cnn") x_mlp = tf.placeholder(tf.float32, [None, 519,1,1], name="x_mlp") y_ = tf.placeholder(tf.float32, [None, 10], name="y_") tf.summary.image("x_cnn", x_cnn) tf.summary.image("x_mlp", x_mlp) x_mlp_new = tf.reshape(x_mlp, [-1, 519]) h_mlp_1 = tf.nn.xw_plus_b(x_mlp_new, W_mlp_1, b_mlp_1) h_mlp_2 = tf.nn.xw_plus_b(h_mlp_1, W_mlp_2, b_mlp_2) h_conv1 = tf.nn.relu(conv2d(x_cnn, W_conv1) + b_conv1) h_pool1 = max_pool_1(h_conv1) h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) h_pool2 = max_pool_2(h_conv2) h_conv2_flat = tf.reshape(h_pool2, [-1, 120*6]) h_inter = tf.concat([h_mlp_2, h_conv2_flat],1) y = tf.nn.softmax(tf.nn.xw_plus_b(h_inter, sm_w, sm_b)) global_step = tf.Variable(0) loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.001).minimize( loss, global_step=global_step) label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1,name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() logdir = TFNode.hdfs_path(ctx, args.model) print("tensorflow model path: {0}".format(logdir)) summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) step = 0 tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train") while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: batch_mlp, batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size)) feed = {x_mlp: batch_mlp, x_cnn: batch_xs, y_: batch_ys} if len(batch_xs) > 0: if args.mode == "train": _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed) if (step % 10 == 0): print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x_mlp: batch_mlp, x_cnn: batch_xs, y_: batch_ys}))) if sv.is_chief: summary_writer.add_summary(summary, step) elif args.mode == "inference": labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed) results = ["Label: {0}, Prediction: {1}".format(l, p) for l,p in zip(labels,preds)] tf_feed.batch_results(results) print("acc: {0}".format(acc)) else: preds= sess.run(prediction, feed_dict={x_mlp: batch_mlp, x_cnn: batch_xs}) results = ["Sha256: {0}, Prediction: {1}".format(l, p) for l,p in zip(batch_ys,preds)] tf_feed.batch_results(results) print(results) if sv.should_stop() or step >= args.steps: tf_feed.terminate() print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(args, ctx): import tensorflow as tf import argparse import time import os from six.moves import cPickle from model import Model from tensorflowonspark import TFNode from datetime import datetime import numpy as np worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec num_workers = len(cluster_spec['worker']) # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) if job_name == "ps": server.join() else: with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): model = Model(args) # instrument for tensorboard saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() logdir = TFNode.hdfs_path(args.save_dir, ctx.defaultFS, ctx.working_dir) print("tensorflow model path: {0}".format(logdir)) summary_writer = TFNode.get_summary_writer(ctx) sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, saver=saver, global_step=model.global_step, stop_grace_secs=300, save_model_secs=10) # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format( datetime.now().isoformat())) state=sess.run(model.initial_state) # Loop until the supervisor shuts down or 1000000 steps have completed. step=0 tf_feed=TFNode.DataFeed(ctx.mgr, True) while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using feed_dict batch = tf_feed.next_batch(args.batch_size) batch_xs = np.asarray([data[0] for data in batch]) batch_ys = np.asarray([data[1] for data in batch]) feed={model.input_data: batch_xs, model.targets: batch_ys} for i, (c, h) in enumerate(model.initial_state): feed[c]=state[i].c feed[h]=state[i].h if len(batch_xs) > 0: # instrument for tensorboard summ, train_loss, state, _, step = sess.run( [summary_op, model.cost, model.final_state, model.train_op, model.global_step], feed_dict=feed) # print loss print("Step: {}, train_loss: {}".format(step, train_loss)) if sv.is_chief: summary_writer.add_summary(summ, step) if sv.should_stop() or step >= args.steps: tf_feed.terminate() # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def main_fun(args, ctx): import numpy import os import tensorflow as tf import tensorflow.contrib.keras as keras from tensorflow.contrib.keras.api.keras import backend as K from tensorflow.contrib.keras.api.keras.models import Sequential, load_model, save_model from tensorflow.contrib.keras.api.keras.layers import Dense, Dropout from tensorflow.contrib.keras.api.keras.optimizers import RMSprop from tensorflow.contrib.keras.python.keras.callbacks import LambdaCallback, TensorBoard from tensorflow.python.saved_model import builder as saved_model_builder from tensorflow.python.saved_model import tag_constants from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def from tensorflowonspark import TFNode cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": def generate_rdd_data(tf_feed, batch_size): print("generate_rdd_data invoked") while True: batch = tf_feed.next_batch(batch_size) imgs = [] lbls = [] for item in batch: imgs.append(item[0]) lbls.append(item[1]) images = numpy.array(imgs).astype('float32') / 255 labels = numpy.array(lbls).astype('float32') yield (images, labels) with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): IMAGE_PIXELS = 28 batch_size = 100 num_classes = 10 # the data, shuffled and split between train and test sets if args.input_mode == 'tf': from tensorflow.contrib.keras.api.keras.datasets import mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) x_train = x_train.astype('float32') / 255 x_test = x_test.astype('float32') / 255 # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) else: # args.mode == 'spark' x_train = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x_train") y_train = tf.placeholder(tf.float32, [None, 10], name="y_train") model = Sequential() model.add(Dense(512, activation='relu', input_shape=(784, ))) model.add(Dropout(0.2)) model.add(Dense(512, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(10, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) saver = tf.train.Saver() with tf.Session(server.target) as sess: K.set_session(sess) def save_checkpoint(epoch, logs=None): if epoch == 1: tf.train.write_graph(sess.graph.as_graph_def(), args.model_dir, 'graph.pbtxt') saver.save(sess, os.path.join(args.model_dir, 'model.ckpt'), global_step=epoch * args.steps_per_epoch) ckpt_callback = LambdaCallback(on_epoch_end=save_checkpoint) tb_callback = TensorBoard(log_dir=args.model_dir, histogram_freq=1, write_graph=True, write_images=True) # add callbacks to save model checkpoint and tensorboard events (on worker:0 only) callbacks = [ckpt_callback, tb_callback ] if ctx.task_index == 0 else None if args.input_mode == 'tf': # train & validate on in-memory data history = model.fit(x_train, y_train, batch_size=batch_size, epochs=args.epochs, verbose=1, validation_data=(x_test, y_test), callbacks=callbacks) else: # args.input_mode == 'spark': # train on data read from a generator which is producing data from a Spark RDD tf_feed = TFNode.DataFeed(ctx.mgr) history = model.fit_generator( generator=generate_rdd_data(tf_feed, batch_size), steps_per_epoch=args.steps_per_epoch, epochs=args.epochs, verbose=1, callbacks=callbacks) if args.export_dir and ctx.job_name == 'worker' and ctx.task_index == 0: # save a local Keras model, so we can reload it with an inferencing learning_phase save_model(model, "tmp_model") # reload the model K.set_learning_phase(False) new_model = load_model("tmp_model") # export a saved_model for inferencing builder = saved_model_builder.SavedModelBuilder( args.export_dir) signature = predict_signature_def( inputs={'images': new_model.input}, outputs={'scores': new_model.output}) builder.add_meta_graph_and_variables( sess=sess, tags=[tag_constants.SERVING], signature_def_map={'predict': signature}, clear_devices=True) builder.save() if args.input_mode == 'spark': tf_feed.terminate()