def keras_mnist(): import os import uuid import tensorflow as tf from hops import tensorboard from hops import model as hops_model from hops import hdfs batch_size = 32 num_classes = 10 # Provide path to train and validation datasets train_filenames = tf.io.gfile.glob( hdfs.project_path(td_proj_name) + '/' + td_ds + '/' + td + '/train/part-r-*') validation_filenames = tf.io.gfile.glob( hdfs.project_path(td_proj_name) + '/' + td_ds + '/' + td + '/validate/part-r-*') # Define input function def data_input(filenames, batch_size=128, num_classes=10, shuffle=False, repeat=None): def parser(serialized_example): """Parses a single tf.Example into image and label tensors.""" features = tf.io.parse_single_example( serialized_example, features={ 'image': tf.io.FixedLenFeature([28 * 28], tf.float32), 'label': tf.io.FixedLenFeature([], tf.int64), }) image = tf.cast(features['image'], tf.float32) label = tf.cast(features['label'], tf.int32) # Create a one hot array for your labels label = tf.one_hot(label, num_classes) return image, label # Import MNIST data dataset = tf.data.TFRecordDataset(filenames) # Map the parser over dataset, and batch results by up to batch_size dataset = dataset.map(parser) if shuffle: dataset = dataset.shuffle(buffer_size=128) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.repeat(repeat) return dataset # Define a Keras Model. model = tf.keras.Sequential() model.add( tf.keras.layers.Dense(128, activation='relu', input_shape=(784, ))) model.add(tf.keras.layers.Dense(num_classes, activation='softmax')) # Compile the model. model.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer=tf.keras.optimizers.Adam(0.001), metrics=['accuracy']) callbacks = [ tf.keras.callbacks.TensorBoard(log_dir=tensorboard.logdir()), tf.keras.callbacks.ModelCheckpoint(filepath=tensorboard.logdir()), ] model.fit(data_input(train_filenames, batch_size), verbose=0, epochs=3, steps_per_epoch=5, validation_data=data_input(validation_filenames, batch_size), validation_steps=1, callbacks=callbacks) score = model.evaluate(data_input(validation_filenames, batch_size), steps=1) # Export model # WARNING(break-tutorial-inline-code): The following code snippet is # in-lined in tutorials, please update tutorial documents accordingly # whenever code changes. export_path = os.getcwd() + '/model-' + str(uuid.uuid4()) print('Exporting trained model to: {}'.format(export_path)) tf.saved_model.save(model, export_path) print('Done exporting!') metrics = {'accuracy': score[1]} hops_model.export(export_path, model_name, metrics=metrics, project=model_proj_name) return metrics
def task2(): import tensorflow as tf from hops import tensorboard from hops import hdfs from tensorflow.examples.tutorials.mnist import input_data fashion_mnist = input_data.read_data_sets( 'data/fashion', one_hot=True, source_url='http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/' ) # Helpers def weight_var(shape): initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def bias_var(shape, value=0.1): initial = tf.constant(value, shape=shape) return tf.Variable(initial) def bias_var_z(shape): return tf.Variable(tf.zeros(shape)) def layer(tensor, in_dim, out_dim, name, activation=tf.nn.sigmoid): weights = weight_var([in_dim, out_dim]) biases = bias_var_z([out_dim]) pre = tf.matmul(tensor, weights) + biases post = activation(pre) tf.summary.histogram('activations', post) return post # Hardcoded params num_ch = 1 num_classes = 10 image_height = image_width = 28 layer_widths = [200, 100, 60, 30, 10] # 1. Define variables and placeholders X = tf.placeholder(tf.float32, shape=[None, image_height, image_width, num_ch]) Y_ = tf.placeholder(tf.float32, shape=[None, 10]) XX = tf.reshape(X, [-1, image_height * image_width]) HSig1 = layer(XX, 784, layer_widths[0], 'sigmoid-1', tf.nn.sigmoid) HSig2 = layer(HSig1, layer_widths[0], layer_widths[1], 'sigmoid-2', tf.nn.sigmoid) HSig3 = layer(HSig2, layer_widths[1], layer_widths[2], 'sigmoid-3', tf.nn.sigmoid) HSig4 = layer(HSig3, layer_widths[2], layer_widths[3], 'sigmoid-4', tf.nn.sigmoid) Y = layer(HSig4, layer_widths[3], layer_widths[4], 'identity', tf.identity) # W1 = tf.Variable(tf.truncated_normal([784, 200], stddev=0.1)) # W2 = tf.Variable(tf.truncated_normal([200, 100], stddev=0.1)) # W3 = tf.Variable(tf.truncated_normal([100, 60 ], stddev=0.1)) # W4 = tf.Variable(tf.truncated_normal([60, 30 ], stddev=0.1)) # W5 = tf.Variable(tf.truncated_normal([30, 10 ], stddev=0.1)) # B1 = tf.Variable(tf.zeros([200])) # B2 = tf.Variable(tf.zeros([100])) # B3 = tf.Variable(tf.zeros([60 ])) # B4 = tf.Variable(tf.zeros([30 ])) # B5 = tf.Variable(tf.zeros([10 ])) # #Define the model # XX = tf.reshape(X, [-1, 784]) # Y1 = tf.nn.sigmoid(tf.matmul(XX, W1) + B1) # Y2 = tf.nn.sigmoid(tf.matmul(Y1, W2) + B2) # Y3 = tf.nn.sigmoid(tf.matmul(Y2, W3) + B3) # Y4 = tf.nn.sigmoid(tf.matmul(Y3, W4) + B4) # Ylogits = tf.matmul(Y4, W5) + B5 # Y = tf.nn.softmax(Ylogits) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=Y, labels=Y_)) tf.summary.scalar('cross_entropy', cross_entropy) with tf.name_scope('accuracy'): with tf.name_scope('correct_prediction'): correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1)) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', accuracy) with tf.name_scope('train'): with tf.name_scope('gradient_descent'): train_step_gd = tf.train.GradientDescentOptimizer(0.5).minimize( cross_entropy) with tf.name_scope('adam_optimizer'): train_step_adam = tf.train.AdamOptimizer(0.005).minimize( cross_entropy) # Define accuracy with tf.name_scope('accuracy'): with tf.name_scope('correct_prediction'): correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1)) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar("cost", cross_entropy) tf.summary.scalar("accuracy", accuracy) init = tf.global_variables_initializer() sess = tf.Session() logdir = tensorboard.logdir() summary_op = tf.summary.merge_all() train_writer = tf.summary.FileWriter(logdir + '/train', sess.graph) test_writer = tf.summary.FileWriter(logdir + '/test') def epochs(train, test, train_step, num_epochs=100, batch_size=100): sess.run(init) accuracies = [] losses = [] for epoch in range(10000): for it in range(100): batch_xs, batch_ys = train.next_batch(batch_size) feed_dict = {XX: batch_xs, Y_: batch_ys} _, summary = sess.run([train_step, summary_op], feed_dict=feed_dict) train_writer.add_summary(summary, epoch * 100 + it) # Compute accuracy and loss every 100 rounds feed_dict = {XX: test.images, Y_: test.labels} summary, acc = sess.run([summary_op, accuracy], feed_dict=feed_dict) loss = sess.run(cross_entropy, feed_dict=feed_dict) accuracies.append(acc) losses.append(loss) test_writer.add_summary(summary, epoch) return (accuracies, losses) acc, loss = epochs(fashion_mnist.train, fashion_mnist.test, train_step_gd) train_writer.close() test_writer.close() writer.close() print("Accuracy: {}".format(acc)) print("Loss: {}".format(loss))
def mnist_fun(args, ctx): def print_log(worker_num, arg): print("%d: " % worker_num) print(arg) from tensorflowonspark import TFNode from datetime import datetime import getpass import math import numpy import os import signal import tensorflow as tf import time # Used to get TensorBoard logdir for TensorBoard that show up in HopsWorks from hops import tensorboard IMAGE_PIXELS = 28 worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec num_workers = len(cluster_spec['worker']) # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict) if job_name == "ps": time.sleep((worker_num + 1) * 5) # Parameters hidden_units = 128 batch_size = 100 # Get TF cluster and server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) def read_tfr_examples(path, batch_size=100, num_epochs=None, task_index=None, num_workers=None): print_log(worker_num, "num_epochs: {0}".format(num_epochs)) # Setup queue of TFRecord filenames tf_record_pattern = os.path.join(path, 'part-*') files = tf.gfile.Glob(tf_record_pattern) queue_name = "file_queue" # split input files across workers, if specified if task_index is not None and num_workers is not None: num_files = len(files) files = files[task_index:num_files:num_workers] queue_name = "file_queue_{0}".format(task_index) print_log(worker_num, "files: {0}".format(files)) file_queue = tf.train.string_input_producer(files, shuffle=False, capacity=1000, num_epochs=num_epochs, name=queue_name) # Setup reader for examples reader = tf.TFRecordReader(name="reader") _, serialized = reader.read(file_queue) feature_def = { 'label': tf.FixedLenFeature([10], tf.int64), 'image': tf.FixedLenFeature([784], tf.int64) } features = tf.parse_single_example(serialized, feature_def) norm = tf.constant(255, dtype=tf.float32, shape=(784, )) image = tf.div(tf.to_float(features['image']), norm) print_log(worker_num, "image: {0}".format(image)) label = tf.to_float(features['label']) print_log(worker_num, "label: {0}".format(label)) # Return a batch of examples return tf.train.batch([image, label], batch_size, num_threads=args.readers, name="batch") def read_csv_examples(image_dir, label_dir, batch_size=100, num_epochs=None, task_index=None, num_workers=None): print_log(worker_num, "num_epochs: {0}".format(num_epochs)) # Setup queue of csv image filenames tf_record_pattern = os.path.join(image_dir, 'part-*') images = tf.gfile.Glob(tf_record_pattern) print_log(worker_num, "images: {0}".format(images)) image_queue = tf.train.string_input_producer(images, shuffle=False, capacity=1000, num_epochs=num_epochs, name="image_queue") # Setup queue of csv label filenames tf_record_pattern = os.path.join(label_dir, 'part-*') labels = tf.gfile.Glob(tf_record_pattern) print_log(worker_num, "labels: {0}".format(labels)) label_queue = tf.train.string_input_producer(labels, shuffle=False, capacity=1000, num_epochs=num_epochs, name="label_queue") # Setup reader for image queue img_reader = tf.TextLineReader(name="img_reader") _, img_csv = img_reader.read(image_queue) image_defaults = [[1.0] for col in range(784)] img = tf.stack(tf.decode_csv(img_csv, image_defaults)) # Normalize values to [0,1] norm = tf.constant(255, dtype=tf.float32, shape=(784, )) image = tf.div(img, norm) print_log(worker_num, "image: {0}".format(image)) # Setup reader for label queue label_reader = tf.TextLineReader(name="label_reader") _, label_csv = label_reader.read(label_queue) label_defaults = [[1.0] for col in range(10)] label = tf.stack(tf.decode_csv(label_csv, label_defaults)) print_log(worker_num, "label: {0}".format(label)) # Return a batch of examples return tf.train.batch([image, label], batch_size, num_threads=args.readers, name="batch_csv") if job_name == "ps": server.join() elif job_name == "worker": # Assigns ops to the local worker by default. with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # Variables of the hidden layer hid_w = tf.Variable(tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b") tf.summary.histogram("hidden_weights", hid_w) # Variables of the softmax layer sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10], stddev=1.0 / math.sqrt(hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") tf.summary.histogram("softmax_weights", sm_w) # Placeholders or QueueRunner/Readers for input data num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs index = task_index if args.mode == "inference" else None workers = num_workers if args.mode == "inference" else None if args.format == "csv": images = TFNode.hdfs_path(ctx, args.images) labels = TFNode.hdfs_path(ctx, args.labels) x, y_ = read_csv_examples(images, labels, 100, num_epochs, index, workers) elif args.format == "tfr": images = TFNode.hdfs_path(ctx, args.images) x, y_ = read_tfr_examples(images, 100, num_epochs, index, workers) else: raise ("{0} format not supported for tf input mode".format( args.format)) x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1]) tf.summary.image("x_img", x_img) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) global_step = tf.Variable(0) loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) tf.summary.scalar("loss", loss) train_op = tf.train.AdagradOptimizer(0.01).minimize( loss, global_step=global_step) # Test trained model label = tf.argmax(y_, 1, name="label") prediction = tf.argmax(y, 1, name="prediction") correct_prediction = tf.equal(prediction, label) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") tf.summary.scalar("acc", accuracy) saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = tensorboard.logdir() print("tensorflow model path: {0}".format(logdir)) if job_name == "worker" and task_index == 0: summary_writer = tf.summary.FileWriter( logdir, graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, summary_writer=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) output_dir = TFNode.hdfs_path(ctx, args.output) output_file = tf.gfile.Open("{0}/part-{1:05d}".format( output_dir, worker_num), mode='w') # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) # Loop until the supervisor shuts down or 1000000 steps have completed. step = 0 count = 0 while not sv.should_stop() and step < args.steps: # Run a training step asynchronously. # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # using QueueRunners/Readers if args.mode == "train": if (step % 100 == 0): print("{0} step: {1} accuracy: {2}".format( datetime.now().isoformat(), step, sess.run(accuracy))) _, summary, step = sess.run( [train_op, summary_op, global_step]) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" labels, pred, acc = sess.run([label, prediction, accuracy]) #print("label: {0}, pred: {1}".format(labels, pred)) print("acc: {0}".format(acc)) for i in range(len(labels)): count += 1 output_file.write("{0} {1}\n".format(labels[i], pred[i])) print("count: {0}".format(count)) if args.mode == "inference": output_file.close() # Delay chief worker from shutting down supervisor during inference, since it can load model, start session, # run inference and request stop before the other workers even start/sync their sessions. if task_index == 0: time.sleep(60) # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()
def _wrapper_fun(iter): for i in iter: executor_num = i client = coordination_server.Client(server_addr) node_meta = { 'host': get_ip_address(), 'executor_cwd': os.getcwd(), 'cuda_visible_devices_ordinals': devices.get_minor_gpu_device_numbers() } client.register(node_meta) t_gpus = threading.Thread( target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t_gpus.start() # Only spark executor with index 0 should create necessary HDFS directories and start mpirun # Other executors simply block until index 0 reports mpirun is finished clusterspec = client.await_reservations() #pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) #hopshdfs.init_logger() #hopshdfs.log('Starting Spark executor with arguments') gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info( ) #hopshdfs.log(gpu_str) print(gpu_str) mpi_logfile_path = os.getcwd() + '/mpirun.log' if os.path.exists(mpi_logfile_path): os.remove(mpi_logfile_path) mpi_logfile = open(mpi_logfile_path, 'w') py_runnable = localize_scripts(nb_path, clusterspec) # non-chief executor should not do mpirun if not executor_num == 0: client.await_mpirun_finished() else: hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, param_string='Horovod') tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0) mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \ ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \ ' mpirun -np ' + str(get_num_ps(clusterspec)) + ' --hostfile ' + get_hosts_file(clusterspec) + \ ' -bind-to none -map-by slot ' + \ ' -x LD_LIBRARY_PATH ' + \ ' -x HOROVOD_TIMELINE ' + \ ' -x TENSORBOARD_LOGDIR ' + \ ' -x NCCL_DEBUG=INFO ' + \ ' -mca pml ob1 -mca btl ^openib ' + \ os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable mpi = subprocess.Popen(mpi_cmd, shell=True, stdout=mpi_logfile, stderr=mpi_logfile, preexec_fn=util.on_executor_exit('SIGTERM')) t_log = threading.Thread(target=print_log) t_log.start() mpi.wait() client.register_mpirun_finished() if devices.get_num_gpus() > 0: t_gpus.do_run = False t_gpus.join() return_code = mpi.returncode if return_code != 0: cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() raise Exception( 'mpirun FAILED, look in the logs for the error') cleanup(tb_hdfs_path) t_log.do_run = False t_log.join()
def _wrapper_fun(iter): for i in iter: executor_num = i hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories( app_id, run_id, None, 'horovod') tb_pid = 0 tb_hdfs_path = '' pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user()) hopshdfs.init_logger() hopshdfs.log('Starting Spark executor with arguments') if executor_num == 0: tb_hdfs_path, tb_pid = tensorboard.register( hdfs_exec_logdir, hdfs_appid_logdir, 0, local_logdir=local_logdir) gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info( ) hopshdfs.log(gpu_str) print(gpu_str) #1. Download notebook file fs_handle = hopshdfs.get_fs() try: fd = fs_handle.open_file(nb_path, flags='r') except: fd = fs_handle.open_file(nb_path, mode='r') notebook = '' for line in fd: notebook += line path, filename = os.path.split(nb_path) f_nb = open(filename, "w+") f_nb.write(notebook) f_nb.flush() f_nb.close() # 2. Convert notebook to py file jupyter_runnable = os.path.abspath( os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter' conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename conversion = subprocess.Popen(conversion_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) conversion.wait() stdout, stderr = conversion.communicate() print(stdout) print(stderr) # 3. Make py file runnable py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py' st = os.stat(py_runnable) os.chmod(py_runnable, st.st_mode | stat.S_IEXEC) t_gpus = threading.Thread( target=devices.print_periodic_gpu_utilization) if devices.get_num_gpus() > 0: t_gpus.start() mpi_logfile_path = os.getcwd() + '/mpirun.log' if os.path.exists(mpi_logfile_path): os.remove(mpi_logfile_path) mpi_logfile = open(mpi_logfile_path, 'w') # 4. Run allreduce mpi_np = os.environ['MPI_NP'] mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \ ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \ ' mpirun -np ' + str(mpi_np) + \ ' -bind-to none -map-by slot ' + \ ' -x HOROVOD_TIMELINE ' + \ ' -x TENSORBOARD_LOGDIR ' + \ ' -x NCCL_DEBUG=INFO ' + \ os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable mpi = subprocess.Popen(mpi_cmd, shell=True, stdout=mpi_logfile, stderr=mpi_logfile, preexec_fn=util.on_executor_exit('SIGTERM')) t_log = threading.Thread(target=print_log) t_log.start() mpi.wait() if devices.get_num_gpus() > 0: t_gpus.do_run = False t_gpus.join() return_code = mpi.returncode if local_logdir: local_tb = tensorboard.local_logdir_path pydoop.hdfs.put(local_tb, hdfs_exec_logdir) if return_code != 0: cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() raise Exception('mpirun FAILED, look in the logs for the error') cleanup(tb_hdfs_path) t_log.do_run = False t_log.join() hopshdfs.kill_logger()
def test_fun(args, ctx): # Dependencies from tensorflowonspark import TFNode from datetime import datetime import getpass import math import numpy import os import random import signal import tensorflow as tf import time from tensorflow.contrib import rnn # Used for TensorBoard logdir from hops import tensorboard # Extract configuration worker_num = ctx.worker_num job_name = ctx.job_name task_index = ctx.task_index cluster_spec = ctx.cluster_spec num_workers = len(cluster_spec['worker']) # Get TF cluster/server instances cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma) # Parameters batch_size = 100 display_iter = 1000 training_iters = 50000 learning_rate = 0.0001 n_input = 3 n_hidden = 512 n_predictions = 32 # Utility functions def elapsed(sec): if sec < 60: return str(sec) + " sec" elif sec < (60 * 60): return str(sec / 60) + " min" else: return str(sec / (60 * 60)) + " hr" def print_log(worker_num, arg): print("%d: " % worker_num) print(arg) def RNN(x, weights, biases, n_input, n_hidden): # Reshape to [1, n_input] x = tf.reshape(x, [-1, n_input]) # Generate a n_input-element sequence of inputs # (eg. [had] [a] [general] -> [20] [6] [33]) x = tf.split(x, n_input, 1) rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden), rnn.BasicLSTMCell(n_hidden)]) # Generate prediction outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32) # There are n_input outputs but we only want the last output return tf.matmul(outputs[-1], weights['out']) + biases['out'] def get_loss_fn(logits, labels): return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)) if job_name == "ps": server.join() elif job_name == "worker": # TODO What does this do? # Assigns ops to the local worker by default with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % task_index, cluster=cluster)): # TODO Set up vocab_size by loading in dataset and parsing through it? dictionary = {} reverse_dictionary = {} vocab_size = 32 # Placeholders or QueueRunner/Readers for input data num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs index = task_index if args.mode == "inference" else None workers = num_workers if args.mode == "inference" else None # RNN output node weights and biases hidden_weights = tf.Variable(tf.random_normal([n_hidden, vocab_size]), name="hidden_weights") hidden_biases = tf.Variable(tf.random_normal([vocab_size]), name="hidden_biases") weights = {'out': hidden_weights} biases = {'out': hidden_biases} # Graph input placeholders x = tf.placeholder("float", [None, n_input, 1]) y = tf.placeholder("float", [None, vocab_size]) # Set up TFOS global_step = tf.Variable(0) pred = RNN(x, weights, biases, n_input, n_hidden) cost = get_loss_fn(logits=pred, labels=y) # Note that the global_step is passed in to the optimizer's min. function optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) \ .minimize(loss=cost, global_step=global_step) # Model evaluation correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) # TF summaries tf.summary.scalar("cost", cost) tf.summary.histogram("hidden_weights", hidden_weights) tf.summary.scalar("acc", accuracy) # TODO XXX Below is copied directly from TFOS example saver = tf.train.Saver() summary_op = tf.summary.merge_all() init_op = tf.global_variables_initializer() # Create a "supervisor", which oversees the training process and stores model state into HDFS logdir = tensorboard.logdir() print("tensorflow model path: {0}".format(logdir)) # Check if chief worker if job_name == "worker" and task_index == 0: summary_writer = tf.summary.FileWriter(logdir, graph=tf.get_default_graph()) if args.mode == "train": sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, init_op=init_op, summary_op=None, summary_writer=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=10) else: sv = tf.train.Supervisor(is_chief=(task_index == 0), logdir=logdir, summary_op=None, saver=saver, global_step=global_step, stop_grace_secs=300, save_model_secs=0) # Configure output path on HDFS output_dir = TFNode.hdfs_path(ctx, args.output) output_file = tf.gfile.Open("{0}/part-{1:05d}".format(output_dir, worker_num), mode='w') # The supervisor takes care of session initialization, restoring from # a checkpoint, and closing when done or an error occurs. with sv.managed_session(server.target) as sess: print("{0} session ready".format(datetime.now().isoformat())) step = 0 count = 0 offset = random.randint(0, n_input + 1) end_offset = n_input + 1 acc_total = 0 loss_total = 0 # TODO writer.add_graph(session.graph)? Might be taken care of by setup of summary_writer # TODO Set up args.steps # Loop until supervisor shuts down or max. iters have completed while not sv.should_stop() and step < args.steps: # TODO Determine what makes THIS asynch, and whether we need synch. # TODO A good resource may be https://stackoverflow.com/questions/41293576/distributed-tensorflow-good-example-for-synchronous-training-on-cpus # Run a training step asynchronously # See `tf.train.SyncReplicasOptimizer` for additional details on how to # perform *synchronous* training. # Using QueueRunner/Readers if args.mode == "train": # TODO Below is merely a copy-pasta of the local TF code, and will need refactoring if offset > (len(training_data) - end_offset): offset = random.randint(0, n_input + 1) symbols_in_keys = [[dictionary[str(training_data[i])]] for i in range(offset, offset + n_input)] symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1]) symbols_out_onehot = np.zeros([vocab_size], dtype=float) symbols_out_onehot[dictionary[str(training_data[offset + n_input])]] = 1.0 symbols_out_onehot = np.reshape(symbols_out_onehot, [1, -1]) # Run iteration and increment 'step' _, summary, acc, loss, onehot_pred, step = sess.run( [optimizer, summary_op, accuracy, cost, pred, global_step], feed_dict={x: symbols_in_keys, y: symbols_out_onehot}) loss_total += loss acc_total += acc if ((step + 1) % display_iter) == 0: print("{0} step: {1} accuracy: {2}".format( datetime.now().isoformat(), step, sess.run(accuracy))) # TODO migrate over print fn from local TF code offset += (n_input + 1) if sv.is_chief: summary_writer.add_summary(summary, step) else: # args.mode == "inference" # labels, pred, acc = sess.run([label, prediction, accuracy]) # # print("label: {0}, pred: {1}".format(labels, pred)) # print("acc: {0}".format(acc)) # for i in range(len(labels)): # count += 1 # output_file.write("{0} {1}\n".format(labels[i], pred[i])) # print("count: {0}".format(count)) pass if args.mode == "inference": output_file.close() # Delay chief worker from shutting down supervisor during inference, since it can load model, start session, # run inference and request stop before the other workers even start/sync their sessions. if task_index == 0: time.sleep(60) # Ask for all the services to stop. print("{0} stopping supervisor".format(datetime.now().isoformat())) sv.stop()