def test_inputmode_spark(self): """Distributed TF cluster w/ InputMode.SPARK""" def _map_fun(args, ctx): import tensorflow as tf tf_feed = TFNode.DataFeed(ctx.mgr, False) while not tf_feed.should_stop(): batch = tf_feed.next_batch(batch_size=10) print("batch: {}".format(batch)) squares = tf.math.square(batch) print("squares: {}".format(squares)) tf_feed.batch_results(squares.numpy()) input = [[x] for x in range(1000) ] # set up input as tensors of shape [1] to match placeholder rdd = self.sc.parallelize(input, 10) cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.SPARK) rdd_out = cluster.inference(rdd) rdd_sum = rdd_out.sum() self.assertEqual(rdd_sum, sum([x * x for x in range(1000)])) cluster.shutdown()
def train(self, data_rdd, model_rdd, batch_size, epochs, model_dir, go_on=False): n_samples = data_rdd.count() # steps_per_epoch = n_samples // batch_size // self.num_workers steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers) assert steps_per_epoch > 0 md = ModelDir(model_dir, 'train*') if go_on: md.create_model_dir() else: md = md.rebuild_model_dir() worker = TFTrainWorker(model_rdd, go_on=go_on, batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, **md.to_dict()) cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd.rdd, num_epochs=epochs, feed_timeout=60000) cluster.shutdown() results = md.read_result() return self.sqlc.createDataFrame(results)
def test_inputmode_spark(self): """Distributed TF cluster w/ InputMode.SPARK""" def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0), init_op=init_op) with sv.managed_session(server.target) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sv.should_stop() and not tf_feed.should_stop(): outputs = sess.run([sq], feed_dict={ x: tf_feed.next_batch(10) }) tf_feed.batch_results(outputs[0]) sv.stop() input = [ [x] for x in range(1000) ] # set up input as tensors of shape [1] to match placeholder rdd = self.sc.parallelize(input, 10) cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.SPARK) rdd_out = cluster.inference(rdd) rdd_sum = rdd_out.sum() self.assertEqual(sum( [x * x for x in range(1000)] ), rdd_sum) cluster.shutdown()
def test_port_unreleased(self): """Test that temporary socket/port is unreleased prior to invoking user map_fun.""" def _map_fun(args, ctx): import socket assert ctx.tmp_socket is not None reserved_port = ctx.tmp_socket.getsockname()[1] # socket bind to tmp port should fail try: my_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) my_sock.bind(('0.0.0.0', reserved_port)) assert False, "should never hit this assert statement" except socket.error as e: print(e) assert True, "should raise an exception" ctx.release_port() assert ctx.tmp_socket is None cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.TENSORFLOW, master_node='chief', release_port=False) cluster.shutdown()
def run(self, model_fn, args): from tensorflowonspark import TFCluster self.set_graph_modules(model_fn) config = cp.ConfigParser() config.readfp(open('{PROJECT_ROOT}/defaults.cfg'.format(**os.environ))) project = config.get('gcp', 'project') keyfile = "/etl/credentials/bi-service-155107.json" app_name = args.app_name submit_host = config.get('environment', 'submit_host') python_lib = config.get('environment', 'python_lib') python_files = utils.get_list(config.get('environment', 'python_files')) sc = utils.get_context(app_name, project, keyfile, submit_host, python_lib, python_files) # tf.app.run() cluster = TFCluster.run(sc, self.execute, args, args.cluster_size, args.num_ps, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.job_dir, master_node='master', reservation_timeout=1800) cluster.shutdown()
def test_inputmode_spark_exception(self): """Distributed TF cluster w/ InputMode.SPARK and exception during feeding""" def _map_fun(args, ctx): import tensorflow as tf tf_feed = TFNode.DataFeed(ctx.mgr, False) while not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if len(batch) > 0: squares = tf.math.square(batch) tf_feed.batch_results(squares.numpy()) raise Exception("FAKE exception during feeding") input = [[x] for x in range(1000) ] # set up input as tensors of shape [1] to match placeholder rdd = self.sc.parallelize(input, 10) with self.assertRaises(Exception): cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.SPARK) cluster.inference(rdd, feed_timeout=1).count() cluster.shutdown()
def test_inputmode_spark(self): """Distributed TF cluster w/ InputMode.SPARK""" def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device(tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() with tf.train.MonitoredTrainingSession(is_chief=(ctx.task_index == 0)) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sess.should_stop() and not tf_feed.should_stop(): outputs = sess.run([sq], feed_dict={x: tf_feed.next_batch(10)}) tf_feed.batch_results(outputs[0]) input = [[x] for x in range(1000)] # set up input as tensors of shape [1] to match placeholder rdd = self.sc.parallelize(input, 10) cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.SPARK) rdd_out = cluster.inference(rdd) rdd_sum = rdd_out.sum() self.assertEqual(rdd_sum, sum([x * x for x in range(1000)])) cluster.shutdown()
def test_inputmode_spark_late_exception(self): """Distributed TF cluster w/ InputMode.SPARK and exception after feeding""" def _map_fun(args, ctx): import tensorflow as tf tf_feed = TFNode.DataFeed(ctx.mgr, False) while not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if len(batch) > 0: squares = tf.math.square(batch) tf_feed.batch_results(squares.numpy()) # simulate post-feed actions that raise an exception time.sleep(2) raise Exception("FAKE exception after feeding") input = [[x] for x in range(1000) ] # set up input as tensors of shape [1] to match placeholder rdd = self.sc.parallelize(input, 10) with self.assertRaises(Exception): cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.SPARK) cluster.inference(rdd).count() cluster.shutdown( grace_secs=5 ) # note: grace_secs must be larger than the time needed for post-feed actions
def train(self, data, output_path, steps, batch_size): checkpoint_path = os.path.join(output_path, 'checkpoint') if not tf.gfile.Exists(checkpoint_path): tf.gfile.MkDir(checkpoint_path) result_path = os.path.join(output_path, 'results') if not tf.gfile.Exists(result_path): tf.gfile.MkDir(result_path) worker = CGAN_MLP(data, result_path, checkpoint_path, steps, batch_size) cluster = TFCluster.run(self.sc, worker, None, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.shutdown()
def evaluate(self, data_rdd, steps, model_dir): md = ModelDir(model_dir, 'evaluate*') steps_per_epoch = data_rdd.count() if steps <= 0 else steps steps_per_epoch = math.ceil(steps_per_epoch / self.num_workers) worker = EvaluateWorker(steps_per_epoch=steps_per_epoch, **md.to_dict()) md.delete_result_file() cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd.rdd, num_epochs=1) cluster.shutdown() results = md.read_result() return self.sqlc.createDataFrame(results)
def recurrent_predict(self, data_rdd, units, steps, feature_type, model_dir): md = ModelDir(model_dir, 'recurrent_predict*') worker = RecurrentPredictWorker(units=units, steps=steps, feature_type=feature_type, **md.to_dict()) md.delete_result_file() cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd.rdd, num_epochs=1, feed_timeout=6000) cluster.shutdown() results = md.read_result(True) return self.sqlc.createDataFrame([{"result": result} for result in results])
def test_port_released(self): """Test that temporary socket/port is released prior to invoking user map_fun.""" def _map_fun(args, ctx): assert ctx.tmp_socket is None cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.TENSORFLOW, master_node='chief') cluster.shutdown()
def test_basic_tf(self): """Single-node TF graph (w/ args) running independently on multiple executors.""" def _map_fun(args, ctx): import tensorflow as tf x = tf.constant(args['x']) y = tf.constant(args['y']) sum = tf.add(x,y) with tf.Session() as sess: result = sess.run([sum]) assert result[0] == 3 args = { 'x':1, 'y':2 } cluster = TFCluster.run(self.sc, _map_fun, tf_args=args, num_executors=self.num_workers, num_ps=0) cluster.shutdown()
def test_basic_tf(self): """Single-node TF graph (w/ args) running independently on multiple executors.""" def _map_fun(args, ctx): import tensorflow as tf x = tf.constant(args['x']) y = tf.constant(args['y']) sum = tf.add(x, y) with tf.Session() as sess: result = sess.run([sum]) assert result[0] == 3 args = {'x': 1, 'y': 2} cluster = TFCluster.run(self.sc, _map_fun, tf_args=args, num_executors=self.num_workers, num_ps=0) cluster.shutdown()
def predict(self, data_rdd, steps, model_dir, output_prob=False): md = ModelDir(model_dir, 'predict*') steps_per_epoch = data_rdd.count() if steps <= 0 else steps steps_per_epoch = math.ceil(steps_per_epoch / self.num_workers) worker = PredictWorker(steps_per_epoch=steps_per_epoch, output_prob=output_prob, **md.to_dict()) md.delete_result_file() cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd.rdd, num_epochs=1, feed_timeout=6000) cluster.shutdown() results = md.read_result() return self.sqlc.createDataFrame(results)
def yolov3_tiny_train(self, model_rdd, batch_size, epochs, classes_path, anchors_path, train_path, val_path, image_size, model_dir, weights_path=None, freeze_body=2, go_on=False): columns = model_rdd.columns assert "model_config" in columns, "not exists model layer config!" assert tf.io.gfile.exists(train_path), "train dataset path not exists!" data_rdd = self.sc.textFile(train_path) n_samples = data_rdd.count() steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers) md = ModelDir(model_dir, 'train*') if go_on: md.create_model_dir() else: md = md.rebuild_model_dir() worker = YOLOV3TinyModelTrainWorker(model_rdd, go_on=go_on, batch_size=batch_size, epochs=epochs, classes_path=classes_path, anchors_path=anchors_path, weights_path=weights_path, val_path=val_path, image_size=image_size, steps_per_epoch=steps_per_epoch, freeze_body=freeze_body, **md.to_dict()) cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd, num_epochs=epochs, feed_timeout=60000) cluster.shutdown() results = md.read_result() return self.sqlc.createDataFrame(results)
def test_inputmode_spark_late_exception(self): """Distributed TF cluster w/ InputMode.SPARK and exception after feeding""" def _map_fun(args, ctx): import tensorflow as tf cluster, server = TFNode.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): x = tf.placeholder(tf.int32, [None, 1]) sq = tf.square(x) init_op = tf.global_variables_initializer() with tf.train.MonitoredTrainingSession( is_chief=(ctx.task_index == 0)) as sess: tf_feed = TFNode.DataFeed(ctx.mgr, False) while not sess.should_stop() and not tf_feed.should_stop(): batch = tf_feed.next_batch(10) if len(batch) > 0: outputs = sess.run([sq], feed_dict={x: batch}) tf_feed.batch_results(outputs[0]) # simulate post-feed actions that raise an exception time.sleep(2) raise Exception("FAKE exception after feeding") input = [[x] for x in range(1000) ] # set up input as tensors of shape [1] to match placeholder rdd = self.sc.parallelize(input, 10) with self.assertRaises(Exception): cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.SPARK) cluster.inference(rdd).count() cluster.shutdown( grace_secs=5 ) # note: grace_secs must be larger than the time needed for post-feed actions
def yolov3_train(self, model_rdd, data_dir, batch_size, epochs, image_size, model_dir, weights_path=None, freeze_body=2, go_on=False): train_path = os.path.join(data_dir, 'train.txt') assert tf.io.gfile.exists(train_path), "train dataset path not exists!" data_rdd = self.sc.textFile(train_path) n_samples = data_rdd.count() steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers) md = ModelDir(model_dir, 'train*') if go_on: md.create_model_dir() else: md = md.rebuild_model_dir() worker = YOLOV3ModelTrainWorker(model_rdd, data_dir, go_on=go_on, batch_size=batch_size, epochs=epochs, image_size=image_size, steps_per_epoch=steps_per_epoch, freeze_body=freeze_body, **md.to_dict()) cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd, num_epochs=epochs, feed_timeout=60000) cluster.shutdown() results = md.read_result() if results: return self.sqlc.createDataFrame(results)
def run(self, input_dir, output_dir, *args, **kwargs): out_text_dir = os.path.join(output_dir, 'text') out_image_dir = os.path.join(output_dir, 'images') out_result_dir = os.path.join(output_dir, 'result') if tf.io.gfile.exists(out_text_dir): tf.io.gfile.rmtree(out_text_dir) if tf.io.gfile.exists(out_image_dir): tf.io.gfile.rmtree(out_image_dir) tf.io.gfile.makedirs(out_text_dir) tf.io.gfile.makedirs(out_image_dir) tf.io.gfile.makedirs(out_result_dir) dataset = facenet.get_dataset(input_dir) data_rdd = self.sc.parallelize([(cls.name, cls.image_paths) for cls in dataset]) worker = MTCNNWorker(out_text_dir, out_image_dir, out_result_dir, *args, **kwargs) cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps, input_mode=self.input_mode) cluster.train(data_rdd, feed_timeout=60000) cluster.shutdown()
for epoch in range(FLAGS.num_epoch): train_batches = train_reader.yieldBatches() print("Epoch: %d" % epoch) step = 0 for dense_x,sparse_idx,sparse_values,y in train_batches: start_time = datetime.now() _ ,train_loss,train_auc,summ,_ = sess.run([train_op,loss,auc_op,summary_op,step_update_op], feed_dict={dense_inputs:dense_x,sparse_inputs:(sparse_idx,sparse_values,shape),labels:y}) step += 1 assert not np.isnan(train_loss), 'Model diverged with loss = NaN' time_used = datetime.now() - start_time if step % FLAGS.display_step == 0: g_step, = sess.run([global_step]) print("step: " + str(step) + ", global_step: " + str(g_step)) summary_writer.add_summary(summ,g_step) print("Step = {}, Examples = {}, Time = {}, Minibatch Loss = {}, Auc = {}".format( g_step, g_step*FLAGS.batch_size, time_used, train_loss, train_auc)) sys.stdout.flush() total_time = datetime.now() - begin_time print("Training Done!!") print("Total time used: {}".format(total_time)) if __name__ == "__main__": sc = SparkContext(conf=SparkConf().setAppName("tfos_online_train_distributed")) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 64 tensorboard = False cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
import mnist_dist sc = SparkContext(conf=SparkConf().setAppName("mnist_tf")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 parser = argparse.ArgumentParser() parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=0) parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv","pickle","tfr"], default="tfr") parser.add_argument("-i", "--images", help="HDFS path to MNIST images in parallelized format") parser.add_argument("-l", "--labels", help="HDFS path to MNIST labels in parallelized format") parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/test", default="mnist_model") parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors) parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1) parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("-X", "--mode", help="train|inference", default="train") parser.add_argument("-c", "--rdma", help="use rdma connection", default=False) args = parser.parse_args() print("args:",args) print("{0} ===== Start".format(datetime.now().isoformat())) cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
parser.add_argument("--rdma", help="use rdma connection", default=False) parser.add_argument("--readers", help="number of reader/enqueue threads per worker", type=int, default=10) parser.add_argument("--shuffle_size", help="size of shuffle buffer", type=int, default=1000) parser.add_argument("--steps", help="maximum number of steps", type=int, default=1000) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) print("{0} ===== Start".format(datetime.now().isoformat())) cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, driver_ps_nodes=args.driver_ps_nodes) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
if not tf.gfile.Exists(FLAGS.train_dir): tf.gfile.MakeDirs(FLAGS.train_dir) inception_distributed_train.train(server.target, dataset, cluster_spec, ctx) if __name__ == '__main__': # parse arguments needed by the Spark driver import argparse parser = argparse.ArgumentParser() parser.add_argument("--epochs", help="number of epochs", type=int, default=0) parser.add_argument("--input_data", help="HDFS path to input dataset") parser.add_argument("--input_mode", help="method to ingest data: (spark|tf)", choices=["spark","tf"], default="tf") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") (args,rem) = parser.parse_known_args() input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW print("{0} ===== Start".format(datetime.now().isoformat())) sc = SparkContext(conf=SparkConf().setAppName('imagenet_distributed_train')) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 1 cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, args.tensorboard, input_mode) if input_mode == TFCluster.InputMode.SPARK: dataRDD = sc.newAPIHadoopFile(args.input_data, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") cluster.train(dataRDD, args.epochs) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
duration = time.time() - self._start_time loss_value = run_values.results if self._step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) with tf.train.MonitoredTrainingSession( checkpoint_dir=FLAGS.train_dir, hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss), _LoggerHook()], config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) as mon_sess: while not mon_sess.should_stop(): mon_sess.run(train_op) if __name__ == '__main__': sc = SparkContext(conf=SparkConf().setAppName("cifar10_train")) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 0 cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
parser.add_argument("--epochs", help="number of epochs", type=int, default=3) parser.add_argument("--model_dir", help="path to save model/checkpoint", default="mnist_model") parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=469) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, master_node='chief') cluster.shutdown()
# arguments for Spark and TFoS parser = argparse.ArgumentParser() parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=executors) parser.add_argument("--num_ps", help="number of ps nodes", type=int, default=1) (args, remainder) = parser.parse_known_args() # construct an ARGV (with script name as first element) from remaining args and pass it to the TF processes on executors remainder.insert(0, __file__) print("spark args:", args) print("tf args:", remainder) num_workers = args.cluster_size - args.num_ps print("===== num_executors={}, num_workers={}, num_ps={}".format( args.cluster_size, num_workers, args.num_ps)) cluster = TFCluster.run(sc, main_fun, remainder, args.cluster_size, args.num_ps, False, TFCluster.InputMode.TENSORFLOW, master_node='master') cluster.shutdown()
type=int, default=3) parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-4) parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model") parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='chief', eval_node=True) cluster.shutdown(grace_secs=60)
plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.title('Accuracy per epoch train vs test') plt.legend() plt.grid(True) plt.show() plt.plot(train_cost[zoom_point:]) plt.plot(test_cost[zoom_point:]) plt.xlabel('Epoch') plt.ylabel('Loss') plt.title('Loss per epoch train vs test') plt.legend() plt.grid(True) plt.show() if __name__ == '__main__': # tf.app.run() import argparse parser = argparse.ArgumentParser() parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args, rem = parser.parse_known_args() sc = SparkContext(conf=SparkConf().setAppName("your_app_name")) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 1 tensorboard = True cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
plt.grid(True) plt.show() def main_fun(argv, ctx): worker_num = ctx.worker_num job_name = ctx.job_name print(f"Starting worker {worker_num} on task {job_name}") hype_random(worker_num) if __name__ == '__main__': # tf.app.run() import argparse parser = argparse.ArgumentParser() parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args, rem = parser.parse_known_args() sc = SparkContext(conf=SparkConf().setAppName("lab4_task6")) # num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 1 num_workers = 4 tensorboard = True cluster = TFCluster.run(sc, main_fun, [], num_workers, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
help="number of ps nodes", type=int, default=1) parser.add_argument("--task_num", help="number of worker nodes", type=int, default=1) parser.add_argument("--max_steps", help="max number of steps to train", type=int, default=20000) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) assert (args.num_ps + args.task_num == num_executors) cluster = TFCluster.run(sc, main_func, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='master') cluster.shutdown()
type=int, default=num_executors) parser.add_argument( "--model", help="HDFS path to save/load model during train/inference", default="mnist_model") parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("--num_ps", help="number of PS nodes in cluster", type=int, default=1) parser.add_argument("--steps", help="maximum number of steps", type=int, default=1000) args = parser.parse_args() print("args:", args) cluster = TFCluster.run(sc, main, args, args.cluster_size, args.num_ps, tensorboard=False, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model, master_node='master') cluster.shutdown()
train_tensor, logdir=FLAGS.train_dir, master=server.target, is_chief=(FLAGS.task == 0), init_fn=_get_init_fn(), summary_op=summary_op, number_of_steps=FLAGS.max_number_of_steps, log_every_n_steps=FLAGS.log_every_n_steps, save_summaries_secs=FLAGS.save_summaries_secs, save_interval_secs=FLAGS.save_interval_secs, summary_writer=summary_writer, sync_optimizer=optimizer if FLAGS.sync_replicas else None) if __name__ == '__main__': import argparse sc = SparkContext(conf=SparkConf().setAppName("train_image_classifier")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 parser = argparse.ArgumentParser() parser.add_argument("--num_ps_tasks", help="number of PS nodes", type=int, default=0) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) (args,rem) = parser.parse_known_args() assert(num_executors > args.num_ps_tasks) cluster = TFCluster.run(sc, main_fun, sys.argv, args.cluster_size, args.num_ps_tasks, args.tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.shutdown()
default="mnist_model") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args = parser.parse_args() print("args:", args) # create RDD of input data def parse(ln): vec = [int(x) for x in ln.split(',')] return (vec[1:], vec[0]) stream = ssc.textFileStream(args.images_labels) images_labels = stream.map(parse) cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=1, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.SPARK, log_dir=args.model_dir, master_node='chief') cluster.train( images_labels, feed_timeout=86400 ) # extend feed timeout to 24hrs for streaming data to arrive ssc.start() cluster.shutdown(ssc)
def toNumpy(bytestr): example = tf.train.Example() example.ParseFromString(bytestr) features = example.features.feature image = numpy.array(features['image'].int64_list.value) label = numpy.array(features['label'].int64_list.value) return (image, label) dataRDD = images.map(lambda x: toNumpy(str(x[0]))) else: if args.format == "csv": images = sc.textFile( args.images).map(lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile( args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) else: # args.format == "pickle": images = sc.pickleFile(args.images) labels = sc.pickleFile(args.labels) print("zipping images and labels") dataRDD = images.zip(labels) cluster = TFCluster.run(sc, cifar100_dist2.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) if args.mode == "train": cluster.train(dataRDD, args.epochs) else: labelRDD = cluster.inference(dataRDD) labelRDD.saveAsTextFile(args.output) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
def main(args=None): spark = SparkSession \ .builder \ .appName("mitosis_spark") \ .getOrCreate() sc = spark.sparkContext executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 num_ps = 1 logging.info("============= Num of executors: {0}".format(num_executors)) # parse args parser = argparse.ArgumentParser() parser.add_argument("--appName", default="mitosis_spark", help="application name") parser.add_argument("--hdfs_host", help="HDFS host", type=str, default="default") parser.add_argument("--hdfs_port", help="HDFS port", type=int, default=8020) parser.add_argument("--mitosis_img_dir", help="path to the mitosis image files") parser.add_argument( "--mitosis_img_csv", help="csv file that contain all the mitosis image files") parser.add_argument("--normal_img_dir", required=True, help="path to the normal image files") parser.add_argument( "--normal_img_csv", help="csv file that contain all the normal image files") parser.add_argument("--batch_size", help="number of records per batch", type=int, default=32) parser.add_argument("--epochs", help="number of epochs", type=int, default=1) parser.add_argument("--export_dir", help="HDFS path to export saved_model", default="mnist_export") parser.add_argument("--format", help="example format: (csv|pickle|tfr)", choices=["csv", "pickle", "tfr"], default="csv") parser.add_argument( "--model", help="HDFS path to save/load model during train/inference", default="mnist_model") parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) parser.add_argument("--output", help="HDFS path to save test/inference output", default="predictions") parser.add_argument("--readers", help="number of reader/enqueue threads", type=int, default=1) parser.add_argument("--steps", help="maximum number of steps", type=int, default=99) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("--mode", help="train|inference", default="train") parser.add_argument("--rdma", help="use rdma connection", default=False) args = parser.parse_args(args) if args.mitosis_img_dir is None and args.mitosis_img_csv is None: parser.error( "at least one of --mitosis_img_dir and --mitosis_img_csv required") if args.normal_img_dir is None and args.normal_img_csv is None: parser.error( "at least one of --normal_img_dir and --normal_img_csv required") if args.mitosis_img_csv is None: fs = get_hdfs(args.hdfs_host, args.hdfs_port) mitosis_img_pathes = fs.ls(args.mitosis_img_dir) mitosis_label_img_pathes = [(1, path) for path in mitosis_img_pathes] #mitosis_train_rdd = sc.parallelize(mitosis_img_pathes).map(lambda path : (1, path)) else: mitosis_train_rdd = sc.read.textFile( args.mitosis_img_csv).map(lambda path: (1, path)) if args.normal_img_csv is None: fs = get_hdfs(args.hdfs_host, args.hdfs_port) normal_img_pathes = fs.ls(args.normal_img_dir) normal_label_img_pathes = [(0, path) for path in normal_img_pathes] #normal_train_rdd = sc.parallelize(normal_img_pathes).map(lambda path : (0, path)) else: normal_train_rdd = sc.read.textFile( args.normal_img_csv).map(lambda path: (0, path)) # get the train data set with mitosis and normal images. In the output RDD, # each entry will be (label, img_arr) training_data = [] training_data.extend(mitosis_label_img_pathes) training_data.extend(normal_label_img_pathes) print("+++++++++++ Training data size: {}".format(len(training_data))) data_RDD = sc.parallelize(training_data) \ .repartition(int(len(training_data)/128/2000)) \ .mapPartitions(lambda iter : read_images(get_hdfs(args.hdfs_host, args.hdfs_port), iter)) cluster = TFCluster.run(sc, mitosis_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK, log_dir=args.model) if args.mode == "train": cluster.train(data_RDD, args.epochs) else: labelRDD = cluster.inference(data_RDD) labelRDD.saveAsTextFile(args.output) cluster.shutdown(grace_secs=30) print("{0} ===== Stop".format(datetime.now().isoformat()))
label = numpy.array(features['label'].int64_list.value) return (image, label) dataRDD = images.map(lambda x: toNumpy(str(x[0]))) else: if args.format == "csv": # HDFS==>numpy array images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) else: # args.format == "pickle": # HDFS==>numpy array images = sc.pickleFile(args.images) labels = sc.pickleFile(args.labels) print("zipping images and labels") # print(type(labels)) # print(labels.count()) dataRDD = images.zip(labels) # image+label #cluster = TFCluster.reserve(sc, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) #cluster.start(mnist_dist.map_fun, args) cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) if args.mode == "train" or args.mode == "retrain": cluster.train(dataRDD, args.epochs) else: labelRDD = cluster.inference(dataRDD) labelRDD.saveAsTextFile(args.output) cluster.shutdown() # 集群关闭 print("{0} ===== Stop".format(datetime.now().isoformat()))
parser.add_argument("--epochs", help="number of epochs", type=int, default=1) parser.add_argument( "--steps", help="maximum number of steps", type=int, default=1000) args=parser.parse_args() data_loader=TextLoader( sc, args.data_dir, args.batch_size, args.seq_length) args.vocab_size = data_loader.vocab_size defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS") working_dir = os.getcwd() config_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'config.p'), defaultFS, working_dir) sc.parallelize([args]).saveAsPickleFile(config_file) chars_vocab_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'chars_vocab.p'), defaultFS, working_dir) sc.parallelize([data_loader.chars, data_loader.vocab]).saveAsPickleFile(chars_vocab_file) dataRDD=sc.parallelize(data_loader.get_data_for_feeder()) cluster=TFCluster.run(sc, main_fun, args, num_executors, args.num_ps_tasks, TFCluster.InputMode.SPARK) cluster.train(dataRDD, args.epochs) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))