def _map_fun(args, ctx):
            import tensorflow as tf
            cluster, server = TFNode.start_cluster_server(ctx)
            if ctx.job_name == "ps":
                server.join()
            elif ctx.job_name == "worker":
                with tf.device(
                        tf.train.replica_device_setter(
                            worker_device="/job:worker/task:%d" %
                            ctx.task_index,
                            cluster=cluster)):
                    x = tf.placeholder(tf.int32, [None, 1])
                    sq = tf.square(x)
                    init_op = tf.global_variables_initializer()
                with tf.train.MonitoredTrainingSession(
                        is_chief=(ctx.task_index == 0)) as sess:
                    tf_feed = TFNode.DataFeed(ctx.mgr, False)
                    while not sess.should_stop() and not tf_feed.should_stop():
                        batch = tf_feed.next_batch(10)
                        if len(batch) > 0:
                            outputs = sess.run([sq], feed_dict={x: batch})
                            tf_feed.batch_results(outputs[0])

                # simulate post-feed actions that raise an exception
                time.sleep(2)
                raise Exception("FAKE exception after feeding")
Exemple #2
0
        def _tf_train(args, ctx):
            """Basic linear regression in a distributed TF cluster using InputMode.TENSORFLOW"""
            import tensorflow as tf
            from tensorflowonspark import TFNode

            tf.reset_default_graph(
            )  # reset graph in case we're re-using a Spark python worker

            cluster, server = TFNode.start_cluster_server(ctx)

            def _get_examples(batch_size):
                """Generate test data (mocking a queue_runner of file inputs)"""
                features = tf.random_uniform([batch_size,
                                              2])  # (batch_size x 2)
                weights = tf.constant([[3.14], [1.618]])  # (2, 1)
                labels = tf.matmul(features, weights)
                return features, labels

            if ctx.job_name == "ps":
                server.join()
            elif ctx.job_name == "worker":
                with tf.device(
                        tf.train.replica_device_setter(
                            worker_device="/job:worker/task:%d" %
                            ctx.task_index,
                            cluster=cluster)):
                    x, y_ = _get_examples(
                        10
                    )  # no input placeholders, TF code reads (or in this case "generates") input
                    w = tf.Variable(tf.truncated_normal([2, 1]), name='w')
                    y = tf.matmul(x, w, name='y')
                    global_step = tf.Variable(0)

                    cost = tf.reduce_mean(tf.square(y_ - y), name='cost')
                    optimizer = tf.train.GradientDescentOptimizer(
                        0.5).minimize(cost, global_step)

                    init_op = tf.global_variables_initializer()
                    saver = tf.train.Saver()

                sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                                         init_op=init_op)
                step = 0
                with sv.managed_session(server.target) as sess:
                    while not sv.should_stop() and step < args.steps:
                        opt, weights, step = sess.run(
                            [optimizer, w, global_step])
                        if (step % 100 == 0):
                            print("step: {}, weights: {}".format(
                                step, weights))

                    if sv.is_chief:
                        if args.model_dir:
                            # manually save checkpoint
                            ckpt_name = args.model_dir + "/model.ckpt"
                            print("Saving checkpoint to: {}".format(ckpt_name))
                            saver.save(sess, ckpt_name)
                sv.stop()
Exemple #3
0
 def __call__(self, args, ctx):
     self.task_index = ctx.task_index
     self.job_name = ctx.job_name
     self.cluster, self.server = TFNode.start_cluster_server(ctx)
     self.tf_feed = TFNode.DataFeed(ctx.mgr)
     if ctx.job_name == "ps":
         self.server.join()
     elif ctx.job_name == "worker":
         self.build_model()
         self.execute()
Exemple #4
0
    def _spark_train(args, ctx):
      """Basic linear regression in a distributed TF cluster using InputMode.SPARK"""
      import tensorflow as tf
      from tensorflowonspark import TFNode

      tf.reset_default_graph()                          # reset graph in case we're re-using a Spark python worker

      cluster, server = TFNode.start_cluster_server(ctx)
      if ctx.job_name == "ps":
        server.join()
      elif ctx.job_name == "worker":
        with tf.device(tf.train.replica_device_setter(
          worker_device="/job:worker/task:%d" % ctx.task_index,
          cluster=cluster)):
          x = tf.placeholder(tf.float32, [None, 2], name='x')
          y_ = tf.placeholder(tf.float32, [None, 1], name='y_')
          w = tf.Variable(tf.truncated_normal([2,1]), name='w')
          y = tf.matmul(x, w, name='y')
          y2 = tf.square(y, name="y2")                      # extra/optional output for testing multiple output tensors
          cost = tf.reduce_mean(tf.square(y_ - y), name='cost')
          optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(cost)
          init_op = tf.global_variables_initializer()
          saver = tf.train.Saver()

        sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                                init_op=init_op)
        with sv.managed_session(server.target) as sess:
          tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping)
          while not sv.should_stop() and not tf_feed.should_stop():
            batch = tf_feed.next_batch(10)
            if args.input_mapping:
              if len(batch['x']) > 0:
                feed = { x: batch['x'], y_: batch['y_'] }
              opt = sess.run(optimizer, feed_dict=feed)

          if sv.is_chief:
            if args.model_dir:
              # manually save checkpoint
              ckpt_name = args.model_dir + "/model.ckpt"
              print("Saving checkpoint to: {}".format(ckpt_name))
              saver.save(sess, ckpt_name)
            elif args.export_dir:
              # export a saved_model
              signatures = {
                'test_key': {
                  'inputs': { 'features': x },
                  'outputs': { 'prediction': y },
                  'method_name': 'test'
                }
              }
              TFNode.export_saved_model(sess, export_dir=args.export_dir, tag_set='test_tag', signatures=signatures)
            else:
              print("WARNING: model state not saved.")

        sv.stop()
Exemple #5
0
 def __call__(self, args, ctx):
     self.task_index = ctx.task_index
     self.job_name = ctx.job_name
     self.cluster, self.server = TFNode.start_cluster_server(ctx)
     self.tf_feed = TFNode.DataFeed(ctx.mgr)
     if ctx.job_name == "ps":
         self.server.join()
     elif ctx.job_name == "worker":
         self.create_tmp_dir()
         self.process()
         self.delete_tmp_dir()
Exemple #6
0
def main_fun(argv, ctx):
    import tensorflow as tf
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    cluster_spec, server = TFNode.start_cluster_server(ctx)
    '''  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)
	
  if job_name == "ps":
    server.join()
  elif job_name == "worker":'''
    hello = tf.constant('Hello, TensorFlow!')
    sess = tf.Session()
    print(sess.run(hello))
def main_fun(argv, ctx):
    from src import facenet_distributed_train
    from src import vipus_distributed_train
    import sys

    job_name = ctx.job_name
    assert job_name in ['ps', 'worker'], 'job_name must be ps or worker'
    print("argv:", argv)
    sys.argv = argv

    cluster, server = TFNode.start_cluster_server(ctx, num_gpus=1)
    if job_name == 'ps':
        server.join()
    else:
        if argv.model == 'FACENET':
            facenet_distributed_train.train(server, ctx.cluster_spec, argv,
                                            ctx)
        elif argv.model == 'VIPUS':
            vipus_distributed_train.train(server, ctx.cluster_spec, argv, ctx)
 def _map_fun(args, ctx):
   import tensorflow as tf
   cluster, server = TFNode.start_cluster_server(ctx)
   if ctx.job_name == "ps":
     server.join()
   elif ctx.job_name == "worker":
     with tf.device(tf.train.replica_device_setter(
       worker_device="/job:worker/task:%d" % ctx.task_index,
       cluster=cluster)):
       x = tf.placeholder(tf.int32, [None, 1])
       sq = tf.square(x)
       init_op = tf.global_variables_initializer()
     sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                             init_op=init_op)
     with sv.managed_session(server.target) as sess:
       tf_feed = TFNode.DataFeed(ctx.mgr, False)
       while not sv.should_stop() and not tf_feed.should_stop():
         outputs = sess.run([sq], feed_dict={ x: tf_feed.next_batch(10) })
         tf_feed.batch_results(outputs[0])
     sv.stop()
Exemple #9
0
def main_fun(argv, ctx):
  import tensorflow as tf
  from inception import inception_eval
  from inception.imagenet_data import ImagenetData

  print("argv:", argv)
  sys.argv = argv

  FLAGS = tf.app.flags.FLAGS
  FLAGS._parse_flags()
  print("FLAGS:", FLAGS.__dict__['__flags'])

  dataset = ImagenetData(subset=FLAGS.subset)
  assert dataset.data_files()
  if tf.gfile.Exists(FLAGS.eval_dir):
    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  tf.gfile.MakeDirs(FLAGS.eval_dir)

  cluster_spec, server = TFNode.start_cluster_server(ctx)

  inception_eval.evaluate(dataset)
def main_fun(argv, ctx):

    # extract node metadata from ctx
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    assert job_name in ['ps', 'worker'], 'job_name must be ps or worker'

    from inception import inception_distributed_train
    from inception.imagenet_data import ImagenetData
    import tensorflow as tf

    # instantiate FLAGS on workers using argv from driver and add job_name and task_id
    print("argv:", argv)
    sys.argv = argv

    FLAGS = tf.app.flags.FLAGS
    FLAGS.job_name = job_name
    FLAGS.task_id = task_index
    print("FLAGS:", FLAGS.__dict__['__flags'])

    # Get TF cluster and server instances
    cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus,
                                                       FLAGS.rdma)

    if FLAGS.job_name == 'ps':
        # `ps` jobs wait for incoming connections from the workers.
        server.join()
    else:
        # `worker` jobs will actually do the work.
        dataset = ImagenetData(subset=FLAGS.subset)
        assert dataset.data_files()
        # Only the chief checks for or creates train_dir.
        if FLAGS.task_id == 0:
            if not tf.gfile.Exists(FLAGS.train_dir):
                tf.gfile.MakeDirs(FLAGS.train_dir)
        inception_distributed_train.train(server.target, dataset, cluster_spec,
                                          ctx)
 def _map_fun(args, ctx):
     import tensorflow as tf
     cluster, server = TFNode.start_cluster_server(ctx)
     if ctx.job_name == "ps":
         server.join()
     elif ctx.job_name == "worker":
         with tf.device(
                 tf.train.replica_device_setter(
                     worker_device="/job:worker/task:%d" %
                     ctx.task_index,
                     cluster=cluster)):
             x = tf.placeholder(tf.int32, [None, 1])
             sq = tf.square(x)
             init_op = tf.global_variables_initializer()
         sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                                  init_op=init_op)
         with sv.managed_session(server.target) as sess:
             tf_feed = TFNode.DataFeed(ctx.mgr, False)
             while not sv.should_stop() and not tf_feed.should_stop():
                 outputs = sess.run(
                     [sq], feed_dict={x: tf_feed.next_batch(10)})
                 tf_feed.batch_results(outputs[0])
         sv.stop()
def main_fun(argv, ctx):

  # extract node metadata from ctx
  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index

  assert job_name in ['ps', 'worker'], 'job_name must be ps or worker'

  from inception import inception_distributed_train
  from inception.imagenet_data import ImagenetData
  import tensorflow as tf

  # instantiate FLAGS on workers using argv from driver and add job_name and task_id
  print("argv:", argv)
  sys.argv = argv

  FLAGS = tf.app.flags.FLAGS
  FLAGS.job_name = job_name
  FLAGS.task_id = task_index
  print("FLAGS:", FLAGS.__dict__['__flags'])

  # Get TF cluster and server instances
  cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma)

  if FLAGS.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()
  else:
    # `worker` jobs will actually do the work.
    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    # Only the chief checks for or creates train_dir.
    if FLAGS.task_id == 0:
      if not tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.MakeDirs(FLAGS.train_dir)
    inception_distributed_train.train(server.target, dataset, cluster_spec, ctx)
Exemple #13
0
def map_fun(args, ctx):
	from tensorflowonspark import TFNode
	from datetime import datetime
	import math
	import numpy
	import tensorflow as tf
	import time

	worker_num = ctx.worker_num
	job_name = ctx.job_name
	task_index = ctx.task_index
	cluster_spec = ctx.cluster_spec

	IMAGE_PIXELS=28

	# Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
	if job_name == "ps":
		time.sleep((worker_num + 1) * 5)

	# Parameters
	hidden_units = 128
	batch_size   = args.batch_size

	# Get TF cluster and server instances
	cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)
	
	def writeFileToHDFS():
		rootdir = '/tmp/mnist_model'
		client = HdfsClient(hosts='localhost:50070')
		client.mkdirs('/user/root/mnist_model')
		for parent,dirnames,filenames in os.walk(rootdir):
			for dirname in  dirnames:
				print("parent is:{0}".format(parent))
		for filename in filenames:
			client.copy_from_local(os.path.join(parent,filename), os.path.join('/user/root/mnist_model',filename), overwrite=True)


	def feed_dict(batch):
		# Convert from [(images, labels)] to two numpy arrays of the proper type
		images = []
		labels = []
		for item in batch:
			images.append(item[0])
			labels.append(item[1])
		xs = numpy.array(images)
		xs = xs.astype(numpy.float32)
		xs = xs/255.0
		ys = numpy.array(labels)
		ys = ys.astype(numpy.uint8)
		return (xs, ys)

	if job_name == "ps":
		server.join()
	elif job_name == "worker":

		# Assigns ops to the local worker by default.
		with tf.device(tf.train.replica_device_setter(
			worker_device="/job:worker/task:%d" % task_index,
			cluster=cluster)):

			# Variables of the hidden layer
			hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
							stddev=1.0 / IMAGE_PIXELS), name="hid_w")
			hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
			tf.summary.histogram("hidden_weights", hid_w)

			# Variables of the softmax layer
			sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
							stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
			sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
			tf.summary.histogram("softmax_weights", sm_w)

			# Placeholders or QueueRunner/Readers for input data
			x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
			y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

			x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
			tf.summary.image("x_img", x_img)

			hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
			hid = tf.nn.relu(hid_lin)

			y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

			global_step = tf.Variable(0)

			loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
			tf.summary.scalar("loss", loss)

			train_op = tf.train.AdagradOptimizer(0.01).minimize(
							loss, global_step=global_step)

			# Test trained model
			label = tf.argmax(y_, 1, name="label")
			prediction = tf.argmax(y, 1,name="prediction")
			correct_prediction = tf.equal(prediction, label)

			accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
			tf.summary.scalar("acc", accuracy)

			saver = tf.train.Saver()
			summary_op = tf.summary.merge_all()
			init_op = tf.global_variables_initializer()

		# Create a "supervisor", which oversees the training process and stores model state into HDFS
#		logdir = TFNode.hdfs_path(ctx, args.model)
		logdir = "hdfs:///tmp/" + args.model
		print("tensorflow model path: {0}".format(logdir))
		summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

		if args.mode == "train":
			sv = tf.train.Supervisor(is_chief=(task_index == 0),
								logdir=logdir,
								init_op=init_op,
								summary_op=None,
								saver=saver,
								global_step=global_step,
								summary_writer=summary_writer,
								stop_grace_secs=300,
								save_model_secs=10)
		else:
			sv = tf.train.Supervisor(is_chief=(task_index == 0),
								logdir=logdir,
								summary_op=None,
								saver=saver,
								global_step=global_step,
								stop_grace_secs=300,
								save_model_secs=0)

		# The supervisor takes care of session initialization, restoring from
		# a checkpoint, and closing when done or an error occurs.
		with sv.managed_session(server.target) as sess:
			print("{0} session ready".format(datetime.now().isoformat()))

			# Loop until the supervisor shuts down or 1000000 steps have completed.
			step = 0
			tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
			while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
				# Run a training step asynchronously.
				# See `tf.train.SyncReplicasOptimizer` for additional details on how to
				# perform *synchronous* training.

				# using feed_dict
				batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
				feed = {x: batch_xs, y_: batch_ys}

				if len(batch_xs) > 0:
					if args.mode == "train":
						_, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
						# print accuracy and save model checkpoint to HDFS every 100 steps
						if (step % 100 == 0):
							print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))

						if sv.is_chief:
							summary_writer.add_summary(summary, step)
							
					else: # args.mode == "inference"
						labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

						results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
						tf_feed.batch_results(results)
						print("acc: {0}".format(acc))

			if sv.should_stop() or step >= args.steps:
				tf_feed.terminate()
				writeFileToHDFS()

		# Ask for all the services to stop.
		print("{0} stopping supervisor".format(datetime.now().isoformat()))
		sv.stop()
Exemple #14
0
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import getpass
    import math
    import numpy
    import os
    import signal
    import tensorflow as tf
    import time

    IMAGE_PIXELS = 28
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    num_workers = len(cluster_spec['worker'])

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Parameters
    hidden_units = 128
    batch_size = 100

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def read_csv_examples(image_dir,
                          label_dir,
                          batch_size=100,
                          num_epochs=None,
                          task_index=None,
                          num_workers=None):
        print_log(worker_num, "num_epochs: {0}".format(num_epochs))
        # Setup queue of csv image filenames
        tf_record_pattern = os.path.join(image_dir, 'part-*')
        images = tf.gfile.Glob(tf_record_pattern)
        print_log(worker_num, "images: {0}".format(images))
        image_queue = tf.train.string_input_producer(images,
                                                     shuffle=False,
                                                     capacity=1000,
                                                     num_epochs=num_epochs,
                                                     name="image_queue")

        # Setup queue of csv label filenames
        tf_record_pattern = os.path.join(label_dir, 'part-*')
        labels = tf.gfile.Glob(tf_record_pattern)
        print_log(worker_num, "labels: {0}".format(labels))
        label_queue = tf.train.string_input_producer(labels,
                                                     shuffle=False,
                                                     capacity=1000,
                                                     num_epochs=num_epochs,
                                                     name="label_queue")

        # Setup reader for image queue
        img_reader = tf.TextLineReader(name="img_reader")
        _, img_csv = img_reader.read(image_queue)
        image_defaults = [[1.0] for col in range(784)]
        img = tf.pack(tf.decode_csv(img_csv, image_defaults))
        # Normalize values to [0,1]
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        image = tf.div(img, norm)
        print_log(worker_num, "image: {0}".format(image))

        # Setup reader for label queue
        label_reader = tf.TextLineReader(name="label_reader")
        _, label_csv = label_reader.read(label_queue)
        label_defaults = [[1.0] for col in range(10)]
        label = tf.pack(tf.decode_csv(label_csv, label_defaults))
        print_log(worker_num, "label: {0}".format(label))

        # Return a batch of examples
        return tf.train.batch([image, label],
                              batch_size,
                              num_threads=args.readers,
                              name="batch_csv")

    def read_tfr_examples(path,
                          batch_size=100,
                          num_epochs=None,
                          task_index=None,
                          num_workers=None):
        print_log(worker_num, "num_epochs: {0}".format(num_epochs))

        # Setup queue of TFRecord filenames
        tf_record_pattern = os.path.join(path, 'part-*')
        files = tf.gfile.Glob(tf_record_pattern)
        queue_name = "file_queue"

        # split input files across workers, if specified
        if task_index is not None and num_workers is not None:
            num_files = len(files)
            files = files[task_index:num_files:num_workers]
            queue_name = "file_queue_{0}".format(task_index)

        print_log(worker_num, "files: {0}".format(files))
        file_queue = tf.train.string_input_producer(files,
                                                    shuffle=False,
                                                    capacity=1000,
                                                    num_epochs=num_epochs,
                                                    name=queue_name)

        # Setup reader for examples
        reader = tf.TFRecordReader(name="reader")
        _, serialized = reader.read(file_queue)
        feature_def = {
            'label': tf.FixedLenFeature([10], tf.int64),
            'image': tf.FixedLenFeature([784], tf.int64)
        }
        features = tf.parse_single_example(serialized, feature_def)
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        image = tf.div(tf.to_float(features['image']), norm)
        print_log(worker_num, "image: {0}".format(image))
        label = tf.to_float(features['label'])
        print_log(worker_num, "label: {0}".format(label))

        # Return a batch of examples
        return tf.train.batch([image, label],
                              batch_size,
                              num_threads=args.readers,
                              name="batch")

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Variables of the hidden layer
            hid_w = tf.Variable(tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
                                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            tf.summary.histogram("hidden_weights", hid_w)

            # Variables of the softmax layer
            sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                                                   stddev=1.0 /
                                                   math.sqrt(hidden_units)),
                               name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            tf.summary.histogram("softmax_weights", sm_w)

            # Placeholders or QueueRunner/Readers for input data
            num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs
            index = task_index if args.mode == "inference" else None
            workers = num_workers if args.mode == "inference" else None

            if args.format == "csv":
                images = TFNode.hdfs_path(ctx, args.images)
                labels = TFNode.hdfs_path(ctx, args.labels)
                x, y_ = read_csv_examples(images, labels, 100, num_epochs,
                                          index, workers)
            elif args.format == "tfr":
                images = TFNode.hdfs_path(ctx, args.images)
                x, y_ = read_tfr_examples(images, 100, num_epochs, index,
                                          workers)
            else:
                raise ("{0} format not supported for tf input mode".format(
                    args.format))

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
            tf.summary.image("x_img", x_img)

            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            tf.summary.scalar("loss", loss)
            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),
                                      name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num),
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=10)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)
            output_dir = TFNode.hdfs_path(ctx, args.output)
            output_file = tf.gfile.Open("{0}/part-{1:05d}".format(
                output_dir, worker_num),
                                        mode='w')

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = 0
            count = 0
            while not sv.should_stop() and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using QueueRunners/Readers
                if args.mode == "train":
                    if (step % 100 == 0):
                        print("{0} step: {1} accuracy: {2}".format(
                            datetime.now().isoformat(), step,
                            sess.run(accuracy)))
                    _, summary, step = sess.run(
                        [train_op, summary_op, global_step])
                    if sv.is_chief:
                        summary_writer.add_summary(summary, step)
                else:  # args.mode == "inference"
                    labels, pred, acc = sess.run([label, prediction, accuracy])
                    #print("label: {0}, pred: {1}".format(labels, pred))
                    print("acc: {0}".format(acc))
                    for i in range(len(labels)):
                        count += 1
                        output_file.write("{0} {1}\n".format(
                            labels[i], pred[i]))
                    print("count: {0}".format(count))

        if args.mode == "inference":
            output_file.close()
            # Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
            # run inference and request stop before the other workers even start/sync their sessions.
            if task_index == 0:
                time.sleep(60)

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
def main_fun(args, ctx):
    # ctx - node metadata like job_name, task_id

    main_path = args.main_path

    sys.path.append(main_path + "/CatDog-CNN-Tensorflow-OnSpark/")

    import tensorflow as tf
    import tensorflowonspark
    import conv_net
    import utils
    import datetime
    from image_op import get_tensor

    tf.app.flags.DEFINE_string('train_dir', main_path + '/data_catsdogs/train',
                               """Directory with training images """)
    tf.app.flags.DEFINE_string('checkpoint_path',
                               main_path + 'checkpoints/catdog_spark',
                               """Directory with checkpoints """)
    tf.app.flags.DEFINE_string('graph_path', main_path + 'graphs/catdog_spark',
                               """Directory with graphs """)

    FLAGS = tf.app.flags.FLAGS

    cluster, server = TFNode.start_cluster_server(ctx)

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    log_file = main_path + "log_spark.txt"

    n_epoch = int(args.n_epoch)
    dataset_size = int(args.dataset_size)
    batch_size = int(args.batch_size)

    model = conv_net.CatDogConvNet(FLAGS.checkpoint_path,
                                   FLAGS.graph_path,
                                   dataset_size=dataset_size,
                                   batch_size=batch_size,
                                   num_workers=num_executors,
                                   task_index=task_index,
                                   ctx=ctx,
                                   server=server,
                                   worker=worker_num)
    model.training_folder = FLAGS.train_dir
    model.log_file = main_path + "log_spark.txt"
    print('building a model')
    utils.write_log(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),
                    'Testing the model ...', log_file)
    with tf.name_scope('data'):
        # path, train_size, test_size, batch_size, desired_shape=300
        train_data, test_data = get_tensor(
            model.training_folder,
            int(model.dataset_size * (1 - model.test_percent)),
            int(model.dataset_size * model.test_percent),
            model.batch_size,
            desired_shape=model.desired_shape,
            num_workers=model.num_workers,
            task_index=model.task_index)

        # train_data = train_data.repeat(FLAGS.n_epoch+1)

        iterator = tf.data.Iterator.from_structure(train_data.output_types,
                                                   train_data.output_shapes)
        img, model.label = iterator.get_next()

        # reshape the image to make it work with tf.nn.conv2d:
        img = tf.reshape(
            img, shape=[-1, model.desired_shape, model.desired_shape, 1])
        model.img = tf.cast(img, tf.float32)

        model.train_init = iterator.make_initializer(
            train_data)  # initializer for train_data
        model.test_init = iterator.make_initializer(
            test_data)  # initializer for train_data
    model.build()

    print('testing')
    model.eval_accuracy_spark()
Exemple #16
0
def map_fun(args, ctx):
  # from com.yahoo.ml.tf import TFNode
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num #worker数量
  job_name = ctx.job_name # job名
  task_index = ctx.task_index # 任务索引
  cluster_spec = ctx.cluster_spec # 集群

  IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1  (后续参考自己图像大小进行修改)
  channels=4
  num_class=2

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps": # ps节点(主节点)
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128 # NN隐藏层
  batch_size   = args.batch_size #每批次训练的样本数

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    xs = xs/255.0 # 数据归一化
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      #-------------普通的NN模型(可以修改成自己的模型)---------------------------------#
      #↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓#
      '''
      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                              stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      # tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                              stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      # tf.summary.histogram("softmax_weights", sm_w)
      '''

      # Create some wrappers for simplicity
      def conv2d(x, W, b, strides=1):
        # Conv2D wrapper, with bias and relu activation
        x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
        x = tf.nn.bias_add(x, b)  # strides中间两个为1 表示x,y方向都不间隔取样
        return tf.nn.relu(x)

      def maxpool2d(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='SAME')  # strides中间两个为2 表示x,y方向都间隔1个取样

      # Store layers weight & bias
      weights = {
        # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入
        'wc1': tf.Variable(tf.random_normal([5, 5, channels, 32])),  # 5X5的卷积模板
        # 5x5 conv, 32 inputs, 64 outputs
        'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
        # fully connected, 7*7*64 inputs, 1024 outputs
        'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])),
        # 1024 inputs, 10 outputs (class prediction)
        'out': tf.Variable(tf.random_normal([1024, num_class]))
      }

      biases = {
        'bc1': tf.Variable(tf.random_normal([32])),
        'bc2': tf.Variable(tf.random_normal([64])),
        'bd1': tf.Variable(tf.random_normal([1024])),
        'out': tf.Variable(tf.random_normal([num_class]))
      }


      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS*channels], name="x") # mnist 28*28*1
      y_ = tf.placeholder(tf.float32, [None, num_class], name="y_")

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels]) # mnist 数据 28x28x1 (灰度图 波段为1)
      # tf.summary.image("x_img", x_img)


      # 改成卷积模型
      conv1 = conv2d(x_img, weights['wc1'], biases['bc1'])
      conv1 = maxpool2d(conv1, k=2)
      conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
      conv2 = maxpool2d(conv2, k=2)
      fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
      fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
      fc1 = tf.nn.relu(fc1)
      if args.mode == "train":
        fc1 = tf.nn.dropout(fc1, 0.7)
      y = tf.add(tf.matmul(fc1, weights['out']), biases['out'])

      '''
      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) # tf.nn.add(tf.nn.matmul(x,hid_w),hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
      '''
      # global_step = tf.Variable(0)

      global_step = tf.Variable(0, name="global_step", trainable=False)

      # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

      loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y))

      # tf.summary.scalar("loss", loss)

      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      # tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      # summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

      # ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑#
      #---------------上面的模型可以修改成自己的模型------------------------------#

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir)) #
    log.info("tensorflow model path: {0}".format(logdir))
    # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess: # 打开session
      logging.basicConfig(level=logging.INFO)

      print("{0} session ready".format(datetime.now().isoformat()))
      log.info("{0} session ready".format(datetime.now().isoformat()))
      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            _, step = sess.run([train_op,  global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
              log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
            if sv.is_chief:
              pass
              # summary_writer.add_summary(summary, step)
          else: # args.mode == "inference"
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

            results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))
            log.info("acc: {0}".format(acc))
      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    log.info("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
Exemple #17
0
        def _spark_train(args, ctx):
            """Basic linear regression in a distributed TF cluster using InputMode.SPARK"""
            import tensorflow as tf
            from tensorflowonspark import TFNode

            class ExportHook(tf.train.SessionRunHook):
                def __init__(self, export_dir, input_tensor, output_tensor):
                    self.export_dir = export_dir
                    self.input_tensor = input_tensor
                    self.output_tensor = output_tensor

                def end(self, session):
                    print("{} ======= Exporting to: {}".format(
                        datetime.now().isoformat(), self.export_dir))
                    signatures = {
                        "test_key": {
                            'inputs': {
                                'features': self.input_tensor
                            },
                            'outputs': {
                                'prediction': self.output_tensor
                            },
                            'method_name':
                            tf.saved_model.signature_constants.
                            PREDICT_METHOD_NAME
                        }
                    }
                    TFNode.export_saved_model(session, self.export_dir,
                                              "test_tag", signatures)
                    print("{} ======= Done exporting".format(
                        datetime.now().isoformat()))

            tf.reset_default_graph(
            )  # reset graph in case we're re-using a Spark python worker

            cluster, server = TFNode.start_cluster_server(ctx)
            if ctx.job_name == "ps":
                server.join()
            elif ctx.job_name == "worker":
                with tf.device(
                        tf.train.replica_device_setter(
                            worker_device="/job:worker/task:%d" %
                            ctx.task_index,
                            cluster=cluster)):
                    x = tf.placeholder(tf.float32, [None, 2], name='x')
                    y_ = tf.placeholder(tf.float32, [None, 1], name='y_')
                    w = tf.Variable(tf.truncated_normal([2, 1]), name='w')
                    y = tf.matmul(x, w, name='y')
                    y2 = tf.square(
                        y, name="y2"
                    )  # extra/optional output for testing multiple output tensors
                    global_step = tf.train.get_or_create_global_step()
                    cost = tf.reduce_mean(tf.square(y_ - y), name='cost')
                    optimizer = tf.train.GradientDescentOptimizer(
                        0.5).minimize(cost, global_step)

                chief_hooks = [
                    ExportHook(ctx.absolute_path(args.export_dir), x, y)
                ] if args.export_dir else []
                with tf.train.MonitoredTrainingSession(
                        master=server.target,
                        is_chief=(ctx.task_index == 0),
                        checkpoint_dir=args.model_dir,
                        chief_only_hooks=chief_hooks) as sess:
                    tf_feed = TFNode.DataFeed(ctx.mgr,
                                              input_mapping=args.input_mapping)
                    while not sess.should_stop() and not tf_feed.should_stop():
                        batch = tf_feed.next_batch(10)
                        if args.input_mapping:
                            if len(batch['x']) > 0:
                                feed = {x: batch['x'], y_: batch['y_']}
                            sess.run(optimizer, feed_dict=feed)
Exemple #18
0
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf
    import time
    import logging
    import cnn_lstm_ctc_ocr
    #import redis_logger_handler
    #redis_logger_handler.logging_setup(args.redis)

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    worker_name = '(worker:%s tf:%s idx:%s)' % (worker_num, job_name,
                                                task_index)

    logging.info(
        '{0} batch_size:{1} initial_learning_rate:{2} decay_steps:{3} decay_rate:{4} momentum:{5}'
        .format(worker_name, args.batch_size, args.initial_learning_rate,
                args.decay_steps, args.decay_rate, args.momentum))
    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Parameters
    CHANNELS = 1
    IMAGE_WIDTH = 120
    IMAGE_HEIGHT = 45

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def sparse_tuple_from_label(sequences, dtype=numpy.int32):
        indices = []
        values = []
        for n, seq in enumerate(sequences):
            indices.extend(zip([n] * len(seq), range(len(seq))))
            values.extend(seq)
        indices = numpy.asarray(indices, dtype=numpy.int64)
        values = numpy.asarray(values, dtype=dtype)
        shape = numpy.asarray(
            [len(sequences),
             numpy.asarray(indices).max(0)[1] + 1],
            dtype=numpy.int64)
        return indices, values, shape

    def get_input_lens(sequences):
        lengths = numpy.asarray([58 for s in sequences], dtype=numpy.int64)
        return sequences, lengths

    def placeholder_inputs(image_width, image_height, channels):
        images_placeholder = tf.placeholder(
            tf.float32, [None, image_height, image_width, channels])
        labels_placeholder = tf.sparse_placeholder(tf.int32)
        seqlen_placeholder = tf.placeholder(tf.int32, [None])
        keep_prob = tf.placeholder(tf.float32)
        return images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob

    def format_batch(data_set, batch_size, image_height, image_width,
                     channels):
        batch = data_set.next_batch(batch_size)
        images = []
        labels = []
        for item in batch:
            images.append(item[0])
            labels.append(item[1])
        xs = numpy.array(images)
        # [batch_size, height * width] => [batch_size, height, width, channels]
        xs = xs.reshape(batch_size, image_height, image_width, channels)
        xs = xs.astype(numpy.float32)
        xs = xs / 255.
        ys = labels
        return xs, ys

    def fill_feed_dict(xs,
                       ys,
                       images_pl,
                       labels_pl,
                       seqlen_pl,
                       keep_prob,
                       train=True):
        images_feed, seqlen_feed = get_input_lens(xs)
        labels_feed = sparse_tuple_from_label(ys)
        if train:
            feed_dict = {
                images_pl: images_feed,
                labels_pl: labels_feed,
                seqlen_pl: seqlen_feed,
                keep_prob: 0.5,
            }
        else:
            feed_dict = {
                images_pl: images_feed,
                labels_pl: labels_feed,
                seqlen_pl: seqlen_feed,
                keep_prob: 1,
            }
        return feed_dict

    def do_eval(sess, dense_decoded, lastbatch_err, learning_rate,
                images_placeholder, labels_placeholder, seqlen_placeholder,
                keep_prob, train, xs, ys):
        true_count = 0  # Counts the number of correct predictions.
        feed_dict = fill_feed_dict(xs, ys, images_placeholder,
                                   labels_placeholder, seqlen_placeholder,
                                   keep_prob, train)
        dd, lerr, lr = sess.run([dense_decoded, lastbatch_err, learning_rate],
                                feed_dict=feed_dict)
        #accuracy calculation
        for i, origin_label in enumerate(ys):
            decoded_label = [j for j in dd[i] if j != -1]
            if i < 10:
                logging.info('{0} seq {1} => origin:{2} decoded:{3}'.format(
                    worker_name, i, origin_label, decoded_label))
            if origin_label == decoded_label:
                true_count += 1
        #accuracy
        acc = true_count * 1.0 / len(ys)
        #print subsummary
        logging.info(
            "%s accuracy = %.3f, lastbatch_err = %.3f, learning_rate = %.8f" %
            (worker_name, acc, lerr, lr))

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):
            # Generate placeholders for the images, labels and seqlens.
            images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob = placeholder_inputs(
                IMAGE_WIDTH, IMAGE_HEIGHT, CHANNELS)
            # Build a Graph that computes predictions from the inference model.
            #images_lp, seqlen_lp, num_features, num_layers, hidden_units
            logits = cnn_lstm_ctc_ocr.inference(images_placeholder,
                                                seqlen_placeholder, keep_prob,
                                                args.hidden_units, args.mode,
                                                args.batch_size)
            # Add to the Graph the Ops for loss calculation.
            #logits, labels_lp, seqlen_lp
            loss = cnn_lstm_ctc_ocr.loss(logits, labels_placeholder,
                                         seqlen_placeholder)
            tf.summary.scalar('loss', loss)
            # global counter
            global_step = tf.Variable(0, name='global_step', trainable=False)
            # Add to the Graph the Ops that calculate and apply gradients.
            #loss, initial_learning_rate, decay_steps, decay_rate, momentum
            train_op, learning_rate = cnn_lstm_ctc_ocr.training(
                loss, global_step, args.initial_learning_rate,
                args.decay_steps, args.decay_rate, args.momentum)
            # Add the Op to compare the logits to the labels during evaluation.
            dense_decoded, lerr = cnn_lstm_ctc_ocr.evaluation(
                logits, labels_placeholder, seqlen_placeholder)
            tf.summary.scalar('lerr', lerr)

            summary_op = tf.summary.merge_all()
            # Add the variable initializer Op.
            init_op = tf.global_variables_initializer()
            # Create a saver for writing training checkpoints.
            saver = tf.train.Saver()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        logging.info("{0} tensorflow model path: {1}".format(
            worker_name, logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num),
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=60)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        validation_xs = None
        validation_ys = None
        validation_batchs = 10
        with sv.managed_session(server.target) as sess:
            logging.info("{0} session ready".format(worker_name))
            # Loop until the supervisor shuts down or 1000000 steps have completed.
            g_step = 0
            tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
            # for do_eval samples
            if None == validation_xs or None == validation_ys:
                validation_xs, validation_ys = format_batch(
                    tf_feed, args.batch_size * validation_batchs, IMAGE_HEIGHT,
                    IMAGE_WIDTH, CHANNELS)
            while not sv.should_stop() and not tf_feed.should_stop(
            ) and g_step < (args.steps * args.epochs - validation_batchs):
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.
                start_time = time.time()
                # using feed_dict
                xs, ys = format_batch(tf_feed, args.batch_size, IMAGE_HEIGHT,
                                      IMAGE_WIDTH, CHANNELS)
                feed_dict = fill_feed_dict(xs, ys, images_placeholder,
                                           labels_placeholder,
                                           seqlen_placeholder, keep_prob,
                                           args.mode == "train")
                # Run one step of the model.  The return values are the activations
                # from the `train_op` (which is discarded) and the `loss` Op.  To
                # inspect the values of your Ops or variables, you may include them
                # in the list passed to sess.run() and the value tensors will be
                # returned in the tuple from the call.
                _, loss_value, g_step = sess.run([train_op, loss, global_step],
                                                 feed_dict=feed_dict)
                duration = time.time() - start_time
                if g_step % 20 == 0:
                    # Print status to stdout.
                    logging.info(
                        '%s [g_step:%d epoch:%d/%d step:%d/%d] loss = %.2f (%.3f sec)'
                        % (worker_name, g_step, g_step / args.steps,
                           args.epochs, g_step % args.steps, args.steps,
                           loss_value, duration))
                # Write the summaries and print an overview fairly often.
                if g_step % 100 == 0:
                    # Update the events file.
                    if sv.is_chief:
                        summary = sess.run(summary_op, feed_dict=feed_dict)
                        summary_writer.add_summary(summary, g_step)
                        summary_writer.flush()

                # Save a checkpoint and evaluate the model periodically.
                if (g_step + 1) % 500 == 0 or (g_step + 1) == args.steps:
                    # Evaluate against the validation set.
                    logging.info('{0} ---- Validation Data Eval: ----'.format(
                        worker_name))
                    do_eval(sess, dense_decoded, lerr, learning_rate,
                            images_placeholder, labels_placeholder,
                            seqlen_placeholder, keep_prob,
                            args.mode == "train", validation_xs, validation_ys)

            if sv.should_stop() or g_step >= (args.steps * args.epochs -
                                              validation_batchs):
                logging.info("{0} terminating tf_feed".format(worker_name))
                tf_feed.terminate()

        # Ask for all the services to stop.
        logging.info("{0} stopping supervisor".format(worker_name))
        sv.stop()
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    # Parameters
    IMAGE_PIXELS = 28
    hidden_units = 128

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    # Create generator for Spark data feed
    tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")

    def rdd_generator():
        while not tf_feed.should_stop():
            batch = tf_feed.next_batch(1)[0]
            image = numpy.array(batch[0])
            image = image.astype(numpy.float32) / 255.0
            label = numpy.array(batch[1])
            label = label.astype(numpy.int64)
            yield (image, label)

    if job_name == "ps":
        server.join()
    elif job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Dataset for input data
            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32),
                (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]),
                 tf.TensorShape([10]))).batch(args.batch_size)
            iterator = ds.make_one_shot_iterator()
            x, y_ = iterator.get_next()

            # Variables of the hidden layer
            hid_w = tf.Variable(tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
                                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            tf.summary.histogram("hidden_weights", hid_w)

            # Variables of the softmax layer
            sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                                                   stddev=1.0 /
                                                   math.sqrt(hidden_units)),
                               name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            tf.summary.histogram("softmax_weights", sm_w)

            # # Placeholders or QueueRunner/Readers for input data
            # x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
            # y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
            tf.summary.image("x_img", x_img)

            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            tf.summary.scalar("loss", loss)

            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)

            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),
                                      name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num,
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=10)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = 0
            while not sv.should_stop() and not tf_feed.should_stop(
            ) and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                if args.mode == "train":
                    _, summary, step = sess.run(
                        [train_op, summary_op, global_step])
                    # print accuracy and save model checkpoint to HDFS every 100 steps
                    if (step % 100 == 0):
                        print("{0} step: {1} accuracy: {2}".format(
                            datetime.now().isoformat(), step,
                            sess.run(accuracy)))

                    if sv.is_chief:
                        summary_writer.add_summary(summary, step)
                else:  # args.mode == "inference"
                    labels, preds, acc = sess.run(
                        [label, prediction, accuracy])

                    results = [
                        "{0} Label: {1}, Prediction: {2}".format(
                            datetime.now().isoformat(), l, p)
                        for l, p in zip(labels, preds)
                    ]
                    tf_feed.batch_results(results)
                    print("acc: {0}".format(acc))

            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import os
    import tensorflow as tf
    import time

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    # Parameters
    IMAGE_PIXELS = 28
    hidden_units = 128

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def _parse_csv(ln):
        splits = tf.string_split([ln], delimiter='|')
        lbl = splits.values[0]
        img = splits.values[1]
        image_defaults = [[0.0] for col in range(IMAGE_PIXELS * IMAGE_PIXELS)]
        image = tf.stack(tf.decode_csv(img, record_defaults=image_defaults))
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        normalized_image = tf.div(image, norm)
        label_value = tf.string_to_number(lbl, tf.int32)
        label = tf.one_hot(label_value, 10)
        return (normalized_image, label, label_value)

    def _parse_tfr(example_proto):
        print("example_proto: {}".format(example_proto))
        feature_def = {
            "label": tf.FixedLenFeature(10, tf.int64),
            "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)
        }
        features = tf.parse_single_example(example_proto, feature_def)
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        image = tf.div(tf.to_float(features['image']), norm)
        label = tf.to_float(features['label'])
        return (image, label)

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Dataset for input data
            image_dir = TFNode.hdfs_path(ctx, args.images)
            file_pattern = os.path.join(image_dir, 'part-*')
            files = tf.gfile.Glob(file_pattern)

            parse_fn = _parse_tfr if args.format == 'tfr' else _parse_csv
            ds = tf.data.TextLineDataset(files).map(parse_fn).batch(
                args.batch_size)
            iterator = ds.make_initializable_iterator()
            x, y_, y_val = iterator.get_next()

            # Variables of the hidden layer
            hid_w = tf.Variable(tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
                                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            tf.summary.histogram("hidden_weights", hid_w)

            # Variables of the softmax layer
            sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                                                   stddev=1.0 /
                                                   math.sqrt(hidden_units)),
                               name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            tf.summary.histogram("softmax_weights", sm_w)

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
            tf.summary.image("x_img", x_img)

            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            tf.summary.scalar("loss", loss)
            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),
                                      name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num,
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=10)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)
            output_dir = TFNode.hdfs_path(ctx, args.output)
            tf.gfile.MkDir(output_dir)
            output_file = tf.gfile.Open("{0}/part-{1:05d}".format(
                output_dir, worker_num),
                                        mode='w')

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            sess.run(iterator.initializer)
            step = 0
            count = 0
            while not sv.should_stop() and step < args.steps:

                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using QueueRunners/Readers
                if args.mode == "train":
                    if (step % 100 == 0):
                        print("{0} step: {1} accuracy: {2}".format(
                            datetime.now().isoformat(), step,
                            sess.run(accuracy)))
                    _, summary, step, yv = sess.run(
                        [train_op, summary_op, global_step, y_val])
                    # print("yval: {}".format(yv))
                    if sv.is_chief:
                        summary_writer.add_summary(summary, step)
                else:  # args.mode == "inference"
                    labels, pred, acc = sess.run([label, prediction, accuracy])
                    # print("label: {0}, pred: {1}".format(labels, pred))
                    print("acc: {0}".format(acc))
                    for i in range(len(labels)):
                        count += 1
                        output_file.write("{0} {1}\n".format(
                            labels[i], pred[i]))
                    print("count: {0}".format(count))

        if args.mode == "inference":
            output_file.close()
            # Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
            # run inference and request stop before the other workers even start/sync their sessions.
            if task_index == 0:
                time.sleep(60)

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
Exemple #21
0
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf
    import time
    import re

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    NUM_CLASSES = 100
    IMAGE_PIXELS = 32
    NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
    NUM_EPOCHS_PER_DECAY = 350.0  # Epochs after which learning rate decays.
    LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
    INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.
    TOWER_NAME = 'tower'

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Parameters
    hidden_units = 128
    batch_size = args.batch_size

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def feed_dict(batch):
        # Convert from [(images, labels)] to two numpy arrays of the proper type
        images = []
        labels = []
        for item in batch:
            images.append(item[0])
            labels.append(item[1])
        xs = numpy.array(images)
        xs = xs.astype(numpy.float32)
        xs = xs / 255.0
        ys = numpy.array(labels)
        ys = ys.astype(numpy.uint8)
        return (xs, ys)

    if job_name == "ps":
        server.join()
    elif job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            print("In a TFCluster.")
            #      global_step = tf.train.get_or_create_global_step()
            # Input placeholders
            with tf.name_scope('input'):
                x = tf.placeholder(tf.float32,
                                   [None, IMAGE_PIXELS * IMAGE_PIXELS * 3],
                                   name='x-input')
                y_ = tf.placeholder(tf.float32, [None, 100], name='y-input')
                images = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 3])
                print(images.shape)
                tf.summary.image('input', images, 10)

            def _activation_summary(x):
                """Helper to create summaries for activations.
        Creates a summary that provides a histogram of activations.
        Creates a summary that measures the sparsity of activations.
        Args:
          x: Tensor
        Returns:
          nothing
        """
                # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
                # session. This helps the clarity of presentation on tensorboard.
                tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
                tf.summary.histogram(tensor_name + '/activations', x)
                tf.summary.scalar(tensor_name + '/sparsity',
                                  tf.nn.zero_fraction(x))

            def _variable_on_cpu(name, shape, initializer):
                """Helper to create a Variable stored on CPU memory.
        Args:
          name: name of the variable
          shape: list of ints
          initializer: initializer for Variable
        Returns:
          Variable Tensor
        """
                with tf.device('/cpu:0'):
                    dtype = tf.float32
                    var = tf.get_variable(name,
                                          shape,
                                          initializer=initializer,
                                          dtype=dtype)
                return var

            def _variable_with_weight_decay(name, shape, stddev, wd):
                """Helper to create an initialized Variable with weight decay.
        Note that the Variable is initialized with a truncated normal distribution.
        A weight decay is added only if one is specified.
        Args:
          name: name of the variable
          shape: list of ints
          stddev: standard deviation of a truncated Gaussian
          wd: add L2Loss weight decay multiplied by this float. If None, weight
                decay is not added for this Variable.
        Returns:
          Variable Tensor
        """
                dtype = tf.float32
                var = _variable_on_cpu(
                    name, shape,
                    tf.truncated_normal_initializer(stddev=stddev,
                                                    dtype=dtype))
                if wd is not None:
                    weight_decay = tf.multiply(tf.nn.l2_loss(var),
                                               wd,
                                               name='weight_loss')
                    tf.add_to_collection('losses', weight_decay)
                return var

            with tf.variable_scope('conv1') as scope:
                kernel = _variable_with_weight_decay('weights',
                                                     shape=[5, 5, 3, 256],
                                                     stddev=5e-2,
                                                     wd=0.0)
                conv = tf.nn.conv2d(images,
                                    kernel, [1, 1, 1, 1],
                                    padding='SAME')
                biases = _variable_on_cpu('biases', [256],
                                          tf.constant_initializer(0.0))
                pre_activation = tf.nn.bias_add(conv, biases)
                conv1 = tf.nn.relu(pre_activation, name=scope.name)
                _activation_summary(conv1)

            # pool1
            pool1 = tf.nn.max_pool(conv1,
                                   ksize=[1, 3, 3, 1],
                                   strides=[1, 2, 2, 1],
                                   padding='SAME',
                                   name='pool1')
            # norm1
            norm1 = tf.nn.lrn(pool1,
                              4,
                              bias=1.0,
                              alpha=0.001 / 9.0,
                              beta=0.75,
                              name='norm1')

            # conv2
            with tf.variable_scope('conv2') as scope:
                kernel = _variable_with_weight_decay('weights',
                                                     shape=[5, 5, 256, 128],
                                                     stddev=5e-2,
                                                     wd=0.0)
                conv = tf.nn.conv2d(norm1,
                                    kernel, [1, 1, 1, 1],
                                    padding='SAME')
                biases = _variable_on_cpu('biases', [128],
                                          tf.constant_initializer(0.1))
                pre_activation = tf.nn.bias_add(conv, biases)
                conv2 = tf.nn.relu(pre_activation, name=scope.name)
                _activation_summary(conv2)

            # norm2
            norm2 = tf.nn.lrn(conv2,
                              4,
                              bias=1.0,
                              alpha=0.001 / 9.0,
                              beta=0.75,
                              name='norm2')
            # pool2
            pool2 = tf.nn.max_pool(norm2,
                                   ksize=[1, 3, 3, 1],
                                   strides=[1, 2, 2, 1],
                                   padding='SAME',
                                   name='pool2')

            # local3
            with tf.variable_scope('local3') as scope:
                # Move everything into depth so we can perform a single matrix multiply.
                reshape = tf.contrib.layers.flatten(pool2)
                dim = reshape.get_shape()[1].value
                weights = _variable_with_weight_decay('weights',
                                                      shape=[dim, 1024],
                                                      stddev=0.04,
                                                      wd=0.004)
                biases = _variable_on_cpu('biases', [1024],
                                          tf.constant_initializer(0.1))
                local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases,
                                    name=scope.name)
                _activation_summary(local3)

            # local4
            with tf.variable_scope('local4') as scope:
                weights = _variable_with_weight_decay('weights',
                                                      shape=[1024, 256],
                                                      stddev=0.04,
                                                      wd=0.004)
                biases = _variable_on_cpu('biases', [256],
                                          tf.constant_initializer(0.1))
                local4 = tf.nn.relu(tf.matmul(local3, weights) + biases,
                                    name=scope.name)
                _activation_summary(local4)

            # linear layer(WX + b),
            # We don't apply softmax here because
            # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
            # and performs the softmax internally for efficiency.
            with tf.variable_scope('softmax_linear') as scope:
                weights = _variable_with_weight_decay('weights',
                                                      [256, NUM_CLASSES],
                                                      stddev=1 / 256.0,
                                                      wd=0.0)
                biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                          tf.constant_initializer(0.0))
                softmax_linear = tf.add(tf.matmul(local4, weights),
                                        biases,
                                        name=scope.name)
                _activation_summary(softmax_linear)

            logits = softmax_linear

            # Calculate the average cross entropy loss across the batch.
            #      labels = tf.reshape(y_, [100, 10])
            print(y_.shape)
            print(logits.shape)
            labels = tf.cast(y_, tf.int64)
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                labels=labels, logits=logits, name='cross_entropy_per_example')
            cross_entropy_mean = tf.reduce_mean(cross_entropy,
                                                name='cross_entropy')
            tf.add_to_collection('losses', cross_entropy_mean)

            # The total loss is defined as the cross entropy loss plus all of the weight
            # decay terms (L2 loss).
            total_loss = tf.add_n(tf.get_collection('losses'),
                                  name='total_loss')
            global_step = tf.Variable(0)
            inc = tf.assign_add(global_step, 1, name='increment')
            #      num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size
            #      decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

            # Decay the learning rate exponentially based on the number of steps.
            #      lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
            #                                  global_step,
            #                                  decay_steps,
            #                                  LEARNING_RATE_DECAY_FACTOR,
            #                                  staircase=True)
            #      tf.summary.scalar('learning_rate', lr)

            train_step = tf.train.AdamOptimizer(1e-4).minimize(total_loss)
            correct_prediction = tf.equal(tf.argmax(logits, 1),
                                          tf.argmax(y_, 1))
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(logits, 1, name="prediction")

            ##########################################################

            # Merge all the summaries and write them out to
            # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
            merged = tf.summary.merge_all()

            #      saver = tf.train.Saver()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
#    logdir = TFNode.hdfs_path(ctx, args.model)
        logdir = "/tmp/" + args.model
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num),
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(
                is_chief=(task_index == 0),
                logdir=logdir,
                init_op=init_op,
                summary_op=None,
                summary_writer=summary_writer,
                global_step=global_step,
                stop_grace_secs=300,
                saver=None
                #                               save_model_secs=10
            )
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))
            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = -1
            tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
            tf_feed_test = TFNode.DataFeed(ctx.mgr, args.mode != "train")
            while step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.
                #        print (args.steps)
                #        print (sv.should_stop())
                #        print (tf_feed.should_stop())
                step = step + 1
                #        print (step)
                temp = sess.run(global_step)
                #        print (temp)
                # using feed_dict
                batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
                test_xs, test_ys = feed_dict(
                    tf_feed_test.next_batch(batch_size))
                feed = {x: batch_xs, y_: batch_ys}

                #        print (len(batch_xs) > 0)
                if len(batch_xs) > 0:
                    if args.mode == "train":
                        summary, _, _ = sess.run([merged, train_step, inc],
                                                 feed_dict=feed)
                        # print accuracy and save model checkpoint to HDFS every 100 steps
                        if (step % 100 == 0):
                            labels, preds, acc = sess.run(
                                [label, prediction, accuracy],
                                feed_dict={
                                    x: test_xs,
                                    y_: test_ys
                                })
                            for l, p in zip(labels, preds):
                                print(
                                    "{0} step: {1} accuracy: {2}, Label: {3}, Prediction: {4}"
                                    .format(datetime.now().isoformat(), temp,
                                            acc, l, p))


#              results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
#              tf_feed.batch_results(results)

                        if sv.is_chief:
                            summary_writer.add_summary(summary, step)
                    else:  # args.mode == "inference"
                        labels, preds, acc = sess.run(
                            [label, prediction, accuracy], feed_dict=feed)

                        results = [
                            "{0} Label: {1}, Prediction: {2}".format(
                                datetime.now().isoformat(), l, p)
                            for l, p in zip(labels, preds)
                        ]
                        tf_feed.batch_results(results)
                        print("acc: {0}".format(acc))

            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
def main_fun(argv, ctx):
  import tensorflow as tf
  import cifar10

  sys.argv = argv
  FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train',
                             """Directory where to write event logs """
                             """and checkpoint.""")
  tf.app.flags.DEFINE_integer('max_steps', 1000000,
                              """Number of batches to run.""")
  tf.app.flags.DEFINE_boolean('log_device_placement', False,
                              """Whether to log device placement.""")
  tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")

  # cifar10.maybe_download_and_extract()
  if tf.gfile.Exists(FLAGS.train_dir):
    tf.gfile.DeleteRecursively(FLAGS.train_dir)
  tf.gfile.MakeDirs(FLAGS.train_dir)

  cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

  # Train CIFAR-10 for a number of steps.
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
def main_fun(argv, ctx):

  import tensorflow as tf
  import cifar10

  sys.argv = argv
  FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval',
                             """Directory where to write event logs.""")
  tf.app.flags.DEFINE_string('eval_data', 'test',
                             """Either 'test' or 'train_eval'.""")
  tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train',
                             """Directory where to read model checkpoints.""")
  tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                              """How often to run the eval.""")
  tf.app.flags.DEFINE_integer('num_examples', 10000,
                              """Number of examples to run.""")
  tf.app.flags.DEFINE_boolean('run_once', False,
                           """Whether to run eval only once.""")
  tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")

  cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

  def eval_once(saver, summary_writer, top_k_op, summary_op):
    """Run Eval once.

    Args:
      saver: Saver.
      summary_writer: Summary writer.
      top_k_op: Top K op.
      summary_op: Summary op.
    """
    with tf.Session() as sess:
      ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
      if ckpt and ckpt.model_checkpoint_path:
        # Restores from checkpoint
        saver.restore(sess, ckpt.model_checkpoint_path)
        # Assuming model_checkpoint_path looks something like:
        #   /my-favorite-path/cifar10_train/model.ckpt-0,
        # extract global_step from it.
        global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
      else:
        print('No checkpoint file found')
        return

      # Start the queue runners.
      coord = tf.train.Coordinator()
      try:
        threads = []
        for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
          threads.extend(qr.create_threads(sess, coord=coord, daemon=True,
                                           start=True))

        num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))
        true_count = 0  # Counts the number of correct predictions.
        total_sample_count = num_iter * FLAGS.batch_size
        step = 0
        while step < num_iter and not coord.should_stop():
          predictions = sess.run([top_k_op])
          true_count += np.sum(predictions)
          step += 1

        # Compute precision @ 1.
        precision = true_count / total_sample_count
        print('%s: precision @ 1 = %.3f' % (datetime.now(), precision))

        summary = tf.Summary()
        summary.ParseFromString(sess.run(summary_op))
        summary.value.add(tag='Precision @ 1', simple_value=precision)
        summary_writer.add_summary(summary, global_step)
      except Exception as e:  # pylint: disable=broad-except
        coord.request_stop(e)

      coord.request_stop()
      coord.join(threads, stop_grace_period_secs=10)


  def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default() as g:
      # Get images and labels for CIFAR-10.
      eval_data = FLAGS.eval_data == 'test'
      images, labels = cifar10.inputs(eval_data=eval_data)

      # Build a Graph that computes the logits predictions from the
      # inference model.
      logits = cifar10.inference(images)

      # Calculate predictions.
      top_k_op = tf.nn.in_top_k(logits, labels, 1)

      # Restore the moving average version of the learned variables for eval.
      variable_averages = tf.train.ExponentialMovingAverage(
          cifar10.MOVING_AVERAGE_DECAY)
      variables_to_restore = variable_averages.variables_to_restore()
      saver = tf.train.Saver(variables_to_restore)

      # Build the summary operation based on the TF collection of Summaries.
      summary_op = tf.summary.merge_all()

      summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g)

      while True:
        eval_once(saver, summary_writer, top_k_op, summary_op)
        if FLAGS.run_once:
          break
        time.sleep(FLAGS.eval_interval_secs)

  #cifar10.maybe_download_and_extract()
  if tf.gfile.Exists(FLAGS.eval_dir):
    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  tf.gfile.MakeDirs(FLAGS.eval_dir)
  evaluate()
Exemple #24
0
def main_fun(args, ctx):
    import tensorflow as tf
    import argparse
    import time
    import os
    from six.moves import cPickle
    from model import Model
    from tensorflowonspark import TFNode
    from datetime import datetime
    import numpy as np

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    num_workers = len(cluster_spec['worker'])

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    if job_name == "ps":
        server.join()
    else:
        with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % task_index,
                                                    cluster=cluster)):
            model = Model(args)
            # instrument for tensorboard
            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        logdir = TFNode.hdfs_path(args.save_dir, ctx.defaultFS, ctx.working_dir)

        print("tensorflow model path: {0}".format(logdir))

        summary_writer = TFNode.get_summary_writer(ctx)

        sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                logdir=logdir,
                                init_op=init_op,
                                summary_op=None,
                                saver=saver,
                                global_step=model.global_step,
                                stop_grace_secs=300, save_model_secs=10)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(
                datetime.now().isoformat()))

            state=sess.run(model.initial_state)

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step=0
            tf_feed=TFNode.DataFeed(ctx.mgr, True)
            while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using feed_dict
                batch = tf_feed.next_batch(args.batch_size)
                batch_xs = np.asarray([data[0] for data in batch])
                batch_ys = np.asarray([data[1] for data in batch])

                feed={model.input_data: batch_xs, model.targets: batch_ys}

                for i, (c, h) in enumerate(model.initial_state):
                    feed[c]=state[i].c
                    feed[h]=state[i].h

                if len(batch_xs) > 0:
                    # instrument for tensorboard
                    summ, train_loss, state, _, step = sess.run(
                        [summary_op, model.cost, model.final_state, model.train_op, model.global_step], feed_dict=feed)

                    # print loss
                    print("Step: {}, train_loss: {}".format(step, train_loss))

                if sv.is_chief:
                    summary_writer.add_summary(summ, step)

            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
Exemple #25
0
def main_fun(argv, ctx):
    import pprint
    import numpy as np
    import tensorflow as tf
    import online_model
    import tfos_online_data_reader

    sys.argv = argv
    flags = tf.app.flags
    FLAGS = flags.FLAGS

    flags.DEFINE_integer('batch_size', 100, 'data batch size')
    flags.DEFINE_integer('num_epoch', 1, 'train epoches for dataset ')
    flags.DEFINE_string('mapping_data',
                        'hdfs://appcluster-cdh/user/root/Adwin_Refactoring_Test/instance_build_txt/mix_dev_wx_interest2/20171022_map',
                        'id mapping path')
    flags.DEFINE_string('train_data',
                        'hdfs://appcluster-cdh/user/root/Adwin_Refactoring_Test/instance_build_txt/mix_dev_wx_interest2/20171022',
                        'train data path')
    #flags.DEFINE_string('mapping_data',
    #                    'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/mix_dev_wx_interest2/20171022_map',
    #                    'id mapping path')
    #flags.DEFINE_string('train_data',
    #                    'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/mix_dev_wx_interest2/20171022',
    #                    'train data path')
    flags.DEFINE_string('log_dir',
                        'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/model',
                        'log directory')

    flags.DEFINE_float('linear_lr', 0.1, 'wide part learning rate. default 0.1')
    flags.DEFINE_float('dnn_lr', 0.001, 'deep part learning rate. default 0.001')
    flags.DEFINE_string('linear_optimizer', 'ftrl',
                        'optimizer: adadelta | adagrad | sgd | adam | ftrl | momentum. default is ftrl')
    flags.DEFINE_string('dnn_optimizer', 'adagrad',
                        'optimizer: adadelta | adagrad | sgd | adam | ftrl | momentum. default is adagrad')

    flags.DEFINE_integer('input_dim', 13, 'input dimension')
    flags.DEFINE_string("model_network", "100,20", "The neural network of model, as 100,50,20")
    flags.DEFINE_string("model_type", "wide_deep", "model type: wide | deep | wide_deep")
    flags.DEFINE_integer('display_step', 200, 'display_step')

    flags.DEFINE_integer('ps_num', '64', 'Comma-separated list of hostname:port pairs')
    flags.DEFINE_integer('task_num', '128', 'Comma-separated list of hostname:port pairs')

    pprint.PrettyPrinter().pprint(FLAGS.__flags)
    cluster_spec, server = TFNode.start_cluster_server(ctx)
    if ctx.job_name == "ps":
        server.join()
    elif ctx.job_name == "worker":
        total_file_names = parse_files(FLAGS.train_data)
        print("total_file_names:")
        print(total_file_names)
        print("task_index: " + str(ctx.task_index))
        task_file_names = [name for idx, name in enumerate(total_file_names) if idx % FLAGS.task_num == ctx.task_index]
        print("task_file_names:")
        print(task_file_names)
        train_reader = tfos_online_data_reader.Reader(
            task_file_names,
            FLAGS.mapping_data,
            batch_size=FLAGS.batch_size,
            delimiter='\t')
        wide_dim = train_reader.wide_dim

        with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d"%ctx.task_index,
                                                      cluster=cluster_spec)):
            config = {}
            config['num_ps'] = FLAGS.ps_num
            dnn_model = online_model.DNNModel(FLAGS,wide_dim,config)
            dnn_model.build()
            dense_inputs = dnn_model.dense_inputs
            sparse_inputs = dnn_model.sparse_inputs
            labels = dnn_model.labels

            global_step = dnn_model.global_step
            step_update_op = dnn_model.step_update_op
            train_op = dnn_model.train_op
            loss = dnn_model.loss
            auc_op = dnn_model.auc_op
            summary_op = dnn_model.summary_op

        saver = tf.train.Saver()
        init_op = [tf.global_variables_initializer(),
                    tf.local_variables_initializer()]

        summary_writer = tf.summary.FileWriter("tensorboard_%d" % ctx.worker_num, graph=tf.get_default_graph())
        sv = tf.train.Supervisor(is_chief = (ctx.task_index == 0),
                                 logdir = FLAGS.log_dir,
                                 init_op = init_op,
                                 summary_op = None,
                                 summary_writer=summary_writer,
                                 global_step = global_step,
                                 saver=saver,
                                 save_model_secs = 300)

        shape = np.array([FLAGS.batch_size, wide_dim + 1])
        begin_time = datetime.now()
        with sv.managed_session(server.target) as sess:
            if not sv.should_stop():
                for epoch in range(FLAGS.num_epoch):
                    train_batches = train_reader.yieldBatches()
                    print("Epoch: %d" % epoch)
                    step = 0
                    for dense_x,sparse_idx,sparse_values,y in train_batches:
                        start_time = datetime.now()
                        _ ,train_loss,train_auc,summ,_ = sess.run([train_op,loss,auc_op,summary_op,step_update_op],
                           feed_dict={dense_inputs:dense_x,sparse_inputs:(sparse_idx,sparse_values,shape),labels:y})
                        step += 1
                        assert not np.isnan(train_loss), 'Model diverged with loss = NaN'
                        time_used = datetime.now() - start_time
                        if step % FLAGS.display_step == 0:
                            g_step, = sess.run([global_step])
                            print("step: " + str(step) + ", global_step: " + str(g_step))
                            summary_writer.add_summary(summ,g_step)
                            print("Step = {}, Examples = {}, Time = {}, Minibatch Loss = {}, Auc = {}".format(
                                 g_step, g_step*FLAGS.batch_size, time_used, train_loss, train_auc))
                            sys.stdout.flush()
            total_time = datetime.now() - begin_time
            print("Training Done!!")
            print("Total time used: {}".format(total_time))
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import numpy
  import tensorflow as tf
  import time
  import math

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  batch_size   = args.batch_size

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [images_labels] to two numpy arrays of the proper type
    images = []
    labels = []
    for item in batch:
      images.append(item[0: 4])
      labels.append(item[4])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    ys = dense_to_one_hot(numpy.array(labels, dtype=numpy.uint), 3)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  def dense_to_one_hot(labels_dense, num_classes):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = numpy.arange(num_labels) * num_classes
    labels_one_hot = numpy.zeros((num_labels, num_classes))
    tt = index_offset + labels_dense.ravel()
    tt = tt.astype(numpy.int32)
    labels_one_hot.flat[tt] = 1
    return labels_one_hot

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

        # network
        x = tf.placeholder(tf.float32, [None, 4])

        # paras
        W = tf.Variable(tf.zeros([4, 3]))
        b = tf.Variable(tf.zeros([3]))

        y = tf.nn.softmax(tf.matmul(x, W) + b)
        y_ = tf.placeholder(tf.float32, [None, 3])

        # loss func
        cross_entropy = -tf.reduce_sum(y_ * tf.log(y))

        global_step = tf.Variable(0)

        train_op = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy,global_step=global_step )

        # Test trained model
        label = tf.argmax(y_, 1, name="label") #??? does the function argmax use in the right way ?
        prediction = tf.argmax(y, 1, name="prediction")
        correct_prediction = tf.equal(prediction, label)

        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
        tf.summary.scalar("acc", accuracy)

        saver = tf.train.Saver()
        summary_op = tf.summary.merge_all()
        init_op = tf.global_variables_initializer()
    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=1)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step,
                                                         sess.run(accuracy,{x: batch_xs, y_: batch_ys})))

            if sv.is_chief:
              summary_writer.add_summary(summary, step)


      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def main_fun(argv, ctx):
  import math
  import six
  import tensorflow as tf

  from datasets import dataset_factory
  from nets import nets_factory
  from preprocessing import preprocessing_factory

  sys.argv = argv

  slim = tf.contrib.slim

  tf.app.flags.DEFINE_integer(
      'batch_size', 100, 'The number of samples in each batch.')

  tf.app.flags.DEFINE_integer(
      'max_num_batches', None,
      'Max number of batches to evaluate by default use all.')

  tf.app.flags.DEFINE_string(
      'master', '', 'The address of the TensorFlow master to use.')

  tf.app.flags.DEFINE_string(
      'checkpoint_path', '/tmp/tfmodel/',
      'The directory where the model was written to or an absolute path to a '
      'checkpoint file.')

  tf.app.flags.DEFINE_string(
      'eval_dir', '/tmp/tfmodel/', 'Directory where the results are saved to.')

  tf.app.flags.DEFINE_integer(
      'num_preprocessing_threads', 4,
      'The number of threads used to create the batches.')

  tf.app.flags.DEFINE_string(
      'dataset_name', 'imagenet', 'The name of the dataset to load.')

  tf.app.flags.DEFINE_string(
      'dataset_split_name', 'test', 'The name of the train/test split.')

  tf.app.flags.DEFINE_string(
      'dataset_dir', None, 'The directory where the dataset files are stored.')

  tf.app.flags.DEFINE_integer(
      'labels_offset', 0,
      'An offset for the labels in the dataset. This flag is primarily used to '
      'evaluate the VGG and ResNet architectures which do not use a background '
      'class for the ImageNet dataset.')

  tf.app.flags.DEFINE_string(
      'model_name', 'inception_v3', 'The name of the architecture to evaluate.')

  tf.app.flags.DEFINE_string(
      'preprocessing_name', None, 'The name of the preprocessing to use. If left '
      'as `None`, then the model_name flag is used.')

  tf.app.flags.DEFINE_float(
      'moving_average_decay', None,
      'The decay to use for the moving average.'
      'If left as None, then moving averages are not used.')

  tf.app.flags.DEFINE_integer(
      'eval_image_size', None, 'Eval image size')

  FLAGS = tf.app.flags.FLAGS

  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  cluster_spec, server = TFNode.start_cluster_server(ctx)

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    #tf_global_step = slim.get_or_create_global_step()
    tf_global_step = tf.Variable(0, name="global_step")

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

    ####################
    # Select the model #
    ####################
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        is_training=False)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        shuffle=False,
        common_queue_capacity=2 * FLAGS.batch_size,
        common_queue_min=FLAGS.batch_size)
    [image, label] = provider.get(['image', 'label'])
    label -= FLAGS.labels_offset

    #####################################
    # Select the preprocessing function #
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=False)

    eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

    image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

    images, labels = tf.train.batch(
        [image, label],
        batch_size=FLAGS.batch_size,
        num_threads=FLAGS.num_preprocessing_threads,
        capacity=5 * FLAGS.batch_size)

    ####################
    # Define the model #
    ####################
    logits, _ = network_fn(images)

    if FLAGS.moving_average_decay:
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, tf_global_step)
      variables_to_restore = variable_averages.variables_to_restore(
          slim.get_model_variables())
      variables_to_restore[tf_global_step.op.name] = tf_global_step
    else:
      variables_to_restore = slim.get_variables_to_restore()

    predictions = tf.argmax(logits, 1)
    labels = tf.squeeze(labels)

    # Define the metrics:
    names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
        'Accuracy': slim.metrics.streaming_accuracy(predictions, labels),
        'Recall_5': slim.metrics.streaming_recall_at_k(
            logits, labels, 5),
    })

    # Print the summaries to screen.
    for name, value in six.iteritems(names_to_values):
      summary_name = 'eval/%s' % name
      op = tf.summary.scalar(summary_name, value, collections=[])
      op = tf.Print(op, [value], summary_name)
      tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

    # TODO(sguada) use num_epochs=1
    if FLAGS.max_num_batches:
      num_batches = FLAGS.max_num_batches
    else:
      # This ensures that we make a single pass over all of the data.
      num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))

    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    else:
      checkpoint_path = FLAGS.checkpoint_path

    tf.logging.info('Evaluating %s' % checkpoint_path)

    slim.evaluation.evaluate_once(
        master=FLAGS.master,
        checkpoint_path=checkpoint_path,
        logdir=FLAGS.eval_dir,
        num_evals=num_batches,
        eval_op=list(names_to_updates.values()),
        variables_to_restore=variables_to_restore)
def main_fun(argv, ctx):
    import tensorflow as tf
    import cifar10

    sys.argv = argv
    FLAGS = tf.app.flags.FLAGS
    tf.app.flags.DEFINE_string(
        'train_dir', '/tmp/cifar10_train',
        """Directory where to write event logs """
        """and checkpoint.""")
    tf.app.flags.DEFINE_integer('max_steps', 1000000,
                                """Number of batches to run.""")
    tf.app.flags.DEFINE_boolean('log_device_placement', False,
                                """Whether to log device placement.""")
    tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")

    # cifar10.maybe_download_and_extract()
    if tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)

    cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

    # Train CIFAR-10 for a number of steps.
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1

            def before_run(self, run_context):
                self._step += 1
                self._start_time = time.time()
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                duration = time.time() - self._start_time
                loss_value = run_values.results
                if self._step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import getpass
  import math
  import numpy
  import os
  import signal
  import tensorflow as tf
  import time

  IMAGE_PIXELS=28
  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec
  num_workers = len(cluster_spec['worker'])

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128
  batch_size   = 100

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def read_csv_examples(image_dir, label_dir, batch_size=100, num_epochs=None, task_index=None, num_workers=None):
    print_log(worker_num, "num_epochs: {0}".format(num_epochs))
    # Setup queue of csv image filenames
    tf_record_pattern = os.path.join(image_dir, 'part-*')
    images = tf.gfile.Glob(tf_record_pattern)
    print_log(worker_num, "images: {0}".format(images))
    image_queue = tf.train.string_input_producer(images, shuffle=False, capacity=1000, num_epochs=num_epochs, name="image_queue")

    # Setup queue of csv label filenames
    tf_record_pattern = os.path.join(label_dir, 'part-*')
    labels = tf.gfile.Glob(tf_record_pattern)
    print_log(worker_num, "labels: {0}".format(labels))
    label_queue = tf.train.string_input_producer(labels, shuffle=False, capacity=1000, num_epochs=num_epochs, name="label_queue")

    # Setup reader for image queue
    img_reader = tf.TextLineReader(name="img_reader")
    _, img_csv = img_reader.read(image_queue)
    image_defaults = [ [1.0] for col in range(784) ]
    img = tf.pack(tf.decode_csv(img_csv, image_defaults))
    # Normalize values to [0,1]
    norm = tf.constant(255, dtype=tf.float32, shape=(784,))
    image = tf.div(img, norm)
    print_log(worker_num, "image: {0}".format(image))

    # Setup reader for label queue
    label_reader = tf.TextLineReader(name="label_reader")
    _, label_csv = label_reader.read(label_queue)
    label_defaults = [ [1.0] for col in range(10) ]
    label = tf.pack(tf.decode_csv(label_csv, label_defaults))
    print_log(worker_num, "label: {0}".format(label))

    # Return a batch of examples
    return tf.train.batch([image,label], batch_size, num_threads=args.readers, name="batch_csv")

  def read_tfr_examples(path, batch_size=100, num_epochs=None, task_index=None, num_workers=None):
    print_log(worker_num, "num_epochs: {0}".format(num_epochs))

    # Setup queue of TFRecord filenames
    tf_record_pattern = os.path.join(path, 'part-*')
    files = tf.gfile.Glob(tf_record_pattern)
    queue_name = "file_queue"

    # split input files across workers, if specified
    if task_index is not None and num_workers is not None:
      num_files = len(files)
      files = files[task_index:num_files:num_workers]
      queue_name = "file_queue_{0}".format(task_index)

    print_log(worker_num, "files: {0}".format(files))
    file_queue = tf.train.string_input_producer(files, shuffle=False, capacity=1000, num_epochs=num_epochs, name=queue_name)

    # Setup reader for examples
    reader = tf.TFRecordReader(name="reader")
    _, serialized = reader.read(file_queue)
    feature_def = {'label': tf.FixedLenFeature([10], tf.int64), 'image': tf.FixedLenFeature([784], tf.int64) }
    features = tf.parse_single_example(serialized, feature_def)
    norm = tf.constant(255, dtype=tf.float32, shape=(784,))
    image = tf.div(tf.to_float(features['image']), norm)
    print_log(worker_num, "image: {0}".format(image))
    label = tf.to_float(features['label'])
    print_log(worker_num, "label: {0}".format(label))

    # Return a batch of examples
    return tf.train.batch([image,label], batch_size, num_threads=args.readers, name="batch")

  if job_name == "ps":
    server.join()
  elif job_name == "worker":
    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                              stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                              stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      # Placeholders or QueueRunner/Readers for input data
      num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs
      index = task_index if args.mode == "inference" else None
      workers = num_workers if args.mode == "inference" else None

      if args.format == "csv":
        images = TFNode.hdfs_path(ctx, args.images)
        labels = TFNode.hdfs_path(ctx, args.labels)
        x, y_ = read_csv_examples(images, labels, 100, num_epochs, index, workers)
      elif args.format == "tfr":
        images = TFNode.hdfs_path(ctx, args.images)
        x, y_ = read_tfr_examples(images, 100, num_epochs, index, workers)
      else:
        raise("{0} format not supported for tf input mode".format(args.format))

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
      tf.summary.image("x_img", x_img)

      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

      global_step = tf.Variable(0)

      loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
      tf.summary.scalar("loss", loss)
      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)
      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)
      output_dir = TFNode.hdfs_path(ctx, args.output)
      output_file = tf.gfile.Open("{0}/part-{1:05d}".format(output_dir, worker_num), mode='w')

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      count = 0
      while not sv.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using QueueRunners/Readers
        if args.mode == "train":
          if (step % 100 == 0):
            print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy)))
          _, summary, step = sess.run([train_op, summary_op, global_step])
          if sv.is_chief:
            summary_writer.add_summary(summary, step)
        else: # args.mode == "inference"
          labels, pred, acc = sess.run([label, prediction, accuracy])
          #print("label: {0}, pred: {1}".format(labels, pred))
          print("acc: {0}".format(acc))
          for i in range(len(labels)):
            count += 1
            output_file.write("{0} {1}\n".format(labels[i], pred[i]))
          print("count: {0}".format(count))

    if args.mode == "inference":
      output_file.close()
      # Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
      # run inference and request stop before the other workers even start/sync their sessions.
      if task_index == 0:
        time.sleep(60)

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def main_fun(argv, ctx):
    import math
    import six
    import tensorflow as tf

    from datasets import dataset_factory
    from nets import nets_factory
    from preprocessing import preprocessing_factory

    sys.argv = argv

    slim = tf.contrib.slim

    tf.app.flags.DEFINE_integer('batch_size', 100,
                                'The number of samples in each batch.')

    tf.app.flags.DEFINE_integer(
        'max_num_batches', None,
        'Max number of batches to evaluate by default use all.')

    tf.app.flags.DEFINE_string('master', '',
                               'The address of the TensorFlow master to use.')

    tf.app.flags.DEFINE_string(
        'checkpoint_path', '/tmp/tfmodel/',
        'The directory where the model was written to or an absolute path to a '
        'checkpoint file.')

    tf.app.flags.DEFINE_string('eval_dir', '/tmp/tfmodel/',
                               'Directory where the results are saved to.')

    tf.app.flags.DEFINE_integer(
        'num_preprocessing_threads', 4,
        'The number of threads used to create the batches.')

    tf.app.flags.DEFINE_string('dataset_name', 'imagenet',
                               'The name of the dataset to load.')

    tf.app.flags.DEFINE_string('dataset_split_name', 'test',
                               'The name of the train/test split.')

    tf.app.flags.DEFINE_string(
        'dataset_dir', None,
        'The directory where the dataset files are stored.')

    tf.app.flags.DEFINE_integer(
        'labels_offset', 0,
        'An offset for the labels in the dataset. This flag is primarily used to '
        'evaluate the VGG and ResNet architectures which do not use a background '
        'class for the ImageNet dataset.')

    tf.app.flags.DEFINE_string('model_name', 'inception_v3',
                               'The name of the architecture to evaluate.')

    tf.app.flags.DEFINE_string(
        'preprocessing_name', None,
        'The name of the preprocessing to use. If left '
        'as `None`, then the model_name flag is used.')

    tf.app.flags.DEFINE_float(
        'moving_average_decay', None,
        'The decay to use for the moving average.'
        'If left as None, then moving averages are not used.')

    tf.app.flags.DEFINE_integer('eval_image_size', None, 'Eval image size')

    FLAGS = tf.app.flags.FLAGS

    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    cluster_spec, server = TFNode.start_cluster_server(ctx)

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        #tf_global_step = slim.get_or_create_global_step()
        tf_global_step = tf.Variable(0, name="global_step")

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        ####################
        # Select the model #
        ####################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            is_training=False)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            shuffle=False,
            common_queue_capacity=2 * FLAGS.batch_size,
            common_queue_min=FLAGS.batch_size)
        [image, label] = provider.get(['image', 'label'])
        label -= FLAGS.labels_offset

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

        image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

        images, labels = tf.train.batch(
            [image, label],
            batch_size=FLAGS.batch_size,
            num_threads=FLAGS.num_preprocessing_threads,
            capacity=5 * FLAGS.batch_size)

        ####################
        # Define the model #
        ####################
        logits, _ = network_fn(images)

        if FLAGS.moving_average_decay:
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, tf_global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[tf_global_step.op.name] = tf_global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        predictions = tf.argmax(logits, 1)
        labels = tf.squeeze(labels)

        # Define the metrics:
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
            'Accuracy':
            slim.metrics.streaming_accuracy(predictions, labels),
            'Recall_5':
            slim.metrics.streaming_recall_at_k(logits, labels, 5),
        })

        # Print the summaries to screen.
        for name, value in six.iteritems(names_to_values):
            summary_name = 'eval/%s' % name
            op = tf.summary.scalar(summary_name, value, collections=[])
            op = tf.Print(op, [value], summary_name)
            tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        # TODO(sguada) use num_epochs=1
        if FLAGS.max_num_batches:
            num_batches = FLAGS.max_num_batches
        else:
            # This ensures that we make a single pass over all of the data.
            num_batches = math.ceil(dataset.num_samples /
                                    float(FLAGS.batch_size))

        if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
        else:
            checkpoint_path = FLAGS.checkpoint_path

        tf.logging.info('Evaluating %s' % checkpoint_path)

        slim.evaluation.evaluate_once(
            master=FLAGS.master,
            checkpoint_path=checkpoint_path,
            logdir=FLAGS.eval_dir,
            num_evals=num_batches,
            eval_op=list(names_to_updates.values()),
            variables_to_restore=variables_to_restore)
Exemple #31
0
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec

  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  batch_size = args.batch_size

  cluster, server = TFNode.start_cluster_server(ctx, 1)

  def feed_dict(batch):
    images = []
    labels = []
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    x_initial = numpy.array(images)
    x_objdump = x_initial[:,519:719]
    x_cnn = numpy.empty((0, 200), dtype=numpy.float64)
    for i in xrange(len(images)):  
      x_cnn_batch = numpy.zeros((200, 120), dtype=numpy.float64)
      for j in xrange(0, 200):
        x_cnn_batch[j, int(x_objdump[i, j])] = True
      x_cnn_batch = numpy.transpose(x_cnn_batch)
      x_cnn = numpy.append(x_cnn, x_cnn_batch, axis=0)
    x_peinfo = x_initial[:,0:519]
    ys = numpy.array(labels)
    return (x_peinfo.reshape(-1,519,1,1),x_cnn.reshape(-1, 200, 120, 1), ys)

  def conv2d(x, W):
      return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

  def max_pool_1(x):
      return tf.nn.avg_pool(x, ksize=[1, 2,1, 1], strides=[1, 2, 1, 1], padding='SAME')

  def max_pool_2(x):
      return tf.nn.avg_pool(x, ksize=[1, 100,1, 1], strides=[1, 100, 1, 1], padding='SAME')

  if job_name == "ps":
    server.join()
  elif job_name == "worker":
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):
      # Build NN-Network
      W_mlp_1 = tf.Variable(tf.truncated_normal([519,519],stddev=0.1), name="W_mlp_1") 
      b_mlp_1 = tf.Variable(tf.constant(0.1, shape=[519]),name="b_mlp_1")
      tf.summary.histogram("W_mlp_1", W_mlp_1)
      W_mlp_2 = tf.Variable(tf.truncated_normal([519,519],stddev=0.1), name="W_mlp_2") 
      b_mlp_2 = tf.Variable(tf.constant(0.1, shape=[519]),name="b_mlp_2") 
      tf.summary.histogram("W_mlp_2", W_mlp_2)   

      W_conv1 = tf.Variable(tf.truncated_normal([3,120,1,3],stddev=0.1), name="W_conv1") 
      b_conv1 = tf.Variable(tf.constant(0.1, shape=[3]),name="b_conv1")
      tf.summary.histogram("W_conv1", W_conv1)
      W_conv2 = tf.Variable(tf.truncated_normal([3,120,3,6],stddev=0.1),name="W_conv2") 
      b_conv2 = tf.Variable(tf.constant(0.1, shape=[6]),name="b_conv2")
      tf.summary.histogram("W_conv2", W_conv2)

      sm_w = tf.Variable(tf.truncated_normal([1239, 10], stddev= 0.1), name="sm_w")
      sm_b = tf.Variable(tf.constant(0.1, shape=[10]),name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      x_cnn = tf.placeholder(tf.float32, [None, 200,120,1], name="x_cnn")
      x_mlp = tf.placeholder(tf.float32, [None, 519,1,1], name="x_mlp")
      y_ = tf.placeholder(tf.float32, [None, 10], name="y_")
      tf.summary.image("x_cnn", x_cnn)
      tf.summary.image("x_mlp", x_mlp)

      x_mlp_new = tf.reshape(x_mlp, [-1, 519])
      h_mlp_1 = tf.nn.xw_plus_b(x_mlp_new, W_mlp_1, b_mlp_1)
      h_mlp_2 = tf.nn.xw_plus_b(h_mlp_1, W_mlp_2, b_mlp_2)
      h_conv1 = tf.nn.relu(conv2d(x_cnn, W_conv1) + b_conv1)
      h_pool1 = max_pool_1(h_conv1)
      h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
      h_pool2 = max_pool_2(h_conv2)
      h_conv2_flat = tf.reshape(h_pool2, [-1, 120*6])

      h_inter = tf.concat([h_mlp_2, h_conv2_flat],1)
      y = tf.nn.softmax(tf.nn.xw_plus_b(h_inter, sm_w, sm_b))

      global_step = tf.Variable(0)
      loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
      tf.summary.scalar("loss", loss)
      train_op = tf.train.AdagradOptimizer(0.001).minimize(
          loss, global_step=global_step)

      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)
      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        batch_mlp, batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x_mlp: batch_mlp, x_cnn: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            if (step % 10 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x_mlp: batch_mlp, x_cnn: batch_xs, y_: batch_ys})))
            if sv.is_chief:
              summary_writer.add_summary(summary, step)
          
          elif args.mode == "inference": 
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)
            results = ["Label: {0}, Prediction: {1}".format(l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))

          else:
            preds= sess.run(prediction, feed_dict={x_mlp: batch_mlp, x_cnn: batch_xs})
            results = ["Sha256: {0}, Prediction: {1}".format(l, p) for l,p in zip(batch_ys,preds)]
            tf_feed.batch_results(results)
            print(results)
            
      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def main_fun(argv, ctx):
  import tensorflow as tf
  from tensorflow.python.ops import control_flow_ops
  from datasets import dataset_factory
  from deployment import model_deploy
  from nets import nets_factory
  from preprocessing import preprocessing_factory

  sys.argv = argv

  slim = tf.contrib.slim

  tf.app.flags.DEFINE_integer(
      'num_gpus', '1', 'The number of GPUs to use per node')

  tf.app.flags.DEFINE_boolean('rdma', False, 'Whether to use rdma.')

  tf.app.flags.DEFINE_string(
      'master', '', 'The address of the TensorFlow master to use.')

  tf.app.flags.DEFINE_string(
      'train_dir', '/tmp/tfmodel/',
      'Directory where checkpoints and event logs are written to.')

  tf.app.flags.DEFINE_integer('num_clones', 1,
                              'Number of model clones to deploy.')

  tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
                              'Use CPUs to deploy clones.')

  tf.app.flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas.')

  tf.app.flags.DEFINE_integer(
      'num_ps_tasks', 0,
      'The number of parameter servers. If the value is 0, then the parameters '
      'are handled locally by the worker.')

  tf.app.flags.DEFINE_integer(
      'num_readers', 4,
      'The number of parallel readers that read data from the dataset.')

  tf.app.flags.DEFINE_integer(
      'num_preprocessing_threads', 4,
      'The number of threads used to create the batches.')

  tf.app.flags.DEFINE_integer(
      'log_every_n_steps', 10,
      'The frequency with which logs are print.')

  tf.app.flags.DEFINE_integer(
      'save_summaries_secs', 600,
      'The frequency with which summaries are saved, in seconds.')

  tf.app.flags.DEFINE_integer(
      'save_interval_secs', 600,
      'The frequency with which the model is saved, in seconds.')

  tf.app.flags.DEFINE_integer(
      'task', 0, 'Task id of the replica running the training.')

  ######################
  # Optimization Flags #
  ######################

  tf.app.flags.DEFINE_float(
      'weight_decay', 0.00004, 'The weight decay on the model weights.')

  tf.app.flags.DEFINE_string(
      'optimizer', 'rmsprop',
      'The name of the optimizer, one of "adadelta", "adagrad", "adam",'
      '"ftrl", "momentum", "sgd" or "rmsprop".')

  tf.app.flags.DEFINE_float(
      'adadelta_rho', 0.95,
      'The decay rate for adadelta.')

  tf.app.flags.DEFINE_float(
      'adagrad_initial_accumulator_value', 0.1,
      'Starting value for the AdaGrad accumulators.')

  tf.app.flags.DEFINE_float(
      'adam_beta1', 0.9,
      'The exponential decay rate for the 1st moment estimates.')

  tf.app.flags.DEFINE_float(
      'adam_beta2', 0.999,
      'The exponential decay rate for the 2nd moment estimates.')

  tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.')

  tf.app.flags.DEFINE_float('ftrl_learning_rate_power', -0.5,
                            'The learning rate power.')

  tf.app.flags.DEFINE_float(
      'ftrl_initial_accumulator_value', 0.1,
      'Starting value for the FTRL accumulators.')

  tf.app.flags.DEFINE_float(
      'ftrl_l1', 0.0, 'The FTRL l1 regularization strength.')

  tf.app.flags.DEFINE_float(
      'ftrl_l2', 0.0, 'The FTRL l2 regularization strength.')

  tf.app.flags.DEFINE_float(
      'momentum', 0.9,
      'The momentum for the MomentumOptimizer and RMSPropOptimizer.')

  tf.app.flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.')

  #######################
  # Learning Rate Flags #
  #######################

  tf.app.flags.DEFINE_string(
      'learning_rate_decay_type',
      'exponential',
      'Specifies how the learning rate is decayed. One of "fixed", "exponential",'
      ' or "polynomial"')

  tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')

  tf.app.flags.DEFINE_float(
      'end_learning_rate', 0.0001,
      'The minimal end learning rate used by a polynomial decay learning rate.')

  tf.app.flags.DEFINE_float(
      'label_smoothing', 0.0, 'The amount of label smoothing.')

  tf.app.flags.DEFINE_float(
      'learning_rate_decay_factor', 0.94, 'Learning rate decay factor.')

  tf.app.flags.DEFINE_float(
      'num_epochs_per_decay', 2.0,
      'Number of epochs after which learning rate decays.')

  tf.app.flags.DEFINE_bool(
      'sync_replicas', False,
      'Whether or not to synchronize the replicas during training.')

  tf.app.flags.DEFINE_integer(
      'replicas_to_aggregate', 1,
      'The Number of gradients to collect before updating params.')

  tf.app.flags.DEFINE_float(
      'moving_average_decay', None,
      'The decay to use for the moving average.'
      'If left as None, then moving averages are not used.')

  #######################
  # Dataset Flags #
  #######################

  tf.app.flags.DEFINE_string(
      'dataset_name', 'imagenet', 'The name of the dataset to load.')

  tf.app.flags.DEFINE_string(
      'dataset_split_name', 'train', 'The name of the train/test split.')

  tf.app.flags.DEFINE_string(
      'dataset_dir', None, 'The directory where the dataset files are stored.')

  tf.app.flags.DEFINE_integer(
      'labels_offset', 0,
      'An offset for the labels in the dataset. This flag is primarily used to '
      'evaluate the VGG and ResNet architectures which do not use a background '
      'class for the ImageNet dataset.')

  tf.app.flags.DEFINE_string(
      'model_name', 'inception_v3', 'The name of the architecture to train.')

  tf.app.flags.DEFINE_string(
      'preprocessing_name', None, 'The name of the preprocessing to use. If left '
      'as `None`, then the model_name flag is used.')

  tf.app.flags.DEFINE_integer(
      'batch_size', 32, 'The number of samples in each batch.')

  tf.app.flags.DEFINE_integer(
      'train_image_size', None, 'Train image size')

  tf.app.flags.DEFINE_integer('max_number_of_steps', None,
                              'The maximum number of training steps.')

  #####################
  # Fine-Tuning Flags #
  #####################

  tf.app.flags.DEFINE_string(
      'checkpoint_path', None,
      'The path to a checkpoint from which to fine-tune.')

  tf.app.flags.DEFINE_string(
      'checkpoint_exclude_scopes', None,
      'Comma-separated list of scopes of variables to exclude when restoring '
      'from a checkpoint.')

  tf.app.flags.DEFINE_string(
      'trainable_scopes', None,
      'Comma-separated list of scopes to filter the set of variables to train.'
      'By default, None would train all the variables.')

  tf.app.flags.DEFINE_boolean(
      'ignore_missing_vars', False,
      'When restoring a checkpoint would ignore missing variables.')

  FLAGS = tf.app.flags.FLAGS
  FLAGS.job_name = ctx.job_name
  FLAGS.task = ctx.task_index
  FLAGS.num_clones = FLAGS.num_gpus
  FLAGS.worker_replicas = len(ctx.cluster_spec['worker'])
  assert(FLAGS.num_ps_tasks == (len(ctx.cluster_spec['ps']) if 'ps' in ctx.cluster_spec else 0))

  def _configure_learning_rate(num_samples_per_epoch, global_step):
    """Configures the learning rate.

    Args:
      num_samples_per_epoch: The number of samples in each epoch of training.
      global_step: The global_step tensor.

    Returns:
      A `Tensor` representing the learning rate.

    Raises:
      ValueError: if
    """
    decay_steps = int(num_samples_per_epoch / FLAGS.batch_size *
                      FLAGS.num_epochs_per_decay)
    if FLAGS.sync_replicas:
      decay_steps /= FLAGS.replicas_to_aggregate

    if FLAGS.learning_rate_decay_type == 'exponential':
      return tf.train.exponential_decay(FLAGS.learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True,
                                        name='exponential_decay_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'fixed':
      return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'polynomial':
      return tf.train.polynomial_decay(FLAGS.learning_rate,
                                       global_step,
                                       decay_steps,
                                       FLAGS.end_learning_rate,
                                       power=1.0,
                                       cycle=False,
                                       name='polynomial_decay_learning_rate')
    else:
      raise ValueError('learning_rate_decay_type [%s] was not recognized',
                       FLAGS.learning_rate_decay_type)


  def _configure_optimizer(learning_rate):
    """Configures the optimizer used for training.

    Args:
      learning_rate: A scalar or `Tensor` learning rate.

    Returns:
      An instance of an optimizer.

    Raises:
      ValueError: if FLAGS.optimizer is not recognized.
    """
    if FLAGS.optimizer == 'adadelta':
      optimizer = tf.train.AdadeltaOptimizer(
          learning_rate,
          rho=FLAGS.adadelta_rho,
          epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'adagrad':
      optimizer = tf.train.AdagradOptimizer(
          learning_rate,
          initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value)
    elif FLAGS.optimizer == 'adam':
      optimizer = tf.train.AdamOptimizer(
          learning_rate,
          beta1=FLAGS.adam_beta1,
          beta2=FLAGS.adam_beta2,
          epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'ftrl':
      optimizer = tf.train.FtrlOptimizer(
          learning_rate,
          learning_rate_power=FLAGS.ftrl_learning_rate_power,
          initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value,
          l1_regularization_strength=FLAGS.ftrl_l1,
          l2_regularization_strength=FLAGS.ftrl_l2)
    elif FLAGS.optimizer == 'momentum':
      optimizer = tf.train.MomentumOptimizer(
          learning_rate,
          momentum=FLAGS.momentum,
          name='Momentum')
    elif FLAGS.optimizer == 'rmsprop':
      optimizer = tf.train.RMSPropOptimizer(
          learning_rate,
          decay=FLAGS.rmsprop_decay,
          momentum=FLAGS.momentum,
          epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'sgd':
      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    else:
      raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer)
    return optimizer


  def _add_variables_summaries(learning_rate):
    summaries = []
    for variable in slim.get_model_variables():
      summaries.append(tf.summary.histogram(variable.op.name, variable))
    summaries.append(tf.summary.scalar('training/Learning Rate', learning_rate))
    return summaries


  def _get_init_fn():
    """Returns a function run by the chief worker to warm-start the training.

    Note that the init_fn is only run when initializing the model during the very
    first global step.

    Returns:
      An init function run by the supervisor.
    """
    if FLAGS.checkpoint_path is None:
      return None

    # Warn the user if a checkpoint exists in the train_dir. Then we'll be
    # ignoring the checkpoint anyway.
    if tf.train.latest_checkpoint(FLAGS.train_dir):
      tf.logging.info(
          'Ignoring --checkpoint_path because a checkpoint already exists in %s'
          % FLAGS.train_dir)
      return None

    exclusions = []
    if FLAGS.checkpoint_exclude_scopes:
      exclusions = [scope.strip()
                    for scope in FLAGS.checkpoint_exclude_scopes.split(',')]

    # TODO(sguada) variables.filter_variables()
    variables_to_restore = []
    for var in slim.get_model_variables():
      excluded = False
      for exclusion in exclusions:
        if var.op.name.startswith(exclusion):
          excluded = True
          break
      if not excluded:
        variables_to_restore.append(var)

    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    else:
      checkpoint_path = FLAGS.checkpoint_path

    tf.logging.info('Fine-tuning from %s' % checkpoint_path)

    return slim.assign_from_checkpoint_fn(
        checkpoint_path,
        variables_to_restore,
        ignore_missing_vars=FLAGS.ignore_missing_vars)


  def _get_variables_to_train():
    """Returns a list of variables to train.

    Returns:
      A list of variables to train by the optimizer.
    """
    if FLAGS.trainable_scopes is None:
      return tf.trainable_variables()
    else:
      scopes = [scope.strip() for scope in FLAGS.trainable_scopes.split(',')]

    variables_to_train = []
    for scope in scopes:
      variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
      variables_to_train.extend(variables)
    return variables_to_train

  # main
  cluster_spec, server = TFNode.start_cluster_server(ctx=ctx, num_gpus=FLAGS.num_gpus, rdma=FLAGS.rdma)
  if ctx.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()
  else:
    # `worker` jobs will actually do the work.
    if not FLAGS.dataset_dir:
      raise ValueError('You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
      #######################
      # Config model_deploy #
      #######################
      deploy_config = model_deploy.DeploymentConfig(
          num_clones=FLAGS.num_clones,
          clone_on_cpu=FLAGS.clone_on_cpu,
          replica_id=FLAGS.task,
          num_replicas=FLAGS.worker_replicas,
          num_ps_tasks=FLAGS.num_ps_tasks)

      # Create global_step
      #with tf.device(deploy_config.variables_device()):
      #  global_step = slim.create_global_step()
      with tf.device("/job:ps/task:0"):
        global_step = tf.Variable(0, name="global_step")

      ######################
      # Select the dataset #
      ######################
      dataset = dataset_factory.get_dataset(
          FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

      ######################
      # Select the network #
      ######################
      network_fn = nets_factory.get_network_fn(
          FLAGS.model_name,
          num_classes=(dataset.num_classes - FLAGS.labels_offset),
          weight_decay=FLAGS.weight_decay,
          is_training=True)

      #####################################
      # Select the preprocessing function #
      #####################################
      preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
      image_preprocessing_fn = preprocessing_factory.get_preprocessing(
          preprocessing_name,
          is_training=True)

      ##############################################################
      # Create a dataset provider that loads data from the dataset #
      ##############################################################
      with tf.device(deploy_config.inputs_device()):
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            num_readers=FLAGS.num_readers,
            common_queue_capacity=20 * FLAGS.batch_size,
            common_queue_min=10 * FLAGS.batch_size)
        [image, label] = provider.get(['image', 'label'])
        label -= FLAGS.labels_offset

        train_image_size = FLAGS.train_image_size or network_fn.default_image_size

        image = image_preprocessing_fn(image, train_image_size, train_image_size)

        images, labels = tf.train.batch(
            [image, label],
            batch_size=FLAGS.batch_size,
            num_threads=FLAGS.num_preprocessing_threads,
            capacity=5 * FLAGS.batch_size)
        labels = slim.one_hot_encoding(
            labels, dataset.num_classes - FLAGS.labels_offset)
        batch_queue = slim.prefetch_queue.prefetch_queue(
            [images, labels], capacity=2 * deploy_config.num_clones)

      ####################
      # Define the model #
      ####################
      def clone_fn(batch_queue):
        """Allows data parallelism by creating multiple clones of network_fn."""
        images, labels = batch_queue.dequeue()
        logits, end_points = network_fn(images)

        #############################
        # Specify the loss function #
        #############################
        if 'AuxLogits' in end_points:
          tf.losses.softmax_cross_entropy(
              logits=end_points['AuxLogits'], onehot_labels=labels,
              label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss')
        tf.losses.softmax_cross_entropy(
            logits=logits, onehot_labels=labels,
            label_smoothing=FLAGS.label_smoothing, weights=1.0)
        return end_points

      # Gather initial summaries.
      summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

      clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
      first_clone_scope = deploy_config.clone_scope(0)
      # Gather update_ops from the first clone. These contain, for example,
      # the updates for the batch_norm variables created by network_fn.
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

      # Add summaries for end_points.
      end_points = clones[0].outputs
      for end_point in end_points:
        x = end_points[end_point]
        summaries.add(tf.summary.histogram('activations/' + end_point, x))
        summaries.add(tf.summary.scalar('sparsity/' + end_point,
                                        tf.nn.zero_fraction(x)))

      # Add summaries for losses.
      for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
        summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

      # Add summaries for variables.
      for variable in slim.get_model_variables():
        summaries.add(tf.summary.histogram(variable.op.name, variable))

      #################################
      # Configure the moving averages #
      #################################
      if FLAGS.moving_average_decay:
        moving_average_variables = slim.get_model_variables()
        variable_averages = tf.train.ExponentialMovingAverage(
            FLAGS.moving_average_decay, global_step)
      else:
        moving_average_variables, variable_averages = None, None

      #########################################
      # Configure the optimization procedure. #
      #########################################
      with tf.device(deploy_config.optimizer_device()):
        learning_rate = _configure_learning_rate(dataset.num_samples, global_step)
        optimizer = _configure_optimizer(learning_rate)
        summaries.add(tf.summary.scalar('learning_rate', learning_rate))

      if FLAGS.sync_replicas:
        # If sync_replicas is enabled, the averaging will be done in the chief
        # queue runner.
        optimizer = tf.train.SyncReplicasOptimizer(
            opt=optimizer,
            replicas_to_aggregate=FLAGS.replicas_to_aggregate,
            variable_averages=variable_averages,
            variables_to_average=moving_average_variables,
            replica_id=tf.constant(FLAGS.task, tf.int32, shape=()),
            total_num_replicas=FLAGS.worker_replicas)
      elif FLAGS.moving_average_decay:
        # Update ops executed locally by trainer.
        update_ops.append(variable_averages.apply(moving_average_variables))

      # Variables to train.
      variables_to_train = _get_variables_to_train()

      #  and returns a train_tensor and summary_op
      total_loss, clones_gradients = model_deploy.optimize_clones(
          clones,
          optimizer,
          var_list=variables_to_train)
      # Add total_loss to summary.
      summaries.add(tf.summary.scalar('total_loss', total_loss))

      # Create gradient updates.
      grad_updates = optimizer.apply_gradients(clones_gradients,
                                               global_step=global_step)
      update_ops.append(grad_updates)

      update_op = tf.group(*update_ops)
      train_tensor = control_flow_ops.with_dependencies([update_op], total_loss,
                                                        name='train_op')

      # Add the summaries from the first clone. These contain the summaries
      # created by model_fn and either optimize_clones() or _gather_clone_loss().
      summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                         first_clone_scope))

      # Merge all summaries together.
      summary_op = tf.summary.merge(list(summaries), name='summary_op')


      ###########################
      # Kicks off the training. #
      ###########################
      summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num), graph=tf.get_default_graph())
      slim.learning.train(
          train_tensor,
          logdir=FLAGS.train_dir,
          master=server.target,
          is_chief=(FLAGS.task == 0),
          init_fn=_get_init_fn(),
          summary_op=summary_op,
          number_of_steps=FLAGS.max_number_of_steps,
          log_every_n_steps=FLAGS.log_every_n_steps,
          save_summaries_secs=FLAGS.save_summaries_secs,
          save_interval_secs=FLAGS.save_interval_secs,
          summary_writer=summary_writer,
          sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec

  IMAGE_PIXELS=28

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128
  batch_size   = args.batch_size

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    xs = xs/255.0
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                              stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                              stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
      y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
      tf.summary.image("x_img", x_img)

      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

      global_step = tf.Variable(0)

      loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
      tf.summary.scalar("loss", loss)

      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))

            if sv.is_chief:
              summary_writer.add_summary(summary, step)
          else: # args.mode == "inference"
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

            results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))

      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def main_fun(args, ctx):
    import numpy
    import os
    import tensorflow as tf
    import tensorflow.contrib.keras as keras
    from tensorflow.contrib.keras.api.keras import backend as K
    from tensorflow.contrib.keras.api.keras.models import Sequential, load_model, save_model
    from tensorflow.contrib.keras.api.keras.layers import Dense, Dropout
    from tensorflow.contrib.keras.api.keras.optimizers import RMSprop
    from tensorflow.contrib.keras.python.keras.callbacks import LambdaCallback, TensorBoard

    from tensorflow.python.saved_model import builder as saved_model_builder
    from tensorflow.python.saved_model import tag_constants
    from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def

    from tensorflowonspark import TFNode

    cluster, server = TFNode.start_cluster_server(ctx)

    if ctx.job_name == "ps":
        server.join()
    elif ctx.job_name == "worker":

        def generate_rdd_data(tf_feed, batch_size):
            print("generate_rdd_data invoked")
            while True:
                batch = tf_feed.next_batch(batch_size)
                imgs = []
                lbls = []
                for item in batch:
                    imgs.append(item[0])
                    lbls.append(item[1])
                images = numpy.array(imgs).astype('float32') / 255
                labels = numpy.array(lbls).astype('float32')
                yield (images, labels)

        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % ctx.task_index,
                    cluster=cluster)):

            IMAGE_PIXELS = 28
            batch_size = 100
            num_classes = 10

            # the data, shuffled and split between train and test sets
            if args.input_mode == 'tf':
                from tensorflow.contrib.keras.api.keras.datasets import mnist
                (x_train, y_train), (x_test, y_test) = mnist.load_data()
                x_train = x_train.reshape(60000, 784)
                x_test = x_test.reshape(10000, 784)
                x_train = x_train.astype('float32') / 255
                x_test = x_test.astype('float32') / 255

                # convert class vectors to binary class matrices
                y_train = keras.utils.to_categorical(y_train, num_classes)
                y_test = keras.utils.to_categorical(y_test, num_classes)
            else:  # args.mode == 'spark'
                x_train = tf.placeholder(tf.float32,
                                         [None, IMAGE_PIXELS * IMAGE_PIXELS],
                                         name="x_train")
                y_train = tf.placeholder(tf.float32, [None, 10],
                                         name="y_train")

            model = Sequential()
            model.add(Dense(512, activation='relu', input_shape=(784, )))
            model.add(Dropout(0.2))
            model.add(Dense(512, activation='relu'))
            model.add(Dropout(0.2))
            model.add(Dense(10, activation='softmax'))

            model.summary()

            model.compile(loss='categorical_crossentropy',
                          optimizer=RMSprop(),
                          metrics=['accuracy'])

        saver = tf.train.Saver()

        with tf.Session(server.target) as sess:
            K.set_session(sess)

            def save_checkpoint(epoch, logs=None):
                if epoch == 1:
                    tf.train.write_graph(sess.graph.as_graph_def(),
                                         args.model_dir, 'graph.pbtxt')
                saver.save(sess,
                           os.path.join(args.model_dir, 'model.ckpt'),
                           global_step=epoch * args.steps_per_epoch)

            ckpt_callback = LambdaCallback(on_epoch_end=save_checkpoint)
            tb_callback = TensorBoard(log_dir=args.model_dir,
                                      histogram_freq=1,
                                      write_graph=True,
                                      write_images=True)

            # add callbacks to save model checkpoint and tensorboard events (on worker:0 only)
            callbacks = [ckpt_callback, tb_callback
                         ] if ctx.task_index == 0 else None

            if args.input_mode == 'tf':
                # train & validate on in-memory data
                history = model.fit(x_train,
                                    y_train,
                                    batch_size=batch_size,
                                    epochs=args.epochs,
                                    verbose=1,
                                    validation_data=(x_test, y_test),
                                    callbacks=callbacks)
            else:  # args.input_mode == 'spark':
                # train on data read from a generator which is producing data from a Spark RDD
                tf_feed = TFNode.DataFeed(ctx.mgr)
                history = model.fit_generator(
                    generator=generate_rdd_data(tf_feed, batch_size),
                    steps_per_epoch=args.steps_per_epoch,
                    epochs=args.epochs,
                    verbose=1,
                    callbacks=callbacks)

            if args.export_dir and ctx.job_name == 'worker' and ctx.task_index == 0:
                # save a local Keras model, so we can reload it with an inferencing learning_phase
                save_model(model, "tmp_model")

                # reload the model
                K.set_learning_phase(False)
                new_model = load_model("tmp_model")

                # export a saved_model for inferencing
                builder = saved_model_builder.SavedModelBuilder(
                    args.export_dir)
                signature = predict_signature_def(
                    inputs={'images': new_model.input},
                    outputs={'scores': new_model.output})
                builder.add_meta_graph_and_variables(
                    sess=sess,
                    tags=[tag_constants.SERVING],
                    signature_def_map={'predict': signature},
                    clear_devices=True)
                builder.save()

            if args.input_mode == 'spark':
                tf_feed.terminate()
Exemple #35
0
def map_fun(args, ctx):
    num_workers = args.cluster_size if args.driver_ps_nodes else args.cluster_size - args.num_ps
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1,
                                                  args.protocol == 'rdma')

    def _parse_tfr(example_proto):
        feature_def = {
            "label": tf.FixedLenFeature(10, tf.int64),
            "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)
        }
        features = tf.parse_single_example(example_proto, feature_def)
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        image = tf.div(tf.to_float(features['image']), norm)
        label = tf.to_float(features['label'])
        return (image, label)

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Variables of the hidden layer
            hid_w = tf.Variable(tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
                                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            tf.summary.histogram("hidden_weights", hid_w)

            # Variables of the softmax layer
            sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                                                   stddev=1.0 /
                                                   math.sqrt(hidden_units)),
                               name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            tf.summary.histogram("softmax_weights", sm_w)

            # read from saved tf records
            images = TFNode.hdfs_path(ctx, args.tfrecord_dir)
            tf_record_pattern = os.path.join(images, 'part-*')
            tfr_files = tf.gfile.Glob(tf_record_pattern)
            ds = tf.data.TFRecordDataset(tfr_files)
            parse_fn = _parse_tfr
            ds = ds.shard(num_workers, task_index).repeat(args.epochs).shuffle(
                args.shuffle_size)
            ds = ds.map(parse_fn).batch(args.batch_size)
            iterator = ds.make_initializable_iterator()
            x, y_ = iterator.get_next()

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
            tf.summary.image("x_img", x_img)

            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            tf.summary.scalar("loss", loss)
            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),
                                      name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model_dir)
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num),
                                               graph=tf.get_default_graph())

        sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                 logdir=logdir,
                                 init_op=init_op,
                                 summary_op=None,
                                 saver=saver,
                                 global_step=global_step,
                                 stop_grace_secs=300,
                                 save_model_secs=10)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))
            sess.run(iterator.initializer)

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = 0
            while not sv.should_stop() and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using QueueRunners/Readers
                if (step % 100 == 0):
                    print("{0} step: {1} accuracy: {2}".format(
                        datetime.now().isoformat(), step, sess.run(accuracy)))
                _, summary, step = sess.run(
                    [train_op, summary_op, global_step])
                if sv.is_chief:
                    summary_writer.add_summary(summary, step)

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
def map_fun(args, ctx):
    # from com.yahoo.ml.tf import TFNode
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf
    from tensorflow.contrib.layers.python.layers import batch_norm
    import time
    import os

    worker_num = ctx.worker_num  #worker数量
    job_name = ctx.job_name  # job名
    task_index = ctx.task_index  # 任务索引
    cluster_spec = ctx.cluster_spec  # 集群

    IMAGE_PIXELS = 2  # 图像大小 mnist 28x28x1  (后续参考自己图像大小进行修改)
    channels = 3
    num_class = 2
    # global dropout
    dropout = args.dropout
    # Parameters
    # hidden_units = 128 # NN隐藏层
    # training_epochs=args.epochs
    batch_size = args.batch_size  #每批次训练的样本数
    # img_nums=630000
    # global learning_rate
    # learning_rate=args.learning_rate
    INITIAL_LEARNING_RATE = args.learning_rate
    # flag=True

    # batch_size=200

    num_examples_per_epoch_for_train = (4015 - 1)**2  # 每次迭代的样本数
    num_batches_per_epoch = int(num_examples_per_epoch_for_train / batch_size)
    num_epochs_per_decay = 1.2
    learning_rate_decay_rate = 0.8
    learning_rate_decay_steps = int(num_batches_per_epoch *
                                    num_epochs_per_decay)
    """
  # ---------设置动态学习效率
  # Constants describing the training process.
  # MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
  NUM_EPOCHS_PER_DECAY = batch_size  # Epochs after which learning rate decays.
  LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
  INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.

  global_step1 = training_epochs * (img_nums // batch_size)  # Integer Variable counting the number of training steps
  # Variables that affect learning rate.
  num_batches_per_epoch = img_nums / batch_size
  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

  # Decay the learning rate exponentially based on the number of steps.
  learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                             global_step1,
                                             decay_steps,
                                             LEARNING_RATE_DECAY_FACTOR,
                                             staircase=True)
# 设置动态学习效率----------  
"""

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":  # ps节点(主节点)
        time.sleep((worker_num + 1) * 5)

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def feed_dict(batch):
        # Convert from [(images, labels)] to two numpy arrays of the proper type
        images = []
        labels = []
        if args.mode != 'inference':
            numpy.random.shuffle(batch)  # 随机打乱
        for item in batch:
            images.append(item[0])
            labels.append(item[1])
        xs = numpy.array(images)
        xs = xs.astype(numpy.float32)
        # xs = xs/255.0 # 数据归一化
        # Z-score标准化方法
        # mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1])
        # std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1])
        # xs = (xs - mean) / std

        # min-max标准化(Min-Max Normalization
        max_ = numpy.reshape(numpy.max(xs, 1), [numpy.shape(xs)[0], 1])
        min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1])

        xs = (xs - min_) / (max_ - min_)
        ys = numpy.array(labels)
        if args.mode != 'inference':
            ys = ys.astype(numpy.uint8)
        else:
            ys = ys.astype(numpy.uint16)
        return (xs, ys)

    def batch_norm_layer(inputT, is_training=True, scope=None):
        # Note: is_training is tf.placeholder(tf.bool) type
        return tf.cond(is_training,
                       lambda: batch_norm(inputT,
                                          is_training=True,
                                          center=True,
                                          scale=True,
                                          activation_fn=tf.nn.relu,
                                          decay=0.9,
                                          scope=scope),
                       lambda: batch_norm(inputT,
                                          is_training=False,
                                          center=True,
                                          scale=True,
                                          activation_fn=tf.nn.relu,
                                          decay=0.9,
                                          scope=scope))  # , reuse = True))

    if job_name == "ps":
        server.join()
    elif job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Create some wrappers for simplicity
            def conv2d(x, W, b, strides=1):
                # Conv2D wrapper, with bias and relu activation
                x = tf.nn.conv2d(x,
                                 W,
                                 strides=[1, strides, strides, 1],
                                 padding='SAME')
                x = tf.nn.bias_add(x, b)  # strides中间两个为1 表示x,y方向都不间隔取样
                return tf.nn.relu(x)

            def maxpool2d(x, k=2):
                # MaxPool2D wrapper
                return tf.nn.max_pool(
                    x,
                    ksize=[1, k, k, 1],
                    strides=[1, k, k, 1],
                    padding='SAME')  # strides中间两个为2 表示x,y方向都间隔1个取样

            # Store layers weight & bias
            weights = {
                # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入
                'wc1':
                tf.get_variable('wc1', [3, 3, channels, 128],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),  # 5X5的卷积模板

                # 5x5 conv, 32 inputs, 64 outputs
                'wc2':
                tf.get_variable('wc2', [3, 3, 32, 64],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),

                # fully connected, 7*7*64 inputs, 1024 outputs
                'wd1':
                tf.Variable(
                    tf.random_normal([
                        (IMAGE_PIXELS // 2) * (IMAGE_PIXELS // 2) * 128, 1024
                    ])),
                # 1024 inputs, 10 outputs (class prediction)
                'out':
                tf.Variable(tf.random_normal([1024, num_class]))
            }

            biases = {
                'bc1':
                tf.get_variable('bc1', [128],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),
                'bc2':
                tf.get_variable('bc2', [64],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),
                'bd1':
                tf.Variable(tf.random_normal([1024])),
                'out':
                tf.Variable(tf.random_normal([num_class]))
            }

            # Placeholders or QueueRunner/Readers for input data
            x = tf.placeholder(tf.float32,
                               [None, IMAGE_PIXELS * IMAGE_PIXELS * channels],
                               name="x")  # mnist 28*28*1
            if args.mode != 'inference':
                y_ = tf.placeholder(tf.float32, [None, num_class], name="y_")
            else:
                y_ = tf.placeholder(tf.float32, [None, 4], name="y_")
                label = y_
            keep = tf.placeholder(tf.float32)
            is_training = tf.placeholder(tf.bool, name='MODE')

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels
                                   ])  # mnist 数据 28x28x1 (灰度图 波段为1)

            # x_img=batch_norm_layer(x_img,is_training)
            x_img = tf.nn.lrn(x_img,
                              depth_radius=5,
                              bias=2.0,
                              alpha=1e-3,
                              beta=0.75)  # lrn层

            # 改成卷积模型
            conv1 = conv2d(x_img, weights['wc1'], biases['bc1'])
            conv1 = maxpool2d(conv1, k=2)  # shape [N,1,1,32]
            conv1 = tf.nn.lrn(conv1,
                              depth_radius=5,
                              bias=2.0,
                              alpha=1e-3,
                              beta=0.75)  # lrn层
            # conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
            # conv2 = maxpool2d(conv2, k=2)  # shape [N,1,1,32]
            # conv1 = tf.nn.dropout(conv1, keep+0.1)
            fc1 = tf.reshape(conv1,
                             [-1, weights['wd1'].get_shape().as_list()[0]])
            fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
            # fc1=batch_norm_layer(fc1, is_training)
            fc1 = tf.nn.relu(fc1)
            fc1 = tf.nn.dropout(fc1, keep)
            y = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
            prediction = tf.argmax(y, 1, name="prediction")
            # y=tf.sigmoid(y) # 二分类 多分类加 tf.nn.softmax()

            global_step = tf.Variable(0, name="global_step", trainable=False)

            # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            if args.mode != 'inference':
                loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(labels=y_,
                                                            logits=y))

                # learning_rate=tf.train.exponential_decay(INITIAL_LEARNING_RATE,global_step,
                #                                          learning_rate_decay_steps,learning_rate_decay_rate,
                #                                          staircase=False)

                # learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                #                                            global_step,
                #                                            10000,
                #                                            0.96,
                #                                            staircase=False)
                learning_rate = tf.train.polynomial_decay(
                    INITIAL_LEARNING_RATE, global_step, 3000000, 1e-5, 0.8,
                    True)
                # 运行steps:decay_steps>1000:1
                # train_op = tf.train.AdagradOptimizer(learning_rate).minimize(
                #     loss, global_step=global_step)

                train_op = tf.train.GradientDescentOptimizer(
                    learning_rate).minimize(loss, global_step=global_step)

                # Test trained model
                label = tf.argmax(y_, 1, name="label")
                # prediction = tf.argmax(y, 1,name="prediction")
                correct_prediction = tf.equal(prediction, label)

                accuracy = tf.reduce_mean(tf.cast(correct_prediction,
                                                  tf.float32),
                                          name="accuracy")
                # tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()

            # summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        print("tensorflow model path: {0}".format(logdir))  #
        # log.info("tensorflow model path: {0}".format(logdir))
        # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(
                is_chief=(task_index == 0),
                logdir=logdir,
                init_op=init_op,
                # summary_op=None,
                saver=saver,
                # saver=None, # None 不自动保存模型
                # recovery_wait_secs=1,
                global_step=global_step,
                stop_grace_secs=300,
                save_model_secs=10)
        elif args.mode == "retrain":
            sv = tf.train.Supervisor(
                is_chief=(task_index == 0),
                logdir=logdir,
                # init_op=init_op,
                # summary_op=None,
                # saver=None, # None 不自动保存模型
                saver=saver,
                # recovery_wait_secs=1,
                global_step=global_step,
                stop_grace_secs=300,
                save_model_secs=10)
        else:
            sv = tf.train.Supervisor(
                is_chief=(task_index == 0),
                logdir=logdir,
                # summary_op=None,
                saver=saver,
                # recovery_wait_secs=1,
                global_step=global_step,
                stop_grace_secs=300,
                save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:  # 打开session
            """
      # 验证之前是否已经保存了检查点文件
      ckpt = tf.train.get_checkpoint_state(logdir)
      if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess,ckpt.model_checkpoint_path)
      """
            # global_step=int(ckpt.model_checkpoint_path.rsplit('-',1)[1])
            # else:
            #   sess.run(init_op)

            print("{0} session ready".format(datetime.now().isoformat()))
            # log.info("{0} session ready".format(datetime.now().isoformat()))
            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = 0
            # acc1=args.acc
            # n = 0
            tf_feed = TFNode.DataFeed(
                ctx.mgr, args.mode == "train" or args.mode == "retrain")
            while not sv.should_stop() and not tf_feed.should_stop(
            ) and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using feed_dict
                batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
                feed = {
                    x: batch_xs,
                    y_: batch_ys,
                    keep: dropout,
                    is_training: True
                }
                if len(batch_xs) > 0:
                    if args.mode == "train" or args.mode == "retrain":
                        # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
                        _, step = sess.run([train_op, global_step],
                                           feed_dict=feed)
                        '''
            if dropout > 0.2:
                if step%10000==0:dropout=dropout*0.85
            else:
                dropout=0.7
            '''
                        """
            acc=sess.run(accuracy,{x: batch_xs, y_: batch_ys,keep:1.})
            if acc>acc1:
              if flag and acc>0.9:
                os.popen('hdfs dfs -rm -r '+logdir+'/*') # 清空hdfs上面文件夹下的所有文件
                flag=False
              # acc1=acc # 训练达到一定程度加上
              saver.save(sess,logdir+'/'+args.model_name,global_step=step)
              n=0
              # learning_rate=1e-3
              # dropout=.7
            else:
              n += 1
              if n > 100:
                ckpt1 = tf.train.get_checkpoint_state(logdir)
                if ckpt1 and ckpt1.model_checkpoint_path:
                  saver.restore(sess, ckpt1.model_checkpoint_path)
                if learning_rate > 1e-7:
                  # learning_rate = learning_rate * .96**(step/10)
                  learning_rate = learning_rate * .8
                else:
                  learning_rate = 1e-3
                if dropout > 0.2:
                  dropout = dropout * .85
                else:
                  dropout = .7
            """

                        # print accuracy and save model checkpoint to HDFS every 100 steps
                        if (step % 100 == 0):
                            print("{0} step: {1} accuracy: {2}".format(
                                datetime.now().isoformat(), step,
                                sess.run(
                                    accuracy, {
                                        x: batch_xs,
                                        y_: batch_ys,
                                        keep: 1.,
                                        is_training: False
                                    })))
                            # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
                        if sv.is_chief:
                            pass
                            # summary_writer.add_summary(summary, step)
                    elif args.mode == 'test':
                        feed2 = {
                            x: batch_xs,
                            y_: batch_ys,
                            keep: 1.,
                            is_training: False
                        }
                        labels, preds, acc = sess.run(
                            [label, prediction, accuracy], feed_dict=feed2)
                        results = [
                            "{0} Label: {1}, Prediction: {2}".format(
                                datetime.now().isoformat(), l, p)
                            for l, p in zip(labels, preds)
                        ]
                        tf_feed.batch_results(results)
                        print("acc: {0}".format(acc))
                    else:  # args.mode == "inference"
                        feed2 = {
                            x: batch_xs,
                            y_: batch_ys,
                            keep: 1.,
                            is_training: False
                        }
                        # labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed2)
                        labels, preds = sess.run([label, prediction],
                                                 feed_dict=feed2)
                        # results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
                        results = [
                            "Label: {0}, Prediction: {1}".format(l, p)
                            for l, p in zip(labels, preds)
                        ]
                        tf_feed.batch_results(results)
                        # print("acc: {0}".format(acc))
                        # log.info("acc: {0}".format(acc))
            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        # log.info("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index

  IMAGE_PIXELS = 28

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128
  batch_size = args.batch_size

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == 'rdma')

  def feed_dict(batch):
    # Convert from dict of named arrays to two numpy arrays of the proper type
    images = batch['image']
    labels = batch['label']
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    xs = xs / 255.0
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
      worker_device="/job:worker/task:%d" % task_index,
      cluster=cluster)):

      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                          stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                         stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
      y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
      tf.summary.image("x_img", x_img)

      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

      global_step = tf.Variable(0)

      loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
      tf.summary.scalar("loss", loss)

      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1, name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model_dir)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph())

    sv = tf.train.Supervisor(is_chief=(task_index == 0),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             saver=saver,
                             global_step=global_step,
                             stop_grace_secs=300,
                             save_model_secs=10)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping)
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
          # print accuracy and save model checkpoint to HDFS every 100 steps
          if (step % 100 == 0):
            print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy, {x: batch_xs, y_: batch_ys})))

          if sv.is_chief:
            summary_writer.add_summary(summary, step)

      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

      if sv.is_chief and args.export_dir:
        print("{0} exporting saved_model to: {1}".format(datetime.now().isoformat(), args.export_dir))
        # exported signatures defined in code
        signatures = {
          tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: {
            'inputs': {'image': x},
            'outputs': {'prediction': prediction},
            'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME
          },
          'featurize': {
            'inputs': {'image': x},
            'outputs': {'features': hid},
            'method_name': 'featurize'
          }
        }
        TFNode.export_saved_model(sess,
                                  args.export_dir,
                                  tf.saved_model.tag_constants.SERVING,
                                  signatures)
      else:
        # non-chief workers should wait for chief
        while not sv.should_stop():
          print("Waiting for chief")
          time.sleep(5)

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def main_fun(argv, ctx):

    import tensorflow as tf
    import cifar10

    sys.argv = argv
    FLAGS = tf.app.flags.FLAGS
    tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval',
                               """Directory where to write event logs.""")
    tf.app.flags.DEFINE_string('eval_data', 'test',
                               """Either 'test' or 'train_eval'.""")
    tf.app.flags.DEFINE_string(
        'checkpoint_dir', '/tmp/cifar10_train',
        """Directory where to read model checkpoints.""")
    tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                                """How often to run the eval.""")
    tf.app.flags.DEFINE_integer('num_examples', 10000,
                                """Number of examples to run.""")
    tf.app.flags.DEFINE_boolean('run_once', False,
                                """Whether to run eval only once.""")
    tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")

    cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

    def eval_once(saver, summary_writer, top_k_op, summary_op):
        """Run Eval once.

    Args:
      saver: Saver.
      summary_writer: Summary writer.
      top_k_op: Top K op.
      summary_op: Summary op.
    """
        with tf.Session() as sess:
            ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            if ckpt and ckpt.model_checkpoint_path:
                # Restores from checkpoint
                saver.restore(sess, ckpt.model_checkpoint_path)
                # Assuming model_checkpoint_path looks something like:
                #   /my-favorite-path/cifar10_train/model.ckpt-0,
                # extract global_step from it.
                global_step = ckpt.model_checkpoint_path.split('/')[-1].split(
                    '-')[-1]
            else:
                print('No checkpoint file found')
                return

            # Start the queue runners.
            coord = tf.train.Coordinator()
            try:
                threads = []
                for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
                    threads.extend(
                        qr.create_threads(sess,
                                          coord=coord,
                                          daemon=True,
                                          start=True))

                num_iter = int(math.ceil(FLAGS.num_examples /
                                         FLAGS.batch_size))
                true_count = 0  # Counts the number of correct predictions.
                total_sample_count = num_iter * FLAGS.batch_size
                step = 0
                while step < num_iter and not coord.should_stop():
                    predictions = sess.run([top_k_op])
                    true_count += np.sum(predictions)
                    step += 1

                # Compute precision @ 1.
                precision = true_count / total_sample_count
                print('%s: precision @ 1 = %.3f' % (datetime.now(), precision))

                summary = tf.Summary()
                summary.ParseFromString(sess.run(summary_op))
                summary.value.add(tag='Precision @ 1', simple_value=precision)
                summary_writer.add_summary(summary, global_step)
            except Exception as e:  # pylint: disable=broad-except
                coord.request_stop(e)

            coord.request_stop()
            coord.join(threads, stop_grace_period_secs=10)

    def evaluate():
        """Eval CIFAR-10 for a number of steps."""
        with tf.Graph().as_default() as g:
            # Get images and labels for CIFAR-10.
            eval_data = FLAGS.eval_data == 'test'
            images, labels = cifar10.inputs(eval_data=eval_data)

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)

            # Calculate predictions.
            top_k_op = tf.nn.in_top_k(logits, labels, 1)

            # Restore the moving average version of the learned variables for eval.
            variable_averages = tf.train.ExponentialMovingAverage(
                cifar10.MOVING_AVERAGE_DECAY)
            variables_to_restore = variable_averages.variables_to_restore()
            saver = tf.train.Saver(variables_to_restore)

            # Build the summary operation based on the TF collection of Summaries.
            summary_op = tf.summary.merge_all()

            summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g)

            while True:
                eval_once(saver, summary_writer, top_k_op, summary_op)
                if FLAGS.run_once:
                    break
                time.sleep(FLAGS.eval_interval_secs)

    #cifar10.maybe_download_and_extract()
    if tf.gfile.Exists(FLAGS.eval_dir):
        tf.gfile.DeleteRecursively(FLAGS.eval_dir)
    tf.gfile.MakeDirs(FLAGS.eval_dir)
    evaluate()
Exemple #39
0
def map_fun(args, ctx):
  # from com.yahoo.ml.tf import TFNode
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num #worker数量
  job_name = ctx.job_name # job名
  task_index = ctx.task_index # 任务索引
  cluster_spec = ctx.cluster_spec # 集群

  IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1  (后续参考自己图像大小进行修改)
  channels=3
  num_class=2
  dropout = 0.5

  learning_rate=1e-6
  # Parameters
  hidden_units = 128 # NN隐藏层
  training_epochs=args.epochs
  img_nums=630000
  #batch_size   = args.batch_size #每批次训练的样本数
  batch_size=200
  """
  # ---------设置动态学习效率
  # Constants describing the training process.
  # MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
  NUM_EPOCHS_PER_DECAY = batch_size  # Epochs after which learning rate decays.
  LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
  INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.

  global_step1 = training_epochs * (img_nums // batch_size)  # Integer Variable counting the number of training steps
  # Variables that affect learning rate.
  num_batches_per_epoch = img_nums / batch_size
  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

  # Decay the learning rate exponentially based on the number of steps.
  learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                            global_step1,
                                            decay_steps,
                                            LEARNING_RATE_DECAY_FACTOR,
                                            staircase=True)
  # 设置动态学习效率----------
  """
  
  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps": # ps节点(主节点)
    time.sleep((worker_num + 1) * 5)

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    numpy.random.shuffle(batch) # 随机打乱
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    #xs = xs/255.0 # 数据归一化
    # Z-score标准化方法
    #mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1])
    #std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1])
    #xs = (xs - mean) / std

    # min-max标准化(Min-Max Normalization
    max_=numpy.reshape(numpy.max(xs,1),[numpy.shape(xs)[0], 1])
    min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1])

    xs=(xs-min_)/(max_-min_)
    
    
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      # Create some wrappers for simplicity
      def conv2d(x, W, b, strides=1):
        # Conv2D wrapper, with bias and relu activation
        x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
        x = tf.nn.bias_add(x, b)  # strides中间两个为1 表示x,y方向都不间隔取样
        return tf.nn.relu(x)

      def maxpool2d(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='SAME')  # strides中间两个为2 表示x,y方向都间隔1个取样

      def maxpool2d2(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='VALID')  # strides中间两个为2 表示x,y方向都间隔1个取样

      # Store layers weight & bias
      weights = {
          # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入
          'wc1': tf.get_variable('wc1',[3,3,channels,64],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),  # 5X5的卷积模板

          # 5x5 conv, 32 inputs, 64 outputs
          'wc2': tf.get_variable('wc2',[3,3,64,128],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'wc3': tf.Variable(tf.random_normal([3, 3, 256, 128])),
          'wc4': tf.get_variable('wc4',[3,3,128,num_class],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # fully connected, 7*7*64 inputs, 1024 outputs
          # 'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])),
          # 1024 inputs, 10 outputs (class prediction)
          # 'out': tf.Variable(tf.random_normal([1024, num_class]))
      }

      biases = {
          'bc1': tf.get_variable('bc1',[64],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          'bc2': tf.get_variable('bc2',[128],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'bc3': tf.Variable(tf.random_normal([128])),
          'bc4': tf.get_variable('bc4',[num_class],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'bd1': tf.Variable(tf.random_normal([1024])),
          # 'out': tf.Variable(tf.random_normal([num_class]))
      }

      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS * channels], name="x")  # mnist 28*28*1
      y_ = tf.placeholder(tf.float32, [None, num_class], name="y_")
      # keep=tf.placeholder(tf.float32)

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels])  # mnist 数据 28x28x1 (灰度图 波段为1)
      # tf.summary.image("x_img", x_img)

      # 改成卷积模型
      conv1 = conv2d(x_img, weights['wc1'], biases['bc1'])
      conv1 = maxpool2d(conv1, k=2)
      # conv1 = tf.nn.dropout(conv1, keep)
      conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
      conv2 = maxpool2d(conv2, k=2)
      conv2 = tf.nn.dropout(conv2, dropout)
      # conv3 = conv2d(conv2, weights['wc3'], biases['bc3'])
      # conv3 = tf.nn.dropout(conv3, keep)
      conv4 = conv2d(conv2, weights['wc4'], biases['bc4'])
      conv4 = maxpool2d2(conv4, k=2)
      y = tf.reshape(conv4, [-1, num_class])


      # fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
      # fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
      # fc1 = tf.nn.relu(fc1)
      # if args.mode == "train" or args.mode == "retrain":
      #   fc1 = tf.nn.dropout(fc1, dropout)
      # y = tf.add(tf.matmul(fc1, weights['out']), biases['out'])

 
      # global_step = tf.Variable(0)

      global_step = tf.Variable(0, name="global_step", trainable=False)

      # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

      loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y))

      # tf.summary.scalar("loss", loss)
      train_op = tf.train.AdagradOptimizer(learning_rate).minimize(
          loss, global_step=global_step)


      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      # tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      # summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()


    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir)) #
    # log.info("tensorflow model path: {0}".format(logdir))
    # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=1)
    elif args.mode == "retrain":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess: # 打开session

      print("{0} session ready".format(datetime.now().isoformat()))
      # log.info("{0} session ready".format(datetime.now().isoformat()))
      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train" or args.mode == "retrain")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train" or args.mode == "retrain":
            # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            _, step = sess.run([train_op,  global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
              # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
            if sv.is_chief:
              pass
              # summary_writer.add_summary(summary, step)
          else: # args.mode == "inference"
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

            results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))
            # log.info("acc: {0}".format(acc))
      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def main_fun(argv, ctx):
  import tensorflow as tf
  import cifar10

  sys.argv = argv
  FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train',
                             """Directory where to write event logs """
                             """and checkpoint.""")
  tf.app.flags.DEFINE_integer('max_steps', 1000000,
                              """Number of batches to run.""")
  tf.app.flags.DEFINE_integer('num_gpus', 1,
                              """How many GPUs to use.""")
  tf.app.flags.DEFINE_boolean('log_device_placement', False,
                              """Whether to log device placement.""")
  tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")
  cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma)

  def tower_loss(scope):
    """Calculate the total loss on a single tower running the CIFAR model.

    Args:
      scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

    Returns:
       Tensor of shape [] containing the total loss for a batch of data
    """
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build inference Graph.
    logits = cifar10.inference(images)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
      # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
      # session. This helps the clarity of presentation on tensorboard.
      loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
      tf.summary.scalar(loss_name, l)

    return total_loss


  def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.

    Note that this function provides a synchronization point across all towers.

    Args:
      tower_grads: List of lists of (gradient, variable) tuples. The outer list
        is over individual gradients. The inner list is over the gradient
        calculation for each tower.
    Returns:
       List of pairs of (gradient, variable) where the gradient has been averaged
       across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
      # Note that each grad_and_vars looks like the following:
      #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
      grads = []
      for g, _ in grad_and_vars:
        # Add 0 dimension to the gradients to represent the tower.
        expanded_g = tf.expand_dims(g, 0)

        # Append on a 'tower' dimension which we will average over below.
        grads.append(expanded_g)

      # Average over the 'tower' dimension.
      grad = tf.concat(axis=0, values=grads)
      grad = tf.reduce_mean(grad, 0)

      # Keep in mind that the Variables are redundant because they are shared
      # across towers. So .. we will just return the first tower's pointer to
      # the Variable.
      v = grad_and_vars[0][1]
      grad_and_var = (grad, v)
      average_grads.append(grad_and_var)
    return average_grads


  def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
      # Create a variable to count the number of train() calls. This equals the
      # number of batches processed * FLAGS.num_gpus.
      global_step = tf.get_variable(
          'global_step', [],
          initializer=tf.constant_initializer(0), trainable=False)

      # Calculate the learning rate schedule.
      num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                               FLAGS.batch_size)
      decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)

      # Decay the learning rate exponentially based on the number of steps.
      lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                      global_step,
                                      decay_steps,
                                      cifar10.LEARNING_RATE_DECAY_FACTOR,
                                      staircase=True)

      # Create an optimizer that performs gradient descent.
      opt = tf.train.GradientDescentOptimizer(lr)

      # Calculate the gradients for each model tower.
      tower_grads = []
      with tf.variable_scope(tf.get_variable_scope()):
        for i in xrange(FLAGS.num_gpus):
          with tf.device('/gpu:%d' % i):
            with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
              # Calculate the loss for one tower of the CIFAR model. This function
              # constructs the entire CIFAR model but shares the variables across
              # all towers.
              loss = tower_loss(scope)

              # Reuse variables for the next tower.
              tf.get_variable_scope().reuse_variables()

              # Retain the summaries from the final tower.
              summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

              # Calculate the gradients for the batch of data on this CIFAR tower.
              grads = opt.compute_gradients(loss)

              # Keep track of the gradients across all towers.
              tower_grads.append(grads)

      # We must calculate the mean of each gradient. Note that this is the
      # synchronization point across all towers.
      grads = average_gradients(tower_grads)

      # Add a summary to track the learning rate.
      summaries.append(tf.summary.scalar('learning_rate', lr))

      # Add histograms for gradients.
      for grad, var in grads:
        if grad is not None:
          summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))

      # Apply the gradients to adjust the shared variables.
      apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

      # Add histograms for trainable variables.
      for var in tf.trainable_variables():
        summaries.append(tf.summary.histogram(var.op.name, var))

      # Track the moving averages of all trainable variables.
      variable_averages = tf.train.ExponentialMovingAverage(
          cifar10.MOVING_AVERAGE_DECAY, global_step)
      variables_averages_op = variable_averages.apply(tf.trainable_variables())

      # Group all updates to into a single train op.
      train_op = tf.group(apply_gradient_op, variables_averages_op)

      # Create a saver.
      saver = tf.train.Saver(tf.global_variables())

      # Build the summary operation from the last tower summaries.
      summary_op = tf.summary.merge(summaries)

      # Build an initialization operation to run below.
      init = tf.global_variables_initializer()

      # Start running operations on the Graph. allow_soft_placement must be set to
      # True to build towers on GPU, as some of the ops do not have GPU
      # implementations.
      sess = tf.Session(config=tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=FLAGS.log_device_placement))
      sess.run(init)

      # Start the queue runners.
      tf.train.start_queue_runners(sess=sess)

      summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

      for step in xrange(FLAGS.max_steps):
        start_time = time.time()
        _, loss_value = sess.run([train_op, loss])
        duration = time.time() - start_time

        assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

        if step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = duration / FLAGS.num_gpus

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), step, loss_value,
                               examples_per_sec, sec_per_batch))

        if step % 100 == 0:
          summary_str = sess.run(summary_op)
          summary_writer.add_summary(summary_str, step)

        # Save the model checkpoint periodically.
        if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
          checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
          saver.save(sess, checkpoint_path, global_step=step)

  # cifar10.maybe_download_and_extract()
  if tf.gfile.Exists(FLAGS.train_dir):
    tf.gfile.DeleteRecursively(FLAGS.train_dir)
  tf.gfile.MakeDirs(FLAGS.train_dir)
  train()