Ejemplo n.º 1
0
    def end(self, session):
        if self.mode != 'train':
            return

        print("{} ======= Exporting to: {}".format(datetime.now().isoformat(),
                                                   self.export_dir))
        signatures = {
            tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            {
                'inputs': {
                    'image': self.input_tensor
                },
                'outputs': {
                    'prediction': self.output_tensor
                },
                'method_name':
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME
            }
        }

        # 保存和导出模型
        TFNode.export_saved_model(session, self.export_dir,
                                  tf.saved_model.tag_constants.SERVING,
                                  signatures)
        print("{} ======= Done exporting".format(datetime.now().isoformat()))
Ejemplo n.º 2
0
    def _tf_export(args):
      """Creates an inference graph w/ placeholder and loads weights from checkpoint"""
      import tensorflow as tf
      from tensorflowonspark import TFNode

      tf.reset_default_graph()                          # reset graph in case we're re-using a Spark python worker
      x = tf.placeholder(tf.float32, [None, 2], name='x')
      w = tf.Variable(tf.truncated_normal([2,1]), name='w')
      y = tf.matmul(x, w, name='y')
      y2 = tf.square(y, name="y2")                      # extra/optional output for testing multiple output tensors
      saver = tf.train.Saver()

      with tf.Session() as sess:
        # load graph from a checkpoint
        ckpt = tf.train.get_checkpoint_state(args.model_dir)
        assert ckpt and ckpt.model_checkpoint_path, "Invalid model checkpoint path: {}".format(args.model_dir)
        saver.restore(sess, ckpt.model_checkpoint_path)

        # exported signatures defined in code
        signatures = {
          'test_key': {
            'inputs': { 'features': x },
            'outputs': { 'prediction': y, 'pred2': y2 },
            'method_name': 'test'
          }
        }
        TFNode.export_saved_model(sess, export_dir=args.export_dir, tag_set='test_tag', signatures=signatures)
Ejemplo n.º 3
0
def sample(args, sc):
    defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
    working_dir = os.getcwd()

    config_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'config.p'),
                                   defaultFS, working_dir)
    saved_args = sc.pickleFile(config_file).collect()[0]
    chars_vocab_file = TFNode.hdfs_path(
        os.path.join(args.save_dir, 'chars_vocab.p'), defaultFS, working_dir)
    chars, vocab = sc.pickleFile(chars_vocab_file).collect()
    model = Model(saved_args, training=False)
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        saver = tf.train.Saver()
        save_dir = TFNode.hdfs_path(os.path.join(args.save_dir, ''), defaultFS,
                                    working_dir)
        ckpt = tf.train.get_checkpoint_state(save_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            sample_ = model.sample(sess, chars, vocab, args.n, args.prime,
                                   args.sample)
            with hdfs.open(
                    TFNode.hdfs_path(
                        os.path.join(args.output_dir, 'output.txt'), defaultFS,
                        working_dir), 'w') as f:
                f.write(sample_)
        def _map_fun(args, ctx):
            import tensorflow as tf
            cluster, server = TFNode.start_cluster_server(ctx)
            if ctx.job_name == "ps":
                server.join()
            elif ctx.job_name == "worker":
                with tf.device(
                        tf.train.replica_device_setter(
                            worker_device="/job:worker/task:%d" %
                            ctx.task_index,
                            cluster=cluster)):
                    x = tf.placeholder(tf.int32, [None, 1])
                    sq = tf.square(x)
                    init_op = tf.global_variables_initializer()
                with tf.train.MonitoredTrainingSession(
                        is_chief=(ctx.task_index == 0)) as sess:
                    tf_feed = TFNode.DataFeed(ctx.mgr, False)
                    while not sess.should_stop() and not tf_feed.should_stop():
                        batch = tf_feed.next_batch(10)
                        if len(batch) > 0:
                            outputs = sess.run([sq], feed_dict={x: batch})
                            tf_feed.batch_results(outputs[0])

                # simulate post-feed actions that raise an exception
                time.sleep(2)
                raise Exception("FAKE exception after feeding")
Ejemplo n.º 5
0
 def __call__(self, args, ctx):
     self.task_index = ctx.task_index
     self.job_name = ctx.job_name
     self.cluster, self.server = TFNode.start_cluster_server(ctx)
     self.tf_feed = TFNode.DataFeed(ctx.mgr)
     if ctx.job_name == "ps":
         self.server.join()
     elif ctx.job_name == "worker":
         self.build_model()
         self.execute()
Ejemplo n.º 6
0
    def _spark_train(args, ctx):
      """Basic linear regression in a distributed TF cluster using InputMode.SPARK"""
      import tensorflow as tf
      from tensorflowonspark import TFNode

      tf.reset_default_graph()                          # reset graph in case we're re-using a Spark python worker

      cluster, server = TFNode.start_cluster_server(ctx)
      if ctx.job_name == "ps":
        server.join()
      elif ctx.job_name == "worker":
        with tf.device(tf.train.replica_device_setter(
          worker_device="/job:worker/task:%d" % ctx.task_index,
          cluster=cluster)):
          x = tf.placeholder(tf.float32, [None, 2], name='x')
          y_ = tf.placeholder(tf.float32, [None, 1], name='y_')
          w = tf.Variable(tf.truncated_normal([2,1]), name='w')
          y = tf.matmul(x, w, name='y')
          y2 = tf.square(y, name="y2")                      # extra/optional output for testing multiple output tensors
          cost = tf.reduce_mean(tf.square(y_ - y), name='cost')
          optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(cost)
          init_op = tf.global_variables_initializer()
          saver = tf.train.Saver()

        sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                                init_op=init_op)
        with sv.managed_session(server.target) as sess:
          tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping)
          while not sv.should_stop() and not tf_feed.should_stop():
            batch = tf_feed.next_batch(10)
            if args.input_mapping:
              if len(batch['x']) > 0:
                feed = { x: batch['x'], y_: batch['y_'] }
              opt = sess.run(optimizer, feed_dict=feed)

          if sv.is_chief:
            if args.model_dir:
              # manually save checkpoint
              ckpt_name = args.model_dir + "/model.ckpt"
              print("Saving checkpoint to: {}".format(ckpt_name))
              saver.save(sess, ckpt_name)
            elif args.export_dir:
              # export a saved_model
              signatures = {
                'test_key': {
                  'inputs': { 'features': x },
                  'outputs': { 'prediction': y },
                  'method_name': 'test'
                }
              }
              TFNode.export_saved_model(sess, export_dir=args.export_dir, tag_set='test_tag', signatures=signatures)
            else:
              print("WARNING: model state not saved.")

        sv.stop()
Ejemplo n.º 7
0
 def __call__(self, args, ctx):
     self.task_index = ctx.task_index
     self.job_name = ctx.job_name
     self.cluster, self.server = TFNode.start_cluster_server(ctx)
     self.tf_feed = TFNode.DataFeed(ctx.mgr)
     if ctx.job_name == "ps":
         self.server.join()
     elif ctx.job_name == "worker":
         self.create_tmp_dir()
         self.process()
         self.delete_tmp_dir()
Ejemplo n.º 8
0
def export_fun(args):
  """Define/export a single-node TF graph for inferencing"""
  # Input placeholder for inferencing
  x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")

  # Variables of the hidden layer
  hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                      stddev=1.0 / IMAGE_PIXELS), name="hid_w")
  hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
  tf.summary.histogram("hidden_weights", hid_w)

  # Variables of the softmax layer
  sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                     stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
  sm_b = tf.Variable(tf.zeros([10]), name="sm_b")

  hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
  hid = tf.nn.relu(hid_lin)
  y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
  prediction = tf.argmax(y, 1, name="prediction")

  saver = tf.train.Saver()

  with tf.Session() as sess:
    # load graph from a checkpoint
    logging.info("model path: {}".format(args.model_dir))
    ckpt = tf.train.get_checkpoint_state(args.model_dir)
    logging.info("ckpt: {}".format(ckpt))
    assert ckpt and ckpt.model_checkpoint_path, "Invalid model checkpoint path: {}".format(args.model_dir)
    saver.restore(sess, ckpt.model_checkpoint_path)

    logging.info("Exporting saved_model to: {}".format(args.export_dir))
    # exported signatures defined in code
    signatures = {
      tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: {
        'inputs': {'image': x},
        'outputs': {'prediction': prediction},
        'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME
      },
      'featurize': {
        'inputs': {'image': x},
        'outputs': {'features': hid},
        'method_name': 'featurize'
      }
    }
    TFNode.export_saved_model(sess,
                              args.export_dir,
                              tf.saved_model.tag_constants.SERVING,
                              signatures)
    logging.info("Exported saved_model")
Ejemplo n.º 9
0
    def test_datafeed(self):
        mgr = TFManager.start('abc', ['input', 'output'], 'local')

        # insert 10 numbers followed by an end-of-feed marker
        q = mgr.get_queue('input')
        for i in range(10):
            q.put(i)
        q.put(None)

        feed = TFNode.DataFeed(mgr)

        # [0,1]
        self.assertFalse(feed.done_feeding)
        batch = feed.next_batch(2)
        self.assertEqual(2, len(batch))
        self.assertEqual(1, sum(batch))

        # [2,3,4,5]
        batch = feed.next_batch(4)
        self.assertEqual(4, len(batch))
        self.assertEqual(14, sum(batch))

        # [6,7,8,9]
        batch = feed.next_batch(10)
        self.assertEqual(4, len(batch))
        self.assertEqual(30, sum(batch))

        # should be done
        self.assertTrue(feed.should_stop())
Ejemplo n.º 10
0
 def feed_dict(mgr, batch_size):
     tmp = TFNode.next_batch(mgr, batch_size)
     # extract TFRecords, since tmp array is [(TFRecord, None)]
     tfrecords = []
     for elem in tmp:
         tfrecords.append(str(elem[0]))
     return tfrecords
    def test_datafeed(self):
        """TFNode.DataFeed basic operations"""
        mgr = TFManager.start('abc', ['input', 'output'], 'local')

        # insert 10 numbers followed by an end-of-feed marker
        q = mgr.get_queue('input')
        for i in range(10):
            q.put(i)
        q.put(None)  # end-of-feed marker

        feed = TFNode.DataFeed(mgr)

        # [0,1]
        self.assertFalse(feed.done_feeding)
        batch = feed.next_batch(2)
        self.assertEqual(len(batch), 2)
        self.assertEqual(sum(batch), 1)

        # [2,3,4,5]
        self.assertFalse(feed.done_feeding)
        batch = feed.next_batch(4)
        self.assertEqual(len(batch), 4)
        self.assertEqual(sum(batch), 14)

        # [6,7,8,9]
        self.assertFalse(feed.done_feeding)
        batch = feed.next_batch(10)  # ask for more than available
        self.assertEqual(len(batch), 4)
        self.assertEqual(sum(batch), 30)

        # should be done
        self.assertTrue(feed.should_stop())
    def test_hdfs_path(self):
        """Normalization of absolution & relative string paths depending on filesystem"""
        cwd = os.getcwd()
        user = getpass.getuser()
        fs = ["file://", "hdfs://", "viewfs://"]
        paths = {
            "hdfs://foo/bar":
            ["hdfs://foo/bar", "hdfs://foo/bar", "hdfs://foo/bar"],
            "viewfs://foo/bar":
            ["viewfs://foo/bar", "viewfs://foo/bar", "viewfs://foo/bar"],
            "file://foo/bar":
            ["file://foo/bar", "file://foo/bar", "file://foo/bar"],
            "/foo/bar":
            ["file:///foo/bar", "hdfs:///foo/bar", "viewfs:///foo/bar"],
            "foo/bar": [
                "file://{}/foo/bar".format(cwd),
                "hdfs:///user/{}/foo/bar".format(user),
                "viewfs:///user/{}/foo/bar".format(user)
            ],
        }

        for i in range(len(fs)):
            ctx = type('MockContext', (), {
                'defaultFS': fs[i],
                'working_dir': cwd
            })
            for path, expected in paths.items():
                final_path = TFNode.hdfs_path(ctx, path)
                self.assertEqual(
                    final_path, expected[i],
                    "fs({}) + path({}) => {}, expected {}".format(
                        fs[i], path, final_path, expected[i]))
 def feed_dict(mgr, batch_size):
   tmp = TFNode.next_batch(mgr, batch_size)
   # extract TFRecords, since tmp array is [(TFRecord, None)]
   tfrecords = []
   for elem in tmp:
     tfrecords.append(str(elem[0]))
   return tfrecords
Ejemplo n.º 14
0
        def _tf_train(args, ctx):
            """Basic linear regression in a distributed TF cluster using InputMode.TENSORFLOW"""
            import tensorflow as tf
            from tensorflowonspark import TFNode

            tf.reset_default_graph(
            )  # reset graph in case we're re-using a Spark python worker

            cluster, server = TFNode.start_cluster_server(ctx)

            def _get_examples(batch_size):
                """Generate test data (mocking a queue_runner of file inputs)"""
                features = tf.random_uniform([batch_size,
                                              2])  # (batch_size x 2)
                weights = tf.constant([[3.14], [1.618]])  # (2, 1)
                labels = tf.matmul(features, weights)
                return features, labels

            if ctx.job_name == "ps":
                server.join()
            elif ctx.job_name == "worker":
                with tf.device(
                        tf.train.replica_device_setter(
                            worker_device="/job:worker/task:%d" %
                            ctx.task_index,
                            cluster=cluster)):
                    x, y_ = _get_examples(
                        10
                    )  # no input placeholders, TF code reads (or in this case "generates") input
                    w = tf.Variable(tf.truncated_normal([2, 1]), name='w')
                    y = tf.matmul(x, w, name='y')
                    global_step = tf.Variable(0)

                    cost = tf.reduce_mean(tf.square(y_ - y), name='cost')
                    optimizer = tf.train.GradientDescentOptimizer(
                        0.5).minimize(cost, global_step)

                    init_op = tf.global_variables_initializer()
                    saver = tf.train.Saver()

                sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                                         init_op=init_op)
                step = 0
                with sv.managed_session(server.target) as sess:
                    while not sv.should_stop() and step < args.steps:
                        opt, weights, step = sess.run(
                            [optimizer, w, global_step])
                        if (step % 100 == 0):
                            print("step: {}, weights: {}".format(
                                step, weights))

                    if sv.is_chief:
                        if args.model_dir:
                            # manually save checkpoint
                            ckpt_name = args.model_dir + "/model.ckpt"
                            print("Saving checkpoint to: {}".format(ckpt_name))
                            saver.save(sess, ckpt_name)
                sv.stop()
        def _map_fun(args, ctx):
            import tensorflow as tf

            tf_feed = TFNode.DataFeed(ctx.mgr, False)
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(10)
                if len(batch) > 0:
                    squares = tf.math.square(batch)
                    tf_feed.batch_results(squares.numpy())
                    raise Exception("FAKE exception during feeding")
        def _map_fun(args, ctx):
            import tensorflow as tf

            tf_feed = TFNode.DataFeed(ctx.mgr, False)
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(batch_size=10)
                print("batch: {}".format(batch))
                squares = tf.math.square(batch)
                print("squares: {}".format(squares))
                tf_feed.batch_results(squares.numpy())
Ejemplo n.º 17
0
 def end(self, session):
     print("{} ======= Exporting to: {}".format(
         datetime.now().isoformat(), self.export_dir))
     signatures = {
         "test_key": {
             'inputs': {
                 'features': self.input_tensor
             },
             'outputs': {
                 'prediction': self.output_tensor
             },
             'method_name':
             tf.saved_model.signature_constants.
             PREDICT_METHOD_NAME
         }
     }
     TFNode.export_saved_model(session, self.export_dir,
                               "test_tag", signatures)
     print("{} ======= Done exporting".format(
         datetime.now().isoformat()))
        def _map_fun(args, ctx):
            import tensorflow as tf

            tf_feed = TFNode.DataFeed(ctx.mgr, False)
            while not tf_feed.should_stop():
                batch = tf_feed.next_batch(10)
                if len(batch) > 0:
                    squares = tf.math.square(batch)
                    tf_feed.batch_results(squares.numpy())

            # simulate post-feed actions that raise an exception
            time.sleep(2)
            raise Exception("FAKE exception after feeding")
Ejemplo n.º 19
0
    def end(self, session):
        logging.info("{} ======= Exporting to: {}".format(
            datetime.now().isoformat(), self.export_dir))
        signatures = {
            tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            {
                'inputs': {
                    'image': self.input_tensor
                },
                'outputs': {
                    'prediction': self.output_tensor
                },
                'method_name':
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME
            }
        }

        TFNode.export_saved_model(session,
                                  self.export_dir + '_' + str(random.random()),
                                  tf.saved_model.tag_constants.SERVING,
                                  signatures)
        logging.info("{} ====== Done exporting".format(
            datetime.now().isoformat()))
 def _map_fun(args, ctx):
     import tensorflow as tf
     cluster, server = TFNode.start_cluster_server(ctx)
     if ctx.job_name == "ps":
         server.join()
     elif ctx.job_name == "worker":
         with tf.device(
                 tf.train.replica_device_setter(
                     worker_device="/job:worker/task:%d" %
                     ctx.task_index,
                     cluster=cluster)):
             x = tf.placeholder(tf.int32, [None, 1])
             sq = tf.square(x)
             init_op = tf.global_variables_initializer()
         sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                                  init_op=init_op)
         with sv.managed_session(server.target) as sess:
             tf_feed = TFNode.DataFeed(ctx.mgr, False)
             while not sv.should_stop() and not tf_feed.should_stop():
                 outputs = sess.run(
                     [sq], feed_dict={x: tf_feed.next_batch(10)})
                 tf_feed.batch_results(outputs[0])
         sv.stop()
Ejemplo n.º 21
0
def main(_):
  # restore graph/session from checkpoint
  sess = tf.Session(graph=tf.get_default_graph())
  ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
  saver = tf.train.import_meta_graph(ckpt + '.meta', clear_devices=True)
  saver.restore(sess, ckpt)
  g = sess.graph

  # if --show, dump out all operations in this graph
  if FLAGS.show:
    for o in g.get_operations():
      print("{:>64}\t{}".format(o.name, o.type))

  if FLAGS.export_dir and FLAGS.signatures:
    # load/parse JSON signatures
    if ':' in FLAGS.signatures:
      # assume JSON string, since unix filenames shouldn't contain colons
      signatures = json.loads(FLAGS.signatures)
    else:
      # assume JSON file
      with open(FLAGS.signatures) as f:
        signatures = json.load(f)

    # convert string input/output values with actual tensors from graph
    for name, sig in signatures.items():
      for k, v in sig['inputs'].items():
        tensor_name = v if v.endswith(':0') else v + ':0'
        sig['inputs'][k] = g.get_tensor_by_name(tensor_name)
      for k, v in sig['outputs'].items():
        tensor_name = v if v.endswith(':0') else v + ':0'
        sig['outputs'][k] = g.get_tensor_by_name(tensor_name)

    # export a saved model
    TFNode.export_saved_model(sess,
                              FLAGS.export_dir,
                              tf.saved_model.tag_constants.SERVING,
                              signatures)
Ejemplo n.º 22
0
def save_model(sess, args, x, prediction):
    """ 保存模型 """

    pb_folder_dir = args.export_dir + constants.PATH_SEP + constants.PB_FOLDER_NAME
    # exported signatures defined in code
    signatures = {
        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: {
            "inputs": {constants.SIG_INPUT: x},
            "outputs": {constants.SIG_OUTPUT: prediction},
            "method_name": tf.saved_model.signature_constants.PREDICT_METHOD_NAME
        }
    }
    TFNode.export_saved_model(sess,
                              pb_folder_dir,
                              tf.saved_model.tag_constants.SERVING,
                              signatures)

    # 转为单个pb
    t = Thread(target=tensorflow_utils.convert_as_single_pb,
               args=[pb_folder_dir,
                     constants.PREDICT_NODE_NAME,
                     args.export_dir + constants.PATH_SEP + constants.PB_NAME])
    t.start()
    t.join()
Ejemplo n.º 23
0
def main_fun(argv, ctx):
    import tensorflow as tf
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    cluster_spec, server = TFNode.start_cluster_server(ctx)
    '''  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)
	
  if job_name == "ps":
    server.join()
  elif job_name == "worker":'''
    hello = tf.constant('Hello, TensorFlow!')
    sess = tf.Session()
    print(sess.run(hello))
  def test_hdfs_path(self):
    cwd = os.getcwd()
    user = getpass.getuser()
    fs = ["file://", "hdfs://", "viewfs://"]
    paths = {
      "hdfs://foo/bar": ["hdfs://foo/bar", "hdfs://foo/bar", "hdfs://foo/bar"],
      "viewfs://foo/bar": ["viewfs://foo/bar", "viewfs://foo/bar", "viewfs://foo/bar"],
      "file://foo/bar": ["file://foo/bar", "file://foo/bar", "file://foo/bar"],
      "/foo/bar": ["file:///foo/bar", "hdfs:///foo/bar", "viewfs:///foo/bar"],
      "foo/bar": ["file://{}/foo/bar".format(cwd), "hdfs:///user/{}/foo/bar".format(user), "viewfs:///user/{}/foo/bar".format(user)],
    }

    for i in range(len(fs)):
      ctx = type('MockContext', (), {'defaultFS': fs[i], 'working_dir': cwd})
      for path, expected in paths.items():
        final_path = TFNode.hdfs_path(ctx, path)
        self.assertEqual(expected[i], final_path, "fs({}) + path({}) => {}, expected {}".format(fs[i], path, final_path, expected[i]))
Ejemplo n.º 25
0
    def __init__(self, sc, data_dir, batch_size, seq_length, encoding='utf-8'):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.encoding = encoding

        defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
        working_dir = os.getcwd()

        input_file = TFNode.hdfs_path(os.path.join(data_dir, "input.txt"),
                                      defaultFS, working_dir)

        print("reading text file")
        self.preprocess(input_file)

        self.create_batches()
        self.reset_batch_pointer()
Ejemplo n.º 26
0
def main_fun(argv, ctx):
    from src import facenet_distributed_train
    from src import vipus_distributed_train
    import sys

    job_name = ctx.job_name
    assert job_name in ['ps', 'worker'], 'job_name must be ps or worker'
    print("argv:", argv)
    sys.argv = argv

    cluster, server = TFNode.start_cluster_server(ctx, num_gpus=1)
    if job_name == 'ps':
        server.join()
    else:
        if argv.model == 'FACENET':
            facenet_distributed_train.train(server, ctx.cluster_spec, argv,
                                            ctx)
        elif argv.model == 'VIPUS':
            vipus_distributed_train.train(server, ctx.cluster_spec, argv, ctx)
 def _map_fun(args, ctx):
   import tensorflow as tf
   cluster, server = TFNode.start_cluster_server(ctx)
   if ctx.job_name == "ps":
     server.join()
   elif ctx.job_name == "worker":
     with tf.device(tf.train.replica_device_setter(
       worker_device="/job:worker/task:%d" % ctx.task_index,
       cluster=cluster)):
       x = tf.placeholder(tf.int32, [None, 1])
       sq = tf.square(x)
       init_op = tf.global_variables_initializer()
     sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                             init_op=init_op)
     with sv.managed_session(server.target) as sess:
       tf_feed = TFNode.DataFeed(ctx.mgr, False)
       while not sv.should_stop() and not tf_feed.should_stop():
         outputs = sess.run([sq], feed_dict={ x: tf_feed.next_batch(10) })
         tf_feed.batch_results(outputs[0])
     sv.stop()
Ejemplo n.º 28
0
def main_fun(argv, ctx):
  import tensorflow as tf
  from inception import inception_eval
  from inception.imagenet_data import ImagenetData

  print("argv:", argv)
  sys.argv = argv

  FLAGS = tf.app.flags.FLAGS
  FLAGS._parse_flags()
  print("FLAGS:", FLAGS.__dict__['__flags'])

  dataset = ImagenetData(subset=FLAGS.subset)
  assert dataset.data_files()
  if tf.gfile.Exists(FLAGS.eval_dir):
    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  tf.gfile.MakeDirs(FLAGS.eval_dir)

  cluster_spec, server = TFNode.start_cluster_server(ctx)

  inception_eval.evaluate(dataset)
def main_fun(argv, ctx):

    # extract node metadata from ctx
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    assert job_name in ['ps', 'worker'], 'job_name must be ps or worker'

    from inception import inception_distributed_train
    from inception.imagenet_data import ImagenetData
    import tensorflow as tf

    # instantiate FLAGS on workers using argv from driver and add job_name and task_id
    print("argv:", argv)
    sys.argv = argv

    FLAGS = tf.app.flags.FLAGS
    FLAGS.job_name = job_name
    FLAGS.task_id = task_index
    print("FLAGS:", FLAGS.__dict__['__flags'])

    # Get TF cluster and server instances
    cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus,
                                                       FLAGS.rdma)

    if FLAGS.job_name == 'ps':
        # `ps` jobs wait for incoming connections from the workers.
        server.join()
    else:
        # `worker` jobs will actually do the work.
        dataset = ImagenetData(subset=FLAGS.subset)
        assert dataset.data_files()
        # Only the chief checks for or creates train_dir.
        if FLAGS.task_id == 0:
            if not tf.gfile.Exists(FLAGS.train_dir):
                tf.gfile.MakeDirs(FLAGS.train_dir)
        inception_distributed_train.train(server.target, dataset, cluster_spec,
                                          ctx)
def main_fun(argv, ctx):

  # extract node metadata from ctx
  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index

  assert job_name in ['ps', 'worker'], 'job_name must be ps or worker'

  from inception import inception_distributed_train
  from inception.imagenet_data import ImagenetData
  import tensorflow as tf

  # instantiate FLAGS on workers using argv from driver and add job_name and task_id
  print("argv:", argv)
  sys.argv = argv

  FLAGS = tf.app.flags.FLAGS
  FLAGS.job_name = job_name
  FLAGS.task_id = task_index
  print("FLAGS:", FLAGS.__dict__['__flags'])

  # Get TF cluster and server instances
  cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma)

  if FLAGS.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()
  else:
    # `worker` jobs will actually do the work.
    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    # Only the chief checks for or creates train_dir.
    if FLAGS.task_id == 0:
      if not tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.MakeDirs(FLAGS.train_dir)
    inception_distributed_train.train(server.target, dataset, cluster_spec, ctx)
def train(target, dataset, cluster_spec, ctx):
  """Train Inception on a dataset for a number of steps."""
  # Number of workers and parameter servers are infered from the workers and ps
  # hosts string.
  num_workers = len(cluster_spec.as_dict()['worker'])
  num_parameter_servers = len(cluster_spec.as_dict()['ps'])
  # If no value is given, num_replicas_to_aggregate defaults to be the number of
  # workers.
  if FLAGS.num_replicas_to_aggregate == -1:
    num_replicas_to_aggregate = num_workers
  else:
    num_replicas_to_aggregate = FLAGS.num_replicas_to_aggregate

  # Both should be greater than 0 in a distributed training.
  assert num_workers > 0 and num_parameter_servers > 0, (' num_workers and '
                                                         'num_parameter_servers'
                                                         ' must be > 0.')

  # Choose worker 0 as the chief. Note that any worker could be the chief
  # but there should be only one chief.
  is_chief = (FLAGS.task_id == 0)

  # Ops are assigned to worker by default.
  with tf.device('/job:worker/task:%d' % FLAGS.task_id):
    # Variables and its related init/assign ops are assigned to ps.
    with slim.scopes.arg_scope(
        [slim.variables.variable, slim.variables.global_step],
        device=slim.variables.VariableDeviceChooser(num_parameter_servers)):
      # Create a variable to count the number of train() calls. This equals the
      # number of updates applied to the variables.
      global_step = slim.variables.global_step()

      # Calculate the learning rate schedule.
      num_batches_per_epoch = (dataset.num_examples_per_epoch() /
                               FLAGS.batch_size)
      # Decay steps need to be divided by the number of replicas to aggregate.
      decay_steps = int(num_batches_per_epoch * FLAGS.num_epochs_per_decay /
                        num_replicas_to_aggregate)

      # Decay the learning rate exponentially based on the number of steps.
      lr = tf.train.exponential_decay(FLAGS.initial_learning_rate,
                                      global_step,
                                      decay_steps,
                                      FLAGS.learning_rate_decay_factor,
                                      staircase=True)
      # Add a summary to track the learning rate.
      tf.summary.scalar('learning_rate', lr)

      # Create an optimizer that performs gradient descent.
      opt = tf.train.RMSPropOptimizer(lr,
                                      RMSPROP_DECAY,
                                      momentum=RMSPROP_MOMENTUM,
                                      epsilon=RMSPROP_EPSILON)

      if FLAGS.input_mode == 'spark':
        def feed_dict(mgr, batch_size):
          tmp = TFNode.next_batch(mgr, batch_size)
          # extract TFRecords, since tmp array is [(TFRecord, None)]
          tfrecords = []
          for elem in tmp:
            tfrecords.append(str(elem[0]))
          return tfrecords

        batch = tf.placeholder(tf.string, [FLAGS.batch_size/FLAGS.num_preprocess_threads])

        # The following is adapted from image_processing.py to remove Readers/QueueRunners.
        # Note: this removes the RandomShuffledQueue, so the incoming data is not shuffled.
        # Presumably, this could be done on the Spark side or done in additional TF code.
        examples = tf.unpack(batch)
        images, labels = [], []
        for example_serialized in examples:
          for thread_id in range(FLAGS.num_preprocess_threads):
            # Parse a serialized Example proto to extract the image and metadata.
            image_buffer, label_index, bbox, _ = image_processing.parse_example_proto(example_serialized)
            image = image_processing.image_preprocessing(image_buffer, bbox, train, thread_id)
            images.append(image)
            labels.append(label_index)
        height = FLAGS.image_size
        width = FLAGS.image_size
        depth = 3
        images = tf.cast(images, tf.float32)
        images = tf.reshape(images, shape=[FLAGS.batch_size, height, width, depth])
        tf.summary.image('images', images)
        labels = tf.reshape(labels, [FLAGS.batch_size])
      else:
        images, labels = image_processing.distorted_inputs(
            dataset,
            batch_size=FLAGS.batch_size,
            num_preprocess_threads=FLAGS.num_preprocess_threads)

      # Number of classes in the Dataset label set plus 1.
      # Label 0 is reserved for an (unused) background class.
      num_classes = dataset.num_classes() + 1
      logits = inception.inference(images, num_classes, for_training=True)
      # Add classification loss.
      inception.loss(logits, labels)

      # Gather all of the losses including regularization losses.
      losses = tf.get_collection(slim.losses.LOSSES_COLLECTION)
      losses += tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)

      total_loss = tf.add_n(losses, name='total_loss')

      if is_chief:
        # Compute the moving average of all individual losses and the
        # total loss.
        loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
        loss_averages_op = loss_averages.apply(losses + [total_loss])

        # Attach a scalar summmary to all individual losses and the total loss;
        # do the same for the averaged version of the losses.
        for l in losses + [total_loss]:
          loss_name = l.op.name
          # Name each loss as '(raw)' and name the moving average version of the
          # loss as the original loss name.
          tf.summary.scalar(loss_name + ' (raw)', l)
          tf.summary.scalar(loss_name, loss_averages.average(l))

        # Add dependency to compute loss_averages.
        with tf.control_dependencies([loss_averages_op]):
          total_loss = tf.identity(total_loss)

      # Track the moving averages of all trainable variables.
      # Note that we maintain a 'double-average' of the BatchNormalization
      # global statistics.
      # This is not needed when the number of replicas are small but important
      # for synchronous distributed training with tens of workers/replicas.
      exp_moving_averager = tf.train.ExponentialMovingAverage(
          inception.MOVING_AVERAGE_DECAY, global_step)

      variables_to_average = (
          tf.trainable_variables() + tf.moving_average_variables())

      # Add histograms for model variables.
      for var in variables_to_average:
        tf.summary.histogram(var.op.name, var)

      # Create synchronous replica optimizer.
      opt = tf.train.SyncReplicasOptimizer(
          opt,
          replicas_to_aggregate=num_replicas_to_aggregate,
          total_num_replicas=num_workers,
          variable_averages=exp_moving_averager,
          variables_to_average=variables_to_average)

      batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION)
      assert batchnorm_updates, 'Batchnorm updates are missing'
      batchnorm_updates_op = tf.group(*batchnorm_updates)
      # Add dependency to compute batchnorm_updates.
      with tf.control_dependencies([batchnorm_updates_op]):
        total_loss = tf.identity(total_loss)

      # Compute gradients with respect to the loss.
      grads = opt.compute_gradients(total_loss)

      # Add histograms for gradients.
      for grad, var in grads:
        if grad is not None:
          tf.summary.histogram(var.op.name + '/gradients', grad)

      apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

      with tf.control_dependencies([apply_gradients_op]):
        train_op = tf.identity(total_loss, name='train_op')

      # Get chief queue_runners, init_tokens and clean_up_op, which is used to
      # synchronize replicas.
      # More details can be found in sync_replicas_optimizer.
      chief_queue_runners = [opt.get_chief_queue_runner()]
      init_tokens_op = opt.get_init_tokens_op()

      # Create a saver.
      saver = tf.train.Saver()

      # Build the summary operation based on the TF collection of Summaries.
      summary_op = tf.summary.merge_all()

      # Build an initialization operation to run below.
      init_op = tf.global_variables_initializer()

      # We run the summaries in the same thread as the training operations by
      # passing in None for summary_op to avoid a summary_thread being started.
      # Running summaries and training operations in parallel could run out of
      # GPU memory.
      summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num), graph=tf.get_default_graph())
      sv = tf.train.Supervisor(is_chief=is_chief,
                               logdir=FLAGS.train_dir,
                               init_op=init_op,
                               summary_op=None,
                               global_step=global_step,
                               summary_writer=summary_writer,
                               saver=saver,
                               save_model_secs=FLAGS.save_interval_secs)

      tf.logging.info('%s Supervisor' % datetime.now())

      sess_config = tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=FLAGS.log_device_placement)

      # Get a session.
      sess = sv.prepare_or_wait_for_session(target, config=sess_config)

      # Start the queue runners.
      queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
      sv.start_queue_runners(sess, queue_runners)
      tf.logging.info('Started %d queues for processing input data.',
                      len(queue_runners))

      if is_chief:
        sv.start_queue_runners(sess, chief_queue_runners)
        sess.run(init_tokens_op)

      # Train, checking for Nans. Concurrently run the summary operation at a
      # specified interval. Note that the summary_op and train_op never run
      # simultaneously in order to prevent running out of GPU memory.
      next_summary_time = time.time() + FLAGS.save_summaries_secs
      while not sv.should_stop():
        try:
          start_time = time.time()
          if FLAGS.input_mode == 'spark':
            tmp = feed_dict(ctx.mgr, FLAGS.batch_size/FLAGS.num_preprocess_threads)
            feed = {batch: tmp}
            loss_value, step = sess.run([train_op, global_step], feed_dict=feed)
          else:
            loss_value, step = sess.run([train_op, global_step])
          assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
          if step > FLAGS.max_steps:
            break
          duration = time.time() - start_time

          if step % 30 == 0:
            examples_per_sec = FLAGS.batch_size / float(duration)
            format_str = ('Worker %d: %s: step %d, loss = %.2f'
                          '(%.1f examples/sec; %.3f  sec/batch)')
            tf.logging.info(format_str %
                            (FLAGS.task_id, datetime.now(), step, loss_value,
                             examples_per_sec, duration))

          # Determine if the summary_op should be run on the chief worker.
          if FLAGS.input_mode == 'tf' and is_chief and next_summary_time < time.time():
            tf.logging.info('Running Summary operation on the chief.')
            summary_str = sess.run(summary_op)
            sv.summary_computed(sess, summary_str)
            tf.logging.info('Finished running Summary operation.')

            # Determine the next time for running the summary.
            next_summary_time += FLAGS.save_summaries_secs
        except:
          if is_chief:
            tf.logging.info('About to execute sync_clean_up_op!')
          raise

      # Stop the TFNode data feed
      if FLAGS.input_mode == 'spark':
        TFNode.terminate(ctx.mgr)

      # Stop the supervisor.  This also waits for service threads to finish.
      sv.stop()

      # Save after the training ends.
      if is_chief:
        saver.save(sess,
                   os.path.join(FLAGS.train_dir, 'model.ckpt'),
                   global_step=global_step)
Ejemplo n.º 32
0
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import getpass
  import math
  import numpy
  import os
  import signal
  import tensorflow as tf
  import time

  IMAGE_PIXELS=28
  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec
  num_workers = len(cluster_spec['worker'])

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128
  batch_size   = 100

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def read_csv_examples(image_dir, label_dir, batch_size=100, num_epochs=None, task_index=None, num_workers=None):
    print_log(worker_num, "num_epochs: {0}".format(num_epochs))
    # Setup queue of csv image filenames
    tf_record_pattern = os.path.join(image_dir, 'part-*')
    images = tf.gfile.Glob(tf_record_pattern)
    print_log(worker_num, "images: {0}".format(images))
    image_queue = tf.train.string_input_producer(images, shuffle=False, capacity=1000, num_epochs=num_epochs, name="image_queue")

    # Setup queue of csv label filenames
    tf_record_pattern = os.path.join(label_dir, 'part-*')
    labels = tf.gfile.Glob(tf_record_pattern)
    print_log(worker_num, "labels: {0}".format(labels))
    label_queue = tf.train.string_input_producer(labels, shuffle=False, capacity=1000, num_epochs=num_epochs, name="label_queue")

    # Setup reader for image queue
    img_reader = tf.TextLineReader(name="img_reader")
    _, img_csv = img_reader.read(image_queue)
    image_defaults = [ [1.0] for col in range(784) ]
    img = tf.pack(tf.decode_csv(img_csv, image_defaults))
    # Normalize values to [0,1]
    norm = tf.constant(255, dtype=tf.float32, shape=(784,))
    image = tf.div(img, norm)
    print_log(worker_num, "image: {0}".format(image))

    # Setup reader for label queue
    label_reader = tf.TextLineReader(name="label_reader")
    _, label_csv = label_reader.read(label_queue)
    label_defaults = [ [1.0] for col in range(10) ]
    label = tf.pack(tf.decode_csv(label_csv, label_defaults))
    print_log(worker_num, "label: {0}".format(label))

    # Return a batch of examples
    return tf.train.batch([image,label], batch_size, num_threads=args.readers, name="batch_csv")

  def read_tfr_examples(path, batch_size=100, num_epochs=None, task_index=None, num_workers=None):
    print_log(worker_num, "num_epochs: {0}".format(num_epochs))

    # Setup queue of TFRecord filenames
    tf_record_pattern = os.path.join(path, 'part-*')
    files = tf.gfile.Glob(tf_record_pattern)
    queue_name = "file_queue"

    # split input files across workers, if specified
    if task_index is not None and num_workers is not None:
      num_files = len(files)
      files = files[task_index:num_files:num_workers]
      queue_name = "file_queue_{0}".format(task_index)

    print_log(worker_num, "files: {0}".format(files))
    file_queue = tf.train.string_input_producer(files, shuffle=False, capacity=1000, num_epochs=num_epochs, name=queue_name)

    # Setup reader for examples
    reader = tf.TFRecordReader(name="reader")
    _, serialized = reader.read(file_queue)
    feature_def = {'label': tf.FixedLenFeature([10], tf.int64), 'image': tf.FixedLenFeature([784], tf.int64) }
    features = tf.parse_single_example(serialized, feature_def)
    norm = tf.constant(255, dtype=tf.float32, shape=(784,))
    image = tf.div(tf.to_float(features['image']), norm)
    print_log(worker_num, "image: {0}".format(image))
    label = tf.to_float(features['label'])
    print_log(worker_num, "label: {0}".format(label))

    # Return a batch of examples
    return tf.train.batch([image,label], batch_size, num_threads=args.readers, name="batch")

  if job_name == "ps":
    server.join()
  elif job_name == "worker":
    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                              stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                              stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      # Placeholders or QueueRunner/Readers for input data
      num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs
      index = task_index if args.mode == "inference" else None
      workers = num_workers if args.mode == "inference" else None

      if args.format == "csv":
        images = TFNode.hdfs_path(ctx, args.images)
        labels = TFNode.hdfs_path(ctx, args.labels)
        x, y_ = read_csv_examples(images, labels, 100, num_epochs, index, workers)
      elif args.format == "tfr":
        images = TFNode.hdfs_path(ctx, args.images)
        x, y_ = read_tfr_examples(images, 100, num_epochs, index, workers)
      else:
        raise("{0} format not supported for tf input mode".format(args.format))

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
      tf.summary.image("x_img", x_img)

      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

      global_step = tf.Variable(0)

      loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
      tf.summary.scalar("loss", loss)
      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)
      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)
      output_dir = TFNode.hdfs_path(ctx, args.output)
      output_file = tf.gfile.Open("{0}/part-{1:05d}".format(output_dir, worker_num), mode='w')

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      count = 0
      while not sv.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using QueueRunners/Readers
        if args.mode == "train":
          if (step % 100 == 0):
            print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy)))
          _, summary, step = sess.run([train_op, summary_op, global_step])
          if sv.is_chief:
            summary_writer.add_summary(summary, step)
        else: # args.mode == "inference"
          labels, pred, acc = sess.run([label, prediction, accuracy])
          #print("label: {0}, pred: {1}".format(labels, pred))
          print("acc: {0}".format(acc))
          for i in range(len(labels)):
            count += 1
            output_file.write("{0} {1}\n".format(labels[i], pred[i]))
          print("count: {0}".format(count))

    if args.mode == "inference":
      output_file.close()
      # Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
      # run inference and request stop before the other workers even start/sync their sessions.
      if task_index == 0:
        time.sleep(60)

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
Ejemplo n.º 33
0
def map_fun(args, ctx):
  # from com.yahoo.ml.tf import TFNode
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num #worker数量
  job_name = ctx.job_name # job名
  task_index = ctx.task_index # 任务索引
  cluster_spec = ctx.cluster_spec # 集群

  IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1  (后续参考自己图像大小进行修改)
  channels=3
  num_class=2
  dropout = 0.5

  learning_rate=1e-6
  # Parameters
  hidden_units = 128 # NN隐藏层
  training_epochs=args.epochs
  img_nums=630000
  #batch_size   = args.batch_size #每批次训练的样本数
  batch_size=200
  """
  # ---------设置动态学习效率
  # Constants describing the training process.
  # MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
  NUM_EPOCHS_PER_DECAY = batch_size  # Epochs after which learning rate decays.
  LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
  INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.

  global_step1 = training_epochs * (img_nums // batch_size)  # Integer Variable counting the number of training steps
  # Variables that affect learning rate.
  num_batches_per_epoch = img_nums / batch_size
  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

  # Decay the learning rate exponentially based on the number of steps.
  learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                            global_step1,
                                            decay_steps,
                                            LEARNING_RATE_DECAY_FACTOR,
                                            staircase=True)
  # 设置动态学习效率----------
  """
  
  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps": # ps节点(主节点)
    time.sleep((worker_num + 1) * 5)

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    numpy.random.shuffle(batch) # 随机打乱
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    #xs = xs/255.0 # 数据归一化
    # Z-score标准化方法
    #mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1])
    #std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1])
    #xs = (xs - mean) / std

    # min-max标准化(Min-Max Normalization
    max_=numpy.reshape(numpy.max(xs,1),[numpy.shape(xs)[0], 1])
    min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1])

    xs=(xs-min_)/(max_-min_)
    
    
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      # Create some wrappers for simplicity
      def conv2d(x, W, b, strides=1):
        # Conv2D wrapper, with bias and relu activation
        x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
        x = tf.nn.bias_add(x, b)  # strides中间两个为1 表示x,y方向都不间隔取样
        return tf.nn.relu(x)

      def maxpool2d(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='SAME')  # strides中间两个为2 表示x,y方向都间隔1个取样

      def maxpool2d2(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='VALID')  # strides中间两个为2 表示x,y方向都间隔1个取样

      # Store layers weight & bias
      weights = {
          # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入
          'wc1': tf.get_variable('wc1',[3,3,channels,64],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),  # 5X5的卷积模板

          # 5x5 conv, 32 inputs, 64 outputs
          'wc2': tf.get_variable('wc2',[3,3,64,128],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'wc3': tf.Variable(tf.random_normal([3, 3, 256, 128])),
          'wc4': tf.get_variable('wc4',[3,3,128,num_class],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # fully connected, 7*7*64 inputs, 1024 outputs
          # 'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])),
          # 1024 inputs, 10 outputs (class prediction)
          # 'out': tf.Variable(tf.random_normal([1024, num_class]))
      }

      biases = {
          'bc1': tf.get_variable('bc1',[64],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          'bc2': tf.get_variable('bc2',[128],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'bc3': tf.Variable(tf.random_normal([128])),
          'bc4': tf.get_variable('bc4',[num_class],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'bd1': tf.Variable(tf.random_normal([1024])),
          # 'out': tf.Variable(tf.random_normal([num_class]))
      }

      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS * channels], name="x")  # mnist 28*28*1
      y_ = tf.placeholder(tf.float32, [None, num_class], name="y_")
      # keep=tf.placeholder(tf.float32)

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels])  # mnist 数据 28x28x1 (灰度图 波段为1)
      # tf.summary.image("x_img", x_img)

      # 改成卷积模型
      conv1 = conv2d(x_img, weights['wc1'], biases['bc1'])
      conv1 = maxpool2d(conv1, k=2)
      # conv1 = tf.nn.dropout(conv1, keep)
      conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
      conv2 = maxpool2d(conv2, k=2)
      conv2 = tf.nn.dropout(conv2, dropout)
      # conv3 = conv2d(conv2, weights['wc3'], biases['bc3'])
      # conv3 = tf.nn.dropout(conv3, keep)
      conv4 = conv2d(conv2, weights['wc4'], biases['bc4'])
      conv4 = maxpool2d2(conv4, k=2)
      y = tf.reshape(conv4, [-1, num_class])


      # fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
      # fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
      # fc1 = tf.nn.relu(fc1)
      # if args.mode == "train" or args.mode == "retrain":
      #   fc1 = tf.nn.dropout(fc1, dropout)
      # y = tf.add(tf.matmul(fc1, weights['out']), biases['out'])

 
      # global_step = tf.Variable(0)

      global_step = tf.Variable(0, name="global_step", trainable=False)

      # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

      loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y))

      # tf.summary.scalar("loss", loss)
      train_op = tf.train.AdagradOptimizer(learning_rate).minimize(
          loss, global_step=global_step)


      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      # tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      # summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()


    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir)) #
    # log.info("tensorflow model path: {0}".format(logdir))
    # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=1)
    elif args.mode == "retrain":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess: # 打开session

      print("{0} session ready".format(datetime.now().isoformat()))
      # log.info("{0} session ready".format(datetime.now().isoformat()))
      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train" or args.mode == "retrain")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train" or args.mode == "retrain":
            # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            _, step = sess.run([train_op,  global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
              # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
            if sv.is_chief:
              pass
              # summary_writer.add_summary(summary, step)
          else: # args.mode == "inference"
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

            results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))
            # log.info("acc: {0}".format(acc))
      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
Ejemplo n.º 34
0
        def _spark_train(args, ctx):
            """Basic linear regression in a distributed TF cluster using InputMode.SPARK"""
            import tensorflow as tf
            from tensorflowonspark import TFNode

            tf.compat.v1.reset_default_graph()
            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

            with strategy.scope():
                model = Sequential()
                model.add(Dense(1, activation='linear', input_shape=[2]))
                model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2),
                              loss='mse',
                              metrics=['mse'])
                model.summary()

            tf_feed = TFNode.DataFeed(ctx.mgr,
                                      input_mapping=args.input_mapping)

            def rdd_generator():
                while not tf_feed.should_stop():
                    batch = tf_feed.next_batch(1)
                    if len(batch['x']) > 0:
                        features = batch['x'][0]
                        label = batch['y_'][0]
                        yield (features, label)
                    else:
                        return

            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32),
                (tf.TensorShape([2]), tf.TensorShape([1])))
            ds = ds.batch(args.batch_size)

            # disable auto-sharding dataset
            options = tf.data.Options()
            options.experimental_distribute.auto_shard = False
            ds = ds.with_options(options)

            # only train 90% of each epoch to account for uneven RDD partition sizes
            steps_per_epoch = 1000 * 0.9 // (args.batch_size * ctx.num_workers)

            tf.io.gfile.makedirs(args.model_dir)
            filepath = args.model_dir + "/weights-{epoch:04d}"
            callbacks = [
                tf.keras.callbacks.ModelCheckpoint(
                    filepath=filepath,
                    verbose=1,
                    load_weights_on_restart=True,
                    save_weights_only=True)
            ]

            model.fit(ds,
                      epochs=args.epochs,
                      steps_per_epoch=steps_per_epoch,
                      callbacks=callbacks)

            # This fails with: "NotImplementedError: `fit_generator` is not supported for models compiled with tf.distribute.Strategy"
            # model.fit_generator(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks)

            if ctx.job_name == 'chief' and args.export_dir:
                print("exporting model to: {}".format(args.export_dir))
                tf.keras.experimental.export_saved_model(
                    model, args.export_dir)

            tf_feed.terminate()
Ejemplo n.º 35
0
def main_fun(args, ctx):
    import numpy as np
    import tensorflow as tf
    import tensorflow_datasets as tfds
    from tensorflowonspark import TFNode

    tfds.disable_progress_bar()

    BUFFER_SIZE = args.buffer_size
    BATCH_SIZE = args.batch_size
    LEARNING_RATE = args.learning_rate

    tf_feed = TFNode.DataFeed(ctx.mgr)

    def rdd_generator():
        while not tf_feed.should_stop():
            batch = tf_feed.next_batch(1)
            if len(batch) > 0:
                example = batch[0]
                image = np.array(example[0]).astype(np.float32) / 255.0
                image = np.reshape(image, (28, 28, 1))
                label = np.array(example[1]).astype(np.float32)
                label = np.reshape(label, (1, ))
                yield (image, label)
            else:
                return

    def input_fn(mode, input_context=None):
        if mode == tf.estimator.ModeKeys.TRAIN:
            # Note: Spark is responsible for feeding data via streaming RDD
            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32),
                (tf.TensorShape([28, 28, 1]), tf.TensorShape([1])))
            return ds.batch(BATCH_SIZE)
        else:
            raise Exception("I'm evaluating: mode={}, input_context={}".format(
                mode, input_context))

            def scale(image, label):
                image = tf.cast(image, tf.float32) / 255.0
                return image, label

            mnist = tfds.load(name='mnist', with_info=True, as_supervised=True)
            ds = mnist['test']
            if input_context:
                ds = ds.shard(input_context.num_input_pipelines,
                              input_context.input_pipeline_id)
            return ds.map(scale).batch(BATCH_SIZE)

    def serving_input_receiver_fn():
        features = tf.compat.v1.placeholder(dtype=tf.float32,
                                            shape=[None, 28, 28, 1],
                                            name='features')
        receiver_tensors = {'features': features}
        return tf.estimator.export.ServingInputReceiver(
            receiver_tensors, receiver_tensors)

    def model_fn(features, labels, mode):
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32,
                                   3,
                                   activation='relu',
                                   input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
        logits = model(features, training=False)

        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {'logits': logits}
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

        optimizer = tf.compat.v1.train.GradientDescentOptimizer(
            learning_rate=LEARNING_RATE)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels,
                                                                        logits)
        loss = tf.reduce_sum(input_tensor=loss) * (1. / BATCH_SIZE)
        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode, loss=loss)

        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=loss,
            train_op=optimizer.minimize(
                loss, tf.compat.v1.train.get_or_create_global_step()))

    # Note: the original example used MultiWorkerMirroredStrategy which is a synchronous training strategy.
    # Since streaming data arrives irregularly, we must use the asynchronous ParameterServerStrategy
    # to allow data to be processed as it arrives and to avoid deadlocks.
    # strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    strategy = tf.distribute.experimental.ParameterServerStrategy()
    config = tf.estimator.RunConfig(train_distribute=strategy,
                                    save_checkpoints_steps=100)

    classifier = tf.estimator.Estimator(model_fn=model_fn,
                                        model_dir=args.model_dir,
                                        config=config)

    # exporter = tf.estimator.FinalExporter("serving", serving_input_receiver_fn=serving_input_receiver_fn)

    tf.estimator.train_and_evaluate(
        classifier,
        train_spec=tf.estimator.TrainSpec(input_fn=input_fn),
        eval_spec=tf.estimator.EvalSpec(input_fn=input_fn)
        # eval_spec=tf.estimator.EvalSpec(input_fn=input_fn, exporters=exporter)
    )

    if ctx.job_name == 'chief':
        print("Exporting saved_model to {}".format(args.export_dir))
        classifier.export_saved_model(args.export_dir,
                                      serving_input_receiver_fn)
Ejemplo n.º 36
0
def main_fun(args, ctx):
    import numpy as np
    import tensorflow as tf
    from tensorflowonspark import compat, TFNode

    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

    def build_and_compile_cnn_model():
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32,
                                   3,
                                   activation='relu',
                                   input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
        model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
                      optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
                      metrics=['accuracy'])
        return model

    # single node
    # single_worker_model = build_and_compile_cnn_model()
    # single_worker_model.fit(x=train_datasets, epochs=3)

    tf_feed = TFNode.DataFeed(ctx.mgr, False)

    def rdd_generator():
        while not tf_feed.should_stop():
            batch = tf_feed.next_batch(1)
            if len(batch) > 0:
                example = batch[0]
                image = np.array(example[0]).astype(np.float32) / 255.0
                image = np.reshape(image, (28, 28, 1))
                label = np.array(example[1]).astype(np.float32)
                label = np.reshape(label, (1, ))
                yield (image, label)
            else:
                return

    ds = tf.data.Dataset.from_generator(
        rdd_generator, (tf.float32, tf.float32),
        (tf.TensorShape([28, 28, 1]), tf.TensorShape([1])))
    ds = ds.batch(args.batch_size)

    # this fails
    # callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath=args.model_dir)]
    tf.io.gfile.makedirs(args.model_dir)
    filepath = args.model_dir + "/weights-{epoch:04d}"
    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(filepath=filepath,
                                           verbose=1,
                                           save_weights_only=True)
    ]

    with strategy.scope():
        multi_worker_model = build_and_compile_cnn_model()

    # Note: MultiWorkerMirroredStrategy (CollectiveAllReduceStrategy) is synchronous,
    # so we need to ensure that all workers complete training before any of them run out of data from the RDD.
    # And given that Spark RDD partitions (and partition sizes) can be non-evenly divisible by num_workers,
    # we'll just stop training at 90% of the total expected number of steps.
    steps_per_epoch = 60000 / args.batch_size
    steps_per_epoch_per_worker = steps_per_epoch / ctx.num_workers
    max_steps_per_worker = steps_per_epoch_per_worker * 0.9

    multi_worker_model.fit(x=ds,
                           epochs=args.epochs,
                           steps_per_epoch=max_steps_per_worker,
                           callbacks=callbacks)

    from tensorflow_estimator.python.estimator.export import export_lib
    export_dir = export_lib.get_timestamped_export_dir(args.export_dir)
    compat.export_saved_model(multi_worker_model, export_dir,
                              ctx.job_name == 'chief')

    # terminating feed tells spark to skip processing further partitions
    tf_feed.terminate()
def main_fun(argv, ctx):
  import tensorflow as tf
  import cifar10

  sys.argv = argv
  FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train',
                             """Directory where to write event logs """
                             """and checkpoint.""")
  tf.app.flags.DEFINE_integer('max_steps', 1000000,
                              """Number of batches to run.""")
  tf.app.flags.DEFINE_integer('num_gpus', 1,
                              """How many GPUs to use.""")
  tf.app.flags.DEFINE_boolean('log_device_placement', False,
                              """Whether to log device placement.""")
  tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")
  cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma)

  def tower_loss(scope):
    """Calculate the total loss on a single tower running the CIFAR model.

    Args:
      scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

    Returns:
       Tensor of shape [] containing the total loss for a batch of data
    """
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build inference Graph.
    logits = cifar10.inference(images)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
      # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
      # session. This helps the clarity of presentation on tensorboard.
      loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
      tf.summary.scalar(loss_name, l)

    return total_loss


  def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.

    Note that this function provides a synchronization point across all towers.

    Args:
      tower_grads: List of lists of (gradient, variable) tuples. The outer list
        is over individual gradients. The inner list is over the gradient
        calculation for each tower.
    Returns:
       List of pairs of (gradient, variable) where the gradient has been averaged
       across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):
      # Note that each grad_and_vars looks like the following:
      #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
      grads = []
      for g, _ in grad_and_vars:
        # Add 0 dimension to the gradients to represent the tower.
        expanded_g = tf.expand_dims(g, 0)

        # Append on a 'tower' dimension which we will average over below.
        grads.append(expanded_g)

      # Average over the 'tower' dimension.
      grad = tf.concat(axis=0, values=grads)
      grad = tf.reduce_mean(grad, 0)

      # Keep in mind that the Variables are redundant because they are shared
      # across towers. So .. we will just return the first tower's pointer to
      # the Variable.
      v = grad_and_vars[0][1]
      grad_and_var = (grad, v)
      average_grads.append(grad_and_var)
    return average_grads


  def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
      # Create a variable to count the number of train() calls. This equals the
      # number of batches processed * FLAGS.num_gpus.
      global_step = tf.get_variable(
          'global_step', [],
          initializer=tf.constant_initializer(0), trainable=False)

      # Calculate the learning rate schedule.
      num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                               FLAGS.batch_size)
      decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)

      # Decay the learning rate exponentially based on the number of steps.
      lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
                                      global_step,
                                      decay_steps,
                                      cifar10.LEARNING_RATE_DECAY_FACTOR,
                                      staircase=True)

      # Create an optimizer that performs gradient descent.
      opt = tf.train.GradientDescentOptimizer(lr)

      # Calculate the gradients for each model tower.
      tower_grads = []
      with tf.variable_scope(tf.get_variable_scope()):
        for i in xrange(FLAGS.num_gpus):
          with tf.device('/gpu:%d' % i):
            with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
              # Calculate the loss for one tower of the CIFAR model. This function
              # constructs the entire CIFAR model but shares the variables across
              # all towers.
              loss = tower_loss(scope)

              # Reuse variables for the next tower.
              tf.get_variable_scope().reuse_variables()

              # Retain the summaries from the final tower.
              summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

              # Calculate the gradients for the batch of data on this CIFAR tower.
              grads = opt.compute_gradients(loss)

              # Keep track of the gradients across all towers.
              tower_grads.append(grads)

      # We must calculate the mean of each gradient. Note that this is the
      # synchronization point across all towers.
      grads = average_gradients(tower_grads)

      # Add a summary to track the learning rate.
      summaries.append(tf.summary.scalar('learning_rate', lr))

      # Add histograms for gradients.
      for grad, var in grads:
        if grad is not None:
          summaries.append(tf.summary.histogram(var.op.name + '/gradients', grad))

      # Apply the gradients to adjust the shared variables.
      apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

      # Add histograms for trainable variables.
      for var in tf.trainable_variables():
        summaries.append(tf.summary.histogram(var.op.name, var))

      # Track the moving averages of all trainable variables.
      variable_averages = tf.train.ExponentialMovingAverage(
          cifar10.MOVING_AVERAGE_DECAY, global_step)
      variables_averages_op = variable_averages.apply(tf.trainable_variables())

      # Group all updates to into a single train op.
      train_op = tf.group(apply_gradient_op, variables_averages_op)

      # Create a saver.
      saver = tf.train.Saver(tf.global_variables())

      # Build the summary operation from the last tower summaries.
      summary_op = tf.summary.merge(summaries)

      # Build an initialization operation to run below.
      init = tf.global_variables_initializer()

      # Start running operations on the Graph. allow_soft_placement must be set to
      # True to build towers on GPU, as some of the ops do not have GPU
      # implementations.
      sess = tf.Session(config=tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=FLAGS.log_device_placement))
      sess.run(init)

      # Start the queue runners.
      tf.train.start_queue_runners(sess=sess)

      summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

      for step in xrange(FLAGS.max_steps):
        start_time = time.time()
        _, loss_value = sess.run([train_op, loss])
        duration = time.time() - start_time

        assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

        if step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = duration / FLAGS.num_gpus

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), step, loss_value,
                               examples_per_sec, sec_per_batch))

        if step % 100 == 0:
          summary_str = sess.run(summary_op)
          summary_writer.add_summary(summary_str, step)

        # Save the model checkpoint periodically.
        if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
          checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
          saver.save(sess, checkpoint_path, global_step=step)

  # cifar10.maybe_download_and_extract()
  if tf.gfile.Exists(FLAGS.train_dir):
    tf.gfile.DeleteRecursively(FLAGS.train_dir)
  tf.gfile.MakeDirs(FLAGS.train_dir)
  train()
def main_fun(argv, ctx):
  import math
  import six
  import tensorflow as tf

  from datasets import dataset_factory
  from nets import nets_factory
  from preprocessing import preprocessing_factory

  sys.argv = argv

  slim = tf.contrib.slim

  tf.app.flags.DEFINE_integer(
      'batch_size', 100, 'The number of samples in each batch.')

  tf.app.flags.DEFINE_integer(
      'max_num_batches', None,
      'Max number of batches to evaluate by default use all.')

  tf.app.flags.DEFINE_string(
      'master', '', 'The address of the TensorFlow master to use.')

  tf.app.flags.DEFINE_string(
      'checkpoint_path', '/tmp/tfmodel/',
      'The directory where the model was written to or an absolute path to a '
      'checkpoint file.')

  tf.app.flags.DEFINE_string(
      'eval_dir', '/tmp/tfmodel/', 'Directory where the results are saved to.')

  tf.app.flags.DEFINE_integer(
      'num_preprocessing_threads', 4,
      'The number of threads used to create the batches.')

  tf.app.flags.DEFINE_string(
      'dataset_name', 'imagenet', 'The name of the dataset to load.')

  tf.app.flags.DEFINE_string(
      'dataset_split_name', 'test', 'The name of the train/test split.')

  tf.app.flags.DEFINE_string(
      'dataset_dir', None, 'The directory where the dataset files are stored.')

  tf.app.flags.DEFINE_integer(
      'labels_offset', 0,
      'An offset for the labels in the dataset. This flag is primarily used to '
      'evaluate the VGG and ResNet architectures which do not use a background '
      'class for the ImageNet dataset.')

  tf.app.flags.DEFINE_string(
      'model_name', 'inception_v3', 'The name of the architecture to evaluate.')

  tf.app.flags.DEFINE_string(
      'preprocessing_name', None, 'The name of the preprocessing to use. If left '
      'as `None`, then the model_name flag is used.')

  tf.app.flags.DEFINE_float(
      'moving_average_decay', None,
      'The decay to use for the moving average.'
      'If left as None, then moving averages are not used.')

  tf.app.flags.DEFINE_integer(
      'eval_image_size', None, 'Eval image size')

  FLAGS = tf.app.flags.FLAGS

  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  cluster_spec, server = TFNode.start_cluster_server(ctx)

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    #tf_global_step = slim.get_or_create_global_step()
    tf_global_step = tf.Variable(0, name="global_step")

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

    ####################
    # Select the model #
    ####################
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        is_training=False)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        shuffle=False,
        common_queue_capacity=2 * FLAGS.batch_size,
        common_queue_min=FLAGS.batch_size)
    [image, label] = provider.get(['image', 'label'])
    label -= FLAGS.labels_offset

    #####################################
    # Select the preprocessing function #
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=False)

    eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

    image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

    images, labels = tf.train.batch(
        [image, label],
        batch_size=FLAGS.batch_size,
        num_threads=FLAGS.num_preprocessing_threads,
        capacity=5 * FLAGS.batch_size)

    ####################
    # Define the model #
    ####################
    logits, _ = network_fn(images)

    if FLAGS.moving_average_decay:
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, tf_global_step)
      variables_to_restore = variable_averages.variables_to_restore(
          slim.get_model_variables())
      variables_to_restore[tf_global_step.op.name] = tf_global_step
    else:
      variables_to_restore = slim.get_variables_to_restore()

    predictions = tf.argmax(logits, 1)
    labels = tf.squeeze(labels)

    # Define the metrics:
    names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
        'Accuracy': slim.metrics.streaming_accuracy(predictions, labels),
        'Recall_5': slim.metrics.streaming_recall_at_k(
            logits, labels, 5),
    })

    # Print the summaries to screen.
    for name, value in six.iteritems(names_to_values):
      summary_name = 'eval/%s' % name
      op = tf.summary.scalar(summary_name, value, collections=[])
      op = tf.Print(op, [value], summary_name)
      tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

    # TODO(sguada) use num_epochs=1
    if FLAGS.max_num_batches:
      num_batches = FLAGS.max_num_batches
    else:
      # This ensures that we make a single pass over all of the data.
      num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))

    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    else:
      checkpoint_path = FLAGS.checkpoint_path

    tf.logging.info('Evaluating %s' % checkpoint_path)

    slim.evaluation.evaluate_once(
        master=FLAGS.master,
        checkpoint_path=checkpoint_path,
        logdir=FLAGS.eval_dir,
        num_evals=num_batches,
        eval_op=list(names_to_updates.values()),
        variables_to_restore=variables_to_restore)
def main_fun(argv, ctx):
  import tensorflow as tf
  import cifar10

  sys.argv = argv
  FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train',
                             """Directory where to write event logs """
                             """and checkpoint.""")
  tf.app.flags.DEFINE_integer('max_steps', 1000000,
                              """Number of batches to run.""")
  tf.app.flags.DEFINE_boolean('log_device_placement', False,
                              """Whether to log device placement.""")
  tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")

  # cifar10.maybe_download_and_extract()
  if tf.gfile.Exists(FLAGS.train_dir):
    tf.gfile.DeleteRecursively(FLAGS.train_dir)
  tf.gfile.MakeDirs(FLAGS.train_dir)

  cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

  # Train CIFAR-10 for a number of steps.
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
Ejemplo n.º 40
0
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec

  IMAGE_PIXELS=28

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128
  batch_size   = args.batch_size

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    xs = xs/255.0
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                              stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                              stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
      y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
      tf.summary.image("x_img", x_img)

      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

      global_step = tf.Variable(0)

      loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
      tf.summary.scalar("loss", loss)

      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))

            if sv.is_chief:
              summary_writer.add_summary(summary, step)
          else: # args.mode == "inference"
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

            results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))

      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def main_fun(argv, ctx):
  import tensorflow as tf
  from tensorflow.python.ops import control_flow_ops
  from datasets import dataset_factory
  from deployment import model_deploy
  from nets import nets_factory
  from preprocessing import preprocessing_factory

  sys.argv = argv

  slim = tf.contrib.slim

  tf.app.flags.DEFINE_integer(
      'num_gpus', '1', 'The number of GPUs to use per node')

  tf.app.flags.DEFINE_boolean('rdma', False, 'Whether to use rdma.')

  tf.app.flags.DEFINE_string(
      'master', '', 'The address of the TensorFlow master to use.')

  tf.app.flags.DEFINE_string(
      'train_dir', '/tmp/tfmodel/',
      'Directory where checkpoints and event logs are written to.')

  tf.app.flags.DEFINE_integer('num_clones', 1,
                              'Number of model clones to deploy.')

  tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
                              'Use CPUs to deploy clones.')

  tf.app.flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas.')

  tf.app.flags.DEFINE_integer(
      'num_ps_tasks', 0,
      'The number of parameter servers. If the value is 0, then the parameters '
      'are handled locally by the worker.')

  tf.app.flags.DEFINE_integer(
      'num_readers', 4,
      'The number of parallel readers that read data from the dataset.')

  tf.app.flags.DEFINE_integer(
      'num_preprocessing_threads', 4,
      'The number of threads used to create the batches.')

  tf.app.flags.DEFINE_integer(
      'log_every_n_steps', 10,
      'The frequency with which logs are print.')

  tf.app.flags.DEFINE_integer(
      'save_summaries_secs', 600,
      'The frequency with which summaries are saved, in seconds.')

  tf.app.flags.DEFINE_integer(
      'save_interval_secs', 600,
      'The frequency with which the model is saved, in seconds.')

  tf.app.flags.DEFINE_integer(
      'task', 0, 'Task id of the replica running the training.')

  ######################
  # Optimization Flags #
  ######################

  tf.app.flags.DEFINE_float(
      'weight_decay', 0.00004, 'The weight decay on the model weights.')

  tf.app.flags.DEFINE_string(
      'optimizer', 'rmsprop',
      'The name of the optimizer, one of "adadelta", "adagrad", "adam",'
      '"ftrl", "momentum", "sgd" or "rmsprop".')

  tf.app.flags.DEFINE_float(
      'adadelta_rho', 0.95,
      'The decay rate for adadelta.')

  tf.app.flags.DEFINE_float(
      'adagrad_initial_accumulator_value', 0.1,
      'Starting value for the AdaGrad accumulators.')

  tf.app.flags.DEFINE_float(
      'adam_beta1', 0.9,
      'The exponential decay rate for the 1st moment estimates.')

  tf.app.flags.DEFINE_float(
      'adam_beta2', 0.999,
      'The exponential decay rate for the 2nd moment estimates.')

  tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.')

  tf.app.flags.DEFINE_float('ftrl_learning_rate_power', -0.5,
                            'The learning rate power.')

  tf.app.flags.DEFINE_float(
      'ftrl_initial_accumulator_value', 0.1,
      'Starting value for the FTRL accumulators.')

  tf.app.flags.DEFINE_float(
      'ftrl_l1', 0.0, 'The FTRL l1 regularization strength.')

  tf.app.flags.DEFINE_float(
      'ftrl_l2', 0.0, 'The FTRL l2 regularization strength.')

  tf.app.flags.DEFINE_float(
      'momentum', 0.9,
      'The momentum for the MomentumOptimizer and RMSPropOptimizer.')

  tf.app.flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.')

  #######################
  # Learning Rate Flags #
  #######################

  tf.app.flags.DEFINE_string(
      'learning_rate_decay_type',
      'exponential',
      'Specifies how the learning rate is decayed. One of "fixed", "exponential",'
      ' or "polynomial"')

  tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')

  tf.app.flags.DEFINE_float(
      'end_learning_rate', 0.0001,
      'The minimal end learning rate used by a polynomial decay learning rate.')

  tf.app.flags.DEFINE_float(
      'label_smoothing', 0.0, 'The amount of label smoothing.')

  tf.app.flags.DEFINE_float(
      'learning_rate_decay_factor', 0.94, 'Learning rate decay factor.')

  tf.app.flags.DEFINE_float(
      'num_epochs_per_decay', 2.0,
      'Number of epochs after which learning rate decays.')

  tf.app.flags.DEFINE_bool(
      'sync_replicas', False,
      'Whether or not to synchronize the replicas during training.')

  tf.app.flags.DEFINE_integer(
      'replicas_to_aggregate', 1,
      'The Number of gradients to collect before updating params.')

  tf.app.flags.DEFINE_float(
      'moving_average_decay', None,
      'The decay to use for the moving average.'
      'If left as None, then moving averages are not used.')

  #######################
  # Dataset Flags #
  #######################

  tf.app.flags.DEFINE_string(
      'dataset_name', 'imagenet', 'The name of the dataset to load.')

  tf.app.flags.DEFINE_string(
      'dataset_split_name', 'train', 'The name of the train/test split.')

  tf.app.flags.DEFINE_string(
      'dataset_dir', None, 'The directory where the dataset files are stored.')

  tf.app.flags.DEFINE_integer(
      'labels_offset', 0,
      'An offset for the labels in the dataset. This flag is primarily used to '
      'evaluate the VGG and ResNet architectures which do not use a background '
      'class for the ImageNet dataset.')

  tf.app.flags.DEFINE_string(
      'model_name', 'inception_v3', 'The name of the architecture to train.')

  tf.app.flags.DEFINE_string(
      'preprocessing_name', None, 'The name of the preprocessing to use. If left '
      'as `None`, then the model_name flag is used.')

  tf.app.flags.DEFINE_integer(
      'batch_size', 32, 'The number of samples in each batch.')

  tf.app.flags.DEFINE_integer(
      'train_image_size', None, 'Train image size')

  tf.app.flags.DEFINE_integer('max_number_of_steps', None,
                              'The maximum number of training steps.')

  #####################
  # Fine-Tuning Flags #
  #####################

  tf.app.flags.DEFINE_string(
      'checkpoint_path', None,
      'The path to a checkpoint from which to fine-tune.')

  tf.app.flags.DEFINE_string(
      'checkpoint_exclude_scopes', None,
      'Comma-separated list of scopes of variables to exclude when restoring '
      'from a checkpoint.')

  tf.app.flags.DEFINE_string(
      'trainable_scopes', None,
      'Comma-separated list of scopes to filter the set of variables to train.'
      'By default, None would train all the variables.')

  tf.app.flags.DEFINE_boolean(
      'ignore_missing_vars', False,
      'When restoring a checkpoint would ignore missing variables.')

  FLAGS = tf.app.flags.FLAGS
  FLAGS.job_name = ctx.job_name
  FLAGS.task = ctx.task_index
  FLAGS.num_clones = FLAGS.num_gpus
  FLAGS.worker_replicas = len(ctx.cluster_spec['worker'])
  assert(FLAGS.num_ps_tasks == (len(ctx.cluster_spec['ps']) if 'ps' in ctx.cluster_spec else 0))

  def _configure_learning_rate(num_samples_per_epoch, global_step):
    """Configures the learning rate.

    Args:
      num_samples_per_epoch: The number of samples in each epoch of training.
      global_step: The global_step tensor.

    Returns:
      A `Tensor` representing the learning rate.

    Raises:
      ValueError: if
    """
    decay_steps = int(num_samples_per_epoch / FLAGS.batch_size *
                      FLAGS.num_epochs_per_decay)
    if FLAGS.sync_replicas:
      decay_steps /= FLAGS.replicas_to_aggregate

    if FLAGS.learning_rate_decay_type == 'exponential':
      return tf.train.exponential_decay(FLAGS.learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True,
                                        name='exponential_decay_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'fixed':
      return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'polynomial':
      return tf.train.polynomial_decay(FLAGS.learning_rate,
                                       global_step,
                                       decay_steps,
                                       FLAGS.end_learning_rate,
                                       power=1.0,
                                       cycle=False,
                                       name='polynomial_decay_learning_rate')
    else:
      raise ValueError('learning_rate_decay_type [%s] was not recognized',
                       FLAGS.learning_rate_decay_type)


  def _configure_optimizer(learning_rate):
    """Configures the optimizer used for training.

    Args:
      learning_rate: A scalar or `Tensor` learning rate.

    Returns:
      An instance of an optimizer.

    Raises:
      ValueError: if FLAGS.optimizer is not recognized.
    """
    if FLAGS.optimizer == 'adadelta':
      optimizer = tf.train.AdadeltaOptimizer(
          learning_rate,
          rho=FLAGS.adadelta_rho,
          epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'adagrad':
      optimizer = tf.train.AdagradOptimizer(
          learning_rate,
          initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value)
    elif FLAGS.optimizer == 'adam':
      optimizer = tf.train.AdamOptimizer(
          learning_rate,
          beta1=FLAGS.adam_beta1,
          beta2=FLAGS.adam_beta2,
          epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'ftrl':
      optimizer = tf.train.FtrlOptimizer(
          learning_rate,
          learning_rate_power=FLAGS.ftrl_learning_rate_power,
          initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value,
          l1_regularization_strength=FLAGS.ftrl_l1,
          l2_regularization_strength=FLAGS.ftrl_l2)
    elif FLAGS.optimizer == 'momentum':
      optimizer = tf.train.MomentumOptimizer(
          learning_rate,
          momentum=FLAGS.momentum,
          name='Momentum')
    elif FLAGS.optimizer == 'rmsprop':
      optimizer = tf.train.RMSPropOptimizer(
          learning_rate,
          decay=FLAGS.rmsprop_decay,
          momentum=FLAGS.momentum,
          epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'sgd':
      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    else:
      raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer)
    return optimizer


  def _add_variables_summaries(learning_rate):
    summaries = []
    for variable in slim.get_model_variables():
      summaries.append(tf.summary.histogram(variable.op.name, variable))
    summaries.append(tf.summary.scalar('training/Learning Rate', learning_rate))
    return summaries


  def _get_init_fn():
    """Returns a function run by the chief worker to warm-start the training.

    Note that the init_fn is only run when initializing the model during the very
    first global step.

    Returns:
      An init function run by the supervisor.
    """
    if FLAGS.checkpoint_path is None:
      return None

    # Warn the user if a checkpoint exists in the train_dir. Then we'll be
    # ignoring the checkpoint anyway.
    if tf.train.latest_checkpoint(FLAGS.train_dir):
      tf.logging.info(
          'Ignoring --checkpoint_path because a checkpoint already exists in %s'
          % FLAGS.train_dir)
      return None

    exclusions = []
    if FLAGS.checkpoint_exclude_scopes:
      exclusions = [scope.strip()
                    for scope in FLAGS.checkpoint_exclude_scopes.split(',')]

    # TODO(sguada) variables.filter_variables()
    variables_to_restore = []
    for var in slim.get_model_variables():
      excluded = False
      for exclusion in exclusions:
        if var.op.name.startswith(exclusion):
          excluded = True
          break
      if not excluded:
        variables_to_restore.append(var)

    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    else:
      checkpoint_path = FLAGS.checkpoint_path

    tf.logging.info('Fine-tuning from %s' % checkpoint_path)

    return slim.assign_from_checkpoint_fn(
        checkpoint_path,
        variables_to_restore,
        ignore_missing_vars=FLAGS.ignore_missing_vars)


  def _get_variables_to_train():
    """Returns a list of variables to train.

    Returns:
      A list of variables to train by the optimizer.
    """
    if FLAGS.trainable_scopes is None:
      return tf.trainable_variables()
    else:
      scopes = [scope.strip() for scope in FLAGS.trainable_scopes.split(',')]

    variables_to_train = []
    for scope in scopes:
      variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
      variables_to_train.extend(variables)
    return variables_to_train

  # main
  cluster_spec, server = TFNode.start_cluster_server(ctx=ctx, num_gpus=FLAGS.num_gpus, rdma=FLAGS.rdma)
  if ctx.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()
  else:
    # `worker` jobs will actually do the work.
    if not FLAGS.dataset_dir:
      raise ValueError('You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
      #######################
      # Config model_deploy #
      #######################
      deploy_config = model_deploy.DeploymentConfig(
          num_clones=FLAGS.num_clones,
          clone_on_cpu=FLAGS.clone_on_cpu,
          replica_id=FLAGS.task,
          num_replicas=FLAGS.worker_replicas,
          num_ps_tasks=FLAGS.num_ps_tasks)

      # Create global_step
      #with tf.device(deploy_config.variables_device()):
      #  global_step = slim.create_global_step()
      with tf.device("/job:ps/task:0"):
        global_step = tf.Variable(0, name="global_step")

      ######################
      # Select the dataset #
      ######################
      dataset = dataset_factory.get_dataset(
          FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

      ######################
      # Select the network #
      ######################
      network_fn = nets_factory.get_network_fn(
          FLAGS.model_name,
          num_classes=(dataset.num_classes - FLAGS.labels_offset),
          weight_decay=FLAGS.weight_decay,
          is_training=True)

      #####################################
      # Select the preprocessing function #
      #####################################
      preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
      image_preprocessing_fn = preprocessing_factory.get_preprocessing(
          preprocessing_name,
          is_training=True)

      ##############################################################
      # Create a dataset provider that loads data from the dataset #
      ##############################################################
      with tf.device(deploy_config.inputs_device()):
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            num_readers=FLAGS.num_readers,
            common_queue_capacity=20 * FLAGS.batch_size,
            common_queue_min=10 * FLAGS.batch_size)
        [image, label] = provider.get(['image', 'label'])
        label -= FLAGS.labels_offset

        train_image_size = FLAGS.train_image_size or network_fn.default_image_size

        image = image_preprocessing_fn(image, train_image_size, train_image_size)

        images, labels = tf.train.batch(
            [image, label],
            batch_size=FLAGS.batch_size,
            num_threads=FLAGS.num_preprocessing_threads,
            capacity=5 * FLAGS.batch_size)
        labels = slim.one_hot_encoding(
            labels, dataset.num_classes - FLAGS.labels_offset)
        batch_queue = slim.prefetch_queue.prefetch_queue(
            [images, labels], capacity=2 * deploy_config.num_clones)

      ####################
      # Define the model #
      ####################
      def clone_fn(batch_queue):
        """Allows data parallelism by creating multiple clones of network_fn."""
        images, labels = batch_queue.dequeue()
        logits, end_points = network_fn(images)

        #############################
        # Specify the loss function #
        #############################
        if 'AuxLogits' in end_points:
          tf.losses.softmax_cross_entropy(
              logits=end_points['AuxLogits'], onehot_labels=labels,
              label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss')
        tf.losses.softmax_cross_entropy(
            logits=logits, onehot_labels=labels,
            label_smoothing=FLAGS.label_smoothing, weights=1.0)
        return end_points

      # Gather initial summaries.
      summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

      clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
      first_clone_scope = deploy_config.clone_scope(0)
      # Gather update_ops from the first clone. These contain, for example,
      # the updates for the batch_norm variables created by network_fn.
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

      # Add summaries for end_points.
      end_points = clones[0].outputs
      for end_point in end_points:
        x = end_points[end_point]
        summaries.add(tf.summary.histogram('activations/' + end_point, x))
        summaries.add(tf.summary.scalar('sparsity/' + end_point,
                                        tf.nn.zero_fraction(x)))

      # Add summaries for losses.
      for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
        summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

      # Add summaries for variables.
      for variable in slim.get_model_variables():
        summaries.add(tf.summary.histogram(variable.op.name, variable))

      #################################
      # Configure the moving averages #
      #################################
      if FLAGS.moving_average_decay:
        moving_average_variables = slim.get_model_variables()
        variable_averages = tf.train.ExponentialMovingAverage(
            FLAGS.moving_average_decay, global_step)
      else:
        moving_average_variables, variable_averages = None, None

      #########################################
      # Configure the optimization procedure. #
      #########################################
      with tf.device(deploy_config.optimizer_device()):
        learning_rate = _configure_learning_rate(dataset.num_samples, global_step)
        optimizer = _configure_optimizer(learning_rate)
        summaries.add(tf.summary.scalar('learning_rate', learning_rate))

      if FLAGS.sync_replicas:
        # If sync_replicas is enabled, the averaging will be done in the chief
        # queue runner.
        optimizer = tf.train.SyncReplicasOptimizer(
            opt=optimizer,
            replicas_to_aggregate=FLAGS.replicas_to_aggregate,
            variable_averages=variable_averages,
            variables_to_average=moving_average_variables,
            replica_id=tf.constant(FLAGS.task, tf.int32, shape=()),
            total_num_replicas=FLAGS.worker_replicas)
      elif FLAGS.moving_average_decay:
        # Update ops executed locally by trainer.
        update_ops.append(variable_averages.apply(moving_average_variables))

      # Variables to train.
      variables_to_train = _get_variables_to_train()

      #  and returns a train_tensor and summary_op
      total_loss, clones_gradients = model_deploy.optimize_clones(
          clones,
          optimizer,
          var_list=variables_to_train)
      # Add total_loss to summary.
      summaries.add(tf.summary.scalar('total_loss', total_loss))

      # Create gradient updates.
      grad_updates = optimizer.apply_gradients(clones_gradients,
                                               global_step=global_step)
      update_ops.append(grad_updates)

      update_op = tf.group(*update_ops)
      train_tensor = control_flow_ops.with_dependencies([update_op], total_loss,
                                                        name='train_op')

      # Add the summaries from the first clone. These contain the summaries
      # created by model_fn and either optimize_clones() or _gather_clone_loss().
      summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                         first_clone_scope))

      # Merge all summaries together.
      summary_op = tf.summary.merge(list(summaries), name='summary_op')


      ###########################
      # Kicks off the training. #
      ###########################
      summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num), graph=tf.get_default_graph())
      slim.learning.train(
          train_tensor,
          logdir=FLAGS.train_dir,
          master=server.target,
          is_chief=(FLAGS.task == 0),
          init_fn=_get_init_fn(),
          summary_op=summary_op,
          number_of_steps=FLAGS.max_number_of_steps,
          log_every_n_steps=FLAGS.log_every_n_steps,
          save_summaries_secs=FLAGS.save_summaries_secs,
          save_interval_secs=FLAGS.save_interval_secs,
          summary_writer=summary_writer,
          sync_optimizer=optimizer if FLAGS.sync_replicas else None)
Ejemplo n.º 42
0
def main_fun(argv, ctx):
    import pprint
    import numpy as np
    import tensorflow as tf
    import online_model
    import tfos_online_data_reader

    sys.argv = argv
    flags = tf.app.flags
    FLAGS = flags.FLAGS

    flags.DEFINE_integer('batch_size', 100, 'data batch size')
    flags.DEFINE_integer('num_epoch', 1, 'train epoches for dataset ')
    flags.DEFINE_string('mapping_data',
                        'hdfs://appcluster-cdh/user/root/Adwin_Refactoring_Test/instance_build_txt/mix_dev_wx_interest2/20171022_map',
                        'id mapping path')
    flags.DEFINE_string('train_data',
                        'hdfs://appcluster-cdh/user/root/Adwin_Refactoring_Test/instance_build_txt/mix_dev_wx_interest2/20171022',
                        'train data path')
    #flags.DEFINE_string('mapping_data',
    #                    'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/mix_dev_wx_interest2/20171022_map',
    #                    'id mapping path')
    #flags.DEFINE_string('train_data',
    #                    'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/mix_dev_wx_interest2/20171022',
    #                    'train data path')
    flags.DEFINE_string('log_dir',
                        'hdfs://appcluster-cdh/user/root/tensorflow/app/online_train_distributed/model',
                        'log directory')

    flags.DEFINE_float('linear_lr', 0.1, 'wide part learning rate. default 0.1')
    flags.DEFINE_float('dnn_lr', 0.001, 'deep part learning rate. default 0.001')
    flags.DEFINE_string('linear_optimizer', 'ftrl',
                        'optimizer: adadelta | adagrad | sgd | adam | ftrl | momentum. default is ftrl')
    flags.DEFINE_string('dnn_optimizer', 'adagrad',
                        'optimizer: adadelta | adagrad | sgd | adam | ftrl | momentum. default is adagrad')

    flags.DEFINE_integer('input_dim', 13, 'input dimension')
    flags.DEFINE_string("model_network", "100,20", "The neural network of model, as 100,50,20")
    flags.DEFINE_string("model_type", "wide_deep", "model type: wide | deep | wide_deep")
    flags.DEFINE_integer('display_step', 200, 'display_step')

    flags.DEFINE_integer('ps_num', '64', 'Comma-separated list of hostname:port pairs')
    flags.DEFINE_integer('task_num', '128', 'Comma-separated list of hostname:port pairs')

    pprint.PrettyPrinter().pprint(FLAGS.__flags)
    cluster_spec, server = TFNode.start_cluster_server(ctx)
    if ctx.job_name == "ps":
        server.join()
    elif ctx.job_name == "worker":
        total_file_names = parse_files(FLAGS.train_data)
        print("total_file_names:")
        print(total_file_names)
        print("task_index: " + str(ctx.task_index))
        task_file_names = [name for idx, name in enumerate(total_file_names) if idx % FLAGS.task_num == ctx.task_index]
        print("task_file_names:")
        print(task_file_names)
        train_reader = tfos_online_data_reader.Reader(
            task_file_names,
            FLAGS.mapping_data,
            batch_size=FLAGS.batch_size,
            delimiter='\t')
        wide_dim = train_reader.wide_dim

        with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d"%ctx.task_index,
                                                      cluster=cluster_spec)):
            config = {}
            config['num_ps'] = FLAGS.ps_num
            dnn_model = online_model.DNNModel(FLAGS,wide_dim,config)
            dnn_model.build()
            dense_inputs = dnn_model.dense_inputs
            sparse_inputs = dnn_model.sparse_inputs
            labels = dnn_model.labels

            global_step = dnn_model.global_step
            step_update_op = dnn_model.step_update_op
            train_op = dnn_model.train_op
            loss = dnn_model.loss
            auc_op = dnn_model.auc_op
            summary_op = dnn_model.summary_op

        saver = tf.train.Saver()
        init_op = [tf.global_variables_initializer(),
                    tf.local_variables_initializer()]

        summary_writer = tf.summary.FileWriter("tensorboard_%d" % ctx.worker_num, graph=tf.get_default_graph())
        sv = tf.train.Supervisor(is_chief = (ctx.task_index == 0),
                                 logdir = FLAGS.log_dir,
                                 init_op = init_op,
                                 summary_op = None,
                                 summary_writer=summary_writer,
                                 global_step = global_step,
                                 saver=saver,
                                 save_model_secs = 300)

        shape = np.array([FLAGS.batch_size, wide_dim + 1])
        begin_time = datetime.now()
        with sv.managed_session(server.target) as sess:
            if not sv.should_stop():
                for epoch in range(FLAGS.num_epoch):
                    train_batches = train_reader.yieldBatches()
                    print("Epoch: %d" % epoch)
                    step = 0
                    for dense_x,sparse_idx,sparse_values,y in train_batches:
                        start_time = datetime.now()
                        _ ,train_loss,train_auc,summ,_ = sess.run([train_op,loss,auc_op,summary_op,step_update_op],
                           feed_dict={dense_inputs:dense_x,sparse_inputs:(sparse_idx,sparse_values,shape),labels:y})
                        step += 1
                        assert not np.isnan(train_loss), 'Model diverged with loss = NaN'
                        time_used = datetime.now() - start_time
                        if step % FLAGS.display_step == 0:
                            g_step, = sess.run([global_step])
                            print("step: " + str(step) + ", global_step: " + str(g_step))
                            summary_writer.add_summary(summ,g_step)
                            print("Step = {}, Examples = {}, Time = {}, Minibatch Loss = {}, Auc = {}".format(
                                 g_step, g_step*FLAGS.batch_size, time_used, train_loss, train_auc))
                            sys.stdout.flush()
            total_time = datetime.now() - begin_time
            print("Training Done!!")
            print("Total time used: {}".format(total_time))
def main_fun(argv, ctx):

  import tensorflow as tf
  import cifar10

  sys.argv = argv
  FLAGS = tf.app.flags.FLAGS
  tf.app.flags.DEFINE_string('eval_dir', '/tmp/cifar10_eval',
                             """Directory where to write event logs.""")
  tf.app.flags.DEFINE_string('eval_data', 'test',
                             """Either 'test' or 'train_eval'.""")
  tf.app.flags.DEFINE_string('checkpoint_dir', '/tmp/cifar10_train',
                             """Directory where to read model checkpoints.""")
  tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                              """How often to run the eval.""")
  tf.app.flags.DEFINE_integer('num_examples', 10000,
                              """Number of examples to run.""")
  tf.app.flags.DEFINE_boolean('run_once', False,
                           """Whether to run eval only once.""")
  tf.app.flags.DEFINE_boolean('rdma', False, """Whether to use rdma.""")

  cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

  def eval_once(saver, summary_writer, top_k_op, summary_op):
    """Run Eval once.

    Args:
      saver: Saver.
      summary_writer: Summary writer.
      top_k_op: Top K op.
      summary_op: Summary op.
    """
    with tf.Session() as sess:
      ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
      if ckpt and ckpt.model_checkpoint_path:
        # Restores from checkpoint
        saver.restore(sess, ckpt.model_checkpoint_path)
        # Assuming model_checkpoint_path looks something like:
        #   /my-favorite-path/cifar10_train/model.ckpt-0,
        # extract global_step from it.
        global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
      else:
        print('No checkpoint file found')
        return

      # Start the queue runners.
      coord = tf.train.Coordinator()
      try:
        threads = []
        for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
          threads.extend(qr.create_threads(sess, coord=coord, daemon=True,
                                           start=True))

        num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))
        true_count = 0  # Counts the number of correct predictions.
        total_sample_count = num_iter * FLAGS.batch_size
        step = 0
        while step < num_iter and not coord.should_stop():
          predictions = sess.run([top_k_op])
          true_count += np.sum(predictions)
          step += 1

        # Compute precision @ 1.
        precision = true_count / total_sample_count
        print('%s: precision @ 1 = %.3f' % (datetime.now(), precision))

        summary = tf.Summary()
        summary.ParseFromString(sess.run(summary_op))
        summary.value.add(tag='Precision @ 1', simple_value=precision)
        summary_writer.add_summary(summary, global_step)
      except Exception as e:  # pylint: disable=broad-except
        coord.request_stop(e)

      coord.request_stop()
      coord.join(threads, stop_grace_period_secs=10)


  def evaluate():
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default() as g:
      # Get images and labels for CIFAR-10.
      eval_data = FLAGS.eval_data == 'test'
      images, labels = cifar10.inputs(eval_data=eval_data)

      # Build a Graph that computes the logits predictions from the
      # inference model.
      logits = cifar10.inference(images)

      # Calculate predictions.
      top_k_op = tf.nn.in_top_k(logits, labels, 1)

      # Restore the moving average version of the learned variables for eval.
      variable_averages = tf.train.ExponentialMovingAverage(
          cifar10.MOVING_AVERAGE_DECAY)
      variables_to_restore = variable_averages.variables_to_restore()
      saver = tf.train.Saver(variables_to_restore)

      # Build the summary operation based on the TF collection of Summaries.
      summary_op = tf.summary.merge_all()

      summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g)

      while True:
        eval_once(saver, summary_writer, top_k_op, summary_op)
        if FLAGS.run_once:
          break
        time.sleep(FLAGS.eval_interval_secs)

  #cifar10.maybe_download_and_extract()
  if tf.gfile.Exists(FLAGS.eval_dir):
    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  tf.gfile.MakeDirs(FLAGS.eval_dir)
  evaluate()