Exemple #1
0
def sample(args, sc):
    defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
    working_dir = os.getcwd()

    config_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'config.p'),
                                   defaultFS, working_dir)
    saved_args = sc.pickleFile(config_file).collect()[0]
    chars_vocab_file = TFNode.hdfs_path(
        os.path.join(args.save_dir, 'chars_vocab.p'), defaultFS, working_dir)
    chars, vocab = sc.pickleFile(chars_vocab_file).collect()
    model = Model(saved_args, training=False)
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        saver = tf.train.Saver()
        save_dir = TFNode.hdfs_path(os.path.join(args.save_dir, ''), defaultFS,
                                    working_dir)
        ckpt = tf.train.get_checkpoint_state(save_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            sample_ = model.sample(sess, chars, vocab, args.n, args.prime,
                                   args.sample)
            with hdfs.open(
                    TFNode.hdfs_path(
                        os.path.join(args.output_dir, 'output.txt'), defaultFS,
                        working_dir), 'w') as f:
                f.write(sample_)
    def test_hdfs_path(self):
        """Normalization of absolution & relative string paths depending on filesystem"""
        cwd = os.getcwd()
        user = getpass.getuser()
        fs = ["file://", "hdfs://", "viewfs://"]
        paths = {
            "hdfs://foo/bar":
            ["hdfs://foo/bar", "hdfs://foo/bar", "hdfs://foo/bar"],
            "viewfs://foo/bar":
            ["viewfs://foo/bar", "viewfs://foo/bar", "viewfs://foo/bar"],
            "file://foo/bar":
            ["file://foo/bar", "file://foo/bar", "file://foo/bar"],
            "/foo/bar":
            ["file:///foo/bar", "hdfs:///foo/bar", "viewfs:///foo/bar"],
            "foo/bar": [
                "file://{}/foo/bar".format(cwd),
                "hdfs:///user/{}/foo/bar".format(user),
                "viewfs:///user/{}/foo/bar".format(user)
            ],
        }

        for i in range(len(fs)):
            ctx = type('MockContext', (), {
                'defaultFS': fs[i],
                'working_dir': cwd
            })
            for path, expected in paths.items():
                final_path = TFNode.hdfs_path(ctx, path)
                self.assertEqual(
                    final_path, expected[i],
                    "fs({}) + path({}) => {}, expected {}".format(
                        fs[i], path, final_path, expected[i]))
  def test_hdfs_path(self):
    cwd = os.getcwd()
    user = getpass.getuser()
    fs = ["file://", "hdfs://", "viewfs://"]
    paths = {
      "hdfs://foo/bar": ["hdfs://foo/bar", "hdfs://foo/bar", "hdfs://foo/bar"],
      "viewfs://foo/bar": ["viewfs://foo/bar", "viewfs://foo/bar", "viewfs://foo/bar"],
      "file://foo/bar": ["file://foo/bar", "file://foo/bar", "file://foo/bar"],
      "/foo/bar": ["file:///foo/bar", "hdfs:///foo/bar", "viewfs:///foo/bar"],
      "foo/bar": ["file://{}/foo/bar".format(cwd), "hdfs:///user/{}/foo/bar".format(user), "viewfs:///user/{}/foo/bar".format(user)],
    }

    for i in range(len(fs)):
      ctx = type('MockContext', (), {'defaultFS': fs[i], 'working_dir': cwd})
      for path, expected in paths.items():
        final_path = TFNode.hdfs_path(ctx, path)
        self.assertEqual(expected[i], final_path, "fs({}) + path({}) => {}, expected {}".format(fs[i], path, final_path, expected[i]))
Exemple #4
0
    def __init__(self, sc, data_dir, batch_size, seq_length, encoding='utf-8'):
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.encoding = encoding

        defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
        working_dir = os.getcwd()

        input_file = TFNode.hdfs_path(os.path.join(data_dir, "input.txt"),
                                      defaultFS, working_dir)

        print("reading text file")
        self.preprocess(input_file)

        self.create_batches()
        self.reset_batch_pointer()
Exemple #5
0
    parser.add_argument("--epochs", help="number of epochs",
                        type=int, default=1)
    parser.add_argument(
        "--steps", help="maximum number of steps", type=int, default=1000)

    args=parser.parse_args()

    data_loader=TextLoader(
        sc, args.data_dir, args.batch_size, args.seq_length)

    args.vocab_size = data_loader.vocab_size

    defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
    working_dir = os.getcwd()

    config_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'config.p'), defaultFS, working_dir)
    sc.parallelize([args]).saveAsPickleFile(config_file)

    chars_vocab_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'chars_vocab.p'), defaultFS, working_dir)
    sc.parallelize([data_loader.chars, data_loader.vocab]).saveAsPickleFile(chars_vocab_file)

    dataRDD=sc.parallelize(data_loader.get_data_for_feeder())

    cluster=TFCluster.run(sc, main_fun, args, num_executors,
                            args.num_ps_tasks, TFCluster.InputMode.SPARK)

    cluster.train(dataRDD, args.epochs)

    cluster.shutdown()

    print("{0} ===== Stop".format(datetime.now().isoformat()))
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import os
    import tensorflow as tf
    import time

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    # Parameters
    IMAGE_PIXELS = 28
    hidden_units = 128

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def _parse_csv(ln):
        splits = tf.string_split([ln], delimiter='|')
        lbl = splits.values[0]
        img = splits.values[1]
        image_defaults = [[0.0] for col in range(IMAGE_PIXELS * IMAGE_PIXELS)]
        image = tf.stack(tf.decode_csv(img, record_defaults=image_defaults))
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        normalized_image = tf.div(image, norm)
        label_value = tf.string_to_number(lbl, tf.int32)
        label = tf.one_hot(label_value, 10)
        return (normalized_image, label, label_value)

    def _parse_tfr(example_proto):
        print("example_proto: {}".format(example_proto))
        feature_def = {
            "label": tf.FixedLenFeature(10, tf.int64),
            "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)
        }
        features = tf.parse_single_example(example_proto, feature_def)
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        image = tf.div(tf.to_float(features['image']), norm)
        label = tf.to_float(features['label'])
        return (image, label)

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Dataset for input data
            image_dir = TFNode.hdfs_path(ctx, args.images)
            file_pattern = os.path.join(image_dir, 'part-*')
            files = tf.gfile.Glob(file_pattern)

            parse_fn = _parse_tfr if args.format == 'tfr' else _parse_csv
            ds = tf.data.TextLineDataset(files).map(parse_fn).batch(
                args.batch_size)
            iterator = ds.make_initializable_iterator()
            x, y_, y_val = iterator.get_next()

            # Variables of the hidden layer
            hid_w = tf.Variable(tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
                                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            tf.summary.histogram("hidden_weights", hid_w)

            # Variables of the softmax layer
            sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                                                   stddev=1.0 /
                                                   math.sqrt(hidden_units)),
                               name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            tf.summary.histogram("softmax_weights", sm_w)

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
            tf.summary.image("x_img", x_img)

            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            tf.summary.scalar("loss", loss)
            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),
                                      name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num,
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=10)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)
            output_dir = TFNode.hdfs_path(ctx, args.output)
            tf.gfile.MkDir(output_dir)
            output_file = tf.gfile.Open("{0}/part-{1:05d}".format(
                output_dir, worker_num),
                                        mode='w')

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            sess.run(iterator.initializer)
            step = 0
            count = 0
            while not sv.should_stop() and step < args.steps:

                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using QueueRunners/Readers
                if args.mode == "train":
                    if (step % 100 == 0):
                        print("{0} step: {1} accuracy: {2}".format(
                            datetime.now().isoformat(), step,
                            sess.run(accuracy)))
                    _, summary, step, yv = sess.run(
                        [train_op, summary_op, global_step, y_val])
                    # print("yval: {}".format(yv))
                    if sv.is_chief:
                        summary_writer.add_summary(summary, step)
                else:  # args.mode == "inference"
                    labels, pred, acc = sess.run([label, prediction, accuracy])
                    # print("label: {0}, pred: {1}".format(labels, pred))
                    print("acc: {0}".format(acc))
                    for i in range(len(labels)):
                        count += 1
                        output_file.write("{0} {1}\n".format(
                            labels[i], pred[i]))
                    print("count: {0}".format(count))

        if args.mode == "inference":
            output_file.close()
            # Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
            # run inference and request stop before the other workers even start/sync their sessions.
            if task_index == 0:
                time.sleep(60)

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
Exemple #7
0
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import getpass
    import math
    import numpy
    import os
    import signal
    import tensorflow as tf
    import time

    IMAGE_PIXELS = 28
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    num_workers = len(cluster_spec['worker'])

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Parameters
    hidden_units = 128
    batch_size = 100

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def read_csv_examples(image_dir,
                          label_dir,
                          batch_size=100,
                          num_epochs=None,
                          task_index=None,
                          num_workers=None):
        print_log(worker_num, "num_epochs: {0}".format(num_epochs))
        # Setup queue of csv image filenames
        tf_record_pattern = os.path.join(image_dir, 'part-*')
        images = tf.gfile.Glob(tf_record_pattern)
        print_log(worker_num, "images: {0}".format(images))
        image_queue = tf.train.string_input_producer(images,
                                                     shuffle=False,
                                                     capacity=1000,
                                                     num_epochs=num_epochs,
                                                     name="image_queue")

        # Setup queue of csv label filenames
        tf_record_pattern = os.path.join(label_dir, 'part-*')
        labels = tf.gfile.Glob(tf_record_pattern)
        print_log(worker_num, "labels: {0}".format(labels))
        label_queue = tf.train.string_input_producer(labels,
                                                     shuffle=False,
                                                     capacity=1000,
                                                     num_epochs=num_epochs,
                                                     name="label_queue")

        # Setup reader for image queue
        img_reader = tf.TextLineReader(name="img_reader")
        _, img_csv = img_reader.read(image_queue)
        image_defaults = [[1.0] for col in range(784)]
        img = tf.pack(tf.decode_csv(img_csv, image_defaults))
        # Normalize values to [0,1]
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        image = tf.div(img, norm)
        print_log(worker_num, "image: {0}".format(image))

        # Setup reader for label queue
        label_reader = tf.TextLineReader(name="label_reader")
        _, label_csv = label_reader.read(label_queue)
        label_defaults = [[1.0] for col in range(10)]
        label = tf.pack(tf.decode_csv(label_csv, label_defaults))
        print_log(worker_num, "label: {0}".format(label))

        # Return a batch of examples
        return tf.train.batch([image, label],
                              batch_size,
                              num_threads=args.readers,
                              name="batch_csv")

    def read_tfr_examples(path,
                          batch_size=100,
                          num_epochs=None,
                          task_index=None,
                          num_workers=None):
        print_log(worker_num, "num_epochs: {0}".format(num_epochs))

        # Setup queue of TFRecord filenames
        tf_record_pattern = os.path.join(path, 'part-*')
        files = tf.gfile.Glob(tf_record_pattern)
        queue_name = "file_queue"

        # split input files across workers, if specified
        if task_index is not None and num_workers is not None:
            num_files = len(files)
            files = files[task_index:num_files:num_workers]
            queue_name = "file_queue_{0}".format(task_index)

        print_log(worker_num, "files: {0}".format(files))
        file_queue = tf.train.string_input_producer(files,
                                                    shuffle=False,
                                                    capacity=1000,
                                                    num_epochs=num_epochs,
                                                    name=queue_name)

        # Setup reader for examples
        reader = tf.TFRecordReader(name="reader")
        _, serialized = reader.read(file_queue)
        feature_def = {
            'label': tf.FixedLenFeature([10], tf.int64),
            'image': tf.FixedLenFeature([784], tf.int64)
        }
        features = tf.parse_single_example(serialized, feature_def)
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        image = tf.div(tf.to_float(features['image']), norm)
        print_log(worker_num, "image: {0}".format(image))
        label = tf.to_float(features['label'])
        print_log(worker_num, "label: {0}".format(label))

        # Return a batch of examples
        return tf.train.batch([image, label],
                              batch_size,
                              num_threads=args.readers,
                              name="batch")

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Variables of the hidden layer
            hid_w = tf.Variable(tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
                                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            tf.summary.histogram("hidden_weights", hid_w)

            # Variables of the softmax layer
            sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                                                   stddev=1.0 /
                                                   math.sqrt(hidden_units)),
                               name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            tf.summary.histogram("softmax_weights", sm_w)

            # Placeholders or QueueRunner/Readers for input data
            num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs
            index = task_index if args.mode == "inference" else None
            workers = num_workers if args.mode == "inference" else None

            if args.format == "csv":
                images = TFNode.hdfs_path(ctx, args.images)
                labels = TFNode.hdfs_path(ctx, args.labels)
                x, y_ = read_csv_examples(images, labels, 100, num_epochs,
                                          index, workers)
            elif args.format == "tfr":
                images = TFNode.hdfs_path(ctx, args.images)
                x, y_ = read_tfr_examples(images, 100, num_epochs, index,
                                          workers)
            else:
                raise ("{0} format not supported for tf input mode".format(
                    args.format))

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
            tf.summary.image("x_img", x_img)

            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            tf.summary.scalar("loss", loss)
            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),
                                      name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num),
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=10)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)
            output_dir = TFNode.hdfs_path(ctx, args.output)
            output_file = tf.gfile.Open("{0}/part-{1:05d}".format(
                output_dir, worker_num),
                                        mode='w')

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = 0
            count = 0
            while not sv.should_stop() and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using QueueRunners/Readers
                if args.mode == "train":
                    if (step % 100 == 0):
                        print("{0} step: {1} accuracy: {2}".format(
                            datetime.now().isoformat(), step,
                            sess.run(accuracy)))
                    _, summary, step = sess.run(
                        [train_op, summary_op, global_step])
                    if sv.is_chief:
                        summary_writer.add_summary(summary, step)
                else:  # args.mode == "inference"
                    labels, pred, acc = sess.run([label, prediction, accuracy])
                    #print("label: {0}, pred: {1}".format(labels, pred))
                    print("acc: {0}".format(acc))
                    for i in range(len(labels)):
                        count += 1
                        output_file.write("{0} {1}\n".format(
                            labels[i], pred[i]))
                    print("count: {0}".format(count))

        if args.mode == "inference":
            output_file.close()
            # Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
            # run inference and request stop before the other workers even start/sync their sessions.
            if task_index == 0:
                time.sleep(60)

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
def map_fun(args, ctx):

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    # Parameters
    FEATURE_COUNT = args.feature_count
    LABEL_COUNT = args.label_count
    BATCH_SIZE = args.batch_size
    SEED = args.seed

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == "rdma")


    if job_name == "ps":
        server.join()
    elif job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % task_index,
                cluster=cluster)):

            # Placeholders or QueueRunner/Readers for input data
            with tf.name_scope(constants.INPUT_LAYER_NAME):
                x = tf.placeholder(tf.float32, [None, FEATURE_COUNT], name=constants.INPUT_LAYER_X)
                y_ = tf.placeholder(tf.float32, [None, LABEL_COUNT], name=constants.INPUT_LAYER_Y)

            with tf.name_scope("layer"):

                # hidden layer
                n_count, h_out = tensorflow_utils.auto_generate_hidden_layer(FEATURE_COUNT, x, args.layers_active, SEED)

                # Variables of the output layer
                with tf.name_scope("output"):

                    sm_w = tf.Variable(tf.truncated_normal(shape=[n_count, LABEL_COUNT],
                                                           stddev=0.1, seed=SEED), name="sm_w")
                    sm_b = tf.Variable(tf.zeros(shape=[LABEL_COUNT]) + 0.1, name="sm_b")
                    tf.summary.histogram("output_weights", sm_w)
                    y = tensorflow_utils.activation_fun(args.activation, tf.nn.xw_plus_b(h_out, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = tensorflow_utils.loss_fun(args.loss, y, y_)
            tf.summary.scalar("loss", loss)

            train_op = tensorflow_utils.optimize_fun(args.gradient, args.learning_rate).minimize(loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name=constants.LABEL_NODE_NAME)
            prediction = tf.argmax(y, 1, name=constants.PREDICT_NODE_NAME)
            correct_prediction = tf.equal(prediction, label)

            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        logdir = TFNode.hdfs_path(ctx, args.model_dir)
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num),
                                               graph=tf.get_default_graph())

        sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                 logdir=logdir,
                                 init_op=init_op,
                                 summary_op=None,
                                 saver=saver,
                                 global_step=global_step,
                                 stop_grace_secs=300,
                                 save_model_secs=10)

        # The supervisor takes care of session initialization, restoring from a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))

            step = 0
            tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping)
            while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
            # while not sv.should_stop() and not tf_feed.should_stop():
                # Run a training step asynchronously.

                # using feed_dict
                batch_xs, batch_ys = feed_dict(tf_feed.next_batch(BATCH_SIZE), args)
                feed = {x: batch_xs, y_: batch_ys}

                if len(batch_xs) > 0:
                    _, summary, step = sess.run([train_op, summary_op, global_step],
                                                feed_dict=feed)
                    # print accuracy and save model checkpoint to HDFS every 100 steps
                    if (step % 100 == 0):
                        print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step,
                                                                   sess.run(accuracy, {x: batch_xs, y_: batch_ys})))

                    if sv.is_chief:
                        summary_writer.add_summary(summary, step)

            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

            # 保存模型
            if sv.is_chief and args.export_dir:
                print("{0} exporting saved_model to: {1}".format(datetime.now().isoformat(), args.export_dir))
                save_model(sess, args, x, prediction)

            else:
                # non-chief workers should wait for chief
                while not sv.should_stop():
                    print("Waiting for chief")
                    time.sleep(5)

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index

  IMAGE_PIXELS = 28

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128
  batch_size = args.batch_size

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.protocol == 'rdma')

  def feed_dict(batch):
    # Convert from dict of named arrays to two numpy arrays of the proper type
    images = batch['image']
    labels = batch['label']
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    xs = xs / 255.0
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
      worker_device="/job:worker/task:%d" % task_index,
      cluster=cluster)):

      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                          stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                         stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
      y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
      tf.summary.image("x_img", x_img)

      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

      global_step = tf.Variable(0)

      loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
      tf.summary.scalar("loss", loss)

      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1, name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model_dir)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num), graph=tf.get_default_graph())

    sv = tf.train.Supervisor(is_chief=(task_index == 0),
                             logdir=logdir,
                             init_op=init_op,
                             summary_op=None,
                             saver=saver,
                             global_step=global_step,
                             stop_grace_secs=300,
                             save_model_secs=10)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping)
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
          # print accuracy and save model checkpoint to HDFS every 100 steps
          if (step % 100 == 0):
            print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy, {x: batch_xs, y_: batch_ys})))

          if sv.is_chief:
            summary_writer.add_summary(summary, step)

      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

      if sv.is_chief and args.export_dir:
        print("{0} exporting saved_model to: {1}".format(datetime.now().isoformat(), args.export_dir))
        # exported signatures defined in code
        signatures = {
          tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: {
            'inputs': {'image': x},
            'outputs': {'prediction': prediction},
            'method_name': tf.saved_model.signature_constants.PREDICT_METHOD_NAME
          },
          'featurize': {
            'inputs': {'image': x},
            'outputs': {'features': hid},
            'method_name': 'featurize'
          }
        }
        TFNode.export_saved_model(sess,
                                  args.export_dir,
                                  tf.saved_model.tag_constants.SERVING,
                                  signatures)
      else:
        # non-chief workers should wait for chief
        while not sv.should_stop():
          print("Waiting for chief")
          time.sleep(5)

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec

  IMAGE_PIXELS=28

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128
  batch_size   = args.batch_size

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    xs = xs/255.0
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                              stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                              stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
      y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
      tf.summary.image("x_img", x_img)

      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

      global_step = tf.Variable(0)

      loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
      tf.summary.scalar("loss", loss)

      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))

            if sv.is_chief:
              summary_writer.add_summary(summary, step)
          else: # args.mode == "inference"
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

            results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))

      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
Exemple #11
0
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec

  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  batch_size = args.batch_size

  cluster, server = TFNode.start_cluster_server(ctx, 1)

  def feed_dict(batch):
    images = []
    labels = []
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    x_initial = numpy.array(images)
    x_objdump = x_initial[:,519:719]
    x_cnn = numpy.empty((0, 200), dtype=numpy.float64)
    for i in xrange(len(images)):  
      x_cnn_batch = numpy.zeros((200, 120), dtype=numpy.float64)
      for j in xrange(0, 200):
        x_cnn_batch[j, int(x_objdump[i, j])] = True
      x_cnn_batch = numpy.transpose(x_cnn_batch)
      x_cnn = numpy.append(x_cnn, x_cnn_batch, axis=0)
    x_peinfo = x_initial[:,0:519]
    ys = numpy.array(labels)
    return (x_peinfo.reshape(-1,519,1,1),x_cnn.reshape(-1, 200, 120, 1), ys)

  def conv2d(x, W):
      return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

  def max_pool_1(x):
      return tf.nn.avg_pool(x, ksize=[1, 2,1, 1], strides=[1, 2, 1, 1], padding='SAME')

  def max_pool_2(x):
      return tf.nn.avg_pool(x, ksize=[1, 100,1, 1], strides=[1, 100, 1, 1], padding='SAME')

  if job_name == "ps":
    server.join()
  elif job_name == "worker":
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):
      # Build NN-Network
      W_mlp_1 = tf.Variable(tf.truncated_normal([519,519],stddev=0.1), name="W_mlp_1") 
      b_mlp_1 = tf.Variable(tf.constant(0.1, shape=[519]),name="b_mlp_1")
      tf.summary.histogram("W_mlp_1", W_mlp_1)
      W_mlp_2 = tf.Variable(tf.truncated_normal([519,519],stddev=0.1), name="W_mlp_2") 
      b_mlp_2 = tf.Variable(tf.constant(0.1, shape=[519]),name="b_mlp_2") 
      tf.summary.histogram("W_mlp_2", W_mlp_2)   

      W_conv1 = tf.Variable(tf.truncated_normal([3,120,1,3],stddev=0.1), name="W_conv1") 
      b_conv1 = tf.Variable(tf.constant(0.1, shape=[3]),name="b_conv1")
      tf.summary.histogram("W_conv1", W_conv1)
      W_conv2 = tf.Variable(tf.truncated_normal([3,120,3,6],stddev=0.1),name="W_conv2") 
      b_conv2 = tf.Variable(tf.constant(0.1, shape=[6]),name="b_conv2")
      tf.summary.histogram("W_conv2", W_conv2)

      sm_w = tf.Variable(tf.truncated_normal([1239, 10], stddev= 0.1), name="sm_w")
      sm_b = tf.Variable(tf.constant(0.1, shape=[10]),name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      x_cnn = tf.placeholder(tf.float32, [None, 200,120,1], name="x_cnn")
      x_mlp = tf.placeholder(tf.float32, [None, 519,1,1], name="x_mlp")
      y_ = tf.placeholder(tf.float32, [None, 10], name="y_")
      tf.summary.image("x_cnn", x_cnn)
      tf.summary.image("x_mlp", x_mlp)

      x_mlp_new = tf.reshape(x_mlp, [-1, 519])
      h_mlp_1 = tf.nn.xw_plus_b(x_mlp_new, W_mlp_1, b_mlp_1)
      h_mlp_2 = tf.nn.xw_plus_b(h_mlp_1, W_mlp_2, b_mlp_2)
      h_conv1 = tf.nn.relu(conv2d(x_cnn, W_conv1) + b_conv1)
      h_pool1 = max_pool_1(h_conv1)
      h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
      h_pool2 = max_pool_2(h_conv2)
      h_conv2_flat = tf.reshape(h_pool2, [-1, 120*6])

      h_inter = tf.concat([h_mlp_2, h_conv2_flat],1)
      y = tf.nn.softmax(tf.nn.xw_plus_b(h_inter, sm_w, sm_b))

      global_step = tf.Variable(0)
      loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
      tf.summary.scalar("loss", loss)
      train_op = tf.train.AdagradOptimizer(0.001).minimize(
          loss, global_step=global_step)

      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)
      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        batch_mlp, batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x_mlp: batch_mlp, x_cnn: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            if (step % 10 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x_mlp: batch_mlp, x_cnn: batch_xs, y_: batch_ys})))
            if sv.is_chief:
              summary_writer.add_summary(summary, step)
          
          elif args.mode == "inference": 
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)
            results = ["Label: {0}, Prediction: {1}".format(l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))

          else:
            preds= sess.run(prediction, feed_dict={x_mlp: batch_mlp, x_cnn: batch_xs})
            results = ["Sha256: {0}, Prediction: {1}".format(l, p) for l,p in zip(batch_ys,preds)]
            tf_feed.batch_results(results)
            print(results)
            
      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import numpy
  import tensorflow as tf
  import time
  import math

  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  batch_size   = args.batch_size

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [images_labels] to two numpy arrays of the proper type
    images = []
    labels = []
    for item in batch:
      images.append(item[0: 4])
      labels.append(item[4])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    ys = dense_to_one_hot(numpy.array(labels, dtype=numpy.uint), 3)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  def dense_to_one_hot(labels_dense, num_classes):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = numpy.arange(num_labels) * num_classes
    labels_one_hot = numpy.zeros((num_labels, num_classes))
    tt = index_offset + labels_dense.ravel()
    tt = tt.astype(numpy.int32)
    labels_one_hot.flat[tt] = 1
    return labels_one_hot

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

        # network
        x = tf.placeholder(tf.float32, [None, 4])

        # paras
        W = tf.Variable(tf.zeros([4, 3]))
        b = tf.Variable(tf.zeros([3]))

        y = tf.nn.softmax(tf.matmul(x, W) + b)
        y_ = tf.placeholder(tf.float32, [None, 3])

        # loss func
        cross_entropy = -tf.reduce_sum(y_ * tf.log(y))

        global_step = tf.Variable(0)

        train_op = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy,global_step=global_step )

        # Test trained model
        label = tf.argmax(y_, 1, name="label") #??? does the function argmax use in the right way ?
        prediction = tf.argmax(y, 1, name="prediction")
        correct_prediction = tf.equal(prediction, label)

        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
        tf.summary.scalar("acc", accuracy)

        saver = tf.train.Saver()
        summary_op = tf.summary.merge_all()
        init_op = tf.global_variables_initializer()
    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=1)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step,
                                                         sess.run(accuracy,{x: batch_xs, y_: batch_ys})))

            if sv.is_chief:
              summary_writer.add_summary(summary, step)


      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
Exemple #13
0
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf
    import time
    import logging
    import cnn_lstm_ctc_ocr
    #import redis_logger_handler
    #redis_logger_handler.logging_setup(args.redis)

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    worker_name = '(worker:%s tf:%s idx:%s)' % (worker_num, job_name,
                                                task_index)

    logging.info(
        '{0} batch_size:{1} initial_learning_rate:{2} decay_steps:{3} decay_rate:{4} momentum:{5}'
        .format(worker_name, args.batch_size, args.initial_learning_rate,
                args.decay_steps, args.decay_rate, args.momentum))
    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Parameters
    CHANNELS = 1
    IMAGE_WIDTH = 120
    IMAGE_HEIGHT = 45

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def sparse_tuple_from_label(sequences, dtype=numpy.int32):
        indices = []
        values = []
        for n, seq in enumerate(sequences):
            indices.extend(zip([n] * len(seq), range(len(seq))))
            values.extend(seq)
        indices = numpy.asarray(indices, dtype=numpy.int64)
        values = numpy.asarray(values, dtype=dtype)
        shape = numpy.asarray(
            [len(sequences),
             numpy.asarray(indices).max(0)[1] + 1],
            dtype=numpy.int64)
        return indices, values, shape

    def get_input_lens(sequences):
        lengths = numpy.asarray([58 for s in sequences], dtype=numpy.int64)
        return sequences, lengths

    def placeholder_inputs(image_width, image_height, channels):
        images_placeholder = tf.placeholder(
            tf.float32, [None, image_height, image_width, channels])
        labels_placeholder = tf.sparse_placeholder(tf.int32)
        seqlen_placeholder = tf.placeholder(tf.int32, [None])
        keep_prob = tf.placeholder(tf.float32)
        return images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob

    def format_batch(data_set, batch_size, image_height, image_width,
                     channels):
        batch = data_set.next_batch(batch_size)
        images = []
        labels = []
        for item in batch:
            images.append(item[0])
            labels.append(item[1])
        xs = numpy.array(images)
        # [batch_size, height * width] => [batch_size, height, width, channels]
        xs = xs.reshape(batch_size, image_height, image_width, channels)
        xs = xs.astype(numpy.float32)
        xs = xs / 255.
        ys = labels
        return xs, ys

    def fill_feed_dict(xs,
                       ys,
                       images_pl,
                       labels_pl,
                       seqlen_pl,
                       keep_prob,
                       train=True):
        images_feed, seqlen_feed = get_input_lens(xs)
        labels_feed = sparse_tuple_from_label(ys)
        if train:
            feed_dict = {
                images_pl: images_feed,
                labels_pl: labels_feed,
                seqlen_pl: seqlen_feed,
                keep_prob: 0.5,
            }
        else:
            feed_dict = {
                images_pl: images_feed,
                labels_pl: labels_feed,
                seqlen_pl: seqlen_feed,
                keep_prob: 1,
            }
        return feed_dict

    def do_eval(sess, dense_decoded, lastbatch_err, learning_rate,
                images_placeholder, labels_placeholder, seqlen_placeholder,
                keep_prob, train, xs, ys):
        true_count = 0  # Counts the number of correct predictions.
        feed_dict = fill_feed_dict(xs, ys, images_placeholder,
                                   labels_placeholder, seqlen_placeholder,
                                   keep_prob, train)
        dd, lerr, lr = sess.run([dense_decoded, lastbatch_err, learning_rate],
                                feed_dict=feed_dict)
        #accuracy calculation
        for i, origin_label in enumerate(ys):
            decoded_label = [j for j in dd[i] if j != -1]
            if i < 10:
                logging.info('{0} seq {1} => origin:{2} decoded:{3}'.format(
                    worker_name, i, origin_label, decoded_label))
            if origin_label == decoded_label:
                true_count += 1
        #accuracy
        acc = true_count * 1.0 / len(ys)
        #print subsummary
        logging.info(
            "%s accuracy = %.3f, lastbatch_err = %.3f, learning_rate = %.8f" %
            (worker_name, acc, lerr, lr))

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):
            # Generate placeholders for the images, labels and seqlens.
            images_placeholder, labels_placeholder, seqlen_placeholder, keep_prob = placeholder_inputs(
                IMAGE_WIDTH, IMAGE_HEIGHT, CHANNELS)
            # Build a Graph that computes predictions from the inference model.
            #images_lp, seqlen_lp, num_features, num_layers, hidden_units
            logits = cnn_lstm_ctc_ocr.inference(images_placeholder,
                                                seqlen_placeholder, keep_prob,
                                                args.hidden_units, args.mode,
                                                args.batch_size)
            # Add to the Graph the Ops for loss calculation.
            #logits, labels_lp, seqlen_lp
            loss = cnn_lstm_ctc_ocr.loss(logits, labels_placeholder,
                                         seqlen_placeholder)
            tf.summary.scalar('loss', loss)
            # global counter
            global_step = tf.Variable(0, name='global_step', trainable=False)
            # Add to the Graph the Ops that calculate and apply gradients.
            #loss, initial_learning_rate, decay_steps, decay_rate, momentum
            train_op, learning_rate = cnn_lstm_ctc_ocr.training(
                loss, global_step, args.initial_learning_rate,
                args.decay_steps, args.decay_rate, args.momentum)
            # Add the Op to compare the logits to the labels during evaluation.
            dense_decoded, lerr = cnn_lstm_ctc_ocr.evaluation(
                logits, labels_placeholder, seqlen_placeholder)
            tf.summary.scalar('lerr', lerr)

            summary_op = tf.summary.merge_all()
            # Add the variable initializer Op.
            init_op = tf.global_variables_initializer()
            # Create a saver for writing training checkpoints.
            saver = tf.train.Saver()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        logging.info("{0} tensorflow model path: {1}".format(
            worker_name, logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num),
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=60)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        validation_xs = None
        validation_ys = None
        validation_batchs = 10
        with sv.managed_session(server.target) as sess:
            logging.info("{0} session ready".format(worker_name))
            # Loop until the supervisor shuts down or 1000000 steps have completed.
            g_step = 0
            tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
            # for do_eval samples
            if None == validation_xs or None == validation_ys:
                validation_xs, validation_ys = format_batch(
                    tf_feed, args.batch_size * validation_batchs, IMAGE_HEIGHT,
                    IMAGE_WIDTH, CHANNELS)
            while not sv.should_stop() and not tf_feed.should_stop(
            ) and g_step < (args.steps * args.epochs - validation_batchs):
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.
                start_time = time.time()
                # using feed_dict
                xs, ys = format_batch(tf_feed, args.batch_size, IMAGE_HEIGHT,
                                      IMAGE_WIDTH, CHANNELS)
                feed_dict = fill_feed_dict(xs, ys, images_placeholder,
                                           labels_placeholder,
                                           seqlen_placeholder, keep_prob,
                                           args.mode == "train")
                # Run one step of the model.  The return values are the activations
                # from the `train_op` (which is discarded) and the `loss` Op.  To
                # inspect the values of your Ops or variables, you may include them
                # in the list passed to sess.run() and the value tensors will be
                # returned in the tuple from the call.
                _, loss_value, g_step = sess.run([train_op, loss, global_step],
                                                 feed_dict=feed_dict)
                duration = time.time() - start_time
                if g_step % 20 == 0:
                    # Print status to stdout.
                    logging.info(
                        '%s [g_step:%d epoch:%d/%d step:%d/%d] loss = %.2f (%.3f sec)'
                        % (worker_name, g_step, g_step / args.steps,
                           args.epochs, g_step % args.steps, args.steps,
                           loss_value, duration))
                # Write the summaries and print an overview fairly often.
                if g_step % 100 == 0:
                    # Update the events file.
                    if sv.is_chief:
                        summary = sess.run(summary_op, feed_dict=feed_dict)
                        summary_writer.add_summary(summary, g_step)
                        summary_writer.flush()

                # Save a checkpoint and evaluate the model periodically.
                if (g_step + 1) % 500 == 0 or (g_step + 1) == args.steps:
                    # Evaluate against the validation set.
                    logging.info('{0} ---- Validation Data Eval: ----'.format(
                        worker_name))
                    do_eval(sess, dense_decoded, lerr, learning_rate,
                            images_placeholder, labels_placeholder,
                            seqlen_placeholder, keep_prob,
                            args.mode == "train", validation_xs, validation_ys)

            if sv.should_stop() or g_step >= (args.steps * args.epochs -
                                              validation_batchs):
                logging.info("{0} terminating tf_feed".format(worker_name))
                tf_feed.terminate()

        # Ask for all the services to stop.
        logging.info("{0} stopping supervisor".format(worker_name))
        sv.stop()
Exemple #14
0
def test_fun(args, ctx):
	# Dependencies
	from tensorflowonspark import TFNode
	from datetime import datetime

	import getpass
	import math
	import numpy
	import os
	import random
	import signal
	import tensorflow as tf
	import time

	from tensorflow.contrib import rnn

	# Used for TensorBoard logdir
	from hops import tensorboard

	# Extract configuration
	worker_num = ctx.worker_num
	job_name = ctx.job_name
	task_index = ctx.task_index
	cluster_spec = ctx.cluster_spec
	num_workers = len(cluster_spec['worker'])

	# Get TF cluster/server instances
	cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

	# Parameters
	batch_size = 100
	display_iter = 1000
	training_iters = 50000

	learning_rate = 0.0001
	n_input = 3
	n_hidden = 512
	n_predictions = 32

	# Utility functions
	def elapsed(sec):
		if sec < 60:
			return str(sec) + " sec"
		elif sec < (60 * 60):
			return str(sec / 60) + " min"
		else:
			return str(sec / (60 * 60)) + " hr"

	def print_log(worker_num, arg):
		print("%d: " % worker_num)
		print(arg)

	def RNN(x, weights, biases, n_input, n_hidden):
		# Reshape to [1, n_input]
		x = tf.reshape(x, [-1, n_input])
		# Generate a n_input-element sequence of inputs
		# (eg. [had] [a] [general] -> [20] [6] [33])
		x = tf.split(x, n_input, 1)
		rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden), rnn.BasicLSTMCell(n_hidden)])

		# Generate prediction
		outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)

		# There are n_input outputs but we only want the last output
		return tf.matmul(outputs[-1], weights['out']) + biases['out']

	def get_loss_fn(logits, labels):
		return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))

	if job_name == "ps":
		server.join()
	elif job_name == "worker":
		# TODO What does this do?
		# Assigns ops to the local worker by default
		with tf.device(tf.train.replica_device_setter(
				worker_device="/job:worker/task:%d" % task_index,
				cluster=cluster)):

			# TODO Set up vocab_size by loading in dataset and parsing through it?
			dictionary = {}
			reverse_dictionary = {}
			vocab_size = 32

			# Placeholders or QueueRunner/Readers for input data
			num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs
			index = task_index if args.mode == "inference" else None
			workers = num_workers if args.mode == "inference" else None

			# RNN output node weights and biases
			hidden_weights = tf.Variable(tf.random_normal([n_hidden, vocab_size]), name="hidden_weights")
			hidden_biases = tf.Variable(tf.random_normal([vocab_size]), name="hidden_biases")
			weights = {'out': hidden_weights}
			biases = {'out': hidden_biases}

			# Graph input placeholders
			x = tf.placeholder("float", [None, n_input, 1])
			y = tf.placeholder("float", [None, vocab_size])

			# Set up TFOS
			global_step = tf.Variable(0)

			pred = RNN(x, weights, biases, n_input, n_hidden)
			cost = get_loss_fn(logits=pred, labels=y)
			# Note that the global_step is passed in to the optimizer's min. function
			optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) \
				.minimize(loss=cost, global_step=global_step)

			# Model evaluation
			correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
			accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

			# TF summaries
			tf.summary.scalar("cost", cost)
			tf.summary.histogram("hidden_weights", hidden_weights)
			tf.summary.scalar("acc", accuracy)

			#  TODO XXX Below is copied directly from TFOS example
			saver = tf.train.Saver()
			summary_op = tf.summary.merge_all()
			init_op = tf.global_variables_initializer()

			# Create a "supervisor", which oversees the training process and stores model state into HDFS
			logdir = tensorboard.logdir()
			print("tensorflow model path: {0}".format(logdir))

			# Check if chief worker
			if job_name == "worker" and task_index == 0:
				summary_writer = tf.summary.FileWriter(logdir, graph=tf.get_default_graph())

			if args.mode == "train":
				sv = tf.train.Supervisor(is_chief=(task_index == 0),
				                         logdir=logdir,
				                         init_op=init_op,
				                         summary_op=None,
				                         summary_writer=None,
				                         saver=saver,
				                         global_step=global_step,
				                         stop_grace_secs=300,
				                         save_model_secs=10)
			else:
				sv = tf.train.Supervisor(is_chief=(task_index == 0),
				                         logdir=logdir,
				                         summary_op=None,
				                         saver=saver,
				                         global_step=global_step,
				                         stop_grace_secs=300,
				                         save_model_secs=0)
			# Configure output path on HDFS
			output_dir = TFNode.hdfs_path(ctx, args.output)
			output_file = tf.gfile.Open("{0}/part-{1:05d}".format(output_dir, worker_num), mode='w')

	# The supervisor takes care of session initialization, restoring from
	# a checkpoint, and closing when done or an error occurs.
	with sv.managed_session(server.target) as sess:
		print("{0} session ready".format(datetime.now().isoformat()))
		step = 0
		count = 0
		offset = random.randint(0, n_input + 1)
		end_offset = n_input + 1
		acc_total = 0
		loss_total = 0

		# TODO writer.add_graph(session.graph)? Might be taken care of by setup of summary_writer
		# TODO Set up args.steps

		# Loop until supervisor shuts down or max. iters have completed
		while not sv.should_stop() and step < args.steps:
			# TODO Determine what makes THIS asynch, and whether we need synch.
			# TODO A good resource may be https://stackoverflow.com/questions/41293576/distributed-tensorflow-good-example-for-synchronous-training-on-cpus
			# Run a training step asynchronously
			# See `tf.train.SyncReplicasOptimizer` for additional details on how to
			# perform *synchronous* training.

			# Using QueueRunner/Readers
			if args.mode == "train":
				# TODO Below is merely a copy-pasta of the local TF code, and will need refactoring
				if offset > (len(training_data) - end_offset):
					offset = random.randint(0, n_input + 1)

				symbols_in_keys = [[dictionary[str(training_data[i])]] for i in range(offset, offset + n_input)]
				symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])

				symbols_out_onehot = np.zeros([vocab_size], dtype=float)
				symbols_out_onehot[dictionary[str(training_data[offset + n_input])]] = 1.0
				symbols_out_onehot = np.reshape(symbols_out_onehot, [1, -1])

				# Run iteration and increment 'step'
				_, summary, acc, loss, onehot_pred, step = sess.run(
					[optimizer, summary_op, accuracy, cost, pred, global_step],
					feed_dict={x: symbols_in_keys, y: symbols_out_onehot})

				loss_total += loss
				acc_total += acc

				if ((step + 1) % display_iter) == 0:
					print("{0} step: {1} accuracy: {2}".format(
						datetime.now().isoformat(),
						step,
						sess.run(accuracy)))
					# TODO migrate over print fn from local TF code

				offset += (n_input + 1)

				if sv.is_chief:
					summary_writer.add_summary(summary, step)
			else:  # args.mode == "inference"
				# labels, pred, acc = sess.run([label, prediction, accuracy])
				# # print("label: {0}, pred: {1}".format(labels, pred))
				# print("acc: {0}".format(acc))
				# for i in range(len(labels)):
				# 	count += 1
				# 	output_file.write("{0} {1}\n".format(labels[i], pred[i]))
				# print("count: {0}".format(count))
				pass

		if args.mode == "inference":
			output_file.close()

		# Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
		# run inference and request stop before the other workers even start/sync their sessions.
		if task_index == 0:
			time.sleep(60)

		# Ask for all the services to stop.
		print("{0} stopping supervisor".format(datetime.now().isoformat()))
		sv.stop()
Exemple #15
0
def main_fun(args, ctx):
    import tensorflow as tf
    import argparse
    import time
    import os
    from six.moves import cPickle
    from model import Model
    from tensorflowonspark import TFNode
    from datetime import datetime
    import numpy as np

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    num_workers = len(cluster_spec['worker'])

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    if job_name == "ps":
        server.join()
    else:
        with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % task_index,
                                                    cluster=cluster)):
            model = Model(args)
            # instrument for tensorboard
            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        logdir = TFNode.hdfs_path(args.save_dir, ctx.defaultFS, ctx.working_dir)

        print("tensorflow model path: {0}".format(logdir))

        summary_writer = TFNode.get_summary_writer(ctx)

        sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                logdir=logdir,
                                init_op=init_op,
                                summary_op=None,
                                saver=saver,
                                global_step=model.global_step,
                                stop_grace_secs=300, save_model_secs=10)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(
                datetime.now().isoformat()))

            state=sess.run(model.initial_state)

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step=0
            tf_feed=TFNode.DataFeed(ctx.mgr, True)
            while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using feed_dict
                batch = tf_feed.next_batch(args.batch_size)
                batch_xs = np.asarray([data[0] for data in batch])
                batch_ys = np.asarray([data[1] for data in batch])

                feed={model.input_data: batch_xs, model.targets: batch_ys}

                for i, (c, h) in enumerate(model.initial_state):
                    feed[c]=state[i].c
                    feed[h]=state[i].h

                if len(batch_xs) > 0:
                    # instrument for tensorboard
                    summ, train_loss, state, _, step = sess.run(
                        [summary_op, model.cost, model.final_state, model.train_op, model.global_step], feed_dict=feed)

                    # print loss
                    print("Step: {}, train_loss: {}".format(step, train_loss))

                if sv.is_chief:
                    summary_writer.add_summary(summ, step)

            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
def map_fun(args, ctx):
    # from com.yahoo.ml.tf import TFNode
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf
    from tensorflow.contrib.layers.python.layers import batch_norm
    import time
    import os

    worker_num = ctx.worker_num  #worker数量
    job_name = ctx.job_name  # job名
    task_index = ctx.task_index  # 任务索引
    cluster_spec = ctx.cluster_spec  # 集群

    IMAGE_PIXELS = 2  # 图像大小 mnist 28x28x1  (后续参考自己图像大小进行修改)
    channels = 3
    num_class = 2
    # global dropout
    dropout = args.dropout
    # Parameters
    # hidden_units = 128 # NN隐藏层
    # training_epochs=args.epochs
    batch_size = args.batch_size  #每批次训练的样本数
    # img_nums=630000
    # global learning_rate
    # learning_rate=args.learning_rate
    INITIAL_LEARNING_RATE = args.learning_rate
    # flag=True

    # batch_size=200

    num_examples_per_epoch_for_train = (4015 - 1)**2  # 每次迭代的样本数
    num_batches_per_epoch = int(num_examples_per_epoch_for_train / batch_size)
    num_epochs_per_decay = 1.2
    learning_rate_decay_rate = 0.8
    learning_rate_decay_steps = int(num_batches_per_epoch *
                                    num_epochs_per_decay)
    """
  # ---------设置动态学习效率
  # Constants describing the training process.
  # MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
  NUM_EPOCHS_PER_DECAY = batch_size  # Epochs after which learning rate decays.
  LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
  INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.

  global_step1 = training_epochs * (img_nums // batch_size)  # Integer Variable counting the number of training steps
  # Variables that affect learning rate.
  num_batches_per_epoch = img_nums / batch_size
  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

  # Decay the learning rate exponentially based on the number of steps.
  learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                             global_step1,
                                             decay_steps,
                                             LEARNING_RATE_DECAY_FACTOR,
                                             staircase=True)
# 设置动态学习效率----------  
"""

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":  # ps节点(主节点)
        time.sleep((worker_num + 1) * 5)

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def feed_dict(batch):
        # Convert from [(images, labels)] to two numpy arrays of the proper type
        images = []
        labels = []
        if args.mode != 'inference':
            numpy.random.shuffle(batch)  # 随机打乱
        for item in batch:
            images.append(item[0])
            labels.append(item[1])
        xs = numpy.array(images)
        xs = xs.astype(numpy.float32)
        # xs = xs/255.0 # 数据归一化
        # Z-score标准化方法
        # mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1])
        # std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1])
        # xs = (xs - mean) / std

        # min-max标准化(Min-Max Normalization
        max_ = numpy.reshape(numpy.max(xs, 1), [numpy.shape(xs)[0], 1])
        min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1])

        xs = (xs - min_) / (max_ - min_)
        ys = numpy.array(labels)
        if args.mode != 'inference':
            ys = ys.astype(numpy.uint8)
        else:
            ys = ys.astype(numpy.uint16)
        return (xs, ys)

    def batch_norm_layer(inputT, is_training=True, scope=None):
        # Note: is_training is tf.placeholder(tf.bool) type
        return tf.cond(is_training,
                       lambda: batch_norm(inputT,
                                          is_training=True,
                                          center=True,
                                          scale=True,
                                          activation_fn=tf.nn.relu,
                                          decay=0.9,
                                          scope=scope),
                       lambda: batch_norm(inputT,
                                          is_training=False,
                                          center=True,
                                          scale=True,
                                          activation_fn=tf.nn.relu,
                                          decay=0.9,
                                          scope=scope))  # , reuse = True))

    if job_name == "ps":
        server.join()
    elif job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Create some wrappers for simplicity
            def conv2d(x, W, b, strides=1):
                # Conv2D wrapper, with bias and relu activation
                x = tf.nn.conv2d(x,
                                 W,
                                 strides=[1, strides, strides, 1],
                                 padding='SAME')
                x = tf.nn.bias_add(x, b)  # strides中间两个为1 表示x,y方向都不间隔取样
                return tf.nn.relu(x)

            def maxpool2d(x, k=2):
                # MaxPool2D wrapper
                return tf.nn.max_pool(
                    x,
                    ksize=[1, k, k, 1],
                    strides=[1, k, k, 1],
                    padding='SAME')  # strides中间两个为2 表示x,y方向都间隔1个取样

            # Store layers weight & bias
            weights = {
                # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入
                'wc1':
                tf.get_variable('wc1', [3, 3, channels, 128],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),  # 5X5的卷积模板

                # 5x5 conv, 32 inputs, 64 outputs
                'wc2':
                tf.get_variable('wc2', [3, 3, 32, 64],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),

                # fully connected, 7*7*64 inputs, 1024 outputs
                'wd1':
                tf.Variable(
                    tf.random_normal([
                        (IMAGE_PIXELS // 2) * (IMAGE_PIXELS // 2) * 128, 1024
                    ])),
                # 1024 inputs, 10 outputs (class prediction)
                'out':
                tf.Variable(tf.random_normal([1024, num_class]))
            }

            biases = {
                'bc1':
                tf.get_variable('bc1', [128],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),
                'bc2':
                tf.get_variable('bc2', [64],
                                dtype=tf.float32,
                                initializer=tf.truncated_normal_initializer,
                                regularizer=tf.nn.l2_loss),
                'bd1':
                tf.Variable(tf.random_normal([1024])),
                'out':
                tf.Variable(tf.random_normal([num_class]))
            }

            # Placeholders or QueueRunner/Readers for input data
            x = tf.placeholder(tf.float32,
                               [None, IMAGE_PIXELS * IMAGE_PIXELS * channels],
                               name="x")  # mnist 28*28*1
            if args.mode != 'inference':
                y_ = tf.placeholder(tf.float32, [None, num_class], name="y_")
            else:
                y_ = tf.placeholder(tf.float32, [None, 4], name="y_")
                label = y_
            keep = tf.placeholder(tf.float32)
            is_training = tf.placeholder(tf.bool, name='MODE')

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels
                                   ])  # mnist 数据 28x28x1 (灰度图 波段为1)

            # x_img=batch_norm_layer(x_img,is_training)
            x_img = tf.nn.lrn(x_img,
                              depth_radius=5,
                              bias=2.0,
                              alpha=1e-3,
                              beta=0.75)  # lrn层

            # 改成卷积模型
            conv1 = conv2d(x_img, weights['wc1'], biases['bc1'])
            conv1 = maxpool2d(conv1, k=2)  # shape [N,1,1,32]
            conv1 = tf.nn.lrn(conv1,
                              depth_radius=5,
                              bias=2.0,
                              alpha=1e-3,
                              beta=0.75)  # lrn层
            # conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
            # conv2 = maxpool2d(conv2, k=2)  # shape [N,1,1,32]
            # conv1 = tf.nn.dropout(conv1, keep+0.1)
            fc1 = tf.reshape(conv1,
                             [-1, weights['wd1'].get_shape().as_list()[0]])
            fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
            # fc1=batch_norm_layer(fc1, is_training)
            fc1 = tf.nn.relu(fc1)
            fc1 = tf.nn.dropout(fc1, keep)
            y = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
            prediction = tf.argmax(y, 1, name="prediction")
            # y=tf.sigmoid(y) # 二分类 多分类加 tf.nn.softmax()

            global_step = tf.Variable(0, name="global_step", trainable=False)

            # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            if args.mode != 'inference':
                loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits(labels=y_,
                                                            logits=y))

                # learning_rate=tf.train.exponential_decay(INITIAL_LEARNING_RATE,global_step,
                #                                          learning_rate_decay_steps,learning_rate_decay_rate,
                #                                          staircase=False)

                # learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                #                                            global_step,
                #                                            10000,
                #                                            0.96,
                #                                            staircase=False)
                learning_rate = tf.train.polynomial_decay(
                    INITIAL_LEARNING_RATE, global_step, 3000000, 1e-5, 0.8,
                    True)
                # 运行steps:decay_steps>1000:1
                # train_op = tf.train.AdagradOptimizer(learning_rate).minimize(
                #     loss, global_step=global_step)

                train_op = tf.train.GradientDescentOptimizer(
                    learning_rate).minimize(loss, global_step=global_step)

                # Test trained model
                label = tf.argmax(y_, 1, name="label")
                # prediction = tf.argmax(y, 1,name="prediction")
                correct_prediction = tf.equal(prediction, label)

                accuracy = tf.reduce_mean(tf.cast(correct_prediction,
                                                  tf.float32),
                                          name="accuracy")
                # tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()

            # summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        print("tensorflow model path: {0}".format(logdir))  #
        # log.info("tensorflow model path: {0}".format(logdir))
        # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(
                is_chief=(task_index == 0),
                logdir=logdir,
                init_op=init_op,
                # summary_op=None,
                saver=saver,
                # saver=None, # None 不自动保存模型
                # recovery_wait_secs=1,
                global_step=global_step,
                stop_grace_secs=300,
                save_model_secs=10)
        elif args.mode == "retrain":
            sv = tf.train.Supervisor(
                is_chief=(task_index == 0),
                logdir=logdir,
                # init_op=init_op,
                # summary_op=None,
                # saver=None, # None 不自动保存模型
                saver=saver,
                # recovery_wait_secs=1,
                global_step=global_step,
                stop_grace_secs=300,
                save_model_secs=10)
        else:
            sv = tf.train.Supervisor(
                is_chief=(task_index == 0),
                logdir=logdir,
                # summary_op=None,
                saver=saver,
                # recovery_wait_secs=1,
                global_step=global_step,
                stop_grace_secs=300,
                save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:  # 打开session
            """
      # 验证之前是否已经保存了检查点文件
      ckpt = tf.train.get_checkpoint_state(logdir)
      if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess,ckpt.model_checkpoint_path)
      """
            # global_step=int(ckpt.model_checkpoint_path.rsplit('-',1)[1])
            # else:
            #   sess.run(init_op)

            print("{0} session ready".format(datetime.now().isoformat()))
            # log.info("{0} session ready".format(datetime.now().isoformat()))
            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = 0
            # acc1=args.acc
            # n = 0
            tf_feed = TFNode.DataFeed(
                ctx.mgr, args.mode == "train" or args.mode == "retrain")
            while not sv.should_stop() and not tf_feed.should_stop(
            ) and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using feed_dict
                batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
                feed = {
                    x: batch_xs,
                    y_: batch_ys,
                    keep: dropout,
                    is_training: True
                }
                if len(batch_xs) > 0:
                    if args.mode == "train" or args.mode == "retrain":
                        # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
                        _, step = sess.run([train_op, global_step],
                                           feed_dict=feed)
                        '''
            if dropout > 0.2:
                if step%10000==0:dropout=dropout*0.85
            else:
                dropout=0.7
            '''
                        """
            acc=sess.run(accuracy,{x: batch_xs, y_: batch_ys,keep:1.})
            if acc>acc1:
              if flag and acc>0.9:
                os.popen('hdfs dfs -rm -r '+logdir+'/*') # 清空hdfs上面文件夹下的所有文件
                flag=False
              # acc1=acc # 训练达到一定程度加上
              saver.save(sess,logdir+'/'+args.model_name,global_step=step)
              n=0
              # learning_rate=1e-3
              # dropout=.7
            else:
              n += 1
              if n > 100:
                ckpt1 = tf.train.get_checkpoint_state(logdir)
                if ckpt1 and ckpt1.model_checkpoint_path:
                  saver.restore(sess, ckpt1.model_checkpoint_path)
                if learning_rate > 1e-7:
                  # learning_rate = learning_rate * .96**(step/10)
                  learning_rate = learning_rate * .8
                else:
                  learning_rate = 1e-3
                if dropout > 0.2:
                  dropout = dropout * .85
                else:
                  dropout = .7
            """

                        # print accuracy and save model checkpoint to HDFS every 100 steps
                        if (step % 100 == 0):
                            print("{0} step: {1} accuracy: {2}".format(
                                datetime.now().isoformat(), step,
                                sess.run(
                                    accuracy, {
                                        x: batch_xs,
                                        y_: batch_ys,
                                        keep: 1.,
                                        is_training: False
                                    })))
                            # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
                        if sv.is_chief:
                            pass
                            # summary_writer.add_summary(summary, step)
                    elif args.mode == 'test':
                        feed2 = {
                            x: batch_xs,
                            y_: batch_ys,
                            keep: 1.,
                            is_training: False
                        }
                        labels, preds, acc = sess.run(
                            [label, prediction, accuracy], feed_dict=feed2)
                        results = [
                            "{0} Label: {1}, Prediction: {2}".format(
                                datetime.now().isoformat(), l, p)
                            for l, p in zip(labels, preds)
                        ]
                        tf_feed.batch_results(results)
                        print("acc: {0}".format(acc))
                    else:  # args.mode == "inference"
                        feed2 = {
                            x: batch_xs,
                            y_: batch_ys,
                            keep: 1.,
                            is_training: False
                        }
                        # labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed2)
                        labels, preds = sess.run([label, prediction],
                                                 feed_dict=feed2)
                        # results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
                        results = [
                            "Label: {0}, Prediction: {1}".format(l, p)
                            for l, p in zip(labels, preds)
                        ]
                        tf_feed.batch_results(results)
                        # print("acc: {0}".format(acc))
                        # log.info("acc: {0}".format(acc))
            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        # log.info("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
def map_fun(args, ctx):
    from tensorflowonspark import TFNode
    from datetime import datetime
    import math
    import numpy
    import tensorflow as tf

    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    # Parameters
    IMAGE_PIXELS = 28
    hidden_units = 128

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    # Create generator for Spark data feed
    tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")

    def rdd_generator():
        while not tf_feed.should_stop():
            batch = tf_feed.next_batch(1)[0]
            image = numpy.array(batch[0])
            image = image.astype(numpy.float32) / 255.0
            label = numpy.array(batch[1])
            label = label.astype(numpy.int64)
            yield (image, label)

    if job_name == "ps":
        server.join()
    elif job_name == "worker":

        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Dataset for input data
            ds = tf.data.Dataset.from_generator(
                rdd_generator, (tf.float32, tf.float32),
                (tf.TensorShape([IMAGE_PIXELS * IMAGE_PIXELS]),
                 tf.TensorShape([10]))).batch(args.batch_size)
            iterator = ds.make_one_shot_iterator()
            x, y_ = iterator.get_next()

            # Variables of the hidden layer
            hid_w = tf.Variable(tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
                                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            tf.summary.histogram("hidden_weights", hid_w)

            # Variables of the softmax layer
            sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                                                   stddev=1.0 /
                                                   math.sqrt(hidden_units)),
                               name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            tf.summary.histogram("softmax_weights", sm_w)

            # # Placeholders or QueueRunner/Readers for input data
            # x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS], name="x")
            # y_ = tf.placeholder(tf.float32, [None, 10], name="y_")

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
            tf.summary.image("x_img", x_img)

            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            tf.summary.scalar("loss", loss)

            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)

            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),
                                      name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model)
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % worker_num,
                                               graph=tf.get_default_graph())

        if args.mode == "train":
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     init_op=init_op,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=10)
        else:
            sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                     logdir=logdir,
                                     summary_op=None,
                                     saver=saver,
                                     global_step=global_step,
                                     stop_grace_secs=300,
                                     save_model_secs=0)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = 0
            while not sv.should_stop() and not tf_feed.should_stop(
            ) and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                if args.mode == "train":
                    _, summary, step = sess.run(
                        [train_op, summary_op, global_step])
                    # print accuracy and save model checkpoint to HDFS every 100 steps
                    if (step % 100 == 0):
                        print("{0} step: {1} accuracy: {2}".format(
                            datetime.now().isoformat(), step,
                            sess.run(accuracy)))

                    if sv.is_chief:
                        summary_writer.add_summary(summary, step)
                else:  # args.mode == "inference"
                    labels, preds, acc = sess.run(
                        [label, prediction, accuracy])

                    results = [
                        "{0} Label: {1}, Prediction: {2}".format(
                            datetime.now().isoformat(), l, p)
                        for l, p in zip(labels, preds)
                    ]
                    tf_feed.batch_results(results)
                    print("acc: {0}".format(acc))

            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
Exemple #18
0
def map_fun(args, ctx):
    num_workers = args.cluster_size if args.driver_ps_nodes else args.cluster_size - args.num_ps
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1,
                                                  args.protocol == 'rdma')

    def _parse_tfr(example_proto):
        feature_def = {
            "label": tf.FixedLenFeature(10, tf.int64),
            "image": tf.FixedLenFeature(IMAGE_PIXELS * IMAGE_PIXELS, tf.int64)
        }
        features = tf.parse_single_example(example_proto, feature_def)
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        image = tf.div(tf.to_float(features['image']), norm)
        label = tf.to_float(features['label'])
        return (image, label)

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Variables of the hidden layer
            hid_w = tf.Variable(tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
                                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            tf.summary.histogram("hidden_weights", hid_w)

            # Variables of the softmax layer
            sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                                                   stddev=1.0 /
                                                   math.sqrt(hidden_units)),
                               name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            tf.summary.histogram("softmax_weights", sm_w)

            # read from saved tf records
            images = TFNode.hdfs_path(ctx, args.tfrecord_dir)
            tf_record_pattern = os.path.join(images, 'part-*')
            tfr_files = tf.gfile.Glob(tf_record_pattern)
            ds = tf.data.TFRecordDataset(tfr_files)
            parse_fn = _parse_tfr
            ds = ds.shard(num_workers, task_index).repeat(args.epochs).shuffle(
                args.shuffle_size)
            ds = ds.map(parse_fn).batch(args.batch_size)
            iterator = ds.make_initializable_iterator()
            x, y_ = iterator.get_next()

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
            tf.summary.image("x_img", x_img)

            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            tf.summary.scalar("loss", loss)
            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),
                                      name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

        # Create a "supervisor", which oversees the training process and stores model state into HDFS
        logdir = TFNode.hdfs_path(ctx, args.model_dir)
        print("tensorflow model path: {0}".format(logdir))
        summary_writer = tf.summary.FileWriter("tensorboard_%d" % (worker_num),
                                               graph=tf.get_default_graph())

        sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                 logdir=logdir,
                                 init_op=init_op,
                                 summary_op=None,
                                 saver=saver,
                                 global_step=global_step,
                                 stop_grace_secs=300,
                                 save_model_secs=10)

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
        with sv.managed_session(server.target) as sess:
            print("{0} session ready".format(datetime.now().isoformat()))
            sess.run(iterator.initializer)

            # Loop until the supervisor shuts down or 1000000 steps have completed.
            step = 0
            while not sv.should_stop() and step < args.steps:
                # Run a training step asynchronously.
                # See `tf.train.SyncReplicasOptimizer` for additional details on how to
                # perform *synchronous* training.

                # using QueueRunners/Readers
                if (step % 100 == 0):
                    print("{0} step: {1} accuracy: {2}".format(
                        datetime.now().isoformat(), step, sess.run(accuracy)))
                _, summary, step = sess.run(
                    [train_op, summary_op, global_step])
                if sv.is_chief:
                    summary_writer.add_summary(summary, step)

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
Exemple #19
0
def map_fun(args, ctx):
  # from com.yahoo.ml.tf import TFNode
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num #worker数量
  job_name = ctx.job_name # job名
  task_index = ctx.task_index # 任务索引
  cluster_spec = ctx.cluster_spec # 集群

  IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1  (后续参考自己图像大小进行修改)
  channels=4
  num_class=2

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps": # ps节点(主节点)
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128 # NN隐藏层
  batch_size   = args.batch_size #每批次训练的样本数

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    xs = xs/255.0 # 数据归一化
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      #-------------普通的NN模型(可以修改成自己的模型)---------------------------------#
      #↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓#
      '''
      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                              stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      # tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                              stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      # tf.summary.histogram("softmax_weights", sm_w)
      '''

      # Create some wrappers for simplicity
      def conv2d(x, W, b, strides=1):
        # Conv2D wrapper, with bias and relu activation
        x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
        x = tf.nn.bias_add(x, b)  # strides中间两个为1 表示x,y方向都不间隔取样
        return tf.nn.relu(x)

      def maxpool2d(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='SAME')  # strides中间两个为2 表示x,y方向都间隔1个取样

      # Store layers weight & bias
      weights = {
        # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入
        'wc1': tf.Variable(tf.random_normal([5, 5, channels, 32])),  # 5X5的卷积模板
        # 5x5 conv, 32 inputs, 64 outputs
        'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
        # fully connected, 7*7*64 inputs, 1024 outputs
        'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])),
        # 1024 inputs, 10 outputs (class prediction)
        'out': tf.Variable(tf.random_normal([1024, num_class]))
      }

      biases = {
        'bc1': tf.Variable(tf.random_normal([32])),
        'bc2': tf.Variable(tf.random_normal([64])),
        'bd1': tf.Variable(tf.random_normal([1024])),
        'out': tf.Variable(tf.random_normal([num_class]))
      }


      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS*channels], name="x") # mnist 28*28*1
      y_ = tf.placeholder(tf.float32, [None, num_class], name="y_")

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels]) # mnist 数据 28x28x1 (灰度图 波段为1)
      # tf.summary.image("x_img", x_img)


      # 改成卷积模型
      conv1 = conv2d(x_img, weights['wc1'], biases['bc1'])
      conv1 = maxpool2d(conv1, k=2)
      conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
      conv2 = maxpool2d(conv2, k=2)
      fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
      fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
      fc1 = tf.nn.relu(fc1)
      if args.mode == "train":
        fc1 = tf.nn.dropout(fc1, 0.7)
      y = tf.add(tf.matmul(fc1, weights['out']), biases['out'])

      '''
      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) # tf.nn.add(tf.nn.matmul(x,hid_w),hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
      '''
      # global_step = tf.Variable(0)

      global_step = tf.Variable(0, name="global_step", trainable=False)

      # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

      loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y))

      # tf.summary.scalar("loss", loss)

      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      # tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      # summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

      # ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑#
      #---------------上面的模型可以修改成自己的模型------------------------------#

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir)) #
    log.info("tensorflow model path: {0}".format(logdir))
    # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess: # 打开session
      logging.basicConfig(level=logging.INFO)

      print("{0} session ready".format(datetime.now().isoformat()))
      log.info("{0} session ready".format(datetime.now().isoformat()))
      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train":
            # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            _, step = sess.run([train_op,  global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
              log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
            if sv.is_chief:
              pass
              # summary_writer.add_summary(summary, step)
          else: # args.mode == "inference"
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

            results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))
            log.info("acc: {0}".format(acc))
      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    log.info("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
Exemple #20
0
def map_fun(args, ctx):
  # from com.yahoo.ml.tf import TFNode
  from tensorflowonspark import TFNode
  from datetime import datetime
  import math
  import numpy
  import tensorflow as tf
  import time

  worker_num = ctx.worker_num #worker数量
  job_name = ctx.job_name # job名
  task_index = ctx.task_index # 任务索引
  cluster_spec = ctx.cluster_spec # 集群

  IMAGE_PIXELS=10 # 图像大小 mnist 28x28x1  (后续参考自己图像大小进行修改)
  channels=3
  num_class=2
  dropout = 0.5

  learning_rate=1e-6
  # Parameters
  hidden_units = 128 # NN隐藏层
  training_epochs=args.epochs
  img_nums=630000
  #batch_size   = args.batch_size #每批次训练的样本数
  batch_size=200
  """
  # ---------设置动态学习效率
  # Constants describing the training process.
  # MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
  NUM_EPOCHS_PER_DECAY = batch_size  # Epochs after which learning rate decays.
  LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
  INITIAL_LEARNING_RATE = 0.1  # Initial learning rate.

  global_step1 = training_epochs * (img_nums // batch_size)  # Integer Variable counting the number of training steps
  # Variables that affect learning rate.
  num_batches_per_epoch = img_nums / batch_size
  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

  # Decay the learning rate exponentially based on the number of steps.
  learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                            global_step1,
                                            decay_steps,
                                            LEARNING_RATE_DECAY_FACTOR,
                                            staircase=True)
  # 设置动态学习效率----------
  """
  
  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps": # ps节点(主节点)
    time.sleep((worker_num + 1) * 5)

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def feed_dict(batch):
    # Convert from [(images, labels)] to two numpy arrays of the proper type
    images = []
    labels = []
    numpy.random.shuffle(batch) # 随机打乱
    for item in batch:
      images.append(item[0])
      labels.append(item[1])
    xs = numpy.array(images)
    xs = xs.astype(numpy.float32)
    #xs = xs/255.0 # 数据归一化
    # Z-score标准化方法
    #mean = numpy.reshape(numpy.average(xs, 1), [numpy.shape(xs)[0], 1])
    #std = numpy.reshape(numpy.std(xs, 1), [numpy.shape(xs)[0], 1])
    #xs = (xs - mean) / std

    # min-max标准化(Min-Max Normalization
    max_=numpy.reshape(numpy.max(xs,1),[numpy.shape(xs)[0], 1])
    min_ = numpy.reshape(numpy.min(xs, 1), [numpy.shape(xs)[0], 1])

    xs=(xs-min_)/(max_-min_)
    
    
    ys = numpy.array(labels)
    ys = ys.astype(numpy.uint8)
    return (xs, ys)

  if job_name == "ps":
    server.join()
  elif job_name == "worker":

    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      # Create some wrappers for simplicity
      def conv2d(x, W, b, strides=1):
        # Conv2D wrapper, with bias and relu activation
        x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
        x = tf.nn.bias_add(x, b)  # strides中间两个为1 表示x,y方向都不间隔取样
        return tf.nn.relu(x)

      def maxpool2d(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='SAME')  # strides中间两个为2 表示x,y方向都间隔1个取样

      def maxpool2d2(x, k=2):
        # MaxPool2D wrapper
        return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                              padding='VALID')  # strides中间两个为2 表示x,y方向都间隔1个取样

      # Store layers weight & bias
      weights = {
          # 5x5 conv, 3 input, 32 outputs 彩色图像3个输入(3个频道),灰度图像1个输入
          'wc1': tf.get_variable('wc1',[3,3,channels,64],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),  # 5X5的卷积模板

          # 5x5 conv, 32 inputs, 64 outputs
          'wc2': tf.get_variable('wc2',[3,3,64,128],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'wc3': tf.Variable(tf.random_normal([3, 3, 256, 128])),
          'wc4': tf.get_variable('wc4',[3,3,128,num_class],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # fully connected, 7*7*64 inputs, 1024 outputs
          # 'wd1': tf.Variable(tf.random_normal([(1+IMAGE_PIXELS // 4) * (1+IMAGE_PIXELS // 4) * 64, 1024])),
          # 1024 inputs, 10 outputs (class prediction)
          # 'out': tf.Variable(tf.random_normal([1024, num_class]))
      }

      biases = {
          'bc1': tf.get_variable('bc1',[64],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          'bc2': tf.get_variable('bc2',[128],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'bc3': tf.Variable(tf.random_normal([128])),
          'bc4': tf.get_variable('bc4',[num_class],dtype=tf.float32,
                                 initializer=tf.truncated_normal_initializer,regularizer=tf.nn.l2_loss),
          # 'bd1': tf.Variable(tf.random_normal([1024])),
          # 'out': tf.Variable(tf.random_normal([num_class]))
      }

      # Placeholders or QueueRunner/Readers for input data
      x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS * channels], name="x")  # mnist 28*28*1
      y_ = tf.placeholder(tf.float32, [None, num_class], name="y_")
      # keep=tf.placeholder(tf.float32)

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, channels])  # mnist 数据 28x28x1 (灰度图 波段为1)
      # tf.summary.image("x_img", x_img)

      # 改成卷积模型
      conv1 = conv2d(x_img, weights['wc1'], biases['bc1'])
      conv1 = maxpool2d(conv1, k=2)
      # conv1 = tf.nn.dropout(conv1, keep)
      conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
      conv2 = maxpool2d(conv2, k=2)
      conv2 = tf.nn.dropout(conv2, dropout)
      # conv3 = conv2d(conv2, weights['wc3'], biases['bc3'])
      # conv3 = tf.nn.dropout(conv3, keep)
      conv4 = conv2d(conv2, weights['wc4'], biases['bc4'])
      conv4 = maxpool2d2(conv4, k=2)
      y = tf.reshape(conv4, [-1, num_class])


      # fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
      # fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
      # fc1 = tf.nn.relu(fc1)
      # if args.mode == "train" or args.mode == "retrain":
      #   fc1 = tf.nn.dropout(fc1, dropout)
      # y = tf.add(tf.matmul(fc1, weights['out']), biases['out'])

 
      # global_step = tf.Variable(0)

      global_step = tf.Variable(0, name="global_step", trainable=False)

      # loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

      loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_,logits=y))

      # tf.summary.scalar("loss", loss)
      train_op = tf.train.AdagradOptimizer(learning_rate).minimize(
          loss, global_step=global_step)


      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)

      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      # tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      # summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()


    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir)) #
    # log.info("tensorflow model path: {0}".format(logdir))
    # summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=1)
    elif args.mode == "retrain":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # init_op=init_op,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               # summary_op=None,
                               saver=saver,
                               # recovery_wait_secs=1,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess: # 打开session

      print("{0} session ready".format(datetime.now().isoformat()))
      # log.info("{0} session ready".format(datetime.now().isoformat()))
      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      tf_feed = TFNode.DataFeed(ctx.mgr, args.mode == "train" or args.mode == "retrain")
      while not sv.should_stop() and not tf_feed.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using feed_dict
        batch_xs, batch_ys = feed_dict(tf_feed.next_batch(batch_size))
        feed = {x: batch_xs, y_: batch_ys}

        if len(batch_xs) > 0:
          if args.mode == "train" or args.mode == "retrain":
            # _, summary, step = sess.run([train_op, summary_op, global_step], feed_dict=feed)
            _, step = sess.run([train_op,  global_step], feed_dict=feed)
            # print accuracy and save model checkpoint to HDFS every 100 steps
            if (step % 100 == 0):
              print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
              # log.info("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy,{x: batch_xs, y_: batch_ys})))
            if sv.is_chief:
              pass
              # summary_writer.add_summary(summary, step)
          else: # args.mode == "inference"
            labels, preds, acc = sess.run([label, prediction, accuracy], feed_dict=feed)

            results = ["{0} Label: {1}, Prediction: {2}".format(datetime.now().isoformat(), l, p) for l,p in zip(labels,preds)]
            tf_feed.batch_results(results)
            print("acc: {0}".format(acc))
            # log.info("acc: {0}".format(acc))
      if sv.should_stop() or step >= args.steps:
        tf_feed.terminate()

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()
def map_fun(args, ctx):
  from tensorflowonspark import TFNode
  from datetime import datetime
  import getpass
  import math
  import numpy
  import os
  import signal
  import tensorflow as tf
  import time

  IMAGE_PIXELS=28
  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index
  cluster_spec = ctx.cluster_spec
  num_workers = len(cluster_spec['worker'])

  # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
  if job_name == "ps":
    time.sleep((worker_num + 1) * 5)

  # Parameters
  hidden_units = 128
  batch_size   = 100

  # Get TF cluster and server instances
  cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

  def read_csv_examples(image_dir, label_dir, batch_size=100, num_epochs=None, task_index=None, num_workers=None):
    print_log(worker_num, "num_epochs: {0}".format(num_epochs))
    # Setup queue of csv image filenames
    tf_record_pattern = os.path.join(image_dir, 'part-*')
    images = tf.gfile.Glob(tf_record_pattern)
    print_log(worker_num, "images: {0}".format(images))
    image_queue = tf.train.string_input_producer(images, shuffle=False, capacity=1000, num_epochs=num_epochs, name="image_queue")

    # Setup queue of csv label filenames
    tf_record_pattern = os.path.join(label_dir, 'part-*')
    labels = tf.gfile.Glob(tf_record_pattern)
    print_log(worker_num, "labels: {0}".format(labels))
    label_queue = tf.train.string_input_producer(labels, shuffle=False, capacity=1000, num_epochs=num_epochs, name="label_queue")

    # Setup reader for image queue
    img_reader = tf.TextLineReader(name="img_reader")
    _, img_csv = img_reader.read(image_queue)
    image_defaults = [ [1.0] for col in range(784) ]
    img = tf.pack(tf.decode_csv(img_csv, image_defaults))
    # Normalize values to [0,1]
    norm = tf.constant(255, dtype=tf.float32, shape=(784,))
    image = tf.div(img, norm)
    print_log(worker_num, "image: {0}".format(image))

    # Setup reader for label queue
    label_reader = tf.TextLineReader(name="label_reader")
    _, label_csv = label_reader.read(label_queue)
    label_defaults = [ [1.0] for col in range(10) ]
    label = tf.pack(tf.decode_csv(label_csv, label_defaults))
    print_log(worker_num, "label: {0}".format(label))

    # Return a batch of examples
    return tf.train.batch([image,label], batch_size, num_threads=args.readers, name="batch_csv")

  def read_tfr_examples(path, batch_size=100, num_epochs=None, task_index=None, num_workers=None):
    print_log(worker_num, "num_epochs: {0}".format(num_epochs))

    # Setup queue of TFRecord filenames
    tf_record_pattern = os.path.join(path, 'part-*')
    files = tf.gfile.Glob(tf_record_pattern)
    queue_name = "file_queue"

    # split input files across workers, if specified
    if task_index is not None and num_workers is not None:
      num_files = len(files)
      files = files[task_index:num_files:num_workers]
      queue_name = "file_queue_{0}".format(task_index)

    print_log(worker_num, "files: {0}".format(files))
    file_queue = tf.train.string_input_producer(files, shuffle=False, capacity=1000, num_epochs=num_epochs, name=queue_name)

    # Setup reader for examples
    reader = tf.TFRecordReader(name="reader")
    _, serialized = reader.read(file_queue)
    feature_def = {'label': tf.FixedLenFeature([10], tf.int64), 'image': tf.FixedLenFeature([784], tf.int64) }
    features = tf.parse_single_example(serialized, feature_def)
    norm = tf.constant(255, dtype=tf.float32, shape=(784,))
    image = tf.div(tf.to_float(features['image']), norm)
    print_log(worker_num, "image: {0}".format(image))
    label = tf.to_float(features['label'])
    print_log(worker_num, "label: {0}".format(label))

    # Return a batch of examples
    return tf.train.batch([image,label], batch_size, num_threads=args.readers, name="batch")

  if job_name == "ps":
    server.join()
  elif job_name == "worker":
    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d" % task_index,
        cluster=cluster)):

      # Variables of the hidden layer
      hid_w = tf.Variable(tf.truncated_normal([IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                              stddev=1.0 / IMAGE_PIXELS), name="hid_w")
      hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
      tf.summary.histogram("hidden_weights", hid_w)

      # Variables of the softmax layer
      sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                              stddev=1.0 / math.sqrt(hidden_units)), name="sm_w")
      sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
      tf.summary.histogram("softmax_weights", sm_w)

      # Placeholders or QueueRunner/Readers for input data
      num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs
      index = task_index if args.mode == "inference" else None
      workers = num_workers if args.mode == "inference" else None

      if args.format == "csv":
        images = TFNode.hdfs_path(ctx, args.images)
        labels = TFNode.hdfs_path(ctx, args.labels)
        x, y_ = read_csv_examples(images, labels, 100, num_epochs, index, workers)
      elif args.format == "tfr":
        images = TFNode.hdfs_path(ctx, args.images)
        x, y_ = read_tfr_examples(images, 100, num_epochs, index, workers)
      else:
        raise("{0} format not supported for tf input mode".format(args.format))

      x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
      tf.summary.image("x_img", x_img)

      hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
      hid = tf.nn.relu(hid_lin)

      y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

      global_step = tf.Variable(0)

      loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
      tf.summary.scalar("loss", loss)
      train_op = tf.train.AdagradOptimizer(0.01).minimize(
          loss, global_step=global_step)

      # Test trained model
      label = tf.argmax(y_, 1, name="label")
      prediction = tf.argmax(y, 1,name="prediction")
      correct_prediction = tf.equal(prediction, label)
      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
      tf.summary.scalar("acc", accuracy)

      saver = tf.train.Saver()
      summary_op = tf.summary.merge_all()
      init_op = tf.global_variables_initializer()

    # Create a "supervisor", which oversees the training process and stores model state into HDFS
    logdir = TFNode.hdfs_path(ctx, args.model)
    print("tensorflow model path: {0}".format(logdir))
    summary_writer = tf.summary.FileWriter("tensorboard_%d" %(worker_num), graph=tf.get_default_graph())

    if args.mode == "train":
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               init_op=init_op,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=10)
    else:
      sv = tf.train.Supervisor(is_chief=(task_index == 0),
                               logdir=logdir,
                               summary_op=None,
                               saver=saver,
                               global_step=global_step,
                               stop_grace_secs=300,
                               save_model_secs=0)
      output_dir = TFNode.hdfs_path(ctx, args.output)
      output_file = tf.gfile.Open("{0}/part-{1:05d}".format(output_dir, worker_num), mode='w')

    # The supervisor takes care of session initialization, restoring from
    # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
      print("{0} session ready".format(datetime.now().isoformat()))

      # Loop until the supervisor shuts down or 1000000 steps have completed.
      step = 0
      count = 0
      while not sv.should_stop() and step < args.steps:
        # Run a training step asynchronously.
        # See `tf.train.SyncReplicasOptimizer` for additional details on how to
        # perform *synchronous* training.

        # using QueueRunners/Readers
        if args.mode == "train":
          if (step % 100 == 0):
            print("{0} step: {1} accuracy: {2}".format(datetime.now().isoformat(), step, sess.run(accuracy)))
          _, summary, step = sess.run([train_op, summary_op, global_step])
          if sv.is_chief:
            summary_writer.add_summary(summary, step)
        else: # args.mode == "inference"
          labels, pred, acc = sess.run([label, prediction, accuracy])
          #print("label: {0}, pred: {1}".format(labels, pred))
          print("acc: {0}".format(acc))
          for i in range(len(labels)):
            count += 1
            output_file.write("{0} {1}\n".format(labels[i], pred[i]))
          print("count: {0}".format(count))

    if args.mode == "inference":
      output_file.close()
      # Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
      # run inference and request stop before the other workers even start/sync their sessions.
      if task_index == 0:
        time.sleep(60)

    # Ask for all the services to stop.
    print("{0} stopping supervisor".format(datetime.now().isoformat()))
    sv.stop()