Beispiel #1
0
def keras_mnist():
    import os
    import uuid

    import tensorflow as tf

    from hops import tensorboard

    from hops import model as hops_model
    from hops import hdfs

    batch_size = 32
    num_classes = 10

    # Provide path to train and validation datasets
    train_filenames = tf.io.gfile.glob(
        hdfs.project_path(td_proj_name) + '/' + td_ds + '/' + td +
        '/train/part-r-*')
    validation_filenames = tf.io.gfile.glob(
        hdfs.project_path(td_proj_name) + '/' + td_ds + '/' + td +
        '/validate/part-r-*')

    # Define input function
    def data_input(filenames,
                   batch_size=128,
                   num_classes=10,
                   shuffle=False,
                   repeat=None):
        def parser(serialized_example):
            """Parses a single tf.Example into image and label tensors."""
            features = tf.io.parse_single_example(
                serialized_example,
                features={
                    'image': tf.io.FixedLenFeature([28 * 28], tf.float32),
                    'label': tf.io.FixedLenFeature([], tf.int64),
                })

            image = tf.cast(features['image'], tf.float32)
            label = tf.cast(features['label'], tf.int32)

            # Create a one hot array for your labels
            label = tf.one_hot(label, num_classes)

            return image, label

        # Import MNIST data
        dataset = tf.data.TFRecordDataset(filenames)

        # Map the parser over dataset, and batch results by up to batch_size
        dataset = dataset.map(parser)
        if shuffle:
            dataset = dataset.shuffle(buffer_size=128)
        dataset = dataset.batch(batch_size, drop_remainder=True)
        dataset = dataset.repeat(repeat)
        return dataset

    # Define a Keras Model.
    model = tf.keras.Sequential()
    model.add(
        tf.keras.layers.Dense(128, activation='relu', input_shape=(784, )))
    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

    # Compile the model.
    model.compile(loss=tf.keras.losses.categorical_crossentropy,
                  optimizer=tf.keras.optimizers.Adam(0.001),
                  metrics=['accuracy'])

    callbacks = [
        tf.keras.callbacks.TensorBoard(log_dir=tensorboard.logdir()),
        tf.keras.callbacks.ModelCheckpoint(filepath=tensorboard.logdir()),
    ]
    model.fit(data_input(train_filenames, batch_size),
              verbose=0,
              epochs=3,
              steps_per_epoch=5,
              validation_data=data_input(validation_filenames, batch_size),
              validation_steps=1,
              callbacks=callbacks)

    score = model.evaluate(data_input(validation_filenames, batch_size),
                           steps=1)

    # Export model
    # WARNING(break-tutorial-inline-code): The following code snippet is
    # in-lined in tutorials, please update tutorial documents accordingly
    # whenever code changes.

    export_path = os.getcwd() + '/model-' + str(uuid.uuid4())
    print('Exporting trained model to: {}'.format(export_path))

    tf.saved_model.save(model, export_path)

    print('Done exporting!')

    metrics = {'accuracy': score[1]}

    hops_model.export(export_path,
                      model_name,
                      metrics=metrics,
                      project=model_proj_name)

    return metrics
def task2():
    import tensorflow as tf

    from hops import tensorboard
    from hops import hdfs
    from tensorflow.examples.tutorials.mnist import input_data

    fashion_mnist = input_data.read_data_sets(
        'data/fashion',
        one_hot=True,
        source_url='http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/'
    )

    # Helpers
    def weight_var(shape):
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial)

    def bias_var(shape, value=0.1):
        initial = tf.constant(value, shape=shape)
        return tf.Variable(initial)

    def bias_var_z(shape):
        return tf.Variable(tf.zeros(shape))

    def layer(tensor, in_dim, out_dim, name, activation=tf.nn.sigmoid):
        weights = weight_var([in_dim, out_dim])
        biases = bias_var_z([out_dim])
        pre = tf.matmul(tensor, weights) + biases
        post = activation(pre)
        tf.summary.histogram('activations', post)
        return post

    # Hardcoded params
    num_ch = 1
    num_classes = 10
    image_height = image_width = 28
    layer_widths = [200, 100, 60, 30, 10]

    # 1. Define variables and placeholders
    X = tf.placeholder(tf.float32,
                       shape=[None, image_height, image_width, num_ch])
    Y_ = tf.placeholder(tf.float32, shape=[None, 10])

    XX = tf.reshape(X, [-1, image_height * image_width])

    HSig1 = layer(XX, 784, layer_widths[0], 'sigmoid-1', tf.nn.sigmoid)
    HSig2 = layer(HSig1, layer_widths[0], layer_widths[1], 'sigmoid-2',
                  tf.nn.sigmoid)
    HSig3 = layer(HSig2, layer_widths[1], layer_widths[2], 'sigmoid-3',
                  tf.nn.sigmoid)
    HSig4 = layer(HSig3, layer_widths[2], layer_widths[3], 'sigmoid-4',
                  tf.nn.sigmoid)
    Y = layer(HSig4, layer_widths[3], layer_widths[4], 'identity', tf.identity)

    #     W1 = tf.Variable(tf.truncated_normal([784, 200], stddev=0.1))
    #     W2 = tf.Variable(tf.truncated_normal([200, 100], stddev=0.1))
    #     W3 = tf.Variable(tf.truncated_normal([100, 60 ], stddev=0.1))
    #     W4 = tf.Variable(tf.truncated_normal([60,  30 ], stddev=0.1))
    #     W5 = tf.Variable(tf.truncated_normal([30,  10 ], stddev=0.1))

    #     B1 = tf.Variable(tf.zeros([200]))
    #     B2 = tf.Variable(tf.zeros([100]))
    #     B3 = tf.Variable(tf.zeros([60 ]))
    #     B4 = tf.Variable(tf.zeros([30 ]))
    #     B5 = tf.Variable(tf.zeros([10 ]))

    #     #Define the model
    #     XX = tf.reshape(X, [-1, 784])
    #     Y1 = tf.nn.sigmoid(tf.matmul(XX, W1) + B1)
    #     Y2 = tf.nn.sigmoid(tf.matmul(Y1, W2) + B2)
    #     Y3 = tf.nn.sigmoid(tf.matmul(Y2, W3) + B3)
    #     Y4 = tf.nn.sigmoid(tf.matmul(Y3, W4) + B4)
    #     Ylogits = tf.matmul(Y4, W5) + B5
    #     Y = tf.nn.softmax(Ylogits)

    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=Y, labels=Y_))

    tf.summary.scalar('cross_entropy', cross_entropy)

    with tf.name_scope('accuracy'):
        with tf.name_scope('correct_prediction'):
            correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
        with tf.name_scope('accuracy'):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

    with tf.name_scope('train'):
        with tf.name_scope('gradient_descent'):
            train_step_gd = tf.train.GradientDescentOptimizer(0.5).minimize(
                cross_entropy)
        with tf.name_scope('adam_optimizer'):
            train_step_adam = tf.train.AdamOptimizer(0.005).minimize(
                cross_entropy)

    # Define accuracy
    with tf.name_scope('accuracy'):
        with tf.name_scope('correct_prediction'):
            correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
        with tf.name_scope('accuracy'):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar("cost", cross_entropy)
    tf.summary.scalar("accuracy", accuracy)

    init = tf.global_variables_initializer()
    sess = tf.Session()

    logdir = tensorboard.logdir()
    summary_op = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(logdir + '/train', sess.graph)
    test_writer = tf.summary.FileWriter(logdir + '/test')

    def epochs(train, test, train_step, num_epochs=100, batch_size=100):
        sess.run(init)

        accuracies = []
        losses = []

        for epoch in range(10000):
            for it in range(100):
                batch_xs, batch_ys = train.next_batch(batch_size)
                feed_dict = {XX: batch_xs, Y_: batch_ys}
                _, summary = sess.run([train_step, summary_op],
                                      feed_dict=feed_dict)
                train_writer.add_summary(summary, epoch * 100 + it)

            # Compute accuracy and loss every 100 rounds
            feed_dict = {XX: test.images, Y_: test.labels}
            summary, acc = sess.run([summary_op, accuracy],
                                    feed_dict=feed_dict)
            loss = sess.run(cross_entropy, feed_dict=feed_dict)

            accuracies.append(acc)
            losses.append(loss)
            test_writer.add_summary(summary, epoch)
        return (accuracies, losses)

    acc, loss = epochs(fashion_mnist.train, fashion_mnist.test, train_step_gd)

    train_writer.close()
    test_writer.close()
    writer.close()

    print("Accuracy: {}".format(acc))
    print("Loss: {}".format(loss))
def mnist_fun(args, ctx):
    def print_log(worker_num, arg):
        print("%d: " % worker_num)
        print(arg)

    from tensorflowonspark import TFNode
    from datetime import datetime
    import getpass
    import math
    import numpy
    import os
    import signal
    import tensorflow as tf
    import time

    # Used to get TensorBoard logdir for TensorBoard that show up in HopsWorks
    from hops import tensorboard

    IMAGE_PIXELS = 28
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index
    cluster_spec = ctx.cluster_spec
    num_workers = len(cluster_spec['worker'])

    # Delay PS nodes a bit, since workers seem to reserve GPUs more quickly/reliably (w/o conflict)
    if job_name == "ps":
        time.sleep((worker_num + 1) * 5)

    # Parameters
    hidden_units = 128
    batch_size = 100

    # Get TF cluster and server instances
    cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

    def read_tfr_examples(path,
                          batch_size=100,
                          num_epochs=None,
                          task_index=None,
                          num_workers=None):
        print_log(worker_num, "num_epochs: {0}".format(num_epochs))

        # Setup queue of TFRecord filenames
        tf_record_pattern = os.path.join(path, 'part-*')
        files = tf.gfile.Glob(tf_record_pattern)
        queue_name = "file_queue"

        # split input files across workers, if specified
        if task_index is not None and num_workers is not None:
            num_files = len(files)
            files = files[task_index:num_files:num_workers]
            queue_name = "file_queue_{0}".format(task_index)

        print_log(worker_num, "files: {0}".format(files))
        file_queue = tf.train.string_input_producer(files,
                                                    shuffle=False,
                                                    capacity=1000,
                                                    num_epochs=num_epochs,
                                                    name=queue_name)

        # Setup reader for examples
        reader = tf.TFRecordReader(name="reader")
        _, serialized = reader.read(file_queue)
        feature_def = {
            'label': tf.FixedLenFeature([10], tf.int64),
            'image': tf.FixedLenFeature([784], tf.int64)
        }
        features = tf.parse_single_example(serialized, feature_def)
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        image = tf.div(tf.to_float(features['image']), norm)
        print_log(worker_num, "image: {0}".format(image))
        label = tf.to_float(features['label'])
        print_log(worker_num, "label: {0}".format(label))

        # Return a batch of examples
        return tf.train.batch([image, label],
                              batch_size,
                              num_threads=args.readers,
                              name="batch")

    def read_csv_examples(image_dir,
                          label_dir,
                          batch_size=100,
                          num_epochs=None,
                          task_index=None,
                          num_workers=None):
        print_log(worker_num, "num_epochs: {0}".format(num_epochs))
        # Setup queue of csv image filenames
        tf_record_pattern = os.path.join(image_dir, 'part-*')
        images = tf.gfile.Glob(tf_record_pattern)
        print_log(worker_num, "images: {0}".format(images))
        image_queue = tf.train.string_input_producer(images,
                                                     shuffle=False,
                                                     capacity=1000,
                                                     num_epochs=num_epochs,
                                                     name="image_queue")

        # Setup queue of csv label filenames
        tf_record_pattern = os.path.join(label_dir, 'part-*')
        labels = tf.gfile.Glob(tf_record_pattern)
        print_log(worker_num, "labels: {0}".format(labels))
        label_queue = tf.train.string_input_producer(labels,
                                                     shuffle=False,
                                                     capacity=1000,
                                                     num_epochs=num_epochs,
                                                     name="label_queue")

        # Setup reader for image queue
        img_reader = tf.TextLineReader(name="img_reader")
        _, img_csv = img_reader.read(image_queue)
        image_defaults = [[1.0] for col in range(784)]
        img = tf.stack(tf.decode_csv(img_csv, image_defaults))
        # Normalize values to [0,1]
        norm = tf.constant(255, dtype=tf.float32, shape=(784, ))
        image = tf.div(img, norm)
        print_log(worker_num, "image: {0}".format(image))

        # Setup reader for label queue
        label_reader = tf.TextLineReader(name="label_reader")
        _, label_csv = label_reader.read(label_queue)
        label_defaults = [[1.0] for col in range(10)]
        label = tf.stack(tf.decode_csv(label_csv, label_defaults))
        print_log(worker_num, "label: {0}".format(label))

        # Return a batch of examples
        return tf.train.batch([image, label],
                              batch_size,
                              num_threads=args.readers,
                              name="batch_csv")

    if job_name == "ps":
        server.join()
    elif job_name == "worker":
        # Assigns ops to the local worker by default.
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device="/job:worker/task:%d" % task_index,
                    cluster=cluster)):

            # Variables of the hidden layer
            hid_w = tf.Variable(tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
                                name="hid_w")
            hid_b = tf.Variable(tf.zeros([hidden_units]), name="hid_b")
            tf.summary.histogram("hidden_weights", hid_w)

            # Variables of the softmax layer
            sm_w = tf.Variable(tf.truncated_normal([hidden_units, 10],
                                                   stddev=1.0 /
                                                   math.sqrt(hidden_units)),
                               name="sm_w")
            sm_b = tf.Variable(tf.zeros([10]), name="sm_b")
            tf.summary.histogram("softmax_weights", sm_w)

            # Placeholders or QueueRunner/Readers for input data
            num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs
            index = task_index if args.mode == "inference" else None
            workers = num_workers if args.mode == "inference" else None

            if args.format == "csv":
                images = TFNode.hdfs_path(ctx, args.images)
                labels = TFNode.hdfs_path(ctx, args.labels)
                x, y_ = read_csv_examples(images, labels, 100, num_epochs,
                                          index, workers)
            elif args.format == "tfr":
                images = TFNode.hdfs_path(ctx, args.images)
                x, y_ = read_tfr_examples(images, 100, num_epochs, index,
                                          workers)
            else:
                raise ("{0} format not supported for tf input mode".format(
                    args.format))

            x_img = tf.reshape(x, [-1, IMAGE_PIXELS, IMAGE_PIXELS, 1])
            tf.summary.image("x_img", x_img)

            hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
            hid = tf.nn.relu(hid_lin)

            y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))

            global_step = tf.Variable(0)

            loss = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
            tf.summary.scalar("loss", loss)
            train_op = tf.train.AdagradOptimizer(0.01).minimize(
                loss, global_step=global_step)

            # Test trained model
            label = tf.argmax(y_, 1, name="label")
            prediction = tf.argmax(y, 1, name="prediction")
            correct_prediction = tf.equal(prediction, label)
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32),
                                      name="accuracy")
            tf.summary.scalar("acc", accuracy)

            saver = tf.train.Saver()
            summary_op = tf.summary.merge_all()
            init_op = tf.global_variables_initializer()

            # Create a "supervisor", which oversees the training process and stores model state into HDFS
            logdir = tensorboard.logdir()
            print("tensorflow model path: {0}".format(logdir))

            if job_name == "worker" and task_index == 0:
                summary_writer = tf.summary.FileWriter(
                    logdir, graph=tf.get_default_graph())

            if args.mode == "train":
                sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                         logdir=logdir,
                                         init_op=init_op,
                                         summary_op=None,
                                         summary_writer=None,
                                         saver=saver,
                                         global_step=global_step,
                                         stop_grace_secs=300,
                                         save_model_secs=10)
            else:
                sv = tf.train.Supervisor(is_chief=(task_index == 0),
                                         logdir=logdir,
                                         summary_op=None,
                                         saver=saver,
                                         global_step=global_step,
                                         stop_grace_secs=300,
                                         save_model_secs=0)
            output_dir = TFNode.hdfs_path(ctx, args.output)
            output_file = tf.gfile.Open("{0}/part-{1:05d}".format(
                output_dir, worker_num),
                                        mode='w')

        # The supervisor takes care of session initialization, restoring from
        # a checkpoint, and closing when done or an error occurs.
    with sv.managed_session(server.target) as sess:
        print("{0} session ready".format(datetime.now().isoformat()))

        # Loop until the supervisor shuts down or 1000000 steps have completed.
        step = 0
        count = 0
        while not sv.should_stop() and step < args.steps:
            # Run a training step asynchronously.
            # See `tf.train.SyncReplicasOptimizer` for additional details on how to
            # perform *synchronous* training.

            # using QueueRunners/Readers
            if args.mode == "train":
                if (step % 100 == 0):
                    print("{0} step: {1} accuracy: {2}".format(
                        datetime.now().isoformat(), step, sess.run(accuracy)))
                _, summary, step = sess.run(
                    [train_op, summary_op, global_step])
                if sv.is_chief:
                    summary_writer.add_summary(summary, step)
            else:  # args.mode == "inference"
                labels, pred, acc = sess.run([label, prediction, accuracy])
                #print("label: {0}, pred: {1}".format(labels, pred))
                print("acc: {0}".format(acc))
                for i in range(len(labels)):
                    count += 1
                    output_file.write("{0} {1}\n".format(labels[i], pred[i]))
                print("count: {0}".format(count))

        if args.mode == "inference":
            output_file.close()
            # Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
            # run inference and request stop before the other workers even start/sync their sessions.
        if task_index == 0:
            time.sleep(60)

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()
Beispiel #4
0
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        client = coordination_server.Client(server_addr)

        node_meta = {
            'host': get_ip_address(),
            'executor_cwd': os.getcwd(),
            'cuda_visible_devices_ordinals':
            devices.get_minor_gpu_device_numbers()
        }

        client.register(node_meta)

        t_gpus = threading.Thread(
            target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t_gpus.start()

        # Only spark executor with index 0 should create necessary HDFS directories and start mpirun
        # Other executors simply block until index 0 reports mpirun is finished

        clusterspec = client.await_reservations()

        #pydoop.hdfs.dump('', os.environ['EXEC_LOGFILE'], user=hopshdfs.project_user())
        #hopshdfs.init_logger()
        #hopshdfs.log('Starting Spark executor with arguments')

        gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info(
        )
        #hopshdfs.log(gpu_str)
        print(gpu_str)

        mpi_logfile_path = os.getcwd() + '/mpirun.log'
        if os.path.exists(mpi_logfile_path):
            os.remove(mpi_logfile_path)

        mpi_logfile = open(mpi_logfile_path, 'w')

        py_runnable = localize_scripts(nb_path, clusterspec)

        # non-chief executor should not do mpirun
        if not executor_num == 0:
            client.await_mpirun_finished()
        else:
            hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
                app_id, run_id, param_string='Horovod')
            tb_hdfs_path, tb_pid = tensorboard.register(
                hdfs_exec_logdir, hdfs_appid_logdir, 0)

            mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \
                      ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \
                      ' mpirun -np ' + str(get_num_ps(clusterspec)) + ' --hostfile ' + get_hosts_file(clusterspec) + \
                      ' -bind-to none -map-by slot ' + \
                      ' -x LD_LIBRARY_PATH ' + \
                      ' -x HOROVOD_TIMELINE ' + \
                      ' -x TENSORBOARD_LOGDIR ' + \
                      ' -x NCCL_DEBUG=INFO ' + \
                      ' -mca pml ob1 -mca btl ^openib ' + \
                      os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable

            mpi = subprocess.Popen(mpi_cmd,
                                   shell=True,
                                   stdout=mpi_logfile,
                                   stderr=mpi_logfile,
                                   preexec_fn=util.on_executor_exit('SIGTERM'))

            t_log = threading.Thread(target=print_log)
            t_log.start()

            mpi.wait()

            client.register_mpirun_finished()

            if devices.get_num_gpus() > 0:
                t_gpus.do_run = False
                t_gpus.join()

            return_code = mpi.returncode

            if return_code != 0:
                cleanup(tb_hdfs_path)
                t_log.do_run = False
                t_log.join()
                raise Exception(
                    'mpirun FAILED, look in the logs for the error')

            cleanup(tb_hdfs_path)
            t_log.do_run = False
            t_log.join()
Beispiel #5
0
    def _wrapper_fun(iter):

        for i in iter:
            executor_num = i

        hdfs_exec_logdir, hdfs_appid_logdir = hopshdfs.create_directories(
            app_id, run_id, None, 'horovod')

        tb_pid = 0
        tb_hdfs_path = ''

        pydoop.hdfs.dump('',
                         os.environ['EXEC_LOGFILE'],
                         user=hopshdfs.project_user())
        hopshdfs.init_logger()
        hopshdfs.log('Starting Spark executor with arguments')
        if executor_num == 0:
            tb_hdfs_path, tb_pid = tensorboard.register(
                hdfs_exec_logdir,
                hdfs_appid_logdir,
                0,
                local_logdir=local_logdir)

        gpu_str = '\n\nChecking for GPUs in the environment\n' + devices.get_gpu_info(
        )
        hopshdfs.log(gpu_str)
        print(gpu_str)

        #1. Download notebook file
        fs_handle = hopshdfs.get_fs()

        try:
            fd = fs_handle.open_file(nb_path, flags='r')
        except:
            fd = fs_handle.open_file(nb_path, mode='r')

        notebook = ''
        for line in fd:
            notebook += line

        path, filename = os.path.split(nb_path)
        f_nb = open(filename, "w+")
        f_nb.write(notebook)
        f_nb.flush()
        f_nb.close()

        # 2. Convert notebook to py file
        jupyter_runnable = os.path.abspath(
            os.path.join(os.environ['PYSPARK_PYTHON'], os.pardir)) + '/jupyter'
        conversion_cmd = jupyter_runnable + ' nbconvert --to python ' + filename
        conversion = subprocess.Popen(conversion_cmd,
                                      shell=True,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE)
        conversion.wait()
        stdout, stderr = conversion.communicate()
        print(stdout)
        print(stderr)

        # 3. Make py file runnable
        py_runnable = os.getcwd() + '/' + filename.split('.')[0] + '.py'
        st = os.stat(py_runnable)
        os.chmod(py_runnable, st.st_mode | stat.S_IEXEC)

        t_gpus = threading.Thread(
            target=devices.print_periodic_gpu_utilization)
        if devices.get_num_gpus() > 0:
            t_gpus.start()

        mpi_logfile_path = os.getcwd() + '/mpirun.log'
        if os.path.exists(mpi_logfile_path):
            os.remove(mpi_logfile_path)

        mpi_logfile = open(mpi_logfile_path, 'w')

        # 4. Run allreduce
        mpi_np = os.environ['MPI_NP']
        mpi_cmd = 'HOROVOD_TIMELINE=' + tensorboard.logdir() + '/timeline.json' + \
                  ' TENSORBOARD_LOGDIR=' + tensorboard.logdir() + \
                  ' mpirun -np ' + str(mpi_np) + \
                  ' -bind-to none -map-by slot ' + \
                  ' -x HOROVOD_TIMELINE ' + \
                  ' -x TENSORBOARD_LOGDIR ' + \
                  ' -x NCCL_DEBUG=INFO ' + \
                  os.environ['PYSPARK_PYTHON'] + ' ' + py_runnable
        mpi = subprocess.Popen(mpi_cmd,
                               shell=True,
                               stdout=mpi_logfile,
                               stderr=mpi_logfile,
                               preexec_fn=util.on_executor_exit('SIGTERM'))

        t_log = threading.Thread(target=print_log)
        t_log.start()

        mpi.wait()

        if devices.get_num_gpus() > 0:
            t_gpus.do_run = False
            t_gpus.join()

        return_code = mpi.returncode

        if local_logdir:
            local_tb = tensorboard.local_logdir_path
            pydoop.hdfs.put(local_tb, hdfs_exec_logdir)

        if return_code != 0:
            cleanup(tb_hdfs_path)
            t_log.do_run = False
            t_log.join()
            raise Exception('mpirun FAILED, look in the logs for the error')

        cleanup(tb_hdfs_path)
        t_log.do_run = False
        t_log.join()

        hopshdfs.kill_logger()
Beispiel #6
0
def test_fun(args, ctx):
	# Dependencies
	from tensorflowonspark import TFNode
	from datetime import datetime

	import getpass
	import math
	import numpy
	import os
	import random
	import signal
	import tensorflow as tf
	import time

	from tensorflow.contrib import rnn

	# Used for TensorBoard logdir
	from hops import tensorboard

	# Extract configuration
	worker_num = ctx.worker_num
	job_name = ctx.job_name
	task_index = ctx.task_index
	cluster_spec = ctx.cluster_spec
	num_workers = len(cluster_spec['worker'])

	# Get TF cluster/server instances
	cluster, server = TFNode.start_cluster_server(ctx, 1, args.rdma)

	# Parameters
	batch_size = 100
	display_iter = 1000
	training_iters = 50000

	learning_rate = 0.0001
	n_input = 3
	n_hidden = 512
	n_predictions = 32

	# Utility functions
	def elapsed(sec):
		if sec < 60:
			return str(sec) + " sec"
		elif sec < (60 * 60):
			return str(sec / 60) + " min"
		else:
			return str(sec / (60 * 60)) + " hr"

	def print_log(worker_num, arg):
		print("%d: " % worker_num)
		print(arg)

	def RNN(x, weights, biases, n_input, n_hidden):
		# Reshape to [1, n_input]
		x = tf.reshape(x, [-1, n_input])
		# Generate a n_input-element sequence of inputs
		# (eg. [had] [a] [general] -> [20] [6] [33])
		x = tf.split(x, n_input, 1)
		rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden), rnn.BasicLSTMCell(n_hidden)])

		# Generate prediction
		outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)

		# There are n_input outputs but we only want the last output
		return tf.matmul(outputs[-1], weights['out']) + biases['out']

	def get_loss_fn(logits, labels):
		return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))

	if job_name == "ps":
		server.join()
	elif job_name == "worker":
		# TODO What does this do?
		# Assigns ops to the local worker by default
		with tf.device(tf.train.replica_device_setter(
				worker_device="/job:worker/task:%d" % task_index,
				cluster=cluster)):

			# TODO Set up vocab_size by loading in dataset and parsing through it?
			dictionary = {}
			reverse_dictionary = {}
			vocab_size = 32

			# Placeholders or QueueRunner/Readers for input data
			num_epochs = 1 if args.mode == "inference" else None if args.epochs == 0 else args.epochs
			index = task_index if args.mode == "inference" else None
			workers = num_workers if args.mode == "inference" else None

			# RNN output node weights and biases
			hidden_weights = tf.Variable(tf.random_normal([n_hidden, vocab_size]), name="hidden_weights")
			hidden_biases = tf.Variable(tf.random_normal([vocab_size]), name="hidden_biases")
			weights = {'out': hidden_weights}
			biases = {'out': hidden_biases}

			# Graph input placeholders
			x = tf.placeholder("float", [None, n_input, 1])
			y = tf.placeholder("float", [None, vocab_size])

			# Set up TFOS
			global_step = tf.Variable(0)

			pred = RNN(x, weights, biases, n_input, n_hidden)
			cost = get_loss_fn(logits=pred, labels=y)
			# Note that the global_step is passed in to the optimizer's min. function
			optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) \
				.minimize(loss=cost, global_step=global_step)

			# Model evaluation
			correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
			accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

			# TF summaries
			tf.summary.scalar("cost", cost)
			tf.summary.histogram("hidden_weights", hidden_weights)
			tf.summary.scalar("acc", accuracy)

			#  TODO XXX Below is copied directly from TFOS example
			saver = tf.train.Saver()
			summary_op = tf.summary.merge_all()
			init_op = tf.global_variables_initializer()

			# Create a "supervisor", which oversees the training process and stores model state into HDFS
			logdir = tensorboard.logdir()
			print("tensorflow model path: {0}".format(logdir))

			# Check if chief worker
			if job_name == "worker" and task_index == 0:
				summary_writer = tf.summary.FileWriter(logdir, graph=tf.get_default_graph())

			if args.mode == "train":
				sv = tf.train.Supervisor(is_chief=(task_index == 0),
				                         logdir=logdir,
				                         init_op=init_op,
				                         summary_op=None,
				                         summary_writer=None,
				                         saver=saver,
				                         global_step=global_step,
				                         stop_grace_secs=300,
				                         save_model_secs=10)
			else:
				sv = tf.train.Supervisor(is_chief=(task_index == 0),
				                         logdir=logdir,
				                         summary_op=None,
				                         saver=saver,
				                         global_step=global_step,
				                         stop_grace_secs=300,
				                         save_model_secs=0)
			# Configure output path on HDFS
			output_dir = TFNode.hdfs_path(ctx, args.output)
			output_file = tf.gfile.Open("{0}/part-{1:05d}".format(output_dir, worker_num), mode='w')

	# The supervisor takes care of session initialization, restoring from
	# a checkpoint, and closing when done or an error occurs.
	with sv.managed_session(server.target) as sess:
		print("{0} session ready".format(datetime.now().isoformat()))
		step = 0
		count = 0
		offset = random.randint(0, n_input + 1)
		end_offset = n_input + 1
		acc_total = 0
		loss_total = 0

		# TODO writer.add_graph(session.graph)? Might be taken care of by setup of summary_writer
		# TODO Set up args.steps

		# Loop until supervisor shuts down or max. iters have completed
		while not sv.should_stop() and step < args.steps:
			# TODO Determine what makes THIS asynch, and whether we need synch.
			# TODO A good resource may be https://stackoverflow.com/questions/41293576/distributed-tensorflow-good-example-for-synchronous-training-on-cpus
			# Run a training step asynchronously
			# See `tf.train.SyncReplicasOptimizer` for additional details on how to
			# perform *synchronous* training.

			# Using QueueRunner/Readers
			if args.mode == "train":
				# TODO Below is merely a copy-pasta of the local TF code, and will need refactoring
				if offset > (len(training_data) - end_offset):
					offset = random.randint(0, n_input + 1)

				symbols_in_keys = [[dictionary[str(training_data[i])]] for i in range(offset, offset + n_input)]
				symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])

				symbols_out_onehot = np.zeros([vocab_size], dtype=float)
				symbols_out_onehot[dictionary[str(training_data[offset + n_input])]] = 1.0
				symbols_out_onehot = np.reshape(symbols_out_onehot, [1, -1])

				# Run iteration and increment 'step'
				_, summary, acc, loss, onehot_pred, step = sess.run(
					[optimizer, summary_op, accuracy, cost, pred, global_step],
					feed_dict={x: symbols_in_keys, y: symbols_out_onehot})

				loss_total += loss
				acc_total += acc

				if ((step + 1) % display_iter) == 0:
					print("{0} step: {1} accuracy: {2}".format(
						datetime.now().isoformat(),
						step,
						sess.run(accuracy)))
					# TODO migrate over print fn from local TF code

				offset += (n_input + 1)

				if sv.is_chief:
					summary_writer.add_summary(summary, step)
			else:  # args.mode == "inference"
				# labels, pred, acc = sess.run([label, prediction, accuracy])
				# # print("label: {0}, pred: {1}".format(labels, pred))
				# print("acc: {0}".format(acc))
				# for i in range(len(labels)):
				# 	count += 1
				# 	output_file.write("{0} {1}\n".format(labels[i], pred[i]))
				# print("count: {0}".format(count))
				pass

		if args.mode == "inference":
			output_file.close()

		# Delay chief worker from shutting down supervisor during inference, since it can load model, start session,
		# run inference and request stop before the other workers even start/sync their sessions.
		if task_index == 0:
			time.sleep(60)

		# Ask for all the services to stop.
		print("{0} stopping supervisor".format(datetime.now().isoformat()))
		sv.stop()