from inception.imagenet_data import ImagenetData

    print("argv:", argv)
    sys.argv = argv

    FLAGS = tf.app.flags.FLAGS
    FLAGS._parse_flags()
    print("FLAGS:", FLAGS.__dict__['__flags'])

    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    if tf.gfile.Exists(FLAGS.eval_dir):
        tf.gfile.DeleteRecursively(FLAGS.eval_dir)
    tf.gfile.MakeDirs(FLAGS.eval_dir)

    cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

    inception_eval.evaluate(dataset)


if __name__ == '__main__':
    sc = SparkContext(conf=SparkConf().setAppName("grid_imagenet_eval"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 0

    #cluster = TFCluster.reserve(sc, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW)
    #cluster.start(main_fun, sys.argv)
    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps,
                            False, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
Beispiel #2
0
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)
args = parser.parse_args()
print("args:", args)

print("{0} ===== Start".format(datetime.now().isoformat()))


def parse(ln):
    lbl, img = ln.split('|')
    image = [int(x) for x in img.split(',')]
    label = numpy.zeros(10)
    label[int(lbl)] = 1.0
    return (image, label)


stream = ssc.textFileStream(args.images)
imageRDD = stream.map(lambda ln: parse(ln))

cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size,
                        num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
if args.mode == "train":
    cluster.train(imageRDD)
else:
    labelRDD = cluster.inference(imageRDD)
    labelRDD.saveAsTextFiles(args.output)

ssc.start()
cluster.shutdown(ssc)

print("{0} ===== Stop".format(datetime.now().isoformat()))
                        help="method to ingest data: (spark|tf)",
                        choices=["spark", "tf"],
                        default="tf")
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    (args, rem) = parser.parse_known_args()

    input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW

    print("{0} ===== Start".format(datetime.now().isoformat()))
    sc = SparkContext(
        conf=SparkConf().setAppName('imagenet_distributed_train'))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 1

    #cluster = TFCluster.reserve(sc, num_executors, num_ps, args.tensorboard, input_mode)
    #cluster.start(main_fun, sys.argv)
    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps,
                            args.tensorboard, input_mode)
    if input_mode == TFCluster.InputMode.SPARK:
        dataRDD = sc.newAPIHadoopFile(
            args.input_data,
            "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
            keyClass="org.apache.hadoop.io.BytesWritable",
            valueClass="org.apache.hadoop.io.NullWritable")
        cluster.train(dataRDD, args.epochs)
    cluster.shutdown()
    print("{0} ===== Stop".format(datetime.now().isoformat()))
Beispiel #4
0
Datei: meme.py Projekt: wfus/zzzz
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from com.yahoo.ml.tf import TFCluster, TFNode
from datetime import datetime


def main_fun(argv, ctx):
    """Main function entrance for spark. Make sure that all imports are done here,
    or spark will try to serialize libraries when they are placed outside
    for each executor, and we don't want that! ~WFU"""
    import tensorflow as tf


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    args, rem = parser.parse_known_args()

    sc = SparkContext(conf=SparkConf().setAppName("Nacho"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_processes = 1
    use_tensorboard = False

    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors,
                            num_processes, use_tensorboard,
                            TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
Beispiel #5
0

if __name__ == '__main__':
    import argparse

    sc = SparkContext(conf=SparkConf().setAppName("train_image_classifier"))
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1

    parser = argparse.ArgumentParser()
    parser.add_argument("--num_ps_tasks",
                        help="number of PS nodes",
                        type=int,
                        default=0)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
                        default=num_executors)
    (args, rem) = parser.parse_known_args()

    assert (num_executors > args.num_ps_tasks)
    #cluster = TFCluster.reserve(sc, args.cluster_size, args.num_ps_tasks, args.tensorboard, TFCluster.InputMode.TENSORFLOW)
    #cluster.start(main_fun, sys.argv)
    cluster = TFCluster.run(sc, main_fun, sys.argv, args.cluster_size,
                            args.num_ps_tasks, args.tensorboard,
                            TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()