コード例 #1
0
    from inception.imagenet_data import ImagenetData

    print("argv:", argv)
    sys.argv = argv

    FLAGS = tf.app.flags.FLAGS
    FLAGS._parse_flags()
    print("FLAGS:", FLAGS.__dict__['__flags'])

    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    if tf.gfile.Exists(FLAGS.eval_dir):
        tf.gfile.DeleteRecursively(FLAGS.eval_dir)
    tf.gfile.MakeDirs(FLAGS.eval_dir)

    cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

    inception_eval.evaluate(dataset)


if __name__ == '__main__':
    sc = SparkContext(conf=SparkConf().setAppName("grid_imagenet_eval"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 0

    #cluster = TFCluster.reserve(sc, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW)
    #cluster.start(main_fun, sys.argv)
    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps,
                            False, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
コード例 #2
0
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)
args = parser.parse_args()
print("args:", args)

print("{0} ===== Start".format(datetime.now().isoformat()))


def parse(ln):
    lbl, img = ln.split('|')
    image = [int(x) for x in img.split(',')]
    label = numpy.zeros(10)
    label[int(lbl)] = 1.0
    return (image, label)


stream = ssc.textFileStream(args.images)
imageRDD = stream.map(lambda ln: parse(ln))

cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size,
                        num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
if args.mode == "train":
    cluster.train(imageRDD)
else:
    labelRDD = cluster.inference(imageRDD)
    labelRDD.saveAsTextFiles(args.output)

ssc.start()
cluster.shutdown(ssc)

print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #3
0
                        help="method to ingest data: (spark|tf)",
                        choices=["spark", "tf"],
                        default="tf")
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    (args, rem) = parser.parse_known_args()

    input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW

    print("{0} ===== Start".format(datetime.now().isoformat()))
    sc = SparkContext(
        conf=SparkConf().setAppName('imagenet_distributed_train'))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 1

    #cluster = TFCluster.reserve(sc, num_executors, num_ps, args.tensorboard, input_mode)
    #cluster.start(main_fun, sys.argv)
    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps,
                            args.tensorboard, input_mode)
    if input_mode == TFCluster.InputMode.SPARK:
        dataRDD = sc.newAPIHadoopFile(
            args.input_data,
            "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
            keyClass="org.apache.hadoop.io.BytesWritable",
            valueClass="org.apache.hadoop.io.NullWritable")
        cluster.train(dataRDD, args.epochs)
    cluster.shutdown()
    print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #4
0
ファイル: meme.py プロジェクト: wfus/zzzz
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from com.yahoo.ml.tf import TFCluster, TFNode
from datetime import datetime


def main_fun(argv, ctx):
    """Main function entrance for spark. Make sure that all imports are done here,
    or spark will try to serialize libraries when they are placed outside
    for each executor, and we don't want that! ~WFU"""
    import tensorflow as tf


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    args, rem = parser.parse_known_args()

    sc = SparkContext(conf=SparkConf().setAppName("Nacho"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_processes = 1
    use_tensorboard = False

    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors,
                            num_processes, use_tensorboard,
                            TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
コード例 #5
0

if __name__ == '__main__':
    import argparse

    sc = SparkContext(conf=SparkConf().setAppName("train_image_classifier"))
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1

    parser = argparse.ArgumentParser()
    parser.add_argument("--num_ps_tasks",
                        help="number of PS nodes",
                        type=int,
                        default=0)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
                        default=num_executors)
    (args, rem) = parser.parse_known_args()

    assert (num_executors > args.num_ps_tasks)
    #cluster = TFCluster.reserve(sc, args.cluster_size, args.num_ps_tasks, args.tensorboard, TFCluster.InputMode.TENSORFLOW)
    #cluster.start(main_fun, sys.argv)
    cluster = TFCluster.run(sc, main_fun, sys.argv, args.cluster_size,
                            args.num_ps_tasks, args.tensorboard,
                            TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()