Ejemplo n.º 1
0
  def toNumpy(bytestr):
    example = tf.train.Example()
    example.ParseFromString(bytestr)
    features = example.features.feature
    image = numpy.array(features['image'].int64_list.value)
    label = numpy.array(features['label'].int64_list.value)
    return (image, label)
  dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
  if args.format == "csv":
    images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
    labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
  else: # args.format == "pickle":
    images = sc.pickleFile(args.images)
    labels = sc.pickleFile(args.labels)
  print("zipping images and labels")
  dataRDD = images.zip(labels)


cluster = TFCluster.reserve(sc, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
cluster.start(mnist_dist.map_fun, args)
if args.mode == "train":
  cluster.train(dataRDD, args.epochs)
else:
  labelRDD = cluster.inference(dataRDD)
  labelRDD.saveAsTextFile(args.output)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))

        helper.display_image_predictions(random_test_features,
                                         random_test_labels,
                                         random_test_predictions)


test_model()


# def main():
def main_fun(argv, ctx):
    pass


if __name__ == '__main__':
    # tf.app.run()
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")
    args, rem = parser.parse_known_args()

    sc = SparkContext(conf=SparkConf().setAppName("your_app_name"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 1
    tensorboard = True

    cluster = TFCluster.reserve(sc, num_executors, num_ps, tensorboard,
                                TFCluster.InputMode.TENSORFLOW)
    cluster.start(main_fun, sys.argv)
    cluster.shutdown()
Ejemplo n.º 3
0
    from inception import inception_eval
    from inception.imagenet_data import ImagenetData

    print("argv:", argv)
    sys.argv = argv

    FLAGS = tf.app.flags.FLAGS
    FLAGS._parse_flags()
    print("FLAGS:", FLAGS.__dict__['__flags'])

    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    if tf.gfile.Exists(FLAGS.eval_dir):
        tf.gfile.DeleteRecursively(FLAGS.eval_dir)
    tf.gfile.MakeDirs(FLAGS.eval_dir)

    cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

    inception_eval.evaluate(dataset)


if __name__ == '__main__':
    sc = SparkContext(conf=SparkConf().setAppName("grid_imagenet_eval"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 0

    cluster = TFCluster.reserve(sc, num_executors, num_ps, False,
                                TFCluster.InputMode.TENSORFLOW)
    cluster.start(main_fun, sys.argv)
    cluster.shutdown()
Ejemplo n.º 4
0
    from inception.imagenet_data import ImagenetData

    print("argv:", argv)
    sys.argv = argv

    FLAGS = tf.app.flags.FLAGS
    FLAGS._parse_flags()
    print("FLAGS:", FLAGS.__dict__['__flags'])

    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    if tf.gfile.Exists(FLAGS.eval_dir):
        tf.gfile.DeleteRecursively(FLAGS.eval_dir)
    tf.gfile.MakeDirs(FLAGS.eval_dir)

    cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

    inception_eval.evaluate(dataset)


if __name__ == '__main__':
    sc = SparkContext(conf=SparkConf().setAppName("grid_imagenet_eval"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 0

    #cluster = TFCluster.reserve(sc, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW)
    #cluster.start(main_fun, sys.argv)
    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps,
                            False, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
    parser.add_argument("--input_mode",
                        help="method to ingest data: (spark|tf)",
                        choices=["spark", "tf"],
                        default="tf")
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    (args, rem) = parser.parse_known_args()

    input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW

    print("{0} ===== Start".format(datetime.now().isoformat()))
    sc = SparkContext(
        conf=SparkConf().setAppName('imagenet_distributed_train'))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 1

    cluster = TFCluster.reserve(sc, num_executors, num_ps, args.tensorboard,
                                input_mode)
    cluster.start(main_fun, sys.argv)
    if input_mode == TFCluster.InputMode.SPARK:
        dataRDD = sc.newAPIHadoopFile(
            args.input_data,
            "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
            keyClass="org.apache.hadoop.io.BytesWritable",
            valueClass="org.apache.hadoop.io.NullWritable")
        cluster.train(dataRDD, args.epochs)
    cluster.shutdown()
    print("{0} ===== Stop".format(datetime.now().isoformat()))
Ejemplo n.º 6
0
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)
args = parser.parse_args()
print("args:", args)

print("{0} ===== Start".format(datetime.now().isoformat()))


def parse(ln):
    lbl, img = ln.split('|')
    image = [int(x) for x in img.split(',')]
    label = numpy.zeros(10)
    label[int(lbl)] = 1.0
    return (image, label)


stream = ssc.textFileStream(args.images)
imageRDD = stream.map(lambda ln: parse(ln))

cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size,
                        num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
if args.mode == "train":
    cluster.train(imageRDD)
else:
    labelRDD = cluster.inference(imageRDD)
    labelRDD.saveAsTextFiles(args.output)

ssc.start()
cluster.shutdown(ssc)

print("{0} ===== Stop".format(datetime.now().isoformat()))
Ejemplo n.º 7
0
                sync_optimizer=optimizer if FLAGS.sync_replicas else None)


if __name__ == '__main__':
    import argparse

    sc = SparkContext(conf=SparkConf().setAppName("train_image_classifier"))
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1

    parser = argparse.ArgumentParser()
    parser.add_argument("--num_ps_tasks",
                        help="number of PS nodes",
                        type=int,
                        default=0)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
                        default=num_executors)
    (args, rem) = parser.parse_known_args()

    assert (num_executors > args.num_ps_tasks)
    cluster = TFCluster.reserve(sc, args.cluster_size, args.num_ps_tasks,
                                args.tensorboard,
                                TFCluster.InputMode.TENSORFLOW)
    cluster.start(main_fun, sys.argv)
    cluster.shutdown()
Ejemplo n.º 8
0
Archivo: meme.py Proyecto: wfus/zzzz
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from com.yahoo.ml.tf import TFCluster, TFNode
from datetime import datetime


def main_fun(argv, ctx):
    """Main function entrance for spark. Make sure that all imports are done here,
    or spark will try to serialize libraries when they are placed outside
    for each executor, and we don't want that! ~WFU"""
    import tensorflow as tf


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    args, rem = parser.parse_known_args()

    sc = SparkContext(conf=SparkConf().setAppName("Nacho"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_processes = 1
    use_tensorboard = False

    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors,
                            num_processes, use_tensorboard,
                            TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
                        help="method to ingest data: (spark|tf)",
                        choices=["spark", "tf"],
                        default="tf")
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    (args, rem) = parser.parse_known_args()

    input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW

    print("{0} ===== Start".format(datetime.now().isoformat()))
    sc = SparkContext(
        conf=SparkConf().setAppName('imagenet_distributed_train'))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 1

    #cluster = TFCluster.reserve(sc, num_executors, num_ps, args.tensorboard, input_mode)
    #cluster.start(main_fun, sys.argv)
    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps,
                            args.tensorboard, input_mode)
    if input_mode == TFCluster.InputMode.SPARK:
        dataRDD = sc.newAPIHadoopFile(
            args.input_data,
            "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
            keyClass="org.apache.hadoop.io.BytesWritable",
            valueClass="org.apache.hadoop.io.NullWritable")
        cluster.train(dataRDD, args.epochs)
    cluster.shutdown()
    print("{0} ===== Stop".format(datetime.now().isoformat()))
Ejemplo n.º 10
0

if __name__ == '__main__':
    import argparse

    sc = SparkContext(conf=SparkConf().setAppName("train_image_classifier"))
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1

    parser = argparse.ArgumentParser()
    parser.add_argument("--num_ps_tasks",
                        help="number of PS nodes",
                        type=int,
                        default=0)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
                        default=num_executors)
    (args, rem) = parser.parse_known_args()

    assert (num_executors > args.num_ps_tasks)
    #cluster = TFCluster.reserve(sc, args.cluster_size, args.num_ps_tasks, args.tensorboard, TFCluster.InputMode.TENSORFLOW)
    #cluster.start(main_fun, sys.argv)
    cluster = TFCluster.run(sc, main_fun, sys.argv, args.cluster_size,
                            args.num_ps_tasks, args.tensorboard,
                            TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()