def toNumpy(bytestr): example = tf.train.Example() example.ParseFromString(bytestr) features = example.features.feature image = numpy.array(features['image'].int64_list.value) label = numpy.array(features['label'].int64_list.value) return (image, label) dataRDD = images.map(lambda x: toNumpy(str(x[0]))) else: if args.format == "csv": images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')]) labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')]) else: # args.format == "pickle": images = sc.pickleFile(args.images) labels = sc.pickleFile(args.labels) print("zipping images and labels") dataRDD = images.zip(labels) cluster = TFCluster.reserve(sc, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK) cluster.start(mnist_dist.map_fun, args) if args.mode == "train": cluster.train(dataRDD, args.epochs) else: labelRDD = cluster.inference(dataRDD) labelRDD.saveAsTextFile(args.output) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
from inception import inception_eval from inception.imagenet_data import ImagenetData print("argv:", argv) sys.argv = argv FLAGS = tf.app.flags.FLAGS FLAGS._parse_flags() print("FLAGS:", FLAGS.__dict__['__flags']) dataset = ImagenetData(subset=FLAGS.subset) assert dataset.data_files() if tf.gfile.Exists(FLAGS.eval_dir): tf.gfile.DeleteRecursively(FLAGS.eval_dir) tf.gfile.MakeDirs(FLAGS.eval_dir) cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma) inception_eval.evaluate(dataset) if __name__ == '__main__': sc = SparkContext(conf=SparkConf().setAppName("grid_imagenet_eval")) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 0 cluster = TFCluster.reserve(sc, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW) cluster.start(main_fun, sys.argv) cluster.shutdown()
parser.add_argument("--input_mode", help="method to ingest data: (spark|tf)", choices=["spark", "tf"], default="tf") parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") (args, rem) = parser.parse_known_args() input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW print("{0} ===== Start".format(datetime.now().isoformat())) sc = SparkContext( conf=SparkConf().setAppName('imagenet_distributed_train')) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 1 cluster = TFCluster.reserve(sc, num_executors, num_ps, args.tensorboard, input_mode) cluster.start(main_fun, sys.argv) if input_mode == TFCluster.InputMode.SPARK: dataRDD = sc.newAPIHadoopFile( args.input_data, "org.tensorflow.hadoop.io.TFRecordFileInputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable") cluster.train(dataRDD, args.epochs) cluster.shutdown() print("{0} ===== Stop".format(datetime.now().isoformat()))
helper.display_image_predictions(random_test_features, random_test_labels, random_test_predictions) test_model() # def main(): def main_fun(argv, ctx): pass if __name__ == '__main__': # tf.app.run() import argparse parser = argparse.ArgumentParser() parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") args, rem = parser.parse_known_args() sc = SparkContext(conf=SparkConf().setAppName("your_app_name")) num_executors = int(sc._conf.get("spark.executor.instances")) num_ps = 1 tensorboard = True cluster = TFCluster.reserve(sc, num_executors, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.start(main_fun, sys.argv) cluster.shutdown()
sync_optimizer=optimizer if FLAGS.sync_replicas else None) if __name__ == '__main__': import argparse sc = SparkContext(conf=SparkConf().setAppName("train_image_classifier")) executors = sc._conf.get("spark.executor.instances") num_executors = int(executors) if executors is not None else 1 parser = argparse.ArgumentParser() parser.add_argument("--num_ps_tasks", help="number of PS nodes", type=int, default=0) parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true") parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors) (args, rem) = parser.parse_known_args() assert (num_executors > args.num_ps_tasks) cluster = TFCluster.reserve(sc, args.cluster_size, args.num_ps_tasks, args.tensorboard, TFCluster.InputMode.TENSORFLOW) cluster.start(main_fun, sys.argv) cluster.shutdown()