コード例 #1
0
  def test_inputmode_spark(self):
    """Distributed TF cluster w/ InputMode.SPARK"""
    def _map_fun(args, ctx):
      import tensorflow as tf
      cluster, server = TFNode.start_cluster_server(ctx)
      if ctx.job_name == "ps":
        server.join()
      elif ctx.job_name == "worker":
        with tf.device(tf.train.replica_device_setter(
          worker_device="/job:worker/task:%d" % ctx.task_index,
          cluster=cluster)):
          x = tf.placeholder(tf.int32, [None, 1])
          sq = tf.square(x)
          init_op = tf.global_variables_initializer()
        sv = tf.train.Supervisor(is_chief=(ctx.task_index == 0),
                                init_op=init_op)
        with sv.managed_session(server.target) as sess:
          tf_feed = TFNode.DataFeed(ctx.mgr, False)
          while not sv.should_stop() and not tf_feed.should_stop():
            outputs = sess.run([sq], feed_dict={ x: tf_feed.next_batch(10) })
            tf_feed.batch_results(outputs[0])
        sv.stop()

    input = [ [x] for x in range(1000) ]    # set up input as tensors of shape [1] to match placeholder
    rdd = self.sc.parallelize(input, 10)
    cluster = TFCluster.run(self.sc, _map_fun, tf_args={}, num_executors=self.num_workers, num_ps=0, input_mode=TFCluster.InputMode.SPARK)
    rdd_out = cluster.inference(rdd)
    rdd_sum = rdd_out.sum()
    self.assertEqual(sum( [x * x for x in range(1000)] ), rdd_sum)
    cluster.shutdown()
コード例 #2
0
  def test_basic_tf(self):
    """Single-node TF graph (w/ args) running independently on multiple executors."""
    def _map_fun(args, ctx):
      import tensorflow as tf
      x = tf.constant(args['x'])
      y = tf.constant(args['y'])
      sum = tf.add(x,y)
      with tf.Session() as sess:
        result = sess.run([sum])
        assert result[0] == 3

    args = { 'x':1, 'y':2 }
    cluster = TFCluster.run(self.sc, _map_fun, tf_args=args, num_executors=self.num_workers, num_ps=0)
    cluster.shutdown()
コード例 #3
0
ファイル: mnist_tf.py プロジェクト: yufengm/TensorFlowOnSpark
    classifier.export_saved_model(args.export_dir, serving_input_receiver_fn)


if __name__ == "__main__":
  # tf.app.run()

  from pyspark.context import SparkContext
  from pyspark.conf import SparkConf
  from tensorflowonspark import TFCluster
  import argparse

  sc = SparkContext(conf=SparkConf().setAppName("mnist_estimator"))
  executors = sc._conf.get("spark.executor.instances")
  num_executors = int(executors) if executors is not None else 1

  parser = argparse.ArgumentParser()
  parser.add_argument("--batch_size", help="number of records per batch", type=int, default=64)
  parser.add_argument("--buffer_size", help="size of shuffle buffer", type=int, default=10000)
  parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors)
  parser.add_argument("--epochs", help="number of epochs", type=int, default=3)
  parser.add_argument("--learning_rate", help="learning rate", type=float, default=1e-4)
  parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model")
  parser.add_argument("--export_dir", help="path to export saved_model", default="mnist_export")
  parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")

  args = parser.parse_args()
  print("args:", args)

  cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, num_ps=0, tensorboard=args.tensorboard, input_mode=TFCluster.InputMode.TENSORFLOW, log_dir=args.model_dir, master_node='chief', eval_node=True)
  cluster.shutdown(grace_secs=120)
コード例 #4
0
def main(args=None):

    spark = SparkSession \
      .builder \
      .appName("mitosis_spark") \
      .getOrCreate()
    sc = spark.sparkContext

    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1
    num_ps = 1
    logging.info("============= Num of executors: {0}".format(num_executors))

    # parse args
    parser = argparse.ArgumentParser()
    parser.add_argument("--appName",
                        default="mitosis_spark",
                        help="application name")
    parser.add_argument("--hdfs_host",
                        help="HDFS host",
                        type=str,
                        default="default")
    parser.add_argument("--hdfs_port",
                        help="HDFS port",
                        type=int,
                        default=8020)
    parser.add_argument("--mitosis_img_dir",
                        help="path to the mitosis image files")
    parser.add_argument(
        "--mitosis_img_csv",
        help="csv file that contain all the mitosis image files")
    parser.add_argument("--normal_img_dir",
                        required=True,
                        help="path to the normal image files")
    parser.add_argument(
        "--normal_img_csv",
        help="csv file that contain all the normal image files")

    parser.add_argument("--batch_size",
                        help="number of records per batch",
                        type=int,
                        default=32)
    parser.add_argument("--epochs",
                        help="number of epochs",
                        type=int,
                        default=1)
    parser.add_argument("--export_dir",
                        help="HDFS path to export saved_model",
                        default="mnist_export")
    parser.add_argument("--format",
                        help="example format: (csv|pickle|tfr)",
                        choices=["csv", "pickle", "tfr"],
                        default="csv")
    parser.add_argument(
        "--model",
        help="HDFS path to save/load model during train/inference",
        default="mnist_model")
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
                        default=num_executors)
    parser.add_argument("--output",
                        help="HDFS path to save test/inference output",
                        default="predictions")
    parser.add_argument("--readers",
                        help="number of reader/enqueue threads",
                        type=int,
                        default=1)
    parser.add_argument("--steps",
                        help="maximum number of steps",
                        type=int,
                        default=99)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")
    parser.add_argument("--mode", help="train|inference", default="train")
    parser.add_argument("--rdma", help="use rdma connection", default=False)
    args = parser.parse_args(args)

    if args.mitosis_img_dir is None and args.mitosis_img_csv is None:
        parser.error(
            "at least one of --mitosis_img_dir and --mitosis_img_csv required")

    if args.normal_img_dir is None and args.normal_img_csv is None:
        parser.error(
            "at least one of --normal_img_dir and --normal_img_csv required")

    if args.mitosis_img_csv is None:
        fs = get_hdfs(args.hdfs_host, args.hdfs_port)
        mitosis_img_pathes = fs.ls(args.mitosis_img_dir)
        mitosis_label_img_pathes = [(1, path) for path in mitosis_img_pathes]
        #mitosis_train_rdd = sc.parallelize(mitosis_img_pathes).map(lambda path : (1, path))
    else:
        mitosis_train_rdd = sc.read.textFile(
            args.mitosis_img_csv).map(lambda path: (1, path))

    if args.normal_img_csv is None:
        fs = get_hdfs(args.hdfs_host, args.hdfs_port)
        normal_img_pathes = fs.ls(args.normal_img_dir)
        normal_label_img_pathes = [(0, path) for path in normal_img_pathes]
        #normal_train_rdd = sc.parallelize(normal_img_pathes).map(lambda path : (0, path))
    else:
        normal_train_rdd = sc.read.textFile(
            args.normal_img_csv).map(lambda path: (0, path))

    # get the train data set with mitosis and normal images. In the output RDD,
    # each entry will be (label, img_arr)
    training_data = []
    training_data.extend(mitosis_label_img_pathes)
    training_data.extend(normal_label_img_pathes)
    print("+++++++++++ Training data size: {}".format(len(training_data)))
    data_RDD = sc.parallelize(training_data) \
      .repartition(int(len(training_data)/128/2000)) \
      .mapPartitions(lambda iter : read_images(get_hdfs(args.hdfs_host, args.hdfs_port), iter))

    cluster = TFCluster.run(sc,
                            mitosis_dist.map_fun,
                            args,
                            args.cluster_size,
                            num_ps,
                            args.tensorboard,
                            TFCluster.InputMode.SPARK,
                            log_dir=args.model)

    if args.mode == "train":
        cluster.train(data_RDD, args.epochs)
    else:
        labelRDD = cluster.inference(data_RDD)
        labelRDD.saveAsTextFile(args.output)

    cluster.shutdown(grace_secs=30)

    print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #5
0
parser.add_argument("--readers",
                    help="number of reader/enqueue threads",
                    type=int,
                    default=1)
parser.add_argument("--shuffle_size",
                    help="size of shuffle buffer",
                    type=int,
                    default=1000)
parser.add_argument("--steps",
                    help="maximum number of steps",
                    type=int,
                    default=1000)
parser.add_argument("--tensorboard",
                    help="launch tensorboard process",
                    action="store_true")
args = parser.parse_args()
print("args:", args)

print("{0} ===== Start".format(datetime.now().isoformat()))
cluster = TFCluster.run(sc,
                        mnist_dist_dataset.map_fun,
                        args,
                        args.cluster_size,
                        args.num_ps,
                        args.tensorboard,
                        TFCluster.InputMode.TENSORFLOW,
                        driver_ps_nodes=args.driver_ps_nodes)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #6
0
ファイル: softmax.py プロジェクト: haixiaoxuan/code-python
train_df = read_train_data(hiveContext, label_name, args , feature_alias)

# 提取特征类别数
label_type , type_count = extract_label_species(train_df,label_name)
args.label_count = type_count

# 进行 one-hot 编码
dataRDD = train_df.rdd.map(label_one_hot)

trainRDD , testRDD = dataRDD.randomSplit([1 - float(args.sample_ratio), float(args.sample_ratio)],seed=args.seed)

# 构建tensorflow on spark集群
cluster = TFCluster.run(sc,
                        softmax_dist.map_fun,
                        args,
                        args.cluster_size,  # 集群节点个数
                        num_ps,
                        args.tensorboard,
                        TFCluster.InputMode.SPARK,
                        log_dir=args.model)
print("{0} ===== Train Start".format(datetime.now().isoformat()))
# 模型训练
cluster.train(trainRDD, args.epochs)
# 关闭集群
cluster.shutdown(grace_secs=30)  # 采用graceful方式关闭tensorflow on spark集群
print("{0} ===== Train Stop".format(datetime.now().isoformat()))

# 构建tensorflow on spark集群
args.mode = "inference"
cluster = TFCluster.run(sc,
                        softmax_dist.map_fun,
                        args,
コード例 #7
0
import mnist_dist

sc = SparkContext(conf=SparkConf().setAppName("mnist_tf"))
executors = sc._conf.get("spark.executor.instances")
num_executors = int(executors) if executors is not None else 1
num_ps = 1

parser = argparse.ArgumentParser()
parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=0)
parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv","pickle","tfr"], default="tfr")
parser.add_argument("-i", "--images", help="HDFS path to MNIST images in parallelized format")
parser.add_argument("-l", "--labels", help="HDFS path to MNIST labels in parallelized format")
parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/test", default="mnist_model")
parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors)
parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions")
parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1)
parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000)
parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true")
parser.add_argument("-X", "--mode", help="train|inference", default="train")
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)
args = parser.parse_args()
print("args:",args)


print("{0} ===== Start".format(datetime.now().isoformat()))
cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))

コード例 #8
0
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)


if __name__ == '__main__':
  sc = SparkContext(conf=SparkConf().setAppName("cifar10_train"))
  num_executors = int(sc._conf.get("spark.executor.instances"))
  num_ps = 0

  cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, False, TFCluster.InputMode.TENSORFLOW)
  cluster.shutdown()
コード例 #9
0
parser.add_argument("-o",
                    "--output",
                    help="HDFS path to save test/inference output",
                    default="predictions")
parser.add_argument("-r",
                    "--readers",
                    help="number of reader/enqueue threads",
                    type=int,
                    default=1)
parser.add_argument("-s",
                    "--steps",
                    help="maximum number of steps",
                    type=int,
                    default=1000)
parser.add_argument("-tb",
                    "--tensorboard",
                    help="launch tensorboard process",
                    action="store_true")
parser.add_argument("-X", "--mode", help="train|inference", default="train")
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)
args = parser.parse_args()
print("args:", args)

print("{0} ===== Start".format(datetime.now().isoformat()))
cluster = TFCluster.run(sc, mnist_dist_dataset.map_fun, args,
                        args.cluster_size, num_ps, args.tensorboard,
                        TFCluster.InputMode.TENSORFLOW)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #10
0
import resnet_cifar_dist

if __name__ == '__main__':
  # tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
  # absl_app.run(main)
  from pyspark.context import SparkContext
  from pyspark.conf import SparkConf
  from tensorflowonspark import TFCluster
  import argparse

  sc = SparkContext(conf=SparkConf().setAppName("resnet_cifar"))
  executors = sc._conf.get("spark.executor.instances")
  num_executors = int(executors) if executors is not None else 1

  parser = argparse.ArgumentParser()
  parser.add_argument("--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors)
  parser.add_argument("--num_ps", help="number of parameter servers", type=int, default=0)
  parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")
  args, rem = parser.parse_known_args()

  cluster = TFCluster.run(sc, resnet_cifar_dist.main_fun, rem, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW, master_node='chief')
  cluster.shutdown()
コード例 #11
0
          train_tensor,
          logdir=FLAGS.train_dir,
          master=server.target,
          is_chief=(FLAGS.task == 0),
          init_fn=_get_init_fn(),
          summary_op=summary_op,
          number_of_steps=FLAGS.max_number_of_steps,
          log_every_n_steps=FLAGS.log_every_n_steps,
          save_summaries_secs=FLAGS.save_summaries_secs,
          save_interval_secs=FLAGS.save_interval_secs,
          summary_writer=summary_writer,
          sync_optimizer=optimizer if FLAGS.sync_replicas else None)


if __name__ == '__main__':
  import argparse

  sc = SparkContext(conf=SparkConf().setAppName("train_image_classifier"))
  executors = sc._conf.get("spark.executor.instances")
  num_executors = int(executors) if executors is not None else 1

  parser = argparse.ArgumentParser()
  parser.add_argument("--num_ps_tasks", help="number of PS nodes", type=int, default=0)
  parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")
  parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors)
  (args,rem) = parser.parse_known_args()

  assert(num_executors > args.num_ps_tasks)
  cluster = TFCluster.run(sc, main_fun, sys.argv, args.cluster_size, args.num_ps_tasks, args.tensorboard, TFCluster.InputMode.TENSORFLOW)
  cluster.shutdown()
コード例 #12
0
        help="HDFS path to save/load model during train/inference",
        default="mnist_model")
    parser.add_argument("--output",
                        help="HDFS path to save test/inference output",
                        default="predictions")
    parser.add_argument("--num_ps",
                        help="number of PS nodes in cluster",
                        type=int,
                        default=1)
    parser.add_argument("--steps",
                        help="maximum number of steps",
                        type=int,
                        default=1000)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    cluster = TFCluster.run(sc,
                            main,
                            args,
                            args.cluster_size,
                            args.num_ps,
                            tensorboard=args.tensorboard,
                            input_mode=TFCluster.InputMode.TENSORFLOW,
                            log_dir=args.model,
                            master_node='master')
    cluster.shutdown()
コード例 #13
0
ファイル: NN_spark.py プロジェクト: shaz13/gsoc_relationship
        self.epochs = conf["NN.epochs"]
        self.cluster_size = conf["NN.cluster_size"]
        self.steps = conf["NN.steps"]


args = argsClass()

print("{0} ===== Start".format(datetime.now().isoformat()))

if args.mode == "train" or args.mode == "inference":
    datafile = sqlContext.read.format("parquet").load(args.input_file)
    labelRDD = datafile.rdd.map(lambda row: row._2._1)
    featureRDD = datafile.rdd.map(lambda row: row._2._2)
    dataRDD = featureRDD.zip(labelRDD)
else:
    datafile = sqlContext.read.format("parquet").load(args.input_file)
    sha256RDD = datafile.rdd.map(lambda row: row._1)
    featureRDD = datafile.rdd.map(lambda row: row._2._2)
    dataRDD = featureRDD.zip(sha256RDD)

cluster = TFCluster.run(sc, NN_dist.map_fun, args, args.cluster_size, num_ps,
                        False, TFCluster.InputMode.SPARK)
if args.mode == "train":
    cluster.train(dataRDD, args.epochs)
else:
    labelRDD = cluster.inference(dataRDD)
    labelRDD.saveAsTextFile(args.output)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #14
0
                                sep=",",
                                inferSchema=True,
                                header=False)
    else:
        print('Wrong value for parameter mode. Allowed: all, nlp or train')
        quit()

    if (args.mode == 'all') or (args.mode == 'train'):
        # Cache the dataset before training
        train_rdd = train_df.rdd.cache()

        # Train the model in the Spark cluster
        print('{} Starting model training'.format(datetime.now()))
        num_ps = 1
        cluster = TFCluster.run(sc,
                                main_fun,
                                args,
                                num_executors,
                                num_ps,
                                args.tensorboard,
                                TFCluster.InputMode.SPARK,
                                log_dir=args.model_folder,
                                master_node='master',
                                reservation_timeout=60)
        cluster.train(train_rdd, args.epochs, feed_timeout=4800)

        print('{} End model training'.format(datetime.now()))
        cluster.shutdown()

    print('{} End program'.format(datetime.now()))
コード例 #15
0
                        help="number of ps nodes",
                        type=int,
                        default=1)
    parser.add_argument("--task_num",
                        help="number of worker nodes",
                        type=int,
                        default=1)
    parser.add_argument("--max_steps",
                        help="max number of steps to train",
                        type=int,
                        default=2000000)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    assert (args.num_ps + args.task_num == num_executors)

    cluster = TFCluster.run(sc,
                            main_func,
                            args,
                            args.cluster_size,
                            args.num_ps,
                            args.tensorboard,
                            TFCluster.InputMode.TENSORFLOW,
                            log_dir=args.model_dir,
                            master_node='master')
    cluster.shutdown()
コード例 #16
0
    # arguments for Spark and TFoS
    parser = argparse.ArgumentParser()
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
                        default=executors)
    parser.add_argument("--num_ps",
                        help="number of ps nodes",
                        type=int,
                        default=1)
    (args, remainder) = parser.parse_known_args()

    # construct an ARGV (with script name as first element) from remaining args and pass it to the TF processes on executors
    remainder.insert(0, __file__)
    print("spark args:", args)
    print("tf args:", remainder)

    num_workers = args.cluster_size - args.num_ps
    print("===== num_executors={}, num_workers={}, num_ps={}".format(
        args.cluster_size, num_workers, args.num_ps))

    cluster = TFCluster.run(sc,
                            main_fun,
                            remainder,
                            args.cluster_size,
                            args.num_ps,
                            False,
                            TFCluster.InputMode.TENSORFLOW,
                            master_node='master')
    cluster.shutdown()
コード例 #17
0
def unit2():
    from pyspark.context import SparkContext
    from pyspark.conf import SparkConf

    import argparse
    import os
    import numpy
    import sys
    import tensorflow as tf
    import threading
    from datetime import datetime
    from hops import util
    from hops import hdfs

    from tensorflowonspark import TFCluster

    sc = spark.sparkContext
    num_executors = util.num_executors(spark)
    num_ps = util.num_param_servers(spark)

    parser = argparse.ArgumentParser()
    parser.add_argument("-e",
                        "--epochs",
                        help="number of epochs",
                        type=int,
                        default=0)
    parser.add_argument("-f",
                        "--format",
                        help="example format: (csv|pickle|tfr)",
                        choices=["csv", "pickle", "tfr"],
                        default="csv")
    parser.add_argument(
        "-i",
        "--images",
        help="HDFS path to MNIST images in parallelized format",
        default='/Projects/' + hdfs.project_name() + '/mnist/train/images')
    parser.add_argument(
        "-l",
        "--labels",
        help="HDFS path to MNIST labels in parallelized format",
        default='/Projects/' + hdfs.project_name() + '/mnist/train/labels')
    parser.add_argument("-m",
                        "--model",
                        help="HDFS path to save/load model during train/test",
                        default="mnist_model")
    parser.add_argument(
        "-n",
        "--cluster_size",
        help="number of nodes in the cluster (for Spark Standalone)",
        type=int,
        default=num_executors)
    parser.add_argument("-o",
                        "--output",
                        help="HDFS path to save test/inference output",
                        default="predictions")
    parser.add_argument("-r",
                        "--readers",
                        help="number of reader/enqueue threads",
                        type=int,
                        default=1)
    parser.add_argument("-s",
                        "--steps",
                        help="maximum number of steps",
                        type=int,
                        default=1000)
    parser.add_argument("-tb",
                        "--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")
    parser.add_argument("-X",
                        "--mode",
                        help="train|inference",
                        default="train")
    parser.add_argument("-c",
                        "--rdma",
                        help="use rdma connection",
                        default=False)
    args = parser.parse_args()
    print("args:", args)

    print("{0} ===== Start".format(datetime.now().isoformat()))

    cluster = TFCluster.run(sc, mnist_fun, args, args.cluster_size, num_ps,
                            args.tensorboard, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()

    print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #18
0
        args.cluster_size, num_workers, args.num_ps))
    sys.argv = args

    #generate type_constrain.txt file
    n_n()
    if args.mode == 'train':
        if is_new_batch(): feed_batch()
        try:
            os.remove(os.path.join(args.output_path, "stop.txt"))
        except:
            pass

    if args.debug: print("Launching jobs...")
    elapsed_time = time.time()
    cluster = TFCluster.run(sc, distribute_training.main_fun, args,
                            args.cluster_size, args.num_ps, True,
                            TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown(timeout=-1)
    elapsed_time = time.time() - elapsed_time
    with open(os.path.join(args.output_path, 'time.txt'), 'w') as f:
        f.write("Elapsed time: " + str(elapsed_time) + "\n")

    if args.mode == 'train':
        if is_new_batch(): remove_batch_files()
        if args.debug:
            print("Restoring the best model founded during training...")
        if path.exists(os.path.join(args.output_path, "stop.txt")):
            step = None
            with open(os.path.join(args.output_path, "stop.txt"), "r") as f:
                step = int(f.readline().strip())
コード例 #19
0
        "--tensorboardlogdir",
        help=
        "Tensorboard log directory. It should on hdfs. Thus, it must be prefixed with hdfs://default"
    )

    args = parser.parse_args()

    print("args:", args)

    print("{0} ===== Start".format(datetime.now().isoformat()))

    dataRDD = sc.textFile(
        args.data).map(lambda ln: [x for x in ln.split('\t')])
    cluster = TFCluster.run(sc,
                            criteo_dist.map_fun,
                            args,
                            args.cluster_size,
                            num_ps,
                            args.tensorboard,
                            TFCluster.InputMode.SPARK,
                            log_dir=args.model)

    if args.mode == "train":
        cluster.train(dataRDD, args.epochs)
    else:
        labelRDD = cluster.inference(dataRDD)
        labelRDD.saveAsTextFile(args.output)
    cluster.shutdown()

    print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #20
0
                    "--num_layers",
                    help="Number of LSTM hidden layers.",
                    type=int,
                    default=2)
parser.add_argument("-hu",
                    "--hidden_units",
                    help="Number of units in LSTM hidden layer.",
                    type=int,
                    default=128)
args = parser.parse_args()

redis_logger_handler.logging_setup(args.redis)
logging.info("===== Start")

images, labels = parseFile(args.images, args.labels, args.format)
dataRDD = images.zip(labels)
args.train_size = labels.count() - args.test_size

logging.info(args)

cluster = TFCluster.run(sc, lstm_ctc_ocr_dist.map_fun, args, args.cluster_size,
                        num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
if args.mode == "train":
    cluster.train(dataRDD, args.epochs)
else:
    labelRDD = cluster.inference(dataRDD)
    labelRDD.saveAsTextFile(args.output)
cluster.shutdown()

logger.info("===== Stop")
コード例 #21
0
				
		return audio

	# Trial data paths
	paths = 'data/local/bird,data/local/bed,data/local/cat'

	# Loading data into memory
	baseAudio = sc.binaryFiles(paths)

	parser = argparse.ArgumentParser()
	parser.add_argument("--batch_size", help="number of records per batch", type=int, default=1)
	parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=1)
	# parser.add_argument("--epochs", help="number of epochs", type=int, default=3)
	# parser.add_argument("--images_labels", help="path to MNIST images and labels in parallelized format")
	# parser.add_argument("--model_dir", help="path to save checkpoint", default="mnist_model")
	parser.add_argument("--export_dir", help="path to export saved_model", default="speechModel")
	# parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")

	args = parser.parse_args()

	# Applying Transformations To Data
	convertedAndLabeledAudio = baseAudio.map(lambda x: [carveClassName(x[0]), binaryToNumerical(x[1])])
	transformedAudio = convertedAndLabeledAudio.map(lambda x: [x[0], fourierTransformation(x[1])])

	# Defining Cluster
	cluster = TFCluster.run(sc, mainFun, args, num_ps = 0, num_executors = 1, tensorboard = True, input_mode = TFCluster.InputMode.SPARK, master_node='chief')

	# Training on cluster
	cluster.train(transformedAudio, num_epochs=3)
	# Shutting down cluster after training is complete
	cluster.shutdown()
コード例 #22
0
# hooks = [tf.train.StopAtStepHook(
#     last_step=int(int(self.dataset_size * (1 - self.test_percent) * n_epochs / self.batch_size)))]) as sess:

if __name__ == '__main__':

    parser = argparse.ArgumentParser()

    parser.add_argument("--n_epoch", help="number of current epoch", type=int)
    parser.add_argument("--main_path",
                        help="Path to '../CatDog-CNN-Tensorflow-OnSpark'",
                        required=True,
                        type=str)
    parser.add_argument("--dataset_size",
                        help="Training size to use",
                        type=str)
    parser.add_argument("--batch_size", help="batch size to use", type=int)

    args = parser.parse_args()

    sys.path.append(args.main_path + "/CatDog-CNN-Tensorflow-OnSpark")

    import conv_net

    sc = SparkContext(conf=SparkConf().setAppName("catdog_spark"))
    num_executors = 1
    num_ps = 0

    cluster = TFCluster.run(sc, main_fun, args, num_executors, num_ps, False,
                            TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
コード例 #23
0
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Accuracy per epoch train vs test')
    plt.legend()
    plt.grid(True)
    plt.show()

    plt.plot(train_cost[zoom_point:])
    plt.plot(test_cost[zoom_point:])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Loss per epoch train vs test')
    plt.legend()
    plt.grid(True)
    plt.show()


if __name__ == '__main__':
    # tf.app.run()
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")
    args, rem = parser.parse_known_args()

    sc = SparkContext(conf=SparkConf().setAppName("lab4_task6"))
    #num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 1
    tensorboard = True

    cluster = TFCluster.run(sc, main_fun, [], 2, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
コード例 #24
0
ファイル: keras.py プロジェクト: prokosna/tfos-playground
#     preds = estimator.predict(input_fn=test_input_fn)
#     for pred in preds:
#         print(pred)


from pyspark.ml.linalg import Vectors

if __name__ == '__main__':
    from tensorflowonspark import TFCluster
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 2
    num_ps = 1

    args = edict({
        "cluster_size": num_executors,
        "num_ps": num_ps,
        "tensorboard": False,
        "model_dir": "/spark/data",
        "epochs": 1,
        "steps": 2000
    })

    # iris RDD
    iris = datasets.load_iris()
    df = spark.createDataFrame([(int(target), Vectors.dense(data)) for target, data in zip(iris.target, iris.data)], ['label', 'features'])
    rdd = df.select('features', 'label').rdd.map(tuple)

    cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard, TFCluster.InputMode.SPARK, master_node='master')
    cluster.train(rdd, args.epochs)
    cluster.shutdown()
コード例 #25
0
        # TODO(sguada) use num_epochs=1
        if FLAGS.max_num_batches:
            num_batches = FLAGS.max_num_batches
        else:
            # This ensures that we make a single pass over all of the data.
            num_batches = math.ceil(dataset.num_samples /
                                    float(FLAGS.batch_size))

        if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
        else:
            checkpoint_path = FLAGS.checkpoint_path

        tf.logging.info('Evaluating %s' % checkpoint_path)

        slim.evaluation.evaluate_once(
            master=FLAGS.master,
            checkpoint_path=checkpoint_path,
            logdir=FLAGS.eval_dir,
            num_evals=num_batches,
            eval_op=list(names_to_updates.values()),
            variables_to_restore=variables_to_restore)


if __name__ == '__main__':
    sc = SparkContext(conf=SparkConf().setAppName("eval_image_classifier"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, 0, False,
                            TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
コード例 #26
0
    parser.add_argument("--input_data", help="HDFS path to input dataset")
    parser.add_argument("--input_mode",
                        help="method to ingest data: (spark|tf)",
                        choices=["spark", "tf"],
                        default="tf")
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    (args, rem) = parser.parse_known_args()

    input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW

    print("{0} ===== Start".format(datetime.now().isoformat()))
    sc = SparkContext(
        conf=SparkConf().setAppName('imagenet_distributed_train'))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 1

    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps,
                            args.tensorboard, input_mode)
    if input_mode == TFCluster.InputMode.SPARK:
        dataRDD = sc.newAPIHadoopFile(
            args.input_data,
            "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
            keyClass="org.apache.hadoop.io.BytesWritable",
            valueClass="org.apache.hadoop.io.NullWritable")
        cluster.train(dataRDD, args.epochs)
    cluster.shutdown()
    print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #27
0
sc = SparkContext(conf=SparkConf().setAppName("mnist_tf"))
executors = sc._conf.get("spark.executor.instances")
num_executors = int(executors) if executors is not None else 1
num_ps = 1

parser = argparse.ArgumentParser()
parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=0)
parser.add_argument("-f", "--format", help="example format: (csv|pickle|tfr)", choices=["csv","pickle","tfr"], default="tfr")
parser.add_argument("-i", "--images", help="HDFS path to MNIST images in parallelized format")
parser.add_argument("-l", "--labels", help="HDFS path to MNIST labels in parallelized format")
parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/test", default="mnist_model")
parser.add_argument("-n", "--cluster_size", help="number of nodes in the cluster (for Spark Standalone)", type=int, default=num_executors)
parser.add_argument("-o", "--output", help="HDFS path to save test/inference output", default="predictions")
parser.add_argument("-r", "--readers", help="number of reader/enqueue threads", type=int, default=1)
parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000)
parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", action="store_true")
parser.add_argument("-X", "--mode", help="train|inference", default="train")
parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)
parser.add_argument("-p", "--driver_ps_nodes", help="run tensorflow PS node on driver locally", default=False)
args = parser.parse_args()
print("args:",args)


print("{0} ===== Start".format(datetime.now().isoformat()))
cluster = TFCluster.run(sc, mnist_dist.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.TENSORFLOW,
                        driver_ps_nodes=args.driver_ps_nodes, log_dir=args.model)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))

コード例 #28
0
                        default="mnist_model")
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    # create RDD of input data
    def parse(ln):
        vec = [int(x) for x in ln.split(',')]
        return (vec[1:], vec[0])

    stream = ssc.textFileStream(args.images_labels)
    images_labels = stream.map(parse)

    cluster = TFCluster.run(sc,
                            main_fun,
                            args,
                            args.cluster_size,
                            num_ps=1,
                            tensorboard=args.tensorboard,
                            input_mode=TFCluster.InputMode.SPARK,
                            log_dir=args.model_dir,
                            master_node='chief')
    cluster.train(
        images_labels, feed_timeout=86400
    )  # extend feed timeout to 24hrs for streaming data to arrive
    ssc.start()
    cluster.shutdown(ssc)
コード例 #29
0
            while True:
                eval_once(saver, summary_writer, top_k_op, summary_op)
                if FLAGS.run_once:
                    break
                time.sleep(FLAGS.eval_interval_secs)

    #cifar10.maybe_download_and_extract()
    if tf.gfile.Exists(FLAGS.eval_dir):
        tf.gfile.DeleteRecursively(FLAGS.eval_dir)
    tf.gfile.MakeDirs(FLAGS.eval_dir)
    evaluate()


if __name__ == '__main__':
    sc = SparkContext(conf=SparkConf().setAppName("cifar10_eval"))

    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 2
    num_ps = 0

    parser = argparse.ArgumentParser()
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
                        default=num_executors)
    args, unknown = parser.parse_known_args()

    cluster = TFCluster.run(sc, main_fun, sys.argv, args.cluster_size, num_ps,
                            False, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
コード例 #30
0
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Accuracy per epoch train vs test')
    plt.legend()
    plt.grid(True)
    plt.show()

    plt.plot(train_cost[zoom_point:])
    plt.plot(test_cost[zoom_point:])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Loss per epoch train vs test')
    plt.legend()
    plt.grid(True)
    plt.show()


if __name__ == '__main__':
    # tf.app.run()
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")
    args, rem = parser.parse_known_args()

    sc = SparkContext(conf=SparkConf().setAppName("your_app_name"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 1
    tensorboard = True

    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
コード例 #31
0

if __name__ == '__main__':
    sc = SparkContext(conf=SparkConf().setAppName("read hdfs save to hdfs "))
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1
    num_ps = 1

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="input hdfs path")
    parser.add_argument("-m", "--model", help="HDFS path to save/load model during train/inference",
                        default="mnist_model")
    parser.add_argument("-tb", "--tensorboard", help="launch tensorboard process", default=False)
    parser.add_argument("-b", "--batch_size", help="number of records per batch", type=int, default=100)
    parser.add_argument("-e", "--epochs", help="number of epochs", type=int, default=1)
    parser.add_argument("-s", "--steps", help="maximum number of steps", type=int, default=1000)
    parser.add_argument("-X", "--mode", help="train|inference", default="train")
    parser.add_argument("-c", "--rdma", help="use rdma connection", default=False)

    args = parser.parse_args()
    print("args:", args)

    # read data
    input_data = sc.textFile(args.input).map(lambda ln: [float(x) for x in ln.split(',')])

    cluster = TFCluster.run(sc, map_fun, args, num_executors,
                            num_ps, args.tensorboard,
                            TFCluster.InputMode.SPARK)
    cluster.train(input_data, 1)

    cluster.shutdown()	
コード例 #32
0
    parser.add_argument("--epochs", help="number of epochs",
                        type=int, default=1)
    parser.add_argument(
        "--steps", help="maximum number of steps", type=int, default=1000)

    args=parser.parse_args()

    data_loader=TextLoader(
        sc, args.data_dir, args.batch_size, args.seq_length)

    args.vocab_size = data_loader.vocab_size

    defaultFS = sc._jsc.hadoopConfiguration().get("fs.defaultFS")
    working_dir = os.getcwd()

    config_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'config.p'), defaultFS, working_dir)
    sc.parallelize([args]).saveAsPickleFile(config_file)

    chars_vocab_file = TFNode.hdfs_path(os.path.join(args.save_dir, 'chars_vocab.p'), defaultFS, working_dir)
    sc.parallelize([data_loader.chars, data_loader.vocab]).saveAsPickleFile(chars_vocab_file)

    dataRDD=sc.parallelize(data_loader.get_data_for_feeder())

    cluster=TFCluster.run(sc, main_fun, args, num_executors,
                            args.num_ps_tasks, TFCluster.InputMode.SPARK)

    cluster.train(dataRDD, args.epochs)

    cluster.shutdown()

    print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #33
0
ファイル: train.py プロジェクト: crafet/first_demo
                for epoch in range(FLAGS.num_epoch):
                    train_batches = train_reader.yieldBatches()
                    print("Epoch: %d" % epoch)
                    step = 0
                    for dense_x,sparse_idx,sparse_values,y in train_batches:
                        start_time = datetime.now()
                        _ ,train_loss,train_auc,summ,_ = sess.run([train_op,loss,auc_op,summary_op,step_update_op],
                           feed_dict={dense_inputs:dense_x,sparse_inputs:(sparse_idx,sparse_values,shape),labels:y})
                        step += 1
                        assert not np.isnan(train_loss), 'Model diverged with loss = NaN'
                        time_used = datetime.now() - start_time
                        if step % FLAGS.display_step == 0:
                            g_step, = sess.run([global_step])
                            print("step: " + str(step) + ", global_step: " + str(g_step))
                            summary_writer.add_summary(summ,g_step)
                            print("Step = {}, Examples = {}, Time = {}, Minibatch Loss = {}, Auc = {}".format(
                                 g_step, g_step*FLAGS.batch_size, time_used, train_loss, train_auc))
                            sys.stdout.flush()
            total_time = datetime.now() - begin_time
            print("Training Done!!")
            print("Total time used: {}".format(total_time))


if __name__ == "__main__":
    sc = SparkContext(conf=SparkConf().setAppName("tfos_online_train_distributed"))
    num_executors = int(sc._conf.get("spark.executor.instances"))
    num_ps = 64
    tensorboard = False
    cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, tensorboard, TFCluster.InputMode.TENSORFLOW)
    cluster.shutdown()
コード例 #34
0
    parser.add_argument("--epochs",
                        help="number of epochs",
                        type=int,
                        default=3)
    parser.add_argument("--model_dir",
                        help="path to save model/checkpoint",
                        default="mnist_model")
    parser.add_argument("--export_dir",
                        help="path to export saved_model",
                        default="mnist_export")
    parser.add_argument("--steps_per_epoch",
                        help="number of steps per epoch",
                        type=int,
                        default=469)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    cluster = TFCluster.run(sc,
                            main_fun,
                            args,
                            args.cluster_size,
                            num_ps=0,
                            tensorboard=args.tensorboard,
                            input_mode=TFCluster.InputMode.TENSORFLOW,
                            master_node='chief')
    cluster.shutdown()
コード例 #35
0
    parser.add_argument("--steps_per_epoch",
                        help="number of steps per epoch",
                        type=int,
                        default=300)
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    if args.input_mode == 'tf':
        cluster = TFCluster.run(sc,
                                main_fun,
                                args,
                                args.cluster_size,
                                args.num_ps,
                                args.tensorboard,
                                TFCluster.InputMode.TENSORFLOW,
                                log_dir=args.model_dir)
    else:  # args.input_mode == 'spark':
        cluster = TFCluster.run(sc,
                                main_fun,
                                args,
                                args.cluster_size,
                                args.num_ps,
                                args.tensorboard,
                                TFCluster.InputMode.SPARK,
                                log_dir=args.model_dir)
        images = sc.textFile(
            args.images).map(lambda ln: [float(x) for x in ln.split(',')])
        labels = sc.textFile(
コード例 #36
0
if args.format == "tfr":
  images = sc.newAPIHadoopFile(args.images, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                              keyClass="org.apache.hadoop.io.BytesWritable",
                              valueClass="org.apache.hadoop.io.NullWritable")
  def toNumpy(bytestr):
    example = tf.train.Example()
    example.ParseFromString(bytestr)
    features = example.features.feature
    image = numpy.array(features['image'].int64_list.value)
    label = numpy.array(features['label'].int64_list.value)
    return (image, label)
  dataRDD = images.map(lambda x: toNumpy(str(x[0])))
else:
  if args.format == "csv":
    images = sc.textFile(args.images).map(lambda ln: [int(x) for x in ln.split(',')])
    labels = sc.textFile(args.labels).map(lambda ln: [float(x) for x in ln.split(',')])
  else: # args.format == "pickle":
    images = sc.pickleFile(args.images)
    labels = sc.pickleFile(args.labels)
  print("zipping images and labels")
  dataRDD = images.zip(labels)

cluster = TFCluster.run(sc, mnist_dist2.map_fun, args, args.cluster_size, num_ps, args.tensorboard, TFCluster.InputMode.SPARK)
if args.mode == "train":
  cluster.train(dataRDD, args.epochs)
else:
  labelRDD = cluster.inference(dataRDD)
  labelRDD.saveAsTextFile(args.output)
cluster.shutdown()

print("{0} ===== Stop".format(datetime.now().isoformat()))
コード例 #37
0
      if not tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.MakeDirs(FLAGS.train_dir)
    inception_distributed_train.train(server.target, dataset, cluster_spec, ctx)

if __name__ == '__main__':
  # parse arguments needed by the Spark driver
  import argparse
  parser = argparse.ArgumentParser()
  parser.add_argument("--epochs", help="number of epochs", type=int, default=0)
  parser.add_argument("--input_data", help="HDFS path to input dataset")
  parser.add_argument("--input_mode", help="method to ingest data: (spark|tf)", choices=["spark","tf"], default="tf")
  parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")

  (args,rem) = parser.parse_known_args()

  input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW

  print("{0} ===== Start".format(datetime.now().isoformat()))
  sc = SparkContext(conf=SparkConf().setAppName('imagenet_distributed_train'))
  num_executors = int(sc._conf.get("spark.executor.instances"))
  num_ps = 1

  cluster = TFCluster.run(sc, main_fun, sys.argv, num_executors, num_ps, args.tensorboard, input_mode)
  if input_mode == TFCluster.InputMode.SPARK:
    dataRDD = sc.newAPIHadoopFile(args.input_data, "org.tensorflow.hadoop.io.TFRecordFileInputFormat",
                                keyClass="org.apache.hadoop.io.BytesWritable",
                                valueClass="org.apache.hadoop.io.NullWritable")
    cluster.train(dataRDD, args.epochs)
  cluster.shutdown()
  print("{0} ===== Stop".format(datetime.now().isoformat()))