def test_spark_saved_model(self): """InputMode.SPARK TFEstimator w/ explicit saved_model export for TFModel inferencing""" # create a Spark DataFrame of training examples (features, labels) trainDF = self.spark.createDataFrame(self.train_examples, ['col1', 'col2']) # train and export model args = {} estimator = TFEstimator(self.get_function('spark/train'), args) \ .setInputMapping({'col1': 'x', 'col2': 'y_'}) \ .setModelDir(self.model_dir) \ .setExportDir(self.export_dir) \ .setClusterSize(self.num_workers) \ .setNumPS(1) \ .setBatchSize(10) \ .setEpochs(2) model = estimator.fit(trainDF) self.assertTrue(os.path.isdir(self.export_dir)) # create a Spark DataFrame of test examples (features, labels) testDF = self.spark.createDataFrame(self.test_examples, ['c1', 'c2']) # test saved_model using exported signature model.setTagSet('test_tag') \ .setSignatureDefKey('test_key') \ .setInputMapping({'c1': 'features'}) \ .setOutputMapping({'prediction': 'cout'}) preds = model.transform(testDF).head() # take first/only result pred = preds.cout[0] # unpack scalar from tensor expected = np.sum(self.weights) self.assertAlmostEqual(pred, expected, 5) # test saved_model using custom/direct mapping model.setTagSet('test_tag') \ .setSignatureDefKey(None) \ .setInputMapping({'c1': 'x'}) \ .setOutputMapping({'y': 'cout1', 'y2': 'cout2'}) preds = model.transform(testDF).head() # take first/only result pred = preds.cout1[0] # unpack pred scalar from tensor squared_pred = preds.cout2[0] # unpack squared pred from tensor self.assertAlmostEqual(pred, expected, 5) self.assertAlmostEqual(squared_pred, expected * expected, 5)
def get_tf_estimator(args): """ 返回一个 tensorflow estimator """ estimator = TFEstimator(softmax_dist.map_fun, args) \ .setInputMapping({args.feature_alias: args.feature_alias, args.label_name: args.label_name}) \ .setModelDir(args.model_dir) \ .setExportDir(args.export_dir) \ .setClusterSize(args.cluster_size) \ .setNumPS(args.num_ps) \ .setProtocol(args.protocol) \ .setTensorboard(args.tensorboard) \ .setEpochs(args.epochs) \ .setBatchSize(args.batch_size) \ .setSteps(args.steps) return estimator
def test_spark_saved_model(self): """InputMode.SPARK TFEstimator w/ explicit saved_model export for TFModel inferencing""" def _spark_train(args, ctx): """Basic linear regression in a distributed TF cluster using InputMode.SPARK""" import tensorflow as tf from tensorflowonspark import TFNode tf.compat.v1.reset_default_graph() strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() with strategy.scope(): model = Sequential() model.add(Dense(1, activation='linear', input_shape=[2])) model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.2), loss='mse', metrics=['mse']) model.summary() tf_feed = TFNode.DataFeed(ctx.mgr, input_mapping=args.input_mapping) def rdd_generator(): while not tf_feed.should_stop(): batch = tf_feed.next_batch(1) if len(batch['x']) > 0: features = batch['x'][0] label = batch['y_'][0] yield (features, label) else: return ds = tf.data.Dataset.from_generator( rdd_generator, (tf.float32, tf.float32), (tf.TensorShape([2]), tf.TensorShape([1]))) ds = ds.batch(args.batch_size) # disable auto-sharding dataset options = tf.data.Options() options.experimental_distribute.auto_shard = False ds = ds.with_options(options) # only train 90% of each epoch to account for uneven RDD partition sizes steps_per_epoch = 1000 * 0.9 // (args.batch_size * ctx.num_workers) tf.io.gfile.makedirs(args.model_dir) filepath = args.model_dir + "/weights-{epoch:04d}" callbacks = [ tf.keras.callbacks.ModelCheckpoint( filepath=filepath, verbose=1, load_weights_on_restart=True, save_weights_only=True) ] model.fit(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) # This fails with: "NotImplementedError: `fit_generator` is not supported for models compiled with tf.distribute.Strategy" # model.fit_generator(ds, epochs=args.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks) if ctx.job_name == 'chief' and args.export_dir: print("exporting model to: {}".format(args.export_dir)) tf.keras.experimental.export_saved_model( model, args.export_dir) tf_feed.terminate() # create a Spark DataFrame of training examples (features, labels) rdd = self.sc.parallelize(self.train_examples, 2) trainDF = rdd.toDF(['col1', 'col2']) # train and export model args = {} estimator = TFEstimator(_spark_train, args) \ .setInputMapping({'col1': 'x', 'col2': 'y_'}) \ .setModelDir(self.model_dir) \ .setExportDir(self.export_dir) \ .setClusterSize(self.num_workers) \ .setNumPS(0) \ .setBatchSize(1) \ .setEpochs(1) model = estimator.fit(trainDF) self.assertTrue(os.path.isdir(self.export_dir)) # create a Spark DataFrame of test examples (features, labels) testDF = self.spark.createDataFrame(self.test_examples, ['c1', 'c2']) # test saved_model using exported signature model.setTagSet('serve') \ .setSignatureDefKey('serving_default') \ .setInputMapping({'c1': 'dense_input'}) \ .setOutputMapping({'dense': 'cout'}) preds = model.transform(testDF).head() # take first/only result pred = preds.cout[0] # unpack scalar from tensor expected = np.sum(self.weights) self.assertAlmostEqual(pred, expected, 2)
# create RDD of input data def parse(ln): vec = [int(x) for x in ln.split(',')] return (vec[1:], vec[0]) images_labels = sc.textFile(args.images_labels).map(parse) df = spark.createDataFrame(images_labels, ['image', 'label']) df.show() if args.mode == 'train': estimator = TFEstimator(main_fun, args) \ .setInputMapping({'image': 'image', 'label': 'label'}) \ .setModelDir(args.model_dir) \ .setExportDir(args.export_dir) \ .setClusterSize(args.cluster_size) \ .setTensorboard(args.tensorboard) \ .setEpochs(args.epochs) \ .setBatchSize(args.batch_size) \ .setGraceSecs(60) model = estimator.fit(df) else: # args.mode == 'inference': # using a trained/exported model model = TFModel(args) \ .setInputMapping({'image': 'conv2d_input'}) \ .setOutputMapping({'dense_1': 'prediction'}) \ .setSignatureDefKey('serving_default') \ .setExportDir(args.export_dir) \ .setBatchSize(args.batch_size) def argmax_fn(l):
def test_spark_sparse_tensor(self): """InputMode.SPARK feeding sparse tensors""" def sparse_train(args, ctx): import tensorflow as tf # reset graph in case we're re-using a Spark python worker (during tests) tf.reset_default_graph() cluster, server = ctx.start_cluster_server(ctx) if ctx.job_name == "ps": server.join() elif ctx.job_name == "worker": with tf.device( tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % ctx.task_index, cluster=cluster)): y_ = tf.placeholder(tf.float32, name='y_label') label = tf.identity(y_, name='label') row_indices = tf.placeholder(tf.int64, name='x_row_indices') col_indices = tf.placeholder(tf.int64, name='x_col_indices') values = tf.placeholder(tf.float32, name='x_values') indices = tf.stack([row_indices[0], col_indices[0]], axis=1) data = values[0] x = tf.SparseTensor(indices=indices, values=data, dense_shape=[args.batch_size, 10]) w = tf.Variable(tf.truncated_normal([10, 1]), name='w') y = tf.sparse_tensor_dense_matmul(x, w, name='y') global_step = tf.train.get_or_create_global_step() cost = tf.reduce_mean(tf.square(y_ - y), name='cost') optimizer = tf.train.GradientDescentOptimizer( 0.1).minimize(cost, global_step) with tf.train.MonitoredTrainingSession( master=server.target, is_chief=(ctx.task_index == 0), checkpoint_dir=args.model_dir, save_checkpoint_steps=20) as sess: tf_feed = ctx.get_data_feed( input_mapping=args.input_mapping) while not sess.should_stop() and not tf_feed.should_stop(): batch = tf_feed.next_batch(args.batch_size) if len(batch['y_label']) > 0: print("batch: {}".format(batch)) feed = { y_: batch['y_label'], row_indices: batch['x_row_indices'], col_indices: batch['x_col_indices'], values: batch['x_values'] } _, pred, trained_weights = sess.run( [optimizer, y, w], feed_dict=feed) print( "trained_weights: {}".format(trained_weights)) # wait for MonitoredTrainingSession to save last checkpoint time.sleep(10) args = {} estimator = TFEstimator(sparse_train, args) \ .setInputMapping({'labels': 'y_label', 'row_indices': 'x_row_indices', 'col_indices': 'x_col_indices', 'values': 'x_values'}) \ .setInputMode(TFCluster.InputMode.SPARK) \ .setModelDir(self.model_dir) \ .setClusterSize(self.num_workers) \ .setNumPS(1) \ .setBatchSize(1) model_weights = np.array( [[1.0, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0]]).T examples = [ scipy.sparse.random( 1, 10, density=0.5, ) for i in range(200) ] rdd = self.sc.parallelize(examples).map( lambda e: ((e * model_weights).tolist()[0][0], e.row.tolist(), e.col.tolist(), e.data.tolist())) df = rdd.toDF(["labels", "row_indices", "col_indices", "values"]) df.show(5) model = estimator.fit(df) model.setOutputMapping({ 'label': 'label', 'y/SparseTensorDenseMatMul': 'predictions' }) test_examples = [ scipy.sparse.random( 1, 10, density=0.5, ) for i in range(50) ] test_rdd = self.sc.parallelize(test_examples).map( lambda e: ((e * model_weights).tolist()[0][0], e.row.tolist(), e.col.tolist(), e.data.tolist())) test_df = test_rdd.toDF( ["labels", "row_indices", "col_indices", "values"]) test_df.show(5) preds = model.transform(test_df) preds.show(5)
help="HDFS path to validation data", type=str) (args, rem) = parser.parse_known_args() input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW print("{0} ===== Start".format(datetime.now().isoformat())) df = dfutil.loadTFRecords(sc, args.train_data, binary_features=['image/encoded']) estimator = TFEstimator(main_fun, sys.argv, export_fn=inception_export.export) \ .setModelDir(args.train_dir) \ .setExportDir(args.export_dir) \ .setTFRecordDir(args.tfrecord_dir) \ .setClusterSize(args.cluster_size) \ .setNumPS(args.num_ps) \ .setInputMode(TFCluster.InputMode.TENSORFLOW) \ .setTensorboard(args.tensorboard) \ print("{0} ===== Train".format(datetime.now().isoformat())) model = estimator.fit(df) print("{0} ===== Inference".format(datetime.now().isoformat())) df = dfutil.loadTFRecords(sc, args.validation_data, binary_features=['image/encoded']) preds = model.setTagSet(tf.saved_model.tag_constants.SERVING) \ .setSignatureDefKey(tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY) \ .setInputMapping({'image/encoded': 'jpegs', 'image/class/label': 'labels'}) \ .setOutputMapping({'top_5_acc': 'output'}) \
if args.train: # train a model using Spark Estimator fitted to a DataFrame print("{0} ===== Estimator.fit()".format(datetime.now().isoformat())) # dummy tf args (from imagenet/inception example) tf_args = { 'initial_learning_rate': 0.045, 'num_epochs_per_decay': 2.0, 'learning_rate_decay_factor': 0.94 } estimator = TFEstimator(mnist_dist_pipeline.map_fun, args, export_fn=mnist_dist_pipeline.export_fun) \ .setModelDir(args.model_dir) \ .setExportDir(args.export_dir) \ .setClusterSize(args.cluster_size) \ .setNumPS(args.num_ps) \ .setInputMode(TFCluster.InputMode.TENSORFLOW) \ .setTFRecordDir(args.tfrecord_dir) \ .setProtocol(args.protocol) \ .setReaders(args.readers) \ .setTensorboard(args.tensorboard) \ .setEpochs(args.epochs) \ .setBatchSize(args.batch_size) \ .setSteps(args.steps) model = estimator.fit(df) else: # use a previously trained/exported model model = TFModel(args) \ .setExportDir(args.export_dir) \ .setBatchSize(args.batch_size) # NO INFERENCING if args.inference_mode == 'none':
else: raise Exception("Unsupported format: {}".format(args.format)) # Pipeline API if args.train: # train a model using Spark Estimator fitted to a DataFrame print("{0} ===== Estimator.fit()".format(datetime.now().isoformat())) # dummy tf args (from imagenet/inception example) tf_args = {'initial_learning_rate': 0.045, 'num_epochs_per_decay': 2.0, 'learning_rate_decay_factor': 0.94} estimator = TFEstimator(mnist_dist_pipeline.map_fun, tf_args) \ .setInputMapping({'image': 'image', 'label': 'label'}) \ .setModelDir(args.model_dir) \ .setExportDir(args.export_dir) \ .setClusterSize(args.cluster_size) \ .setNumPS(args.num_ps) \ .setProtocol(args.protocol) \ .setTensorboard(args.tensorboard) \ .setEpochs(args.epochs) \ .setBatchSize(args.batch_size) \ .setSteps(args.steps) model = estimator.fit(df) else: # use a previously trained/exported model model = TFModel(args) \ .setExportDir(args.export_dir) \ .setBatchSize(args.batch_size) # NO INFERENCING if args.inference_mode == 'none': sys.exit(0)