def test_monitored_session(script_mode): """ Works as intended. """ smd.del_hook() tf.reset_default_graph() json_file_contents = """ { "S3OutputPath": "s3://sagemaker-test", "LocalPath": "/opt/ml/output/tensors", "HookParameters" : { "save_interval": "100" } } """ with SagemakerSimulator(json_file_contents=json_file_contents) as sim: train_op, X, Y = get_train_op_and_placeholders() init = tf.global_variables_initializer() mnist = get_data() if script_mode: hook = smd.SessionHook(out_dir=sim.out_dir) sess = tf.train.MonitoredSession(hooks=[hook]) else: sess = tf.train.MonitoredSession() with sess: sess.run(init) for step in range(1, 101): batch_x, batch_y = mnist.train.next_batch(32) sess.run(train_op, feed_dict={X: batch_x, Y: batch_y}) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved."
def test_monitored_session(script_mode: bool): """ Works as intended. """ smd.del_hook() tf.reset_default_graph() with SagemakerSimulator() as sim: train_op, X, Y = get_train_op_and_placeholders() init = tf.compat.v1.global_variables_initializer() mnist = get_data() if script_mode: hook = smd.SessionHook(out_dir=sim.out_dir) sess = tf.train.MonitoredSession(hooks=[hook]) else: sess = tf.train.MonitoredSession() with sess: sess.run(init) for step in range(1, 101): batch_x, batch_y = mnist.train.next_batch(32) sess.run(train_op, feed_dict={X: batch_x, Y: batch_y}) # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert smd.get_hook() is not None, "Hook was not created." assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved."
def test_new_graph(out_dir): # tests that we can correctly interpret an explicitly created graph g1 = tf.get_default_graph() g = tf.Graph() with g.as_default(): assert g != g1 assert g == tf.get_default_graph() hook = smd.SessionHook( out_dir, include_collections=["weights", "losses", "scalars"], save_config=smd.SaveConfig(save_steps=[0, 1, 2, 3]), ) with tf.name_scope("foobar"): x = tf.placeholder(shape=(None, 2), dtype=tf.float32) w = tf.Variable(initial_value=[[10.0], [10.0]], name="weight1") with tf.name_scope("foobaz"): w0 = [[1], [1.0]] y = tf.matmul(x, w0) loss = tf.reduce_mean((tf.matmul(x, w) - y)**2, name="loss") hook.get_collection("losses").add(loss) global_step = tf.Variable(17, name="global_step", trainable=False) increment_global_step_op = tf.assign(global_step, global_step + 1) optimizer = tf.train.AdamOptimizer(0.1) optimizer = hook.wrap_optimizer(optimizer) optimizer_op = optimizer.minimize(loss, global_step=increment_global_step_op) sess = tf.train.MonitoredSession(hooks=[hook]) for i in range(5): x_ = np.random.random((10, 2)) * 0.1 sess.run([loss, optimizer_op, increment_global_step_op], {x: x_}) sess.close() tr = create_trial(out_dir) assert len(tr.tensor_names())
def test_shapes(out_dir, save_raw_tensor=False): pre_test_clean_up() rdnc = smd.ReductionConfig(save_shape=True, save_raw_tensor=save_raw_tensor) hook = smd.SessionHook( out_dir=out_dir, save_config=smd.SaveConfig(save_interval=1), reduction_config=rdnc, include_collections=["weights", "gradients", "losses"], ) simple_model(hook) verify_shapes(out_dir, 0)
def test_reductions(out_dir, save_raw_tensor=False): pre_test_clean_up() rdnc = smd.ReductionConfig( reductions=ALLOWED_REDUCTIONS, abs_reductions=ALLOWED_REDUCTIONS, norms=ALLOWED_NORMS, abs_norms=ALLOWED_NORMS, save_raw_tensor=save_raw_tensor, ) hook = smd.SessionHook( out_dir=out_dir, save_config=smd.SaveConfig(save_interval=1), reduction_config=rdnc, include_collections=["weights", "gradients", "losses"], ) helper_test_reductions(out_dir, hook, save_raw_tensor)
def test_uninit_sess_run(out_dir): train_op, X, Y = get_train_op_and_placeholders() init = tf.global_variables_initializer() mnist = get_data() hook = smd.SessionHook(out_dir, include_collections=["weights"]) sess = tf.train.MonitoredSession(hooks=[hook]) with sess: sess.run(init) for step in range(1, 101): batch_x, batch_y = mnist.train.next_batch(32) sess.run(train_op, feed_dict={X: batch_x, Y: batch_y}) # Check that hook created and tensors saved trial = smd.create_trial(path=out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="weights")) > 0
def helper_mirrored( trial_dir, save_all=False, num_steps=3, save_config=None, reduction_config=None, include_collections=None, steps=None, zcc=False, eval_distributed=False, include_workers="all", ): num_gpus = get_available_gpus() num_devices = num_gpus if num_gpus > 0 else 1 batch_size = 10 * num_devices # input_fn which serves Dataset input_fn_provider = InputFnProvider( per_device_batch_size(batch_size, num_devices)) # Use multiple GPUs by MirroredStragtegy. # All avaiable GPUs will be used if `num_gpus` is omitted. # if num_devices > 1: distribution = tf.contrib.distribute.MirroredStrategy() # print("### Doing Multi GPU Training") # else: # distribution = None # Pass to RunConfig config = tf.estimator.RunConfig( train_distribute=distribution, eval_distribute=distribution if eval_distributed else None, model_dir="/tmp/mnist_convnet_model", ) if save_config is None: save_config = smd.SaveConfig(save_interval=2) if include_collections is None: include_collections = [ CollectionKeys.WEIGHTS, CollectionKeys.BIASES, CollectionKeys.GRADIENTS, CollectionKeys.LOSSES, ] if not zcc: ts_hook = smd.SessionHook( out_dir=trial_dir, save_all=save_all, include_collections=include_collections, save_config=save_config, reduction_config=reduction_config, include_workers=include_workers, ) else: print("zcc is passed. ignoring include_collections and save_config") mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, config=config) if steps is None: steps = ["train"] for s in steps: if s == "train": print("Starting train") if not zcc: ts_hook.set_mode(smd.modes.TRAIN) # Train the model mnist_classifier.train( input_fn=input_fn_provider.train_input_fn, steps=num_steps, hooks=[ts_hook]) else: mnist_classifier.train( input_fn=input_fn_provider.train_input_fn, steps=num_steps) elif s == "eval": print("Starting eval") if not zcc: ts_hook.set_mode(smd.modes.EVAL) # Evaluate the model and print results mnist_classifier.evaluate( input_fn=input_fn_provider.eval_input_fn, steps=num_steps, hooks=[ts_hook]) else: mnist_classifier.evaluate( input_fn=input_fn_provider.eval_input_fn, steps=num_steps) elif s == "predict": print("Starting predict") if not zcc: ts_hook.set_mode(smd.modes.PREDICT) # Evaluate the model and print results p = mnist_classifier.predict( input_fn=input_fn_provider.eval_input_fn, hooks=[ts_hook]) else: p = mnist_classifier.predict( input_fn=input_fn_provider.eval_input_fn) for i in range(num_steps): next(p) get_hook()._cleanup() return distribution
def help_test_mnist( path, save_config=None, hook=None, set_modes=True, num_steps=10, num_eval_steps=None, steps=None, include_collections=None, ): trial_dir = path tf.reset_default_graph() def cnn_model_fn(features, labels, mode): """Model function for CNN.""" # Input Layer input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) # Convolutional Layer #1 conv1 = tf.layers.conv2d( inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu, ) # Pooling Layer #1 pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) # Convolutional Layer #2 and Pooling Layer #2 conv2 = tf.layers.conv2d(inputs=pool1, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu) pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) # Dense Layer pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu) dropout = tf.layers.dropout( inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) # Logits Layer logits = tf.layers.dense(inputs=dropout, units=10) predictions = { # Generate predictions (for PREDICT and EVAL mode) "classes": tf.argmax(input=logits, axis=1), # Add `softmax_tensor` to the graph. It is used for PREDICT and by the # `logging_hook`. "probabilities": tf.nn.softmax(logits, name="softmax_tensor"), } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate Loss (for both TRAIN and EVAL modes) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) # Configure the Training Op (for TRAIN mode) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) optimizer = smd.get_hook().wrap_optimizer(optimizer) train_op = optimizer.minimize( loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # Add evaluation metrics (for EVAL mode) eval_metric_ops = { "accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"]) } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) # Load training and eval data ((train_data, train_labels), (eval_data, eval_labels)) = tf.keras.datasets.mnist.load_data() train_data = train_data / np.float32(255) train_labels = train_labels.astype(np.int32) # not required eval_data = eval_data / np.float32(255) eval_labels = eval_labels.astype(np.int32) # not required mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir="/tmp/mnist_convnet_model") train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data}, y=train_labels, batch_size=2, num_epochs=None, shuffle=True) eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data}, y=eval_labels, num_epochs=1, batch_size=1, shuffle=False) if hook is None: if include_collections is None: include_collections = ["weights", "gradients", "default", "losses"] hook = smd.SessionHook(out_dir=trial_dir, save_config=save_config, include_collections=include_collections) if num_eval_steps is None: num_eval_steps = num_steps def train(num_steps): if set_modes: hook.set_mode(smd.modes.TRAIN) mnist_classifier.train(input_fn=train_input_fn, steps=num_steps, hooks=[hook]) def evaluate(num_eval_steps): if set_modes: hook.set_mode(smd.modes.EVAL) mnist_classifier.evaluate(input_fn=eval_input_fn, steps=num_eval_steps, hooks=[hook]) # def train_and_evaluate(num_steps, num_eval_steps): # tf.estimator.train_and_evaluate( # mnist_classifier, # train_spec=tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=num_steps),#, hooks=[hook]), # eval_spec=tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=num_eval_steps)#, hooks=[hook]), # ) if steps is None: steps = ["train", "eval", "train"] for s in steps: if s == "train": # train one step and display the probabilties train(num_steps) elif s == "eval": evaluate(num_eval_steps) # elif s == "traineval": # train_and_evaluate(num_steps, num_eval_steps) hook.close()
def test_only_w_g(out_dir): pre_test_clean_up() hook = smd.SessionHook(out_dir, save_all=False, save_config=smd.SaveConfig(save_interval=2)) helper_test_only_w_g(out_dir, hook)