def test_hook_timeline_file_write(set_up_smprofiler_config_path, out_dir): """ This test is meant to test TimelineFileWriter through a MXNet hook. """ hook = t_hook(out_dir=out_dir) for i in range(1, 11): n = "event" + str(i) hook.record_trace_events( training_phase="MXNet_TimelineFileWriteTest", op_name=n, step_num=i, timestamp=time.time(), ) # need to explicitly close hook for the test here so that the JSON file is written and # can be read back below. # In training scripts, this is not necessary as _cleanup will take care of closing the trace file. hook.close() files = [] for path in Path(out_dir + "/" + DEFAULT_PREFIX).rglob("*.json"): files.append(path) assert len(files) == 1 with open(files[0]) as timeline_file: events_dict = json.load(timeline_file) assert events_dict
def test_loss_collection_with_no_other_collections(): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/" + run_id hook = t_hook(out_dir=out_dir, save_config=save_config, include_collections=[]) assert has_training_ended(out_dir) == False run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, register_to_loss_block=True) print("Created the trial with out_dir {0}".format(out_dir)) tr = create_trial(out_dir) assert tr assert len(tr.steps()) == 4 print(tr.tensor_names()) tname = tr.tensor_names(regex=".*loss")[0] loss_tensor = tr.tensor(tname) loss_val = loss_tensor.value(step_num=1) assert len(loss_val) > 0 shutil.rmtree(out_dir)
def test_hook_all_zero(hook=None, out_dir=None): hook_created = False if hook is None: hook_created = True save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/newlogsRunTest/" + run_id print("Registering the hook with out_dir {0}".format(out_dir)) shutil.rmtree(out_dir, ignore_errors=True) hook = t_hook( out_dir=out_dir, save_config=save_config, include_collections=[ "ReluActivation", "weights", "biases", "gradients" ], ) hook.get_collection("ReluActivation").include(["relu*", "input_*"]) run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, make_input_zero=True) print("Created the trial with out_dir {0}".format(out_dir)) tr = create_trial(out_dir) assert tr assert len(tr.steps()) == 4 tnames = tr.tensor_names(regex="conv._input") tname = tr.tensor_names(regex="conv._input")[0] conv_tensor_value = tr.tensor(tname).value(step_num=0) is_zero = np.all(conv_tensor_value == 0) assert is_zero == True if hook_created: shutil.rmtree(out_dir)
def test_hook_custom_collection(): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/" + run_id hook = t_hook(out_dir=out_dir, save_config=save_config, include_collections=["ReluActivation"]) hook.get_collection("ReluActivation").include(["relu*", "input_*"]) run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10) shutil.rmtree(out_dir)
def test_hook(): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/newlogsRunTest/" + run_id hook = t_hook(out_dir=out_dir, save_config=save_config) assert has_training_ended(out_dir) == False run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10, register_to_loss_block=True) shutil.rmtree(out_dir)
def test_save_shapes(out_dir): global_reduce_config = ReductionConfig(save_shape=True) global_save_config = SaveConfig(save_steps=[0, 1]) hook = t_hook( out_dir=out_dir, save_config=global_save_config, save_all=True, reduction_config=global_reduce_config, ) run_mnist_gluon_model(hook=hook, num_steps_train=5) verify_shapes(out_dir, 0) verify_shapes(out_dir, 1) shutil.rmtree(out_dir)
def test_hook(): save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/newlogsRunTest/" + run_id hook = t_hook(out_dir=out_dir, save_config=save_config) run_mnist_gluon_model( hook=hook, num_steps_train=10, num_steps_eval=10, register_to_loss_block=True, save_custom_tensor=True, ) trial = create_trial(out_dir) custom_tensors = trial.tensor_names(collection=CollectionKeys.DEFAULT) all_tensors = trial.tensor_names() assert len(custom_tensors) == 2 assert len(all_tensors) == 4 shutil.rmtree(out_dir)
def test_save_config(hook=None): if hook is None: save_config_collection = SaveConfig(save_steps=[4, 5, 6]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/" + run_id save_config = SaveConfig(save_steps=[0, 1, 2, 3]) hook = t_hook( out_dir=out_dir, save_config=save_config, include_collections=[ "ReluActivation", "weights", "biases", "gradients", "default" ], ) custom_collect = hook.get_collection("ReluActivation") custom_collect.save_config = save_config_collection custom_collect.include(["relu*", "input_*", "output*"]) run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10) if hook is None: shutil.rmtree(out_dir)
def test_save_all(hook=None, out_dir=None): hook_created = False if hook is None: hook_created = True save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/" + run_id print("Registering the hook with out_dir {}".format(out_dir)) hook = t_hook(out_dir=out_dir, save_config=save_config, save_all=True) run_mnist_gluon_model(hook=hook, num_steps_train=7, num_steps_eval=5) # assert for steps and tensor_names print("Created the trial with out_dir {}".format(out_dir)) tr = create_trial(out_dir) tensor_list = tr.tensor_names() assert tr assert len(tr.steps()) == 4 # some tensor names, like input and output, can't be retrieved from training session, so here we only assert for tensor numbers # 46 is gotten from index file # if no assertion failure, then the script could save all tensors assert len(tensor_list) == 46 if hook_created: shutil.rmtree(out_dir)
def test_modes(hook=None, path=None): if hook is None: run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") path = "/tmp/" + run_id hook = t_hook( out_dir=path, save_config=SaveConfig({ modes.TRAIN: SaveConfigMode(save_interval=2), modes.EVAL: SaveConfigMode(save_interval=3), }), include_collections=["gradients", "weights"], ) run_mnist_gluon_model(hook=hook, set_modes=True, register_to_loss_block=True, num_steps_train=6, num_steps_eval=6) tr = create_trial(path) assert len(tr.modes()) == 2 assert len(tr.steps()) == 5, tr.steps() assert len(tr.steps(mode=modes.TRAIN)) == 3 assert len(tr.steps(mode=modes.EVAL)) == 2, tr.steps() # Ensure that the gradients are available in TRAIN modes only. grad_tns_name = tr.tensor_names(regex="^gradient.")[0] grad_tns = tr.tensor(grad_tns_name) grad_train_steps = grad_tns.steps(mode=modes.TRAIN) grad_eval_steps = grad_tns.steps(mode=modes.EVAL) assert len(grad_train_steps) == 3 assert grad_eval_steps == [] # Ensure that the weights are available in TRAIN and EVAL modes. wt_tns_name = tr.tensor_names(regex="conv\d+_weight")[0] wt_tns = tr.tensor(wt_tns_name) wt_train_steps = wt_tns.steps(mode=modes.TRAIN) wt_eval_steps = wt_tns.steps(mode=modes.EVAL) assert len(wt_train_steps) == 3 assert len(wt_eval_steps) == 2
def test_spot_hook(): os.environ[ CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR] = "./tests/mxnet/test_json_configs/checkpointconfig.json" checkpoint_path = "/tmp/savedParams" if not os.path.exists(checkpoint_path): os.mkdir(checkpoint_path) save_config = SaveConfig( save_steps=[10, 11, 12, 13, 14, 40, 50, 60, 70, 80]) """ Run the training for 2 epochs and save the parameter after every epoch. We expect that steps 0 to 14 will be written. """ run_id_1 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir_1 = "/tmp/newlogsRunTest/" + run_id_1 hook = t_hook(out_dir=out_dir_1, save_config=save_config, include_collections=["weights", "gradients"]) assert has_training_ended(out_dir_1) == False run_mnist( hook=hook, num_steps_train=10, num_steps_eval=10, epochs=2, save_interval=1, save_path=checkpoint_path, ) """ Run the training again for 4 epochs and save the parameter after every epoch. We DONOT expect that steps 0 to 14 are written. We expect to read steps 40, 50, 60, 70 and 80 """ run_id_2 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir_2 = "/tmp/newlogsRunTest/" + run_id_2 hook = t_hook(out_dir=out_dir_2, save_config=save_config, include_collections=["weights", "gradients"]) assert has_training_ended(out_dir_2) == False run_mnist( hook=hook, num_steps_train=10, num_steps_eval=10, epochs=4, save_interval=1, save_path=checkpoint_path, ) # Unset the environ variable before validation so that it won't affect the other scripts in py test environment. del os.environ[CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR] # Validation print("Created the trial with out_dir {0} for the first training".format( out_dir_1)) tr = create_trial(out_dir_1) assert tr available_steps_1 = tr.steps() assert 40 not in available_steps_1 assert 80 not in available_steps_1 print(available_steps_1) print("Created the trial with out_dir {0} for the second training".format( out_dir_2)) tr = create_trial(out_dir_2) assert tr available_steps_2 = tr.steps() assert 40 in available_steps_2 assert 50 in available_steps_2 assert 60 in available_steps_2 assert 70 in available_steps_2 assert 80 in available_steps_2 assert 0 not in available_steps_2 assert 10 not in available_steps_2 assert 11 not in available_steps_2 assert 12 not in available_steps_2 print(available_steps_2) print("Cleaning up.") shutil.rmtree(os.path.dirname(out_dir_1)) shutil.rmtree(checkpoint_path, ignore_errors=True)
def test_save_config(hook=None, out_dir=None): hook_created = False if hook is None: hook_created = True global_reduce_config = ReductionConfig(reductions=["max", "mean"]) global_save_config = SaveConfig(save_steps=[0, 1, 2, 3]) run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f") out_dir = "/tmp/newlogsRunTest/" + run_id print("Registering the hook with out_dir {0}".format(out_dir)) hook = t_hook( out_dir=out_dir, save_config=global_save_config, save_all=True, include_collections=[ "weights", "biases", "gradients", "default", "ReluActivation", "flatten", ], reduction_config=global_reduce_config, ) hook.get_collection("ReluActivation").include(["relu*"]) hook.get_collection("ReluActivation").save_config = SaveConfig( save_steps=[4, 5, 6]) hook.get_collection( "ReluActivation").reduction_config = ReductionConfig( reductions=["min"], abs_reductions=["max"]) hook.get_collection("flatten").include(["flatten*"]) hook.get_collection("flatten").save_config = SaveConfig( save_steps=[4, 5, 6]) hook.get_collection("flatten").reduction_config = ReductionConfig( norms=["l1"], abs_norms=["l2"]) run_mnist_gluon_model(hook=hook, num_steps_train=10, num_steps_eval=10) # Testing print("Created the trial with out_dir {0}".format(out_dir)) tr = create_trial(out_dir) assert tr assert len(tr.steps()) == 7 print(tr.tensor_names()) tname = tr.tensor_names(regex=r"conv\d+_weight")[0] # Global reduction with max and mean weight_tensor = tr.tensor(tname) max_val = weight_tensor.reduction_value(step_num=1, abs=False, reduction_name="max") assert max_val is not None mean_val = weight_tensor.reduction_value(step_num=1, abs=False, reduction_name="mean") assert mean_val is not None # custom reduction at step 4 with reduction = 'min' and abs reduction = 'max' tname = tr.tensor_names(regex=r"conv\d+_relu_input_0")[0] relu_input = tr.tensor(tname) min_val = relu_input.reduction_value(step_num=4, abs=False, reduction_name="min") assert min_val is not None abs_max_val = relu_input.reduction_value(step_num=4, abs=True, reduction_name="max") assert abs_max_val is not None # Custom reduction with normalization tname = tr.tensor_names(regex=r"flatten\d+_input_0")[0] flatten_input = tr.tensor(tname) l1_norm = flatten_input.reduction_value(step_num=4, abs=False, reduction_name="l1") assert l1_norm is not None l2_norm = flatten_input.reduction_value(step_num=4, abs=True, reduction_name="l2") assert l2_norm is not None if hook_created: shutil.rmtree(out_dir)