def test_end_local_training():
    run_id = str(uuid.uuid4())
    out_dir = "/tmp/newlogsRunTest/" + run_id
    assert has_training_ended(out_dir) == False
    subprocess.check_call([
        sys.executable,
        "examples/mxnet/scripts/mnist_gluon_basic_hook_demo.py",
        "--output-uri",
        out_dir,
        "--num_steps",
        "10",
    ])
    assert has_training_ended(out_dir)
    shutil.rmtree(out_dir)
Example #2
0
 def maybe_refresh(self, name=None):
     if self.loaded_all_steps or not self.dynamic_refresh:
         return
     retry_count = 1
     training_ended = has_training_ended(self.path)
     if training_ended and self.loaded_all_steps is False:
         retry_count = 2
     while retry_count > 0:
         if name is None:
             self.refresh_data()
         else:
             self.refresh_tensor(name)
         if retry_count > 1:
             self.logger.info(
                 f"Training has ended, will refresh one final time in "
                 f"{self._training_end_delay_refresh} sec.")
             time.sleep(self._training_end_delay_refresh)
         retry_count -= 1
     if training_ended is True and self.loaded_all_steps is False:
         self.loaded_all_steps = True
         self.last_complete_step = (
             sorted(self._global_to_mode.keys())[-1]
             if len(self._global_to_mode) else self.last_complete_step
         )  # Update last_complete_step to the last step written
         self.logger.info("Loaded all steps")
         self.logger.debug(
             f"Training Has Ended : last_complete_step was: {self.last_complete_step}"
         )
         self.logger.debug(
             f"Training Has Ended : last_index_token was: {self.last_index_token}"
         )
Example #3
0
def test_loss_collection_with_no_other_collections():
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir = "/tmp/" + run_id
    hook = t_hook(out_dir=out_dir,
                  save_config=save_config,
                  include_collections=[])
    assert has_training_ended(out_dir) == False
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          register_to_loss_block=True)

    print("Created the trial with out_dir {0}".format(out_dir))
    tr = create_trial(out_dir)
    assert tr
    assert len(tr.steps()) == 4

    print(tr.tensor_names())
    tname = tr.tensor_names(regex=".*loss")[0]
    loss_tensor = tr.tensor(tname)
    loss_val = loss_tensor.value(step_num=1)
    assert len(loss_val) > 0

    shutil.rmtree(out_dir)
def test_end_s3_training():
    run_id = str(uuid.uuid4())
    bucket = "smdebugcodebuildtest"
    key = "newlogsRunTest/" + run_id
    out_dir = bucket + "/" + key
    assert has_training_ended(out_dir) == False
    subprocess.check_call([
        sys.executable,
        "examples/mxnet/scripts/mnist_gluon_basic_hook_demo.py",
        "--output-uri",
        out_dir,
        "--num_steps",
        "10",
    ])
    assert has_training_ended(out_dir)
    delete_s3_prefix(bucket, key)
Example #5
0
def test_hook_from_json_config_for_losses(tmpdir, monkeypatch, params):
    out_dir = tmpdir.join("test_hook_from_json_config_for_losses")
    config_file = tmpdir.join("config.json")
    config_file.write(get_json_config_for_losses(str(out_dir)))
    monkeypatch.setenv(CONFIG_FILE_PATH_ENV_STR, str(config_file))
    hook = Hook.create_from_json_file()
    assert has_training_ended(out_dir) is False
    run_xgboost_model(hook=hook, params=params)
    trial = create_trial(str(out_dir))
    eval_metric = params["eval_metric"]
    test_metric = f"test-{eval_metric}"
    train_metric = f"train-{eval_metric}"
    if eval_metric == "rmse":
        assert train_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert train_metric in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
        assert test_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert test_metric in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
    if eval_metric == "auc" or eval_metric == "map":
        assert train_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert train_metric not in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
        assert test_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert test_metric not in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
def test_end_s3_training():
    run_id = str(uuid.uuid4())
    bucket = "smdebug-testing"
    key = f"outputs/{uuid.uuid4()}"
    out_dir = "s3://" + bucket + "/" + key
    assert has_training_ended(out_dir) == False
    subprocess.check_call([
        sys.executable,
        "tests/resources/mxnet/mnist_gluon_basic_hook_demo.py",
        "--output-uri",
        out_dir,
        "--num_steps",
        "10",
    ])
    assert has_training_ended(out_dir)
    delete_s3_prefix(bucket, key)
 def event_file_present_loop(self, tensor_location: TensorLocation):
     event_file_name = tensor_location.event_file_name
     event_file_present = self._is_event_file_present(event_file_name)
     num_retry = 0
     while not event_file_present and num_retry < self.event_file_retry_limit:
         if self._has_event_file_been_skipped(event_file_name):
             raise TensorUnavailableForStep(
                 tname=tensor_location.tensorname,
                 mode=tensor_location.mode,
                 step=tensor_location.mode_step,
             )
         elif has_training_ended(self.path) is True:
             self.logger.warn(
                 f"IndexReader: Training Has Ended"
                 f"\nIndexReader: {event_file_name} was written but not found."
             )
             raise TensorUnavailableForStep(
                 tname=tensor_location.tensorname,
                 mode=tensor_location.mode,
                 step=tensor_location.mode_step,
             )
         event_file_present = self._is_event_file_present(event_file_name)
         num_retry += 1
         time.sleep(2)
     if num_retry >= self.event_file_retry_limit:
         self.logger.warn(
             f"IndexReader: {event_file_name} was written but not found. After {num_retry} retries."
         )
         raise TensorUnavailableForStep(
             tname=tensor_location.tensorname,
             mode=tensor_location.mode,
             step=tensor_location.mode_step,
         )
     return
 def _wait_for_collection_files(number_of_collection_file_to_wait_for):
     while len(collection_files) < number_of_collection_file_to_wait_for:
         time.sleep(2)
         _fetch()
         if has_training_ended(self.path):
             """ _fetch should have returned all the collection files if the training job has ended """
             if len(collection_files) < number_of_collection_file_to_wait_for:
                 raise MissingCollectionFiles
def test_hook_from_json_config_full(tmpdir, monkeypatch):
    out_dir = tmpdir.join("test_hook_from_json_config_full")
    config_file = tmpdir.join("config.json")
    config_file.write(get_json_config_full(str(out_dir)))
    monkeypatch.setenv(CONFIG_FILE_PATH_ENV_STR, str(config_file))
    hook = Hook.create_from_json_file()
    assert has_training_ended(out_dir) is False
    run_xgboost_model(hook=hook)
def test_s3_training_end():
    s3dir = "s3://smdebugcodebuildtest/training_end_test_dir"
    _, bucket, key = is_s3(s3dir)
    f = TSAccessS3(bucket_name=bucket, key_name=key)
    f.close()
    training_has_ended(s3dir)
    assert has_training_ended(s3dir) is True
    delete_s3_prefixes(bucket, key)
Example #11
0
def test_s3_training_end():
    s3key = str(uuid.uuid4())
    s3dir = f"s3://smdebugcodebuildtest/ok_to_delete_{s3key}"
    _, bucket, key = is_s3(s3dir)
    f = TSAccessS3(bucket_name=bucket, key_name=key)
    f.close()
    training_has_ended(s3dir)
    assert has_training_ended(s3dir) is True
    delete_s3_prefixes(bucket, key)
Example #12
0
def test_hook():
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir = "/tmp/newlogsRunTest/" + run_id
    hook = t_hook(out_dir=out_dir, save_config=save_config)
    assert has_training_ended(out_dir) == False
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          register_to_loss_block=True)
    shutil.rmtree(out_dir)
Example #13
0
def test_hook_from_json_config_full():
    out_dir = "/tmp/newlogsRunTest2/test_hook_from_json_config_full"
    shutil.rmtree(out_dir, True)
    os.environ[
        CONFIG_FILE_PATH_ENV_STR] = "tests/mxnet/test_json_configs/test_hook_from_json_config_full.json"
    hook = t_hook.create_from_json_file()
    assert has_training_ended(out_dir) == False
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          register_to_loss_block=True)
    shutil.rmtree(out_dir, True)
Example #14
0
def test_training_job_has_ended(out_dir):
    tf.reset_default_graph()
    subprocess.check_call(
        [
            sys.executable,
            "examples/tensorflow/local/simple.py",
            "--out_dir",
            out_dir,
            "--steps",
            "10",
            "--save_interval",
            "5",
        ],
        env={
            "CUDA_VISIBLE_DEVICES": "-1",
            "SMDEBUG_LOG_LEVEL": "debug"
        },
    )
    assert has_training_ended(out_dir) == True
Example #15
0
def test_negative_s3_training_end():
    s3dir = "s3://smdebugcodebuildtest/training_end_test_dir_negative"
    assert has_training_ended(s3dir) is False
Example #16
0
def test_negative_local_training_end():
    localdir = "/tmp/training_end_test_dir_negative"
    assert has_training_ended(localdir) is False
Example #17
0
def test_local_training_end():
    localdir = "/tmp/training_end_test_dir"
    ensure_dir(localdir, is_file=False)
    training_has_ended(localdir)
    assert has_training_ended(localdir) is True
    shutil.rmtree(localdir)
def test_spot_hook():
    os.environ[
        CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR] = "./tests/mxnet/test_json_configs/checkpointconfig.json"
    checkpoint_path = "/tmp/savedParams"
    if not os.path.exists(checkpoint_path):
        os.mkdir(checkpoint_path)
    save_config = SaveConfig(
        save_steps=[10, 11, 12, 13, 14, 40, 50, 60, 70, 80])
    """
    Run the training for 2 epochs and save the parameter after every epoch.
    We expect that steps 0 to 14 will be written.
    """

    run_id_1 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir_1 = "/tmp/newlogsRunTest/" + run_id_1
    hook = t_hook(out_dir=out_dir_1,
                  save_config=save_config,
                  include_collections=["weights", "gradients"])
    assert has_training_ended(out_dir_1) == False
    run_mnist(
        hook=hook,
        num_steps_train=10,
        num_steps_eval=10,
        epochs=2,
        save_interval=1,
        save_path=checkpoint_path,
    )
    """
    Run the training again for 4 epochs and save the parameter after every epoch.
    We DONOT expect that steps 0 to 14 are written.
    We expect to read steps 40, 50, 60, 70 and 80
    """
    run_id_2 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir_2 = "/tmp/newlogsRunTest/" + run_id_2
    hook = t_hook(out_dir=out_dir_2,
                  save_config=save_config,
                  include_collections=["weights", "gradients"])
    assert has_training_ended(out_dir_2) == False
    run_mnist(
        hook=hook,
        num_steps_train=10,
        num_steps_eval=10,
        epochs=4,
        save_interval=1,
        save_path=checkpoint_path,
    )
    # Unset the environ variable before validation so that it won't affect the other scripts in py test environment.
    del os.environ[CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR]

    # Validation
    print("Created the trial with out_dir {0} for the first training".format(
        out_dir_1))
    tr = create_trial(out_dir_1)
    assert tr
    available_steps_1 = tr.steps()
    assert 40 not in available_steps_1
    assert 80 not in available_steps_1
    print(available_steps_1)

    print("Created the trial with out_dir {0} for the second training".format(
        out_dir_2))
    tr = create_trial(out_dir_2)
    assert tr
    available_steps_2 = tr.steps()
    assert 40 in available_steps_2
    assert 50 in available_steps_2
    assert 60 in available_steps_2
    assert 70 in available_steps_2
    assert 80 in available_steps_2
    assert 0 not in available_steps_2
    assert 10 not in available_steps_2
    assert 11 not in available_steps_2
    assert 12 not in available_steps_2
    print(available_steps_2)

    print("Cleaning up.")
    shutil.rmtree(os.path.dirname(out_dir_1))
    shutil.rmtree(checkpoint_path, ignore_errors=True)
Example #19
0
 def job_finished(self):
     training_ended = has_training_ended(
         self.path + "/system") or has_training_ended(self.path +
                                                      "/framework")
     # rule job should finish if training job has ended or rule job has been signalled
     return training_ended or is_rule_signalled_gracetime_passed(self.path)
Example #20
0
def test_hook(tmpdir):
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir, save_config=save_config)
    assert has_training_ended(out_dir) is False
    run_xgboost_model(hook=hook)