Example #1
0
def help_test_multiple_trials(num_steps=20, num_tensors=10):
    trial_name = str(uuid.uuid4())
    bucket = "smdebug-testing"
    path = "s3://" + os.path.join(bucket, "outputs/")

    c = CollectionManager()
    c.add("default")
    c.get("default").tensor_names = [
        "foo_" + str(i) for i in range(num_tensors)
    ]
    c.export(path + trial_name, DEFAULT_COLLECTIONS_FILE_NAME)
    c.export(path + trial_name, DEFAULT_COLLECTIONS_FILE_NAME)
    for i in range(num_steps):
        generate_data(
            path=path,
            trial=trial_name,
            num_tensors=num_tensors,
            step=i,
            tname_prefix="foo",
            worker="algo-1",
            shape=(3, 3, 3),
            rank=0,
        )
    _, bucket, prefix = is_s3(os.path.join(path, trial_name))
    trial_obj = S3Trial(name=prefix, bucket_name=bucket, prefix_name=prefix)
    return trial_obj, trial_name
Example #2
0
def dummy_trial_creator(trial_dir, num_workers, job_ended):
    Path(trial_dir).mkdir(parents=True, exist_ok=True)
    cm = CollectionManager()
    for i in range(num_workers):
        collection_file_name = f"worker_{i}_collections.json"
        cm.export(trial_dir, collection_file_name)
    if job_ended:
        Path(os.path.join(trial_dir, "training_job_end.ts")).touch()
def test_manager_export_load():
    cm = CollectionManager()
    cm.create_collection("default")
    cm.get("default").include("loss")
    cm.add(Collection("trial1"))
    cm.add("trial2")
    cm.get("trial2").include("total_loss")
    cm.export("/tmp/dummy_trial", DEFAULT_COLLECTIONS_FILE_NAME)
    cm2 = CollectionManager.load(
        os.path.join(get_path_to_collections("/tmp/dummy_trial"),
                     DEFAULT_COLLECTIONS_FILE_NAME))
    assert cm == cm2
Example #4
0
def test_mode_data():
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    trial_dir = "/tmp/ts_outputs/" + run_id

    c = CollectionManager()
    c.add("default")
    c.get("default").tensor_names = ["arr_1"]
    c.get("default").tensor_names = ["arr_2"]
    c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME)
    trial = create_trial(trial_dir)
    worker = socket.gethostname()
    for s in range(0, 10):
        fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker)
        if s % 2 == 0:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr_1",
                mode=modes.TRAIN,
                mode_step=s // 2,
            )
        else:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr_2",
                mode=modes.EVAL,
                mode_step=s // 2,
            )
        fw.close()

    assert trial.tensor_names() == ["arr_1", "arr_2"]
    assert trial.tensor_names(step=0) == ["arr_1"]
    assert trial.tensor_names(step=1) == ["arr_2"]
    assert trial.tensor_names(step=0, mode=modes.TRAIN) == ["arr_1"]
    assert trial.tensor_names(step=0, mode=modes.EVAL) == ["arr_2"]

    assert trial.tensor_names(mode=modes.TRAIN) == ["arr_1"]
    assert trial.tensor_names(mode=modes.EVAL) == ["arr_2"]
def generate_data(
    path,
    trial,
    step,
    tname_prefix,
    num_tensors,
    worker,
    shape,
    dtype=np.float32,
    rank=None,
    mode=None,
    mode_step=None,
    export_colls=True,
    data=None,
):
    with FileWriter(trial_dir=os.path.join(path, trial),
                    step=step,
                    worker=worker) as fw:
        for i in range(num_tensors):
            if data is None:
                data = np.ones(shape=shape, dtype=dtype) * step
            fw.write_tensor(tdata=data,
                            tname=f"{tname_prefix}_{i}",
                            mode=mode,
                            mode_step=mode_step)
    if export_colls:
        c = CollectionManager()
        c.add("default")
        c.get("default").tensor_names = [
            f"{tname_prefix}_{i}" for i in range(num_tensors)
        ]
        c.add("gradients")
        c.get("gradients").tensor_names = [
            f"{tname_prefix}_{i}" for i in range(num_tensors)
        ]
        c.export(os.path.join(path, trial), DEFAULT_COLLECTIONS_FILE_NAME)
Example #6
0
def write_dummy_collection_file(trial):
    cm = CollectionManager()
    cm.create_collection("default")
    cm.add(Collection(trial))
    cm.export(trial, DEFAULT_COLLECTIONS_FILE_NAME)
def test_mode_data():
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    trial_dir = "/tmp/ts_outputs/" + run_id

    c = CollectionManager()
    c.add("default")
    c.get("default").tensor_names = ["arr"]
    c.export(trial_dir, DEFAULT_COLLECTIONS_FILE_NAME)
    tr = create_trial(trial_dir)
    worker = socket.gethostname()
    for s in range(0, 10):
        fw = FileWriter(trial_dir=trial_dir, step=s, worker=worker)
        if s % 2 == 0:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=modes.TRAIN,
                mode_step=s // 2,
            )
        else:
            fw.write_tensor(
                tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                tname="arr",
                mode=modes.EVAL,
                mode_step=s // 2,
            )
        fw.close()

        if s % 2 == 0:
            assert tr.has_passed_step(s // 2,
                                      mode=modes.TRAIN) == StepState.AVAILABLE
            assert tr.has_passed_step(
                s // 2, mode=modes.EVAL) == StepState.NOT_YET_AVAILABLE
        else:
            assert tr.has_passed_step(s // 2,
                                      mode=modes.EVAL) == StepState.AVAILABLE

        assert tr.has_passed_step(s) == StepState.AVAILABLE
        assert tr.has_passed_step(s + 1) == StepState.NOT_YET_AVAILABLE
        assert tr.has_passed_step(
            s + 1, mode=modes.TRAIN) == StepState.NOT_YET_AVAILABLE

    assert len(tr.tensor_names()) == 1
    assert len(tr.steps()) == 10
    assert len(tr.steps(mode=modes.TRAIN)) == 5
    assert len(tr.steps(mode=modes.EVAL)) == 5
    assert len(tr.modes()) == 2

    for i in range(10):
        if i % 2 == 0:
            assert tr.mode(i) == modes.TRAIN
        else:
            assert tr.mode(i) == modes.EVAL
        assert tr.mode_step(i) == i // 2

    for i in range(5):
        assert tr.global_step(modes.TRAIN, i) == (i * 2)
        assert tr.global_step(modes.EVAL, i) == (i * 2) + 1

    assert len(tr.tensor("arr").steps()) == 10
    assert len(tr.tensor("arr").steps(mode=modes.TRAIN)) == 5
    assert len(tr.tensor("arr").steps(mode=modes.EVAL)) == 5

    for i in range(10):
        assert tr.tensor("arr").value(i) is not None
        if i < 5:
            assert tr.tensor("arr").value(i, mode=modes.TRAIN) is not None
            assert tr.tensor("arr").value(i, mode=modes.EVAL) is not None

    shutil.rmtree("/tmp/ts_outputs/" + run_id)