Example #1
0
def test_resource_monitor_store_to_file(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if storing metrics to a file works correctly.
    """
    tensorboard_folder = test_output_dirs.root_dir
    r = ResourceMonitor(interval_seconds=5,
                        tensorboard_folder=tensorboard_folder,
                        csv_results_folder=tensorboard_folder)
    r.gpu_aggregates = {
        1: GpuUtilization(id=1, mem_util=1, load=2, mem_reserved_gb=30.0, mem_allocated_gb=40.0, count=10),
    }
    r.gpu_max = {
        1: GpuUtilization(id=1, mem_util=0.4, load=0.5, mem_reserved_gb=6.0, mem_allocated_gb=7.0, count=10),
    }
    r.store_to_file()
    # Write a second time - we expect that to overwrite and only produce one set of metrics
    r.store_to_file()
    parsed_metrics = r.read_aggregate_metrics()
    assert parsed_metrics == {
        "GPU1": {
            "MemUtil_Percent": 10.0,
            "Load_Percent": 20.0,
            "MemReserved_GB": 3.0,
            "MemAllocated_GB": 4.0,
            "MaxMemUtil_Percent": 40.0,
            "MaxLoad_Percent": 50.0,
            "MaxMemReserved_GB": 6.0,
            "MaxMemAllocated_GB": 7.0,
        }}
Example #2
0
def test_resource_monitor(test_output_dirs: OutputFolderForTests) -> None:
    """
    Test if metrics are correctly updated in the ResourceMonitor class.
    """
    results_folder = test_output_dirs.root_dir
    r = ResourceMonitor(interval_seconds=5, tensorboard_folder=results_folder, csv_results_folder=results_folder)

    def create_gpu(id: int, load: float, mem_total: float, mem_used: float) -> GPU:
        return GPU(ID=id, uuid=None, load=load, memoryTotal=mem_total, memoryUsed=mem_used,
                   memoryFree=None, driver=None, gpu_name=None,
                   serial=None, display_mode=None, display_active=None, temp_gpu=None)

    # Fake objects coming from GPUtil: Two entries for GPU1, 1 entry only for GPU2
    gpu1 = create_gpu(1, 0.1, 10, 2)  # memUti=0.2
    gpu2 = create_gpu(2, 0.2, 10, 3)  # memUti=0.3
    gpu3 = create_gpu(1, 0.3, 10, 5)  # memUti=0.5
    # Mock torch calls so that we can run on CPUs. memory allocated: 2GB, reserved: 1GB
    with mock.patch("torch.cuda.memory_allocated", return_value=2 ** 31):
        with mock.patch("torch.cuda.memory_reserved", return_value=2 ** 30):
            # Update with results for both GPUs
            r.update_metrics([gpu1, gpu2])
            # Next update with data for GPU2 missing
            r.update_metrics([gpu3])
    # Element-wise maximum of metrics
    assert r.gpu_max == {
        1: GpuUtilization(id=1, load=0.3, mem_util=0.5, mem_allocated_gb=2.0, mem_reserved_gb=1.0, count=2),
        2: GpuUtilization(id=2, load=0.2, mem_util=0.3, mem_allocated_gb=2.0, mem_reserved_gb=1.0, count=1),
    }
    # Aggregates should contain the sum of metrics that were observed.
    assert r.gpu_aggregates == {
        1: GpuUtilization(id=1, load=0.4, mem_util=0.7, mem_allocated_gb=4.0, mem_reserved_gb=2.0, count=2),
        2: GpuUtilization(id=2, load=0.2, mem_util=0.3, mem_allocated_gb=2.0, mem_reserved_gb=1.0, count=1),
    }
    r.writer.flush()
    r.store_to_file()
    tb_file = list(results_folder.rglob("*tfevents*"))[0]
    assert os.path.getsize(str(tb_file)) > 100
    assert r.aggregate_metrics_file.is_file
    assert len(r.aggregate_metrics_file.read_text().splitlines()) == 17
    parsed_metrics = r.read_aggregate_metrics()
    # There should be one entry per GPU
    assert len(parsed_metrics) == 2
    # Each GPU has 4 averages, 4 max.
    assert len(parsed_metrics["GPU1"]) == 8
    assert len(parsed_metrics["GPU2"]) == 8
def test_resource_monitor_store_to_file(
        test_output_dirs: TestOutputDirectories) -> None:
    """
    Test if storing metrics to a file works correctly.
    """
    tensorboard_folder = Path(test_output_dirs.root_dir)
    r = ResourceMonitor(interval_seconds=5,
                        tensorboard_folder=tensorboard_folder)
    r.gpu_aggregates = {
        1:
        GpuUtilization(id=1,
                       mem_util=1,
                       load=2,
                       mem_reserved_gb=30.0,
                       mem_allocated_gb=40.0,
                       count=10),
    }
    r.gpu_max = {
        1:
        GpuUtilization(id=1,
                       mem_util=0.4,
                       load=0.5,
                       mem_reserved_gb=6.0,
                       mem_allocated_gb=7.0,
                       count=10),
    }
    r.store_to_file()
    # Write a second time - we expect that to overwrite and only produce one set of metrics
    r.store_to_file()
    parsed_metrics = r.read_aggregate_metrics()
    assert parsed_metrics == [
        ("GPU1/MemUtil_Percent", 10.0),
        ("GPU1/Load_Percent", 20.0),
        ("GPU1/MemReserved_GB", 3.0),
        ("GPU1/MemAllocated_GB", 4.0),
        ("GPU1/MaxMemUtil_Percent", 40.0),
        ("GPU1/MaxLoad_Percent", 50.0),
        ("GPU1/MaxMemReserved_GB", 6.0),
        ("GPU1/MaxMemAllocated_GB", 7.0),
    ]