Example #1
0
def test_dcgm_port_standalone(handle, gpuIds):
    """
    Verifies that DCGM Engine works on different port
    """
    gpuIdList = dcgm_agent.dcgmGetAllDevices(handle)
    assert len(
        gpuIdList
    ) >= 0, "Standalone host engine using different port number failed."
Example #2
0
 def GetAllGpuIds(self):
     gpuIds = dcgm_agent.dcgmGetAllDevices(self._dcgmHandle.handle)
     return gpuIds
Example #3
0
                                % (x, config_values[x].mComputeMode, expected_compute_mode)

        assert config_values[x].mEccMode  == expected_ecc, "The ecc mode value for gpuID %d is incorrect."\
                                " Returned: %d Expected: %d" \
                                % (x, config_values[x].mEccMode, expected_ecc)
        pass

    ret = dcgm_agent.dcgmStatusDestroy(status_handle)
    assert (ret == dcgm_structs.DCGM_ST_OK
            ), "Failed to remove status handler, error: %s" % ret


dcgm_structs._LoadDcgmLibrary()
handle = dcgm_agent.dcgmInit()

devices = dcgm_agent.dcgmGetAllDevices(handle)
validDevices = list()
for x in devices:
    fvSupported = dcgm_agent_internal.dcgmGetLatestValuesForFields(
        handle, x, [
            dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
        ])
    if (fvSupported[0].value.i64 != dcgmvalue.DCGM_INT64_NOT_SUPPORTED):
        validDevices.append(x)

if (len(validDevices) == 0):
    print "Can only run if at least one GPU with ECC is present"
    sys.exit(1)

print "Number of valid devices: %d" % len(validDevices)