def test_parameters_kernel_regex(profile, format): data = profile(""" __global__ void addVectors(int* p) { int a = 5; *p = a; } __global__ void subtractVectors(int* p) { int a = 5; *p = a; } __global__ void generalKernel(int* p) { int a = 5; *p = a; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); addVectors<<<1, 1>>>(dptr); subtractVectors<<<1, 1>>>(dptr); generalKernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; }""", format=format, kernel_regex=".*a.*ors.*", with_metadata=True) mappings = data["mappings"] assert metadata_file("addVectors") in mappings assert metadata_file("subtractVectors") in mappings assert metadata_file("generalKernel") not in mappings assert len(mappings[kernel_file("addVectors", format=format)]["warps"]) == 1 assert len(mappings[kernel_file("subtractVectors", format=format)]["warps"]) == 1 assert kernel_file("generalKernel", format=format) not in mappings
def test_parameters_runtime_tracking_overwrite(profile): code = """ __global__ void kernel(int* p) { *p = 5; } int main() { int* dptr; cudaMalloc((void**) &dptr, sizeof(int) * 137); // try to find unique size kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """ data = profile(code, with_metadata=True) allocations = data["mappings"][kernel_file("kernel")]["allocations"] alloc = allocations[0] data = profile(code, with_metadata=True, runtime_tracking=True) allocations = data["mappings"][kernel_file("kernel")]["allocations"] validate_allocations(allocations) for record in allocations: if record["size"] == alloc["size"]: assert record["nameString"] == "dptr" return assert False # allocation was not overwritten by static tracking
def test_allocation_global(profile, format): data = profile(""" #include <cstdio> __global__ void kernel(int* p) { *p = 5; } int main() { int* dptr; cudaMalloc((void**) &dptr, sizeof(int) * 10); printf("%p\\n", dptr); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """, format=format, with_metadata=True) allocations = data["mappings"][kernel_file("kernel", format=format)]["allocations"] assert len(allocations) == 1 assert allocations[0]["active"] assert allocations[0]["elementSize"] == 4 assert allocations[0]["size"] == 40 assert allocations[0]["typeString"] == "i32" assert allocations[0]["space"] == 0 assert pointer_matches(allocations[0]["address"], data["stdout"].strip()) assert allocations[0]["nameString"] == "dptr" assert allocations[0]["location"].endswith("input.cu:9")
def test_allocation_shared(profile, format): data = profile(""" #include <cstdio> __global__ void kernel() { __shared__ int arr[10]; printf("%p\\n", arr); } int main() { kernel<<<1, 1>>>(); return 0; } """, format=format, with_metadata=True) allocations = data["mappings"][kernel_file("kernel", format=format)]["allocations"] assert len(allocations) == 1 assert allocations[0]["active"] assert allocations[0]["elementSize"] == 4 assert allocations[0]["size"] == 40 assert "typeIndex" in allocations[0] assert allocations[0]["space"] == 1 assert pointer_matches(allocations[0]["address"], data["stdout"].strip())
def test_addrspace_shared_constant_access(profile, format): data = profile(""" __constant__ int constArr[10]; __global__ void kernel() { __shared__ int arr[10]; arr[threadIdx.x] = constArr[1]; } int main() { kernel<<<1, 1>>>(); return 0; } """, format=format) assert data[kernel_file("kernel", format=format)]["warps"][0]["space"] == 2 assert data[kernel_file("kernel", format=format)]["warps"][1]["space"] == 1
def test_trace_thread_id(profile, format): data = profile(""" __global__ void kernel(int* p) { *p = threadIdx.x; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int) * 64); kernel<<<1, 2>>>(dptr); cudaFree(dptr); return 0; } """, format=format) warp = data[kernel_file("kernel", format=format)]["warps"][0] assert warp["blockIdx"]["x"] == 0 assert warp["blockIdx"]["y"] == 0 assert warp["blockIdx"]["z"] == 0 ids = [ "{}.{}.{}".format(a["threadIdx"]["z"], a["threadIdx"]["y"], a["threadIdx"]["x"]) for a in warp["accesses"] ] assert "0.0.0" in ids assert "0.0.1" in ids
def test_trace_bank_size(profile, format): data = profile(""" __global__ void kernel() { int x = threadIdx.x; } int main() { cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte); kernel<<<1, 1>>>(); cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); kernel<<<1, 1>>>(); return 0; } """, format=format) assert data[kernel_file("kernel", format=format)]["bankSize"] == 4 assert data[kernel_file("kernel", index=1, format=format)]["bankSize"] == 8
def test_trace_multiple_time(profile, format): data = profile(""" __global__ void kernel(int* p) { int x = *p; *p = 5; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; }""", format=format) info = (data[kernel_file("kernel", 0, format)], data[kernel_file("kernel", 1, format)]) assert info[0]["start"] < info[1]["start"]
def test_trace_time(profile, format): data = profile(""" #include <cstdio> __global__ void kernel(int* p) { *p = 5; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """, format=format) assert data[kernel_file("kernel", format=format)]["end"] > data[kernel_file( "kernel", format=format)]["start"]
def test_parameters_compression_content(profile, format): code = """ __global__ void kernel(int* p) { *p = 5; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """ uncompressed = profile(code, format=format) compressed = profile(code, format=format, compress=True) assert (uncompressed[kernel_file("kernel", format=format)]["kernel"] == compressed[kernel_file("kernel", format=format, compress=True)]["kernel"])
def test_allocation_runtime_tracking_capture(profile): code = """ __global__ void kernel(int* p) { *p = 5; } int main() { int* dptr; cudaMalloc((void**) &dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """ data = profile(code, with_metadata=True, runtime_tracking=True) allocations = data["mappings"][kernel_file("kernel")]["allocations"] assert len(allocations) > 1 data = profile(code, with_metadata=True) allocations = data["mappings"][kernel_file("kernel")]["allocations"] assert len(allocations) == 1
def test_trace_warp_size(profile, format): data = profile(""" __global__ void kernel() { int x = threadIdx.x; } int main() { kernel<<<1, 1>>>(); return 0; } """, format=format) assert data[kernel_file("kernel", format=format)]["warpSize"] == 32
def test_addrspace_constant_access(profile, format): data = profile(""" __constant__ int arr[10]; __global__ void kernel() { int x = arr[1]; } int main() { kernel<<<1, 1>>>(); return 0; } """, format=format) assert data[kernel_file("kernel", format=format)]["warps"][0]["space"] == 2
def test_trace_dimensions(profile, format): data = profile(""" __global__ void kernel() { int x = threadIdx.x; } int main() { dim3 gridDim(3, 4, 5); dim3 blockDim(6, 7, 8); kernel<<<gridDim, blockDim>>>(); return 0; } """, format=format) grid = data[kernel_file("kernel", format=format)]["gridDim"] assert grid["x"] == 3 assert grid["y"] == 4 assert grid["z"] == 5 block = data[kernel_file("kernel", format=format)]["blockDim"] assert block["x"] == 6 assert block["y"] == 7 assert block["z"] == 8
def test_general_release_mode(profile): data = profile(""" __global__ void kernel(int* p) { *p = 5; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """, release=True) assert kernel_file("kernel") in data
def test_addrspace_global_access(profile, format): data = profile(""" __global__ void kernel(int* p) { *p = 5; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """, format=format) assert data[kernel_file("kernel", format=format)]["warps"][0]["space"] == 0
def test_parameters_disable_output(profile, format): code = """ __global__ void kernel(int* p) { *p = 5; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """ data = profile(code, disable_output=True) assert kernel_file("kernel", format=format) not in data
def test_trace_warp_id(profile, format): data = profile(""" __global__ void kernel(int* p) { *p = threadIdx.x; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int) * 64); kernel<<<1, 64>>>(dptr); cudaFree(dptr); return 0; } """, format=format) warps = data[kernel_file("kernel", format=format)]["warps"] for warp in warps: assert len(warp["accesses"]) == 32
def test_trace_type_and_name(profile, format): data = profile(""" __global__ void kernel(int* p) { int x = *p; *p = 5; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; }""", format=format) info = data[kernel_file("kernel", format=format)] assert info["type"] == "trace" assert info["kernel"] == "kernel"
def test_parameters_instrument_locals_enable(profile, format): data = profile(""" __global__ void kernel(int* p) { int a = 5; *p = a; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; }""", format=format, instrument_locals=True) warps = data[kernel_file("kernel", format=format)]["warps"] assert len(warps) == 4
def test_metadata_debug_index(profile, format): data = profile(""" __global__ void kernel(int* p) { int x = *p; *p = 5; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; }""", format=format) warps = data[kernel_file("kernel", format=format)]["warps"] assert warps[0]["debugId"] == 0 assert warps[1]["debugId"] == 1
def test_trace_multiple_invocations(profile, format): data = profile(""" #include <cstdio> __global__ void kernel(int* p) { *p = 5; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """, format=format) for i in xrange(2): assert kernel_file("kernel", i, format) in data
def test_access_type(profile, format): data = profile(""" #include <cstdio> __global__ void kernel(int* p) { *p = 5; int a = *p; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """, format=format, with_metadata=True) warps = data["mappings"][kernel_file("kernel", format=format)]["warps"] assert warps[0]["kind"] == AccessType.Write assert warps[1]["kind"] == AccessType.Read
def test_metadata_type_index_shared_variable(profile, format): data = profile(""" #include <cstdio> __global__ void kernel() { __shared__ float arr; arr = threadIdx.x; } int main() { kernel<<<1, 1>>>(); return 0; } """, format=format) types = data[metadata_file("kernel")]["typeMap"] assert len(types) > 0 warp = data[kernel_file("kernel", format=format)]["warps"][0] assert warp["typeIndex"] == types.index("float")
def test_metadata_name_index_shared_buffer(profile, format): data = profile(""" #include <cstdio> __global__ void kernel() { __shared__ float arr[10]; arr[threadIdx.x] = threadIdx.x; } int main() { kernel<<<1, 1>>>(); return 0; } """, format=format) names = data[metadata_file("kernel")]["nameMap"] assert len(names) > 0 allocations = data[kernel_file("kernel", format=format)]["allocations"][0] assert allocations["nameIndex"] == names.index("arr")
def test_metadata_type_index(profile, format): data = profile(""" __global__ void kernel(int* p) { int x = *p; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; }""", format=format) types = data[metadata_file("kernel")]["typeMap"] assert len(types) > 0 warp = data[kernel_file("kernel", format=format)]["warps"][0] assert warp["typeIndex"] == types.index("i32")
def test_access_local(profile, format): data = profile(""" #include <cstdio> __global__ void kernel(float* p) { int a = 5; int b = a; *p = 5; } int main() { float* dptr; cudaMalloc(&dptr, sizeof(float)); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """, format=format, with_metadata=True) accesses = data["mappings"][kernel_file("kernel", format=format)]["warps"][0]["accesses"] assert len(accesses) == 1
def test_access_value_float(profile, format): data = profile(""" #include <cstdio> __global__ void kernel(float* p) { p[threadIdx.x] = *p; } int main() { float data = 1337; float* dptr; cudaMalloc(&dptr, sizeof(data)); cudaMemcpy(dptr, &data, sizeof(data), cudaMemcpyHostToDevice); kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """, format=format, with_metadata=True) warps = data["mappings"][kernel_file("kernel", format=format)]["warps"] assert warps[0]["accesses"][0]["value"] == "0x0000000000000539" assert warps[1]["accesses"][0]["value"] == "0x0000000000000539"
def test_access_address_match(profile, format): data = profile(""" #include <iostream> #include <iomanip> __global__ void kernel(int* p) { *p = 5; } int main() { int* dptr; cudaMalloc(&dptr, sizeof(int)); size_t value = reinterpret_cast<size_t>(dptr); std::cout << std::uppercase << std::hex << std::setfill('0') << "0x" << std::setw(16) << value << std::endl; kernel<<<1, 1>>>(dptr); cudaFree(dptr); return 0; } """, format=format, with_metadata=True) assert data["stdout"].strip() == data["mappings"][kernel_file("kernel", format=format)]["warps"][0]["accesses"][0]["address"]
def test_access_complex0(profile, format): data = profile(""" #include <cstdio> __global__ void kernel(float* a, float* b, float* c) { int tid = threadIdx.x; a[tid] = b[tid] + c[tid]; } int main() { float* dptr; cudaMalloc(&dptr, sizeof(float)); kernel<<<1, 1>>>(dptr, dptr, dptr); cudaFree(dptr); return 0; } """, format=format, with_metadata=True) warps = data["mappings"][kernel_file("kernel", format=format)]["warps"] assert warps[0]["kind"] == AccessType.Read assert warps[1]["kind"] == AccessType.Read assert warps[2]["kind"] == AccessType.Write assert warps[0]["size"] == 4 assert warps[1]["size"] == 4 assert warps[2]["size"] == 4