def cpu_validation(self, gpu_result: object, reinit: bool) -> None: # Recompute the CPU result only if necessary; start = System.nanoTime() if self.current_iter == 0 or reinit: # Re-initialize the random number generator with the same seed as the GPU to generate the same values; seed(self.random_seed) if self.benchmark.random_init: x_g = np.zeros(self.size) y_g = np.zeros(self.size) for i in range(self.size): x_g[i] = randint(0, 10) y_g[i] = randint(0, 10) else: x_g = 1 / np.linspace(1, self.size, self.size) y_g = 1 / np.linspace(1, self.size, self.size) x_g += 1 y_g += 1 self.cpu_result = x_g[0] + y_g[0] cpu_time = System.nanoTime() - start difference = np.abs(self.cpu_result - gpu_result) self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time) self.benchmark.add_to_benchmark("cpu_gpu_res_difference", difference) if self.benchmark.debug: BenchmarkResult.log_message(f"\tcpu result: {self.cpu_result:.4f}, " + f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
def cpu_validation(self, gpu_result: object, reinit: bool) -> None: # Recompute the CPU result only if necessary; start = time.time() if self.current_iter == 0 or reinit: # Re-initialize the random number generator with the same seed as the GPU to generate the same values; seed(self.random_seed) if self.benchmark.random_init: x_g = np.zeros(self.size) y_g = np.zeros(self.size) a_g = np.zeros(self.size) for i in range(self.size): x_g[i] = random() y_g[i] = 2 * random() a_g[i] = 4 * random() else: x_g = 1 / np.linspace(1, self.size, self.size) y_g = 2 / np.linspace(1, self.size, self.size) a_g = 4 / np.linspace(1, self.size, self.size) x_g = x_g ** 2 y_g = y_g ** 2 a_g = a_g ** 2 x_g -= y_g a_g += 2 self.cpu_result = np.sum(x_g + a_g) cpu_time = time.time() - start difference = np.abs(self.cpu_result - gpu_result) self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time) self.benchmark.add_to_benchmark("cpu_gpu_res_difference", difference) if self.benchmark.debug: BenchmarkResult.log_message(f"\tcpu result: {self.cpu_result:.4f}, " + f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
def execute(self) -> object: self.block_size = self._block_size["block_size_1d"] start_comp = System.nanoTime() start = 0 # A, B. Call the kernel. The 2 computations are independent, and can be done in parallel; self.execute_phase( "square_1", self.square_kernel(self.num_blocks, self.block_size), self.x, self.x1, self.size) self.execute_phase( "square_2", self.square_kernel(self.num_blocks, self.block_size), self.y, self.y1, self.size) # C. Compute the sum of the result; self.execute_phase( "reduce", self.reduce_kernel(self.num_blocks, self.block_size), self.x1, self.y1, self.res, self.size) # Add a final sync step to measure the real computation time; if self.time_phases: start = System.nanoTime() result = self.res[0] end = System.nanoTime() if self.time_phases: self.benchmark.add_phase({ "name": "sync", "time_sec": (end - start) / 1_000_000_000 }) self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000) self.benchmark.add_to_benchmark("gpu_result", result) if self.benchmark.debug: BenchmarkResult.log_message(f"\tgpu result: {result:.4f}") return result
def execute(self) -> object: # This must be reset at every execution; self.res[0] = 0 # A. B. Call the kernels. The 2 computations are independent, and can be done in parallel; for i in range(self.num_iter): start = time.time() self.square_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.x, self.size) self.square_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.y, self.size) end = time.time() self.benchmark.add_phase({ "name": f"square_{i}", "time_sec": end - start }) # C. Compute the sum of the result; start = time.time() self.reduce_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.x, self.y, self.res, self.size) end = time.time() self.benchmark.add_phase({"name": "reduce", "time_sec": end - start}) result = self.res[0] self.benchmark.add_to_benchmark("gpu_result", result) if self.benchmark.debug: BenchmarkResult.log_message(f"\tgpu result: {result:.4f}") return result
def execute(self) -> object: self.block_size = self._block_size["block_size_1d"] result = [0] * self.K # Call the kernels; start_comp = System.nanoTime() start = System.nanoTime() for i in range(self.K): self.execute_phase( f"bs_{i}", self.bs_kernel(self.num_blocks, self.block_size), self.x[i], self.y[i], self.size, R, V, T, K) if self.time_phases: start = System.nanoTime() for i in range(self.K): result[i] = self.y[i][0] end = System.nanoTime() if self.time_phases: self.benchmark.add_phase({ "name": "sync", "time_sec": (end - start) / 1_000_000_000 }) self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000) self.benchmark.add_to_benchmark("gpu_result", result[0]) if self.benchmark.debug: BenchmarkResult.log_message(f"\tgpu result: {result[0]}") return result[0]
def cpu_validation(self, gpu_result: object, reinit: bool) -> None: def spmv(ptr, idx, val, vec): res = np.zeros(len(ptr) - 1) for i in range(len(ptr) - 1): curr_sum = 0 start = int(ptr[i]) end = int(ptr[i + 1]) for j in range(start, end): curr_sum += val[j] * vec[idx[j]] res[i] = curr_sum return res # Recompute the CPU result only if necessary; start = System.nanoTime() if self.current_iter == 0 or reinit: # Re-initialize the random number generator with the same seed as the GPU to generate the same values; seed(self.random_seed) # Initialize the support device arrays; N = self.size x = np.ones(N) # r = b - A * x r = np.array(self.b_cpu) - np.array(spmv(self.ptr_cpu, self.idx_cpu, self.val_cpu, x)) p = r.copy() t1 = r.T.dot(r) # Main iteration; for i in range(self.num_iterations): y = spmv(self.ptr_cpu, self.idx_cpu, self.val_cpu, p) t2 = p.dot(y) alpha = t1 / t2 t1_old = t1 x += alpha * p r -= alpha * y t1 = r.T.dot(r) beta = t1 / t1_old p = r + beta * p self.cpu_result = x cpu_time = System.nanoTime() - start # Compare GPU and CPU results; difference = 0 for i in range(self.size): difference += np.abs(self.cpu_result[i] - gpu_result[i]) self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time) self.benchmark.add_to_benchmark("cpu_gpu_res_difference", str(difference)) if self.benchmark.debug: BenchmarkResult.log_message(f"\tcpu result: [" + ", ".join([f"{x:.4f}" for x in self.cpu_result[:10]]) + "...]; " + f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
def cpu_validation(self, gpu_result: object, reinit: bool) -> None: def spmv(ptr, idx, val, vec): res = np.zeros(len(ptr) - 1) for i in range(len(ptr) - 1): curr_sum = 0 start = int(ptr[i]) end = int(ptr[i + 1]) for j in range(start, end): curr_sum += val[j] * vec[idx[j]] res[i] = curr_sum return res # Recompute the CPU result only if necessary; start = System.nanoTime() if self.current_iter == 0 or reinit: # Re-initialize the random number generator with the same seed as the GPU to generate the same values; seed(self.random_seed) # Initialize the support device arrays; N = self.size auth1 = np.ones(N) hub1 = np.ones(N) # Main iteration; for i in range(self.num_iterations): # Authority; auth2 = spmv(self.ptr2_cpu, self.idx2_cpu, self.val2_cpu, hub1) auth2 = auth2 / np.sum(auth2) # Hubs hub2 = spmv(self.ptr_cpu, self.idx_cpu, self.val_cpu, auth1) hub2 = hub2 / np.sum(hub2) auth1 = auth2 hub1 = hub2 self.cpu_result = hub1 + auth1 cpu_time = System.nanoTime() - start # Compare GPU and CPU results; difference = 0 for i in range(self.size): difference += np.abs(self.cpu_result[i] - gpu_result[i]) self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time) self.benchmark.add_to_benchmark("cpu_gpu_res_difference", str(difference)) if self.benchmark.debug: BenchmarkResult.log_message( f"\tcpu result: [" + ", ".join([f"{x:.4f}" for x in self.cpu_result[:10]]) + "...]; " + f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
def cpu_validation(self, gpu_result: object, reinit: bool) -> None: def softmax(X): return np.exp(X) / np.sum(np.exp(X), axis=1).reshape(X.shape[0], 1) def logsumexp(X): return np.log(np.sum(np.exp(X))) def naive_bayes_predict(X, feature_log_prob, log_class_prior): jll = X.dot(feature_log_prob.T) + log_class_prior amax = np.amax(jll, axis=1) l = logsumexp(jll - np.atleast_2d(amax).T) + amax return np.exp(jll - np.atleast_2d(l).T) def normalize(X): return (X - np.mean(X, axis=0)) / np.std(X, axis=0) def ridge_pred(X, coef, intercept): return np.dot(X, coef.T) + intercept # Recompute the CPU result only if necessary; start = System.nanoTime() if self.current_iter == 0 or reinit: # Re-initialize the random number generator with the same seed as the GPU to generate the same values; seed(self.random_seed) r1_g = naive_bayes_predict(self.x_cpu, self.nb_feat_log_prob_cpu, self.nb_class_log_prior_cpu) r2_g = ridge_pred(normalize(self.x_cpu), self.ridge_coeff_cpu, self.ridge_intercept_cpu) r_g = np.argmax(softmax(r1_g) + softmax(r2_g), axis=1) self.cpu_result = r_g cpu_time = System.nanoTime() - start # Compare GPU and CPU results; difference = 0 for i in range(self.size): difference += np.abs(self.cpu_result[i] - gpu_result[i]) self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time) self.benchmark.add_to_benchmark("cpu_gpu_res_difference", str(difference)) if self.benchmark.debug: BenchmarkResult.log_message( f"\tcpu result: [" + ", ".join([f"{x:.4f}" for x in self.cpu_result[:10]]) + "...]; " + f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
def cpu_validation(self, gpu_result: object, reinit: bool) -> None: def CND(X): """ Cumulative normal distribution. Helper function used by BS(...). """ (a1, a2, a3, a4, a5) = (0.31938153, -0.356563782, 1.781477937, -1.821255978, 1.330274429) L = np.absolute(X) K = np.float64(1.0) / (1.0 + 0.2316419 * L) w = 1.0 - 1.0 / math.sqrt(2 * np.pi) * np.exp(-L * L / 2.) * \ (a1 * K + a2 * (K ** 2) + a3 * (K ** 3) + a4 * (K ** 4) + a5 * (K ** 5)) mask = X < 0 w = w * ~mask + (1.0 - w) * mask return w def BS(X, R, V, T, K): """Black Scholes Function.""" d1_arr = (np.log(X / K) + (R + V * V / 2.) * T) / (V * math.sqrt(T)) d2_arr = d1_arr - V * math.sqrt(T) w_arr = CND(d1_arr) w2_arr = CND(d2_arr) return X * w_arr - X * math.exp(-R * T) * w2_arr # Recompute the CPU result only if necessary; start = System.nanoTime() if self.current_iter == 0 or reinit: res = BS(np.array(self.x_tmp), R, V, T, K) self.cpu_result = res[0] cpu_time = System.nanoTime() - start difference = np.abs(self.cpu_result - gpu_result) self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time) self.benchmark.add_to_benchmark("cpu_gpu_res_difference", difference) if self.benchmark.debug: BenchmarkResult.log_message( f"\tcpu result: {self.cpu_result:.4f}, " + f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
def execute(self) -> object: # This must be reset at every execution; self.res[0] = 0 # Call the kernel. The 2 computations are independent, and can be done in parallel; start = time.time() self.square_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.x, self.size) self.square_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.y, self.size) end = time.time() self.benchmark.add_phase({"name": "square", "time_sec": end - start}) # C. Compute the difference of the 2 vectors. This must be done after the 2 previous computations; start = time.time() self.diff_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.x, self.y, self.z, self.size) end = time.time() self.benchmark.add_phase({"name": "diff", "time_sec": end - start}) # D. Compute the other branch of the computation; start = time.time() self.square_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.a, self.size) end = time.time() self.benchmark.add_phase({"name": "square_other_branch", "time_sec": end - start}) # E. Continue computing the other branch; start = time.time() self.addtwo_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.a, self.b, self.size) end = time.time() self.benchmark.add_phase({"name": "add_two_other_branch", "time_sec": end - start}) # F. Compute the sum of the result; start = time.time() self.reduce_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.z, self.b, self.res, self.size) end = time.time() self.benchmark.add_phase({"name": "reduce", "time_sec": end - start}) result = self.res[0] self.benchmark.add_to_benchmark("gpu_result", result) if self.benchmark.debug: BenchmarkResult.log_message(f"\tgpu result: {result:.4f}") return result
def execute(self) -> object: # A. B. Call the kernels. The 2 computations are independent, and can be done in parallel; start = System.nanoTime() self.sum_kernel(self.num_blocks, self.block_size)(self.x, self.size) end = System.nanoTime() self.benchmark.add_phase({"name": "sum_1", "time_sec": (end - start) / 1_000_000_000}) start = System.nanoTime() self.sum_kernel(self.num_blocks, self.block_size)(self.y, self.size) end = System.nanoTime() self.benchmark.add_phase({"name": "sum_2", "time_sec": (end - start) / 1_000_000_000}) start = System.nanoTime() result_1 = self.x[0] result_2 = self.y[0] end = System.nanoTime() self.benchmark.add_phase({"name": "read_result", "time_sec": (end - start) / 1_000_000_000}) self.benchmark.add_to_benchmark("gpu_result", result_1 + result_2) if self.benchmark.debug: BenchmarkResult.log_message(f"\tgpu result: {result_1} {result_2}") return result_1 + result_2
def cpu_validation(self, gpu_result: object, reinit: bool) -> None: def relu(x): return np.maximum(x, 0) def conv3d2(x, kernels, shape, K, k_out, stride=1, operator=relu): N, M, L = shape out = np.zeros((N // stride) * (M // stride) * k_out) radius = K // 2 for m in range(k_out): for i in range(0, int(np.ceil(N / stride)) - radius): for j in range(0, int(np.ceil(M / stride)) - radius): res = 0 i_f = i * stride + radius j_f = j * stride + radius for k_i in range(-radius, radius + 1): for k_j in range(-radius, radius + 1): for l in range(L): ni = i_f + k_i nj = j_f + k_j res += kernels[ l + L * (k_j + radius + K * (k_i + radius + K * m))] * x[( (ni * M) + nj) * L + l] out[m + k_out * (j + M * i // stride)] = operator(res) return out def pooling(x, shape, K, stride): N, M, L = shape out = np.zeros((N // pooling, M // pooling, L)) radius = K // 2 for i in range(0, int(np.ceil(N / stride)) - radius): for j in range(0, int(np.ceil(M / stride)) - radius): for l in range(L): res = 0 i_f = i * stride + radius j_f = j * stride + radius for k_i in range(-radius, radius + 1): for k_j in range(-radius, radius + 1): ni = i_f + k_i nj = j_f + k_j res += x[((ni * M) + nj) * L + l] out[l + L * (j + M * i // stride)] = res / K**2 return out def gap2(x, shape): N, M, L = shape out = np.zeros(L) for n in range(N): for m in range(M): for i in range(L): out[i] += x[i + L * (m + M * n)] / (N * M) return out def concat(x, y): # x and y have the same length; out = np.zeros(2 * len(x)) for i in range(len(x)): out[i] = x[i] out[i + len(x)] = y[i] return out # Recompute the CPU result only if necessary; start = System.nanoTime() if self.current_iter == 0 or reinit: # Initialize weights; N = self.size kernel_1 = np.zeros(len(self.kernel_1)) kernel_2 = np.zeros(len(self.kernel_2)) kernel_3 = np.zeros(len(self.kernel_3)) kernel_4 = np.zeros(len(self.kernel_4)) dense_weights = np.zeros(len(self.dense_weights)) # Random weights; for i in range(len(self.kernel_1)): kernel_1[i] = self.kernel_1[i] kernel_3[i] = self.kernel_3[i] for i in range(len(self.kernel_2)): kernel_2[i] = self.kernel_2[i] kernel_4[i] = self.kernel_4[i] for i in range(len(self.dense_weights)): dense_weights[i] = self.dense_weights[i] # First convolution (N,N,1) -> (N/stride,N/stride,kn1) x_1 = conv3d2(np.array(self.x_cpu), kernel_1, (N, N, self.channels), self.K, self.kn1, stride=self.stride) x_11 = pooling(x_1, (N // self.stride, N // self.stride, self.kn1), self.pooling, self.pooling) # Second convolution (N/stride,N/stride,kn1) -> (N/stride^2,N/stride^2,kn2) x_2 = conv3d2(x_11, kernel_2, (N // self.stride // self.pooling, N // self.stride // self.pooling, self.kn1), self.K, self.kn2, stride=self.stride) # First convolution (N,N,1) -> (N/stride,N/stride,kn1) y_1 = conv3d2(np.array(self.y_cpu), kernel_3, (N, N, self.channels), self.K, self.kn1, stride=self.stride) y_11 = pooling(y_1, (N // self.stride, N // self.stride, self.kn1), self.pooling, self.pooling) # Second convolution (N/stride,N/stride,kn1) -> (N/stride^2,N/stride^2,kn2) y_2 = conv3d2(y_11, kernel_4, (N // self.stride // self.pooling, N // self.stride // self.pooling, self.kn1), self.K, self.kn2, stride=self.stride) # Global average pooling 2D; # x_3 = gap2(x_2, (N // (self.stride * self.stride), N // (self.stride * self.stride), self.kn2)) # y_3 = gap2(y_2, (N // (self.stride * self.stride), N // (self.stride * self.stride), self.kn2)) # Concatenate; out = concat(x_2, y_2) # Final dense layer; self.cpu_result = out.dot(dense_weights[:len(out)]) # self.cpu_result = x_1[:100] cpu_time = (System.nanoTime() - start) / 1_000_000_000 # Compare GPU and CPU results; difference = np.abs(self.cpu_result - gpu_result) self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time) self.benchmark.add_to_benchmark("cpu_gpu_res_difference", str(difference)) if self.benchmark.debug: # BenchmarkResult.log_message( # f"\tcpu result: [" + ", ".join([f"{x:.2f}" for x in self.cpu_result[:100]]) + "...]"+ # f"difference: {difference:.4f}, time: {cpu_time:.4f} sec") BenchmarkResult.log_message( f"\tcpu result: {self.cpu_result:.4f}; " + f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
def execute(self) -> object: self.num_blocks_per_processor = self.num_blocks self.block_size_1d = self._block_size["block_size_1d"] self.block_size_2d = self._block_size["block_size_2d"] start_comp = System.nanoTime() start = 0 a = self.num_blocks_per_processor / 2 # Convolutions; self.execute_phase( "conv_x1", self.conv2d_kernel( (a, a), (self.block_size_2d, self.block_size_2d), 4 * (self.K**2) * self.kn1 * self.channels), self.x1, self.x, self.kernel_1, self.size, self.size, self.channels, self.K, self.kn1, self.stride) self.execute_phase( "conv_y1", self.conv2d_kernel( (a, a), (self.block_size_2d, self.block_size_2d), 4 * (self.K**2) * self.kn1 * self.channels), self.y1, self.y, self.kernel_3, self.size, self.size, self.channels, self.K, self.kn1, self.stride) # Pooling; self.execute_phase( "pool_x1", self.pooling_kernel( (a / 2, a / 2, a / 2), (self.block_size_2d / 2, self.block_size_2d / 2, self.block_size_2d / 2)), self.x11, self.x1, self.size // self.stride, self.size // self.stride, self.kn1, self.pooling, self.pooling) self.execute_phase( "pool_y1", self.pooling_kernel( (a / 2, a / 2, a / 2), (self.block_size_2d / 2, self.block_size_2d / 2, self.block_size_2d / 2)), self.y11, self.y1, self.size // self.stride, self.size // self.stride, self.kn1, self.pooling, self.pooling) # Other convolutions; self.execute_phase( "conv_x2", self.conv2d_kernel( (a, a), (self.block_size_2d, self.block_size_2d), 4 * (self.K**2) * self.kn1 * self.kn2), self.x2, self.x11, self.kernel_2, self.size // self.stride // self.pooling, self.size // self.stride // self.pooling, self.kn1, self.K, self.kn2, self.stride) self.execute_phase( "conv_y2", self.conv2d_kernel( (a, a), (self.block_size_2d, self.block_size_2d), 4 * (self.K**2) * self.kn1 * self.kn2), self.y2, self.y11, self.kernel_4, self.size // self.stride // self.pooling, self.size // self.stride // self.pooling, self.kn1, self.K, self.kn2, self.stride) # Global average pooling; # self.execute_phase("gap_x", # self.gap_kernel((a, a), (self.block_size_2d, self.block_size_2d), 4 * self.kn2), # self.x3, self.x2, self.size // self.stride**2, self.size // self.stride**2, self.kn2) # self.execute_phase("gap_y", # self.gap_kernel((a, a), (self.block_size_2d, self.block_size_2d), 4 * self.kn2), # self.y3, self.y2, self.size // self.stride ** 2, self.size // self.stride ** 2, self.kn2) # Dense layer; self.execute_phase( "concat", self.concat_kernel(self.num_blocks_per_processor, self.block_size_1d), self.z, self.x2, self.y2, len(self.x2)) self.execute_phase( "dot_product", self.dp_kernel(self.num_blocks_per_processor, self.block_size_1d), self.z, self.dense_weights, self.res, len(self.z)) # Add a final sync step to measure the real computation time; if self.time_phases: start = System.nanoTime() # self.gpu_result = sigmoid(self.res[0]) self.gpu_result = self.res[0] # self.gpu_result = [self.x1[i] for i in range(100)] end = System.nanoTime() if self.time_phases: self.benchmark.add_phase({ "name": "sync", "time_sec": (end - start) / 1_000_000_000 }) self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000) self.benchmark.add_to_benchmark("gpu_result", self.gpu_result) if self.benchmark.debug: BenchmarkResult.log_message(f"\tgpu result: {self.gpu_result:.4f}") # BenchmarkResult.log_message( # f"\tgpu result: [" + ", ".join([f"{x:.2f}" for x in self.gpu_result[:100]]) + "...]") return self.gpu_result
def execute(self) -> object: self.num_blocks_size = self.num_blocks # 64 # DEFAULT_NUM_BLOCKS self.num_blocks_feat = self.num_blocks # 64 # DEFAULT_NUM_BLOCKS self.block_size = self._block_size["block_size_1d"] # Schedule the categorical Naive Bayes and Ridge Regression kernels start_comp = System.nanoTime() start = 0 # RR - 1. self.execute_phase("rr_1", self.rr_1(self.num_blocks_feat, self.block_size), self.x, self.z, self.size, self.num_features) # NB - 1. self.execute_phase("nb_1", self.nb_1(self.num_blocks_size, self.block_size), self.x, self.nb_feat_log_prob, self.r1, self.size, self.num_features, self.num_classes) # RR - 2. self.execute_phase("rr_2", self.rr_2(self.num_blocks_size, self.block_size), self.z, self.ridge_coeff, self.r2, self.size, self.num_features, self.num_classes) # NB - 2. self.execute_phase("nb_2", self.nb_2(self.num_blocks_size, self.block_size), self.r1, self.nb_amax, self.size, self.num_classes) # NB - 3. self.execute_phase("nb_3", self.nb_3(self.num_blocks_size, self.block_size), self.r1, self.nb_amax, self.nb_l, self.size, self.num_classes) # RR - 3. self.execute_phase("rr_3", self.rr_3(self.num_blocks_size, self.block_size), self.r2, self.ridge_intercept, self.size, self.num_classes) # NB - 4. self.execute_phase("nb_4", self.nb_4(self.num_blocks_size, self.block_size), self.r1, self.nb_l, self.size, self.num_classes) # Ensemble results; # Softmax normalization; self.execute_phase("softmax_1", self.softmax(self.num_blocks_size, self.block_size), self.r1, self.size, self.num_classes) self.execute_phase("softmax_2", self.softmax(self.num_blocks_size, self.block_size), self.r2, self.size, self.num_classes) # Prediction; self.execute_phase("argmax", self.argmax(self.num_blocks_size, self.block_size), self.r1, self.r2, self.r, self.size, self.num_classes) # Add a final sync step to measure the real computation time; if self.time_phases: start = System.nanoTime() tmp = self.r[0] end = System.nanoTime() if self.time_phases: self.benchmark.add_phase({ "name": "sync", "time_sec": (end - start) / 1_000_000_000 }) self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000) self.benchmark.add_to_benchmark("gpu_result", 0) if self.benchmark.debug: BenchmarkResult.log_message( f"\tgpu result: [" + ", ".join([f"{x:.4f}" for x in self.r[:10]]) + "...]") return self.r
def execute(self) -> object: self.block_size_1d = self._block_size["block_size_1d"] self.block_size_2d = self._block_size["block_size_2d"] self.num_blocks_per_processor = self.num_blocks # 12 # 32 a = self.num_blocks_per_processor / 2 start_comp = System.nanoTime() start = 0 self.reset_kernel(self.num_blocks_per_processor, self.block_size_1d)(self.image3, 0) self.reset_kernel((a, a), (self.block_size_2d, self.block_size_2d))(self.image3, 0) # Blur - Small; self.execute_phase("blur_small", self.gaussian_blur_kernel((a, a), (self.block_size_2d, self.block_size_2d), 4 * self.kernel_small_diameter**2), self.image, self.blurred_small, self.size, self.size, self.kernel_small, self.kernel_small_diameter) # Blur - Large; self.execute_phase("blur_large", self.gaussian_blur_kernel((a, a), (self.block_size_2d, self.block_size_2d), 4 * self.kernel_large_diameter**2), self.image, self.blurred_large, self.size, self.size, self.kernel_large, self.kernel_large_diameter) # Blur - Unsharpen; self.execute_phase("blur_unsharpen", self.gaussian_blur_kernel((a, a), (self.block_size_2d, self.block_size_2d), 4 * self.kernel_unsharpen_diameter**2), self.image, self.blurred_unsharpen, self.size, self.size, self.kernel_unsharpen, self.kernel_unsharpen_diameter) # Sobel filter (edge detection); self.execute_phase("sobel_small", self.sobel_kernel((a, a), (self.block_size_2d, self.block_size_2d)), self.blurred_small, self.mask_small, self.size, self.size) self.execute_phase("sobel_large", self.sobel_kernel((a, a), (self.block_size_2d, self.block_size_2d)), self.blurred_large, self.mask_large, self.size, self.size) # Extend large edge detection mask; self.execute_phase("maximum", self.maximum_kernel(self.num_blocks_per_processor, self.block_size_1d), self.mask_large, self.maximum, self.size**2) self.execute_phase("minimum", self.minimum_kernel(self.num_blocks_per_processor, self.block_size_1d), self.mask_large, self.minimum, self.size**2) self.execute_phase("extend", self.extend_kernel(self.num_blocks_per_processor, self.block_size_1d), self.mask_large, self.minimum, self.maximum, self.size**2) # Unsharpen; self.execute_phase("unsharpen", self.unsharpen_kernel(self.num_blocks_per_processor, self.block_size_1d), self.image, self.blurred_unsharpen, self.image_unsharpen, self.unsharpen_amount, self.size * self.size) # Combine results; self.execute_phase("combine", self.combine_mask_kernel(self.num_blocks_per_processor, self.block_size_1d), self.image_unsharpen, self.blurred_large, self.mask_large, self.image2, self.size * self.size) self.execute_phase("combine_2", self.combine_mask_kernel(self.num_blocks_per_processor, self.block_size_1d), self.image2, self.blurred_small, self.mask_small, self.image3, self.size * self.size) # Add a final sync step to measure the real computation time; if self.time_phases: start = System.nanoTime() tmp = self.image3[0][0] end = System.nanoTime() if self.time_phases: self.benchmark.add_phase({"name": "sync", "time_sec": (end - start) / 1_000_000_000}) self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000) # Compute GPU result; # for i in range(self.size): # for j in range(self.size): # self.gpu_result[i, j] = self.image3[i][j] self.benchmark.add_to_benchmark("gpu_result", 0) if self.benchmark.debug: BenchmarkResult.log_message( f"\tgpu result: [" + ", ".join([f"{x:.4f}" for x in self.gpu_result[0, :10]]) + "...]") return self.gpu_result
BenchmarkResult.DEFAULT_REALLOC ] reinit = args.reinit if args.reinit else [BenchmarkResult.DEFAULT_REINIT] random_init = args.random if args.random else BenchmarkResult.DEFAULT_RANDOM_INIT cpu_validation = args.cpu_validation time_phases = args.time_phases nvprof_profile = args.nvprof # Create a new benchmark result instance; benchmark_res = BenchmarkResult(debug=debug, num_iterations=num_iter, output_path=output_path, cpu_validation=cpu_validation, random_init=random_init) if benchmark_res.debug: BenchmarkResult.log_message(f"using CPU validation: {cpu_validation}") if args.benchmark: if benchmark_res.debug: BenchmarkResult.log_message( f"using only benchmark: {args.benchmark}") benchmarks = {b: benchmarks[b] for b in args.benchmark} if args.policy: if benchmark_res.debug: BenchmarkResult.log_message(f"using only type: {args.policy}") policies = {n: [args.policy] for n in policies.keys()} if args.size: if benchmark_res.debug: BenchmarkResult.log_message(f"using only size: {args.size}")
def execute_grcuda_benchmark(benchmark, size, exec_policy, new_stream_policy, parent_stream_policy, dependency_policy, num_iter, debug, time_phases, num_blocks=DEFAULT_NUM_BLOCKS, prefetch=False): block_size = (block_sizes_1d_dict[b], block_sizes_2d_dict[b]) for m in use_metrics: if debug: BenchmarkResult.log_message("") BenchmarkResult.log_message("") BenchmarkResult.log_message("#" * 30) BenchmarkResult.log_message(f"Benchmark {i + 1}/{tot_benchmarks}") BenchmarkResult.log_message( f"benchmark={b}, size={n}," f"block size={block_size}, " f"num blocks={num_blocks}, " f"exec policy={exec_policy}, " f"new stream policy={new_stream_policy}, " f"parent stream policy={parent_stream_policy}, " f"dependency policy={dependency_policy}, " f"prefetch={prefetch}, " f"time_phases={time_phases}, " f"collect metrics={m}") BenchmarkResult.log_message("#" * 30) BenchmarkResult.log_message("") BenchmarkResult.log_message("") log_folder = f"{datetime.now().strftime('%Y_%m_%d')}" # Create a folder if it doesn't exist; output_folder_path = os.path.join(LOG_FOLDER, log_folder) if not os.path.exists(output_folder_path): if debug: BenchmarkResult.log_message( f"creating result folder: {output_folder_path}") os.mkdir(output_folder_path) file_name = f"{b}_{exec_policy}_{'metric' if m else 'nometric'}_{prefetch}{'' if (POST_TURING and m) else '_%p'}.csv" output_path = os.path.join(output_folder_path, file_name) if POST_TURING: if m: benchmark_cmd = GRAALPYTHON_CMD_METRICS.format( output_path, METRICS, GRAALPYTHON_FOLDER, HEAP_SIZE, new_stream_policy, "--grcuda.InputPrefetch" if prefetch else "", exec_policy, dependency_policy, parent_stream_policy, num_iter, size, num_blocks, benchmark, block_size[0], block_size[1], "-d" if debug else "", "-p" if time_phases else "") else: benchmark_cmd = GRAALPYTHON_CMD_TRACE.format( output_path, "", GRAALPYTHON_FOLDER, HEAP_SIZE, new_stream_policy, "--grcuda.InputPrefetch" if prefetch else "", exec_policy, dependency_policy, parent_stream_policy, num_iter, size, num_blocks, benchmark, block_size[0], block_size[1], "-d" if debug else "", "-p" if time_phases else "") else: benchmark_cmd = GRAALPYTHON_CMD.format( output_path, METRICS if m else "", GRAALPYTHON_FOLDER, HEAP_SIZE, new_stream_policy, "--grcuda.InputPrefetch" if prefetch else "", exec_policy, dependency_policy, parent_stream_policy, num_iter, size, num_blocks, benchmark, block_size[0], block_size[1], "-d" if debug else "", "-p" if time_phases else "") start = System.nanoTime() result = subprocess.run( benchmark_cmd, shell=True, stdout=subprocess.STDOUT, cwd=f"{GRCUDA_HOME}/projects/resources/python/benchmark") result.check_returncode() end = System.nanoTime() if debug: BenchmarkResult.log_message( f"Benchmark total execution time: {(end - start) / 1_000_000_000:.2f} seconds" )
def cpu_validation(self, gpu_result: object, reinit: bool) -> None: sobel_filter_diameter = 3 sobel_filter_x = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]]) sobel_filter_y = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]) def sobel_filter(image): out = np.zeros(image.shape) rows, cols = image.shape radius = sobel_filter_diameter // 2 for i in range(rows): for j in range(cols): sum_gradient_x = 0 sum_gradient_y = 0 for x in range(-radius, radius + 1): for y in range(-radius, radius + 1): nx = x + i ny = y + j if (nx >= 0 and ny >= 0 and nx < rows and ny < cols): gray_value_neigh = image[nx, ny] gradient_x = sobel_filter_x[x + radius][y + radius] gradient_y = sobel_filter_y[x + radius][y + radius] sum_gradient_x += gray_value_neigh * gradient_x sum_gradient_y += gray_value_neigh * gradient_y out[i, j] = np.sqrt(sum_gradient_x ** 2 + sum_gradient_y ** 2) return out def gaussian_blur(image, kernel): out = np.zeros(image.shape) rows, cols = image.shape # Blur radius; diameter = kernel.shape[0] radius = diameter // 2 # Flatten image and kernel; image_1d = image.reshape(-1) kernel_1d = kernel.reshape(-1) for i in range(rows): for j in range(cols): sum_tmp = 0 for x in range(-radius, radius + 1): for y in range(-radius, radius + 1): nx = x + i ny = y + j if (nx >= 0 and ny >= 0 and nx < rows and ny < cols): sum_tmp += kernel_1d[(x + radius) * diameter + (y + radius)] * image_1d[nx * cols + ny] out[i, j] = sum_tmp return out def normalize(image): return (image - np.min(image)) / (np.max(image) - np.min(image)) def truncate(image, minimum=0, maximum=1): out = image.copy() out[out < minimum] = minimum out[out > maximum] = maximum return out # Recompute the CPU result only if necessary; start = System.nanoTime() if self.current_iter == 0 or reinit: # Part 1: Small blur on medium frequencies; blurred_small = gaussian_blur(self.image_cpu, self.kernel_small_cpu) edges_small = sobel_filter(blurred_small) # Part 2: High blur on low frequencies; blurred_large = gaussian_blur(self.image_cpu, self.kernel_large_cpu) edges_large = sobel_filter(blurred_large) # Extend mask to cover a larger area; edges_large = normalize(edges_large) * 5 edges_large[edges_large > 1] = 1 # Part 3: Sharpen image; unsharpen = gaussian_blur(self.image_cpu, self.kernel_unsharpen_cpu) amount = 0.5 sharpened = truncate(self.image_cpu * (1 + amount) - unsharpen * amount) # Part 4: Merge sharpened image and low frequencies; image2 = normalize(sharpened * edges_large + blurred_large * (1 - edges_large)) # Part 5: Merge image and medium frequencies; self.cpu_result = image2 * edges_small + blurred_small * (1 - edges_small) cpu_time = System.nanoTime() - start # Compare GPU and CPU results; difference = 0 for i in range(self.size): for j in range(self.size): difference += np.abs(self.cpu_result[i, j] - gpu_result[i, j]) self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time) self.benchmark.add_to_benchmark("cpu_gpu_res_difference", str(difference)) if self.benchmark.debug: BenchmarkResult.log_message(f"\tcpu result: [" + ", ".join([f"{x:.4f}" for x in self.cpu_result[0, :10]]) + "...]; " + f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
def execute(self) -> object: num_blocks_spmv = int(np.ceil(self.size / self.block_size)) start_comp = System.nanoTime() start = 0 # Initialization phase; # r = b - A * x self.execute_phase("spmv_init", self.spmv_full_kernel(num_blocks_spmv, self.block_size, 4 * self.block_size), self.row_cnt_1, self.ptr, self.idx, self.val, self.x, self.r, self.size, -1, self.b) # p = r self.execute_phase("cpy_init", self.cpy_kernel(self.num_blocks_size, self.block_size), self.p, self.r, self.size) # t1 = r^t * r self.execute_phase("norm_init", self.norm_kernel(self.num_blocks_size, self.block_size), self.r, self.t1, self.size) for i in range(self.num_iterations): # t2 = p^t * A * p self.execute_phase(f"spmv_{i}", self.spmv_kernel(num_blocks_spmv, self.block_size, 4 * self.block_size), self.row_cnt_2, self.ptr, self.idx, self.val, self.p, self.y, self.size) self.t2[0] = 0 self.execute_phase(f"dp_{i}", self.dp_kernel(self.num_blocks_size, self.block_size), self.p, self.y, self.t2, self.size) if self.time_phases: start = System.nanoTime() alpha = self.t1[0] / self.t2[0] old_r_norm_squared = self.t1[0] self.t1[0] = 0 self.row_cnt_1[0] = 0.0 self.row_cnt_2[0] = 0.0 if self.time_phases: end = System.nanoTime() self.benchmark.add_phase({"name": f"alpha_{i}", "time_sec": (end - start) / 1_000_000_000}) # Update x: x = x + alpha * p self.execute_phase(f"saxpy_x_{i}", self.saxpy_kernel(self.num_blocks_size, self.block_size), self.x, self.x, self.p, alpha, self.size) # r = r - alpha * y self.execute_phase(f"saxpy_r_{i}", self.saxpy_kernel(self.num_blocks_size, self.block_size), self.r, self.r, self.y, -1 * alpha, self.size) # t1 = r^t * r self.execute_phase(f"norm_{i}", self.norm_kernel(self.num_blocks_size, self.block_size), self.r, self.t1, self.size) if self.time_phases: start = System.nanoTime() beta = self.t1[0] / old_r_norm_squared if self.time_phases: end = System.nanoTime() self.benchmark.add_phase({"name": f"beta_{i}", "time_sec": (end - start) / 1_000_000_000}) self.execute_phase(f"saxpy_p_{i}", self.saxpy_kernel(self.num_blocks_size, self.block_size), self.p, self.r, self.p, beta, self.size) # Add a final sync step to measure the real computation time; if self.time_phases: start = System.nanoTime() tmp1 = self.x[0] end = System.nanoTime() if self.time_phases: self.benchmark.add_phase({"name": "sync", "time_sec": (end - start) / 1_000_000_000}) self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000) # Compute GPU result; for i in range(self.size): self.gpu_result[i] = self.x[i] self.benchmark.add_to_benchmark("gpu_result", 0) if self.benchmark.debug: BenchmarkResult.log_message(f"\tgpu result: [" + ", ".join([f"{x:.4f}" for x in self.gpu_result[:10]]) + "...]") return self.gpu_result
def execute_cuda_benchmark(benchmark, size, block_size, exec_policy, num_iter, debug, prefetch=False, num_blocks=DEFAULT_NUM_BLOCKS, output_date=None): if debug: BenchmarkResult.log_message("") BenchmarkResult.log_message("") BenchmarkResult.log_message("#" * 30) BenchmarkResult.log_message(f"Benchmark {i + 1}/{tot_benchmarks}") BenchmarkResult.log_message(f"benchmark={b}, size={n}," f" block size={block_size}, " f" prefetch={prefetch}, " f" num blocks={num_blocks}, " f" exec policy={exec_policy}") BenchmarkResult.log_message("#" * 30) BenchmarkResult.log_message("") BenchmarkResult.log_message("") if not output_date: output_date = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") file_name = f"cuda_{output_date}_{benchmark}_{exec_policy}_{size}_{block_size['block_size_1d']}_{block_size['block_size_2d']}_{prefetch}_{num_iter}_{num_blocks}.csv" # Create a folder if it doesn't exist; output_folder_path = os.path.join(BenchmarkResult.DEFAULT_RES_FOLDER, output_date + "_cuda") if not os.path.exists(output_folder_path): if debug: BenchmarkResult.log_message( f"creating result folder: {output_folder_path}") os.mkdir(output_folder_path) output_path = os.path.join(output_folder_path, file_name) benchmark_cmd = CUDA_CMD.format(benchmark, exec_policy, size, block_size["block_size_1d"], block_size["block_size_2d"], num_iter, num_blocks, "-r" if prefetch else "", "-a", output_path) start = System.nanoTime() result = subprocess.run( benchmark_cmd, shell=True, stdout=subprocess.STDOUT, cwd=f"{os.getenv('GRCUDA_HOME')}/projects/resources/cuda/bin") result.check_returncode() end = System.nanoTime() if debug: BenchmarkResult.log_message( f"Benchmark total execution time: {(end - start) / 1_000_000_000:.2f} seconds" )
def execute_grcuda_benchmark(benchmark, size, block_sizes, exec_policy, new_stream_policy, parent_stream_policy, dependency_policy, num_iter, debug, time_phases, num_blocks=DEFAULT_NUM_BLOCKS, prefetch=False, output_date=None): if debug: BenchmarkResult.log_message("") BenchmarkResult.log_message("") BenchmarkResult.log_message("#" * 30) BenchmarkResult.log_message(f"Benchmark {i + 1}/{tot_benchmarks}") BenchmarkResult.log_message( f"benchmark={benchmark}, size={n}," f"block sizes={block_sizes}, " f"num blocks={num_blocks}, " f"exec policy={exec_policy}, " f"new stream policy={new_stream_policy}, " f"parent stream policy={parent_stream_policy}, " f"dependency policy={dependency_policy}, " f"prefetch={prefetch}, " f"time_phases={time_phases}") BenchmarkResult.log_message("#" * 30) BenchmarkResult.log_message("") BenchmarkResult.log_message("") if not output_date: output_date = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") file_name = f"{output_date}_{benchmark}_{exec_policy}_{new_stream_policy}_{parent_stream_policy}_" \ f"{dependency_policy}_{prefetch}_{size}_{num_iter}_{num_blocks}.json" # Create a folder if it doesn't exist; output_folder_path = os.path.join(BenchmarkResult.DEFAULT_RES_FOLDER, output_date + "_grcuda") if not os.path.exists(output_folder_path): if debug: BenchmarkResult.log_message( f"creating result folder: {output_folder_path}") os.mkdir(output_folder_path) output_path = os.path.join(output_folder_path, file_name) b1d_size = " ".join([str(b['block_size_1d']) for b in block_sizes]) b2d_size = " ".join([str(b['block_size_2d']) for b in block_sizes]) benchmark_cmd = GRAALPYTHON_CMD.format( HEAP_SIZE, new_stream_policy, "--grcuda.InputPrefetch" if prefetch else "", exec_policy, dependency_policy, parent_stream_policy, num_iter, size, num_blocks, benchmark, b1d_size, b2d_size, "-d" if debug else "", "-p" if time_phases else "", output_path) start = System.nanoTime() result = subprocess.run( benchmark_cmd, shell=True, stdout=subprocess.STDOUT, cwd=f"{os.getenv('GRCUDA_HOME')}/projects/resources/python/benchmark") result.check_returncode() end = System.nanoTime() if debug: BenchmarkResult.log_message( f"Benchmark total execution time: {(end - start) / 1_000_000_000:.2f} seconds" )
def execute(self) -> object: start_comp = System.nanoTime() start = 0 for i in range(self.num_iterations): # Authorities; self.execute_phase( f"spmv_a_{i}", self.spmv_kernel(self.num_blocks_size, self.block_size), self.ptr2, self.idx2, self.val2, self.hub1, self.auth2, self.size, self.num_nnz) # Hubs; self.execute_phase( f"spmv_h_{i}", self.spmv_kernel(self.num_blocks_size, self.block_size), self.ptr, self.idx, self.val, self.auth1, self.hub2, self.size, self.num_nnz) # Normalize authorities; self.execute_phase( f"sum_a_{i}", self.sum_kernel(self.num_blocks_size, self.block_size), self.auth2, self.auth_norm, self.size) # Normalize hubs; self.execute_phase( f"sum_h_{i}", self.sum_kernel(self.num_blocks_size, self.block_size), self.hub2, self.hub_norm, self.size) self.execute_phase( f"divide_a_{i}", self.divide_kernel(self.num_blocks_size, self.block_size), self.auth2, self.auth1, self.auth_norm, self.size) self.execute_phase( f"divide_h_{i}", self.divide_kernel(self.num_blocks_size, self.block_size), self.hub2, self.hub1, self.hub_norm, self.size) if self.time_phases: start = System.nanoTime() self.auth_norm[0] = 0.0 self.hub_norm[0] = 0.0 if self.time_phases: end = System.nanoTime() self.benchmark.add_phase({ "name": f"norm_reset_{i}", "time_sec": (end - start) / 1_000_000_000 }) # Add a final sync step to measure the real computation time; if self.time_phases: start = System.nanoTime() tmp1 = self.auth1[0] tmp2 = self.hub1[0] end = System.nanoTime() if self.time_phases: self.benchmark.add_phase({ "name": "sync", "time_sec": (end - start) / 1_000_000_000 }) self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000) # Compute GPU result; for i in range(self.size): self.gpu_result[i] = self.auth1[i] + self.hub1[i] self.benchmark.add_to_benchmark("gpu_result", 0) if self.benchmark.debug: BenchmarkResult.log_message( f"\tgpu result: [" + ", ".join([f"{x:.4f}" for x in self.gpu_result[:10]]) + "...]") return self.gpu_result
) # Parse the input arguments; args = parser.parse_args() debug = args.debug if args.debug else BenchmarkResult.DEFAULT_DEBUG num_iter = args.num_iter if args.num_iter else BenchmarkResult.DEFAULT_NUM_ITER use_cuda = args.cuda_test time_phases = args.time_phases num_blocks = args.num_blocks # Setup the block size for each benchmark; block_sizes = create_block_size_list(block_sizes_1d, block_sizes_2d) if debug: BenchmarkResult.log_message( f"using block sizes: {block_sizes}; using low-level CUDA benchmarks: {use_cuda}" ) def tot_benchmark_count(): tot = 0 if use_cuda: for b in benchmarks: tot += len(num_elem[b]) * len(block_sizes) * len( cuda_exec_policies) * len(new_stream_policies) * len( parent_stream_policies) * len( dependency_policies) * len(prefetch) else: for b in benchmarks: tot += len(num_elem[b]) * len(exec_policies) * len(prefetch) return tot