Ejemplo n.º 1
0
    def cpu_validation(self, gpu_result: object, reinit: bool) -> None:
        # Recompute the CPU result only if necessary;
        start = System.nanoTime()
        if self.current_iter == 0 or reinit:
            # Re-initialize the random number generator with the same seed as the GPU to generate the same values;
            seed(self.random_seed)
            if self.benchmark.random_init:
                x_g = np.zeros(self.size)
                y_g = np.zeros(self.size)
                for i in range(self.size):
                    x_g[i] = randint(0, 10)
                    y_g[i] = randint(0, 10)
            else:
                x_g = 1 / np.linspace(1, self.size, self.size)
                y_g = 1 / np.linspace(1, self.size, self.size)

            x_g += 1
            y_g += 1
            self.cpu_result = x_g[0] + y_g[0]
        cpu_time = System.nanoTime() - start
        difference = np.abs(self.cpu_result - gpu_result)
        self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time)
        self.benchmark.add_to_benchmark("cpu_gpu_res_difference", difference)
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tcpu result: {self.cpu_result:.4f}, " +
                                        f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
Ejemplo n.º 2
0
    def cpu_validation(self, gpu_result: object, reinit: bool) -> None:
        # Recompute the CPU result only if necessary;
        start = time.time()
        if self.current_iter == 0 or reinit:
            # Re-initialize the random number generator with the same seed as the GPU to generate the same values;
            seed(self.random_seed)
            if self.benchmark.random_init:
                x_g = np.zeros(self.size)
                y_g = np.zeros(self.size)
                a_g = np.zeros(self.size)
                for i in range(self.size):
                    x_g[i] = random()
                    y_g[i] = 2 * random()
                    a_g[i] = 4 * random()
            else:
                x_g = 1 / np.linspace(1, self.size, self.size)
                y_g = 2 / np.linspace(1, self.size, self.size)
                a_g = 4 / np.linspace(1, self.size, self.size)

            x_g = x_g ** 2
            y_g = y_g ** 2
            a_g = a_g ** 2
            x_g -= y_g
            a_g += 2
            self.cpu_result = np.sum(x_g + a_g)
        cpu_time = time.time() - start
        difference = np.abs(self.cpu_result - gpu_result)
        self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time)
        self.benchmark.add_to_benchmark("cpu_gpu_res_difference", difference)
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tcpu result: {self.cpu_result:.4f}, " +
                                        f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
Ejemplo n.º 3
0
    def execute(self) -> object:
        self.block_size = self._block_size["block_size_1d"]
        start_comp = System.nanoTime()
        start = 0

        # A, B. Call the kernel. The 2 computations are independent, and can be done in parallel;
        self.execute_phase(
            "square_1", self.square_kernel(self.num_blocks, self.block_size),
            self.x, self.x1, self.size)
        self.execute_phase(
            "square_2", self.square_kernel(self.num_blocks, self.block_size),
            self.y, self.y1, self.size)

        # C. Compute the sum of the result;
        self.execute_phase(
            "reduce", self.reduce_kernel(self.num_blocks, self.block_size),
            self.x1, self.y1, self.res, self.size)

        # Add a final sync step to measure the real computation time;
        if self.time_phases:
            start = System.nanoTime()
        result = self.res[0]
        end = System.nanoTime()
        if self.time_phases:
            self.benchmark.add_phase({
                "name": "sync",
                "time_sec": (end - start) / 1_000_000_000
            })
        self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000)
        self.benchmark.add_to_benchmark("gpu_result", result)
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tgpu result: {result:.4f}")

        return result
Ejemplo n.º 4
0
    def execute(self) -> object:
        # This must be reset at every execution;
        self.res[0] = 0

        # A. B. Call the kernels. The 2 computations are independent, and can be done in parallel;
        for i in range(self.num_iter):
            start = time.time()
            self.square_kernel(self.num_blocks,
                               NUM_THREADS_PER_BLOCK)(self.x, self.size)
            self.square_kernel(self.num_blocks,
                               NUM_THREADS_PER_BLOCK)(self.y, self.size)
            end = time.time()
            self.benchmark.add_phase({
                "name": f"square_{i}",
                "time_sec": end - start
            })

        # C. Compute the sum of the result;
        start = time.time()
        self.reduce_kernel(self.num_blocks,
                           NUM_THREADS_PER_BLOCK)(self.x, self.y, self.res,
                                                  self.size)
        end = time.time()
        self.benchmark.add_phase({"name": "reduce", "time_sec": end - start})

        result = self.res[0]
        self.benchmark.add_to_benchmark("gpu_result", result)
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tgpu result: {result:.4f}")

        return result
Ejemplo n.º 5
0
    def execute(self) -> object:
        self.block_size = self._block_size["block_size_1d"]
        result = [0] * self.K

        # Call the kernels;
        start_comp = System.nanoTime()
        start = System.nanoTime()
        for i in range(self.K):
            self.execute_phase(
                f"bs_{i}", self.bs_kernel(self.num_blocks, self.block_size),
                self.x[i], self.y[i], self.size, R, V, T, K)

        if self.time_phases:
            start = System.nanoTime()
        for i in range(self.K):
            result[i] = self.y[i][0]
        end = System.nanoTime()
        if self.time_phases:
            self.benchmark.add_phase({
                "name": "sync",
                "time_sec": (end - start) / 1_000_000_000
            })
        self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000)

        self.benchmark.add_to_benchmark("gpu_result", result[0])
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tgpu result: {result[0]}")

        return result[0]
Ejemplo n.º 6
0
    def cpu_validation(self, gpu_result: object, reinit: bool) -> None:

        def spmv(ptr, idx, val, vec):
            res = np.zeros(len(ptr) - 1)
            for i in range(len(ptr) - 1):
                curr_sum = 0
                start = int(ptr[i])
                end = int(ptr[i + 1])
                for j in range(start, end):
                    curr_sum += val[j] * vec[idx[j]]
                res[i] = curr_sum
            return res

        # Recompute the CPU result only if necessary;
        start = System.nanoTime()
        if self.current_iter == 0 or reinit:
            # Re-initialize the random number generator with the same seed as the GPU to generate the same values;
            seed(self.random_seed)
            # Initialize the support device arrays;
            N = self.size

            x = np.ones(N)
            # r = b - A * x
            r = np.array(self.b_cpu) - np.array(spmv(self.ptr_cpu, self.idx_cpu, self.val_cpu, x))
            p = r.copy()
            t1 = r.T.dot(r)

            # Main iteration;
            for i in range(self.num_iterations):
                y = spmv(self.ptr_cpu, self.idx_cpu, self.val_cpu, p)
                t2 = p.dot(y)
                alpha = t1 / t2
                t1_old = t1
                x += alpha * p
                r -= alpha * y
                t1 = r.T.dot(r)
                beta = t1 / t1_old
                p = r + beta * p

            self.cpu_result = x

        cpu_time = System.nanoTime() - start

        # Compare GPU and CPU results;
        difference = 0
        for i in range(self.size):
            difference += np.abs(self.cpu_result[i] - gpu_result[i])

        self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time)
        self.benchmark.add_to_benchmark("cpu_gpu_res_difference", str(difference))
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tcpu result: [" + ", ".join([f"{x:.4f}" for x in self.cpu_result[:10]])
                                        + "...]; " +
                                        f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
Ejemplo n.º 7
0
    def cpu_validation(self, gpu_result: object, reinit: bool) -> None:
        def spmv(ptr, idx, val, vec):
            res = np.zeros(len(ptr) - 1)
            for i in range(len(ptr) - 1):
                curr_sum = 0
                start = int(ptr[i])
                end = int(ptr[i + 1])
                for j in range(start, end):
                    curr_sum += val[j] * vec[idx[j]]
                res[i] = curr_sum
            return res

        # Recompute the CPU result only if necessary;
        start = System.nanoTime()
        if self.current_iter == 0 or reinit:
            # Re-initialize the random number generator with the same seed as the GPU to generate the same values;
            seed(self.random_seed)
            # Initialize the support device arrays;
            N = self.size

            auth1 = np.ones(N)
            hub1 = np.ones(N)

            # Main iteration;
            for i in range(self.num_iterations):
                # Authority;
                auth2 = spmv(self.ptr2_cpu, self.idx2_cpu, self.val2_cpu, hub1)
                auth2 = auth2 / np.sum(auth2)
                # Hubs
                hub2 = spmv(self.ptr_cpu, self.idx_cpu, self.val_cpu, auth1)
                hub2 = hub2 / np.sum(hub2)

                auth1 = auth2
                hub1 = hub2
            self.cpu_result = hub1 + auth1

        cpu_time = System.nanoTime() - start

        # Compare GPU and CPU results;
        difference = 0
        for i in range(self.size):
            difference += np.abs(self.cpu_result[i] - gpu_result[i])

        self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time)
        self.benchmark.add_to_benchmark("cpu_gpu_res_difference",
                                        str(difference))
        if self.benchmark.debug:
            BenchmarkResult.log_message(
                f"\tcpu result: [" +
                ", ".join([f"{x:.4f}"
                           for x in self.cpu_result[:10]]) + "...]; " +
                f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
Ejemplo n.º 8
0
    def cpu_validation(self, gpu_result: object, reinit: bool) -> None:
        def softmax(X):
            return np.exp(X) / np.sum(np.exp(X), axis=1).reshape(X.shape[0], 1)

        def logsumexp(X):
            return np.log(np.sum(np.exp(X)))

        def naive_bayes_predict(X, feature_log_prob, log_class_prior):
            jll = X.dot(feature_log_prob.T) + log_class_prior
            amax = np.amax(jll, axis=1)
            l = logsumexp(jll - np.atleast_2d(amax).T) + amax

            return np.exp(jll - np.atleast_2d(l).T)

        def normalize(X):
            return (X - np.mean(X, axis=0)) / np.std(X, axis=0)

        def ridge_pred(X, coef, intercept):
            return np.dot(X, coef.T) + intercept

        # Recompute the CPU result only if necessary;
        start = System.nanoTime()
        if self.current_iter == 0 or reinit:
            # Re-initialize the random number generator with the same seed as the GPU to generate the same values;
            seed(self.random_seed)

            r1_g = naive_bayes_predict(self.x_cpu, self.nb_feat_log_prob_cpu,
                                       self.nb_class_log_prior_cpu)
            r2_g = ridge_pred(normalize(self.x_cpu), self.ridge_coeff_cpu,
                              self.ridge_intercept_cpu)
            r_g = np.argmax(softmax(r1_g) + softmax(r2_g), axis=1)
            self.cpu_result = r_g

        cpu_time = System.nanoTime() - start

        # Compare GPU and CPU results;
        difference = 0
        for i in range(self.size):
            difference += np.abs(self.cpu_result[i] - gpu_result[i])

        self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time)
        self.benchmark.add_to_benchmark("cpu_gpu_res_difference",
                                        str(difference))
        if self.benchmark.debug:
            BenchmarkResult.log_message(
                f"\tcpu result: [" +
                ", ".join([f"{x:.4f}"
                           for x in self.cpu_result[:10]]) + "...]; " +
                f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
Ejemplo n.º 9
0
    def cpu_validation(self, gpu_result: object, reinit: bool) -> None:
        def CND(X):
            """
            Cumulative normal distribution.
            Helper function used by BS(...).
            """

            (a1, a2, a3, a4, a5) = (0.31938153, -0.356563782, 1.781477937,
                                    -1.821255978, 1.330274429)
            L = np.absolute(X)
            K = np.float64(1.0) / (1.0 + 0.2316419 * L)
            w = 1.0 - 1.0 / math.sqrt(2 * np.pi) * np.exp(-L * L / 2.) * \
                (a1 * K +
                 a2 * (K ** 2) +
                 a3 * (K ** 3) +
                 a4 * (K ** 4) +
                 a5 * (K ** 5))

            mask = X < 0
            w = w * ~mask + (1.0 - w) * mask

            return w

        def BS(X, R, V, T, K):
            """Black Scholes Function."""
            d1_arr = (np.log(X / K) +
                      (R + V * V / 2.) * T) / (V * math.sqrt(T))
            d2_arr = d1_arr - V * math.sqrt(T)
            w_arr = CND(d1_arr)
            w2_arr = CND(d2_arr)
            return X * w_arr - X * math.exp(-R * T) * w2_arr

        # Recompute the CPU result only if necessary;
        start = System.nanoTime()
        if self.current_iter == 0 or reinit:
            res = BS(np.array(self.x_tmp), R, V, T, K)
            self.cpu_result = res[0]
        cpu_time = System.nanoTime() - start
        difference = np.abs(self.cpu_result - gpu_result)
        self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time)
        self.benchmark.add_to_benchmark("cpu_gpu_res_difference", difference)
        if self.benchmark.debug:
            BenchmarkResult.log_message(
                f"\tcpu result: {self.cpu_result:.4f}, " +
                f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
Ejemplo n.º 10
0
    def execute(self) -> object:
        # This must be reset at every execution;
        self.res[0] = 0

        # Call the kernel. The 2 computations are independent, and can be done in parallel;
        start = time.time()
        self.square_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.x, self.size)
        self.square_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.y, self.size)
        end = time.time()
        self.benchmark.add_phase({"name": "square", "time_sec": end - start})

        # C. Compute the difference of the 2 vectors. This must be done after the 2 previous computations;
        start = time.time()
        self.diff_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.x, self.y, self.z, self.size)
        end = time.time()
        self.benchmark.add_phase({"name": "diff", "time_sec": end - start})

        # D. Compute the other branch of the computation;
        start = time.time()
        self.square_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.a, self.size)
        end = time.time()
        self.benchmark.add_phase({"name": "square_other_branch", "time_sec": end - start})

        # E. Continue computing the other branch;
        start = time.time()
        self.addtwo_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.a, self.b, self.size)
        end = time.time()
        self.benchmark.add_phase({"name": "add_two_other_branch", "time_sec": end - start})

        # F. Compute the sum of the result;
        start = time.time()
        self.reduce_kernel(self.num_blocks, NUM_THREADS_PER_BLOCK)(self.z, self.b, self.res, self.size)
        end = time.time()
        self.benchmark.add_phase({"name": "reduce", "time_sec": end - start})

        result = self.res[0]
        self.benchmark.add_to_benchmark("gpu_result", result)
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tgpu result: {result:.4f}")

        return result
Ejemplo n.º 11
0
    def execute(self) -> object:

        # A. B. Call the kernels. The 2 computations are independent, and can be done in parallel;
        start = System.nanoTime()
        self.sum_kernel(self.num_blocks, self.block_size)(self.x, self.size)
        end = System.nanoTime()
        self.benchmark.add_phase({"name": "sum_1", "time_sec": (end - start) / 1_000_000_000})

        start = System.nanoTime()
        self.sum_kernel(self.num_blocks, self.block_size)(self.y, self.size)
        end = System.nanoTime()
        self.benchmark.add_phase({"name": "sum_2", "time_sec": (end - start) / 1_000_000_000})

        start = System.nanoTime()
        result_1 = self.x[0]
        result_2 = self.y[0]
        end = System.nanoTime()
        self.benchmark.add_phase({"name": "read_result", "time_sec": (end - start) / 1_000_000_000})

        self.benchmark.add_to_benchmark("gpu_result", result_1 + result_2)
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tgpu result: {result_1} {result_2}")

        return result_1 + result_2
Ejemplo n.º 12
0
    def cpu_validation(self, gpu_result: object, reinit: bool) -> None:
        def relu(x):
            return np.maximum(x, 0)

        def conv3d2(x, kernels, shape, K, k_out, stride=1, operator=relu):
            N, M, L = shape
            out = np.zeros((N // stride) * (M // stride) * k_out)
            radius = K // 2

            for m in range(k_out):
                for i in range(0, int(np.ceil(N / stride)) - radius):
                    for j in range(0, int(np.ceil(M / stride)) - radius):
                        res = 0
                        i_f = i * stride + radius
                        j_f = j * stride + radius
                        for k_i in range(-radius, radius + 1):
                            for k_j in range(-radius, radius + 1):
                                for l in range(L):
                                    ni = i_f + k_i
                                    nj = j_f + k_j
                                    res += kernels[
                                        l + L * (k_j + radius + K *
                                                 (k_i + radius + K * m))] * x[(
                                                     (ni * M) + nj) * L + l]
                        out[m + k_out * (j + M * i // stride)] = operator(res)
            return out

        def pooling(x, shape, K, stride):
            N, M, L = shape
            out = np.zeros((N // pooling, M // pooling, L))
            radius = K // 2
            for i in range(0, int(np.ceil(N / stride)) - radius):
                for j in range(0, int(np.ceil(M / stride)) - radius):
                    for l in range(L):
                        res = 0
                        i_f = i * stride + radius
                        j_f = j * stride + radius
                        for k_i in range(-radius, radius + 1):
                            for k_j in range(-radius, radius + 1):
                                ni = i_f + k_i
                                nj = j_f + k_j
                                res += x[((ni * M) + nj) * L + l]
                        out[l + L * (j + M * i // stride)] = res / K**2
            return out

        def gap2(x, shape):
            N, M, L = shape
            out = np.zeros(L)
            for n in range(N):
                for m in range(M):
                    for i in range(L):
                        out[i] += x[i + L * (m + M * n)] / (N * M)
            return out

        def concat(x, y):
            # x and y have the same length;
            out = np.zeros(2 * len(x))
            for i in range(len(x)):
                out[i] = x[i]
                out[i + len(x)] = y[i]
            return out

        # Recompute the CPU result only if necessary;
        start = System.nanoTime()
        if self.current_iter == 0 or reinit:

            # Initialize weights;
            N = self.size
            kernel_1 = np.zeros(len(self.kernel_1))
            kernel_2 = np.zeros(len(self.kernel_2))
            kernel_3 = np.zeros(len(self.kernel_3))
            kernel_4 = np.zeros(len(self.kernel_4))
            dense_weights = np.zeros(len(self.dense_weights))
            # Random weights;
            for i in range(len(self.kernel_1)):
                kernel_1[i] = self.kernel_1[i]
                kernel_3[i] = self.kernel_3[i]
            for i in range(len(self.kernel_2)):
                kernel_2[i] = self.kernel_2[i]
                kernel_4[i] = self.kernel_4[i]

            for i in range(len(self.dense_weights)):
                dense_weights[i] = self.dense_weights[i]

            # First convolution (N,N,1) -> (N/stride,N/stride,kn1)
            x_1 = conv3d2(np.array(self.x_cpu),
                          kernel_1, (N, N, self.channels),
                          self.K,
                          self.kn1,
                          stride=self.stride)
            x_11 = pooling(x_1, (N // self.stride, N // self.stride, self.kn1),
                           self.pooling, self.pooling)
            # Second convolution (N/stride,N/stride,kn1) -> (N/stride^2,N/stride^2,kn2)
            x_2 = conv3d2(x_11,
                          kernel_2,
                          (N // self.stride // self.pooling,
                           N // self.stride // self.pooling, self.kn1),
                          self.K,
                          self.kn2,
                          stride=self.stride)

            # First convolution (N,N,1) -> (N/stride,N/stride,kn1)
            y_1 = conv3d2(np.array(self.y_cpu),
                          kernel_3, (N, N, self.channels),
                          self.K,
                          self.kn1,
                          stride=self.stride)
            y_11 = pooling(y_1, (N // self.stride, N // self.stride, self.kn1),
                           self.pooling, self.pooling)
            # Second convolution (N/stride,N/stride,kn1) -> (N/stride^2,N/stride^2,kn2)
            y_2 = conv3d2(y_11,
                          kernel_4,
                          (N // self.stride // self.pooling,
                           N // self.stride // self.pooling, self.kn1),
                          self.K,
                          self.kn2,
                          stride=self.stride)

            # Global average pooling 2D;
            # x_3 = gap2(x_2, (N // (self.stride * self.stride), N // (self.stride * self.stride), self.kn2))
            # y_3 = gap2(y_2, (N // (self.stride * self.stride), N // (self.stride * self.stride), self.kn2))

            # Concatenate;
            out = concat(x_2, y_2)

            # Final dense layer;
            self.cpu_result = out.dot(dense_weights[:len(out)])
            # self.cpu_result = x_1[:100]

        cpu_time = (System.nanoTime() - start) / 1_000_000_000

        # Compare GPU and CPU results;
        difference = np.abs(self.cpu_result - gpu_result)

        self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time)
        self.benchmark.add_to_benchmark("cpu_gpu_res_difference",
                                        str(difference))
        if self.benchmark.debug:
            # BenchmarkResult.log_message(
            #     f"\tcpu result: [" + ", ".join([f"{x:.2f}" for x in self.cpu_result[:100]]) + "...]"+
            #                             f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
            BenchmarkResult.log_message(
                f"\tcpu result: {self.cpu_result:.4f}; " +
                f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
Ejemplo n.º 13
0
    def execute(self) -> object:
        self.num_blocks_per_processor = self.num_blocks
        self.block_size_1d = self._block_size["block_size_1d"]
        self.block_size_2d = self._block_size["block_size_2d"]
        start_comp = System.nanoTime()
        start = 0

        a = self.num_blocks_per_processor / 2
        # Convolutions;
        self.execute_phase(
            "conv_x1",
            self.conv2d_kernel(
                (a, a), (self.block_size_2d, self.block_size_2d),
                4 * (self.K**2) * self.kn1 * self.channels), self.x1, self.x,
            self.kernel_1, self.size, self.size, self.channels, self.K,
            self.kn1, self.stride)
        self.execute_phase(
            "conv_y1",
            self.conv2d_kernel(
                (a, a), (self.block_size_2d, self.block_size_2d),
                4 * (self.K**2) * self.kn1 * self.channels), self.y1, self.y,
            self.kernel_3, self.size, self.size, self.channels, self.K,
            self.kn1, self.stride)
        # Pooling;
        self.execute_phase(
            "pool_x1",
            self.pooling_kernel(
                (a / 2, a / 2, a / 2),
                (self.block_size_2d / 2, self.block_size_2d / 2,
                 self.block_size_2d / 2)), self.x11, self.x1,
            self.size // self.stride, self.size // self.stride, self.kn1,
            self.pooling, self.pooling)
        self.execute_phase(
            "pool_y1",
            self.pooling_kernel(
                (a / 2, a / 2, a / 2),
                (self.block_size_2d / 2, self.block_size_2d / 2,
                 self.block_size_2d / 2)), self.y11, self.y1,
            self.size // self.stride, self.size // self.stride, self.kn1,
            self.pooling, self.pooling)
        # Other convolutions;
        self.execute_phase(
            "conv_x2",
            self.conv2d_kernel(
                (a, a), (self.block_size_2d, self.block_size_2d),
                4 * (self.K**2) * self.kn1 * self.kn2), self.x2, self.x11,
            self.kernel_2, self.size // self.stride // self.pooling,
            self.size // self.stride // self.pooling, self.kn1, self.K,
            self.kn2, self.stride)
        self.execute_phase(
            "conv_y2",
            self.conv2d_kernel(
                (a, a), (self.block_size_2d, self.block_size_2d),
                4 * (self.K**2) * self.kn1 * self.kn2), self.y2, self.y11,
            self.kernel_4, self.size // self.stride // self.pooling,
            self.size // self.stride // self.pooling, self.kn1, self.K,
            self.kn2, self.stride)

        # Global average pooling;
        # self.execute_phase("gap_x",
        #                    self.gap_kernel((a, a), (self.block_size_2d, self.block_size_2d), 4 * self.kn2),
        #                    self.x3, self.x2, self.size // self.stride**2, self.size // self.stride**2, self.kn2)
        # self.execute_phase("gap_y",
        #                    self.gap_kernel((a, a), (self.block_size_2d, self.block_size_2d), 4 * self.kn2),
        #                    self.y3, self.y2, self.size // self.stride ** 2, self.size // self.stride ** 2, self.kn2)

        # Dense layer;
        self.execute_phase(
            "concat",
            self.concat_kernel(self.num_blocks_per_processor,
                               self.block_size_1d), self.z, self.x2, self.y2,
            len(self.x2))
        self.execute_phase(
            "dot_product",
            self.dp_kernel(self.num_blocks_per_processor, self.block_size_1d),
            self.z, self.dense_weights, self.res, len(self.z))

        # Add a final sync step to measure the real computation time;
        if self.time_phases:
            start = System.nanoTime()
        # self.gpu_result = sigmoid(self.res[0])
        self.gpu_result = self.res[0]
        # self.gpu_result = [self.x1[i] for i in range(100)]
        end = System.nanoTime()
        if self.time_phases:
            self.benchmark.add_phase({
                "name": "sync",
                "time_sec": (end - start) / 1_000_000_000
            })
        self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000)

        self.benchmark.add_to_benchmark("gpu_result", self.gpu_result)
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tgpu result: {self.gpu_result:.4f}")
            # BenchmarkResult.log_message(
            #     f"\tgpu result: [" + ", ".join([f"{x:.2f}" for x in self.gpu_result[:100]]) + "...]")

        return self.gpu_result
Ejemplo n.º 14
0
    def execute(self) -> object:
        self.num_blocks_size = self.num_blocks  # 64  # DEFAULT_NUM_BLOCKS
        self.num_blocks_feat = self.num_blocks  # 64  # DEFAULT_NUM_BLOCKS
        self.block_size = self._block_size["block_size_1d"]
        # Schedule the categorical Naive Bayes and Ridge Regression kernels
        start_comp = System.nanoTime()
        start = 0

        # RR - 1.
        self.execute_phase("rr_1",
                           self.rr_1(self.num_blocks_feat, self.block_size),
                           self.x, self.z, self.size, self.num_features)

        # NB - 1.
        self.execute_phase("nb_1",
                           self.nb_1(self.num_blocks_size, self.block_size),
                           self.x, self.nb_feat_log_prob, self.r1, self.size,
                           self.num_features, self.num_classes)

        # RR - 2.
        self.execute_phase("rr_2",
                           self.rr_2(self.num_blocks_size, self.block_size),
                           self.z, self.ridge_coeff, self.r2, self.size,
                           self.num_features, self.num_classes)

        # NB - 2.
        self.execute_phase("nb_2",
                           self.nb_2(self.num_blocks_size, self.block_size),
                           self.r1, self.nb_amax, self.size, self.num_classes)

        # NB - 3.
        self.execute_phase("nb_3",
                           self.nb_3(self.num_blocks_size,
                                     self.block_size), self.r1, self.nb_amax,
                           self.nb_l, self.size, self.num_classes)

        # RR - 3.
        self.execute_phase("rr_3",
                           self.rr_3(self.num_blocks_size,
                                     self.block_size), self.r2,
                           self.ridge_intercept, self.size, self.num_classes)

        # NB - 4.
        self.execute_phase("nb_4",
                           self.nb_4(self.num_blocks_size, self.block_size),
                           self.r1, self.nb_l, self.size, self.num_classes)

        # Ensemble results;

        # Softmax normalization;
        self.execute_phase("softmax_1",
                           self.softmax(self.num_blocks_size, self.block_size),
                           self.r1, self.size, self.num_classes)
        self.execute_phase("softmax_2",
                           self.softmax(self.num_blocks_size, self.block_size),
                           self.r2, self.size, self.num_classes)

        # Prediction;
        self.execute_phase("argmax",
                           self.argmax(self.num_blocks_size,
                                       self.block_size), self.r1, self.r2,
                           self.r, self.size, self.num_classes)

        # Add a final sync step to measure the real computation time;
        if self.time_phases:
            start = System.nanoTime()
        tmp = self.r[0]
        end = System.nanoTime()
        if self.time_phases:
            self.benchmark.add_phase({
                "name": "sync",
                "time_sec": (end - start) / 1_000_000_000
            })
        self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000)
        self.benchmark.add_to_benchmark("gpu_result", 0)
        if self.benchmark.debug:
            BenchmarkResult.log_message(
                f"\tgpu result: [" +
                ", ".join([f"{x:.4f}" for x in self.r[:10]]) + "...]")

        return self.r
Ejemplo n.º 15
0
    def execute(self) -> object:
        self.block_size_1d = self._block_size["block_size_1d"]
        self.block_size_2d = self._block_size["block_size_2d"]
        self.num_blocks_per_processor = self.num_blocks  # 12  # 32
        a = self.num_blocks_per_processor / 2

        start_comp = System.nanoTime()
        start = 0

        self.reset_kernel(self.num_blocks_per_processor, self.block_size_1d)(self.image3, 0)

        self.reset_kernel((a, a), (self.block_size_2d, self.block_size_2d))(self.image3, 0)

        # Blur - Small;
        self.execute_phase("blur_small",
                           self.gaussian_blur_kernel((a, a), (self.block_size_2d, self.block_size_2d), 4 * self.kernel_small_diameter**2),
                           self.image, self.blurred_small, self.size, self.size, self.kernel_small, self.kernel_small_diameter)

        # Blur - Large;
        self.execute_phase("blur_large",
                           self.gaussian_blur_kernel((a, a), (self.block_size_2d, self.block_size_2d), 4 * self.kernel_large_diameter**2),
                           self.image, self.blurred_large, self.size, self.size, self.kernel_large, self.kernel_large_diameter)

        # Blur - Unsharpen;
        self.execute_phase("blur_unsharpen",
                           self.gaussian_blur_kernel((a, a), (self.block_size_2d, self.block_size_2d), 4 * self.kernel_unsharpen_diameter**2),
                           self.image, self.blurred_unsharpen, self.size, self.size, self.kernel_unsharpen, self.kernel_unsharpen_diameter)

        # Sobel filter (edge detection);
        self.execute_phase("sobel_small",
                           self.sobel_kernel((a, a), (self.block_size_2d, self.block_size_2d)),
                           self.blurred_small, self.mask_small, self.size, self.size)

        self.execute_phase("sobel_large",
                           self.sobel_kernel((a, a), (self.block_size_2d, self.block_size_2d)),
                           self.blurred_large, self.mask_large, self.size, self.size)

        # Extend large edge detection mask;
        self.execute_phase("maximum",
                           self.maximum_kernel(self.num_blocks_per_processor, self.block_size_1d), self.mask_large, self.maximum, self.size**2)
        self.execute_phase("minimum",
                           self.minimum_kernel(self.num_blocks_per_processor, self.block_size_1d), self.mask_large, self.minimum, self.size**2)
        self.execute_phase("extend",
                           self.extend_kernel(self.num_blocks_per_processor, self.block_size_1d), self.mask_large, self.minimum, self.maximum, self.size**2)

        # Unsharpen;
        self.execute_phase("unsharpen",
                            self.unsharpen_kernel(self.num_blocks_per_processor, self.block_size_1d),
                            self.image, self.blurred_unsharpen, self.image_unsharpen, self.unsharpen_amount, self.size * self.size)

        # Combine results;
        self.execute_phase("combine",
                           self.combine_mask_kernel(self.num_blocks_per_processor, self.block_size_1d),
                           self.image_unsharpen, self.blurred_large, self.mask_large, self.image2, self.size * self.size)
        self.execute_phase("combine_2",
                           self.combine_mask_kernel(self.num_blocks_per_processor, self.block_size_1d),
                           self.image2, self.blurred_small, self.mask_small, self.image3, self.size * self.size)

        # Add a final sync step to measure the real computation time;
        if self.time_phases:
            start = System.nanoTime()
        tmp = self.image3[0][0]
        end = System.nanoTime()
        if self.time_phases:
            self.benchmark.add_phase({"name": "sync", "time_sec": (end - start) / 1_000_000_000})
        self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000)

        # Compute GPU result;
        # for i in range(self.size):
        #     for j in range(self.size):
        #         self.gpu_result[i, j] = self.image3[i][j]

        self.benchmark.add_to_benchmark("gpu_result", 0)
        if self.benchmark.debug:
            BenchmarkResult.log_message(
                f"\tgpu result: [" + ", ".join([f"{x:.4f}" for x in self.gpu_result[0, :10]]) + "...]")

        return self.gpu_result
Ejemplo n.º 16
0
    debug = args.debug if args.debug else BenchmarkResult.DEFAULT_DEBUG
    num_iter = args.num_iter if args.num_iter else BenchmarkResult.DEFAULT_NUM_ITER
    output_path = args.output_path if args.output_path else ""
    realloc = args.realloc if args.realloc else [
        BenchmarkResult.DEFAULT_REALLOC
    ]
    reinit = args.reinit if args.reinit else [BenchmarkResult.DEFAULT_REINIT]
    random_init = args.random if args.random else BenchmarkResult.DEFAULT_RANDOM_INIT
    cpu_validation = args.cpu_validation
    time_phases = args.time_phases
    nvprof_profile = args.nvprof

    # Create a new benchmark result instance;
    benchmark_res = BenchmarkResult(debug=debug,
                                    num_iterations=num_iter,
                                    output_path=output_path,
                                    cpu_validation=cpu_validation,
                                    random_init=random_init)
    if benchmark_res.debug:
        BenchmarkResult.log_message(f"using CPU validation: {cpu_validation}")

    if args.benchmark:
        if benchmark_res.debug:
            BenchmarkResult.log_message(
                f"using only benchmark: {args.benchmark}")
        benchmarks = {b: benchmarks[b] for b in args.benchmark}

    if args.policy:
        if benchmark_res.debug:
            BenchmarkResult.log_message(f"using only type: {args.policy}")
        policies = {n: [args.policy] for n in policies.keys()}
Ejemplo n.º 17
0
def execute_grcuda_benchmark(benchmark,
                             size,
                             exec_policy,
                             new_stream_policy,
                             parent_stream_policy,
                             dependency_policy,
                             num_iter,
                             debug,
                             time_phases,
                             num_blocks=DEFAULT_NUM_BLOCKS,
                             prefetch=False):
    block_size = (block_sizes_1d_dict[b], block_sizes_2d_dict[b])
    for m in use_metrics:
        if debug:
            BenchmarkResult.log_message("")
            BenchmarkResult.log_message("")
            BenchmarkResult.log_message("#" * 30)
            BenchmarkResult.log_message(f"Benchmark {i + 1}/{tot_benchmarks}")
            BenchmarkResult.log_message(
                f"benchmark={b}, size={n},"
                f"block size={block_size}, "
                f"num blocks={num_blocks}, "
                f"exec policy={exec_policy}, "
                f"new stream policy={new_stream_policy}, "
                f"parent stream policy={parent_stream_policy}, "
                f"dependency policy={dependency_policy}, "
                f"prefetch={prefetch}, "
                f"time_phases={time_phases}, "
                f"collect metrics={m}")
            BenchmarkResult.log_message("#" * 30)
            BenchmarkResult.log_message("")
            BenchmarkResult.log_message("")

        log_folder = f"{datetime.now().strftime('%Y_%m_%d')}"
        # Create a folder if it doesn't exist;
        output_folder_path = os.path.join(LOG_FOLDER, log_folder)
        if not os.path.exists(output_folder_path):
            if debug:
                BenchmarkResult.log_message(
                    f"creating result folder: {output_folder_path}")
            os.mkdir(output_folder_path)
        file_name = f"{b}_{exec_policy}_{'metric' if m else 'nometric'}_{prefetch}{'' if (POST_TURING and m) else '_%p'}.csv"
        output_path = os.path.join(output_folder_path, file_name)

        if POST_TURING:
            if m:
                benchmark_cmd = GRAALPYTHON_CMD_METRICS.format(
                    output_path, METRICS, GRAALPYTHON_FOLDER, HEAP_SIZE,
                    new_stream_policy,
                    "--grcuda.InputPrefetch" if prefetch else "", exec_policy,
                    dependency_policy, parent_stream_policy, num_iter, size,
                    num_blocks, benchmark, block_size[0], block_size[1],
                    "-d" if debug else "", "-p" if time_phases else "")
            else:
                benchmark_cmd = GRAALPYTHON_CMD_TRACE.format(
                    output_path, "", GRAALPYTHON_FOLDER, HEAP_SIZE,
                    new_stream_policy,
                    "--grcuda.InputPrefetch" if prefetch else "", exec_policy,
                    dependency_policy, parent_stream_policy, num_iter, size,
                    num_blocks, benchmark, block_size[0], block_size[1],
                    "-d" if debug else "", "-p" if time_phases else "")
        else:
            benchmark_cmd = GRAALPYTHON_CMD.format(
                output_path, METRICS if m else "", GRAALPYTHON_FOLDER,
                HEAP_SIZE, new_stream_policy,
                "--grcuda.InputPrefetch" if prefetch else "", exec_policy,
                dependency_policy, parent_stream_policy, num_iter, size,
                num_blocks, benchmark, block_size[0], block_size[1],
                "-d" if debug else "", "-p" if time_phases else "")
        start = System.nanoTime()
        result = subprocess.run(
            benchmark_cmd,
            shell=True,
            stdout=subprocess.STDOUT,
            cwd=f"{GRCUDA_HOME}/projects/resources/python/benchmark")
        result.check_returncode()
        end = System.nanoTime()
        if debug:
            BenchmarkResult.log_message(
                f"Benchmark total execution time: {(end - start) / 1_000_000_000:.2f} seconds"
            )
Ejemplo n.º 18
0
    def cpu_validation(self, gpu_result: object, reinit: bool) -> None:

        sobel_filter_diameter = 3
        sobel_filter_x = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]])
        sobel_filter_y = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])

        def sobel_filter(image):
            out = np.zeros(image.shape)
            rows, cols = image.shape
            radius = sobel_filter_diameter // 2

            for i in range(rows):
                for j in range(cols):
                    sum_gradient_x = 0
                    sum_gradient_y = 0
                    for x in range(-radius, radius + 1):
                        for y in range(-radius, radius + 1):
                            nx = x + i
                            ny = y + j
                            if (nx >= 0 and ny >= 0 and nx < rows and ny < cols):
                                gray_value_neigh = image[nx, ny]
                                gradient_x = sobel_filter_x[x + radius][y + radius]
                                gradient_y = sobel_filter_y[x + radius][y + radius]
                                sum_gradient_x += gray_value_neigh * gradient_x
                                sum_gradient_y += gray_value_neigh * gradient_y
                    out[i, j] = np.sqrt(sum_gradient_x ** 2 + sum_gradient_y ** 2)
            return out

        def gaussian_blur(image, kernel):
            out = np.zeros(image.shape)
            rows, cols = image.shape

            # Blur radius;
            diameter = kernel.shape[0]
            radius = diameter // 2

            # Flatten image and kernel;
            image_1d = image.reshape(-1)
            kernel_1d = kernel.reshape(-1)

            for i in range(rows):
                for j in range(cols):
                    sum_tmp = 0
                    for x in range(-radius, radius + 1):
                        for y in range(-radius, radius + 1):
                            nx = x + i
                            ny = y + j
                            if (nx >= 0 and ny >= 0 and nx < rows and ny < cols):
                                sum_tmp += kernel_1d[(x + radius) * diameter + (y + radius)] * image_1d[nx * cols + ny]
                    out[i, j] = sum_tmp
            return out

        def normalize(image):
            return (image - np.min(image)) / (np.max(image) - np.min(image))

        def truncate(image, minimum=0, maximum=1):
            out = image.copy()
            out[out < minimum] = minimum
            out[out > maximum] = maximum
            return out

        # Recompute the CPU result only if necessary;
        start = System.nanoTime()
        if self.current_iter == 0 or reinit:
            # Part 1: Small blur on medium frequencies;
            blurred_small = gaussian_blur(self.image_cpu, self.kernel_small_cpu)
            edges_small = sobel_filter(blurred_small)

            # Part 2: High blur on low frequencies;
            blurred_large = gaussian_blur(self.image_cpu, self.kernel_large_cpu)
            edges_large = sobel_filter(blurred_large)
            # Extend mask to cover a larger area;
            edges_large = normalize(edges_large) * 5
            edges_large[edges_large > 1] = 1

            # Part 3: Sharpen image;
            unsharpen = gaussian_blur(self.image_cpu, self.kernel_unsharpen_cpu)
            amount = 0.5
            sharpened = truncate(self.image_cpu * (1 + amount) - unsharpen * amount)

            # Part 4: Merge sharpened image and low frequencies;
            image2 = normalize(sharpened * edges_large + blurred_large * (1 - edges_large))

            # Part 5: Merge image and medium frequencies;
            self.cpu_result = image2 * edges_small + blurred_small * (1 - edges_small)

        cpu_time = System.nanoTime() - start

        # Compare GPU and CPU results;
        difference = 0
        for i in range(self.size):
            for j in range(self.size):
                difference += np.abs(self.cpu_result[i, j] - gpu_result[i, j])

        self.benchmark.add_to_benchmark("cpu_time_sec", cpu_time)
        self.benchmark.add_to_benchmark("cpu_gpu_res_difference", str(difference))
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tcpu result: [" + ", ".join([f"{x:.4f}" for x in self.cpu_result[0, :10]])
                                        + "...]; " +
                                        f"difference: {difference:.4f}, time: {cpu_time:.4f} sec")
Ejemplo n.º 19
0
    def execute(self) -> object:
        num_blocks_spmv = int(np.ceil(self.size / self.block_size))
        start_comp = System.nanoTime()
        start = 0

        # Initialization phase;
        # r = b - A * x
        self.execute_phase("spmv_init", self.spmv_full_kernel(num_blocks_spmv, self.block_size, 4 * self.block_size),
                           self.row_cnt_1, self.ptr, self.idx, self.val, self.x, self.r, self.size, -1, self.b)
        # p = r
        self.execute_phase("cpy_init", self.cpy_kernel(self.num_blocks_size, self.block_size), self.p, self.r, self.size)
        # t1 = r^t * r
        self.execute_phase("norm_init", self.norm_kernel(self.num_blocks_size, self.block_size), self.r, self.t1, self.size)

        for i in range(self.num_iterations):
            # t2 = p^t * A * p
            self.execute_phase(f"spmv_{i}", self.spmv_kernel(num_blocks_spmv, self.block_size, 4 * self.block_size),
                               self.row_cnt_2, self.ptr, self.idx, self.val, self.p, self.y, self.size)
            self.t2[0] = 0
            self.execute_phase(f"dp_{i}", self.dp_kernel(self.num_blocks_size, self.block_size), self.p, self.y, self.t2, self.size)

            if self.time_phases:
                start = System.nanoTime()
            alpha = self.t1[0] / self.t2[0]
            old_r_norm_squared = self.t1[0]
            self.t1[0] = 0
            self.row_cnt_1[0] = 0.0
            self.row_cnt_2[0] = 0.0
            if self.time_phases:
                end = System.nanoTime()
                self.benchmark.add_phase({"name": f"alpha_{i}", "time_sec": (end - start) / 1_000_000_000})

            # Update x: x = x + alpha * p
            self.execute_phase(f"saxpy_x_{i}", self.saxpy_kernel(self.num_blocks_size, self.block_size),
                               self.x, self.x, self.p, alpha, self.size)
            # r = r - alpha * y
            self.execute_phase(f"saxpy_r_{i}", self.saxpy_kernel(self.num_blocks_size, self.block_size),
                               self.r, self.r, self.y, -1 * alpha, self.size)
            # t1 = r^t * r
            self.execute_phase(f"norm_{i}", self.norm_kernel(self.num_blocks_size, self.block_size), self.r, self.t1, self.size)

            if self.time_phases:
                start = System.nanoTime()
            beta = self.t1[0] / old_r_norm_squared
            if self.time_phases:
                end = System.nanoTime()
                self.benchmark.add_phase({"name": f"beta_{i}", "time_sec": (end - start) / 1_000_000_000})

            self.execute_phase(f"saxpy_p_{i}", self.saxpy_kernel(self.num_blocks_size, self.block_size),
                               self.p, self.r, self.p, beta, self.size)

        # Add a final sync step to measure the real computation time;
        if self.time_phases:
            start = System.nanoTime()
        tmp1 = self.x[0]
        end = System.nanoTime()
        if self.time_phases:
            self.benchmark.add_phase({"name": "sync", "time_sec": (end - start) / 1_000_000_000})
        self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000)
        # Compute GPU result;
        for i in range(self.size):
            self.gpu_result[i] = self.x[i]

        self.benchmark.add_to_benchmark("gpu_result", 0)
        if self.benchmark.debug:
            BenchmarkResult.log_message(f"\tgpu result: [" + ", ".join([f"{x:.4f}" for x in self.gpu_result[:10]]) + "...]")

        return self.gpu_result
Ejemplo n.º 20
0
def execute_cuda_benchmark(benchmark,
                           size,
                           block_size,
                           exec_policy,
                           num_iter,
                           debug,
                           prefetch=False,
                           num_blocks=DEFAULT_NUM_BLOCKS,
                           output_date=None):
    if debug:
        BenchmarkResult.log_message("")
        BenchmarkResult.log_message("")
        BenchmarkResult.log_message("#" * 30)
        BenchmarkResult.log_message(f"Benchmark {i + 1}/{tot_benchmarks}")
        BenchmarkResult.log_message(f"benchmark={b}, size={n},"
                                    f" block size={block_size}, "
                                    f" prefetch={prefetch}, "
                                    f" num blocks={num_blocks}, "
                                    f" exec policy={exec_policy}")
        BenchmarkResult.log_message("#" * 30)
        BenchmarkResult.log_message("")
        BenchmarkResult.log_message("")

    if not output_date:
        output_date = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    file_name = f"cuda_{output_date}_{benchmark}_{exec_policy}_{size}_{block_size['block_size_1d']}_{block_size['block_size_2d']}_{prefetch}_{num_iter}_{num_blocks}.csv"
    # Create a folder if it doesn't exist;
    output_folder_path = os.path.join(BenchmarkResult.DEFAULT_RES_FOLDER,
                                      output_date + "_cuda")
    if not os.path.exists(output_folder_path):
        if debug:
            BenchmarkResult.log_message(
                f"creating result folder: {output_folder_path}")
        os.mkdir(output_folder_path)
    output_path = os.path.join(output_folder_path, file_name)

    benchmark_cmd = CUDA_CMD.format(benchmark, exec_policy, size,
                                    block_size["block_size_1d"],
                                    block_size["block_size_2d"], num_iter,
                                    num_blocks, "-r" if prefetch else "", "-a",
                                    output_path)
    start = System.nanoTime()
    result = subprocess.run(
        benchmark_cmd,
        shell=True,
        stdout=subprocess.STDOUT,
        cwd=f"{os.getenv('GRCUDA_HOME')}/projects/resources/cuda/bin")
    result.check_returncode()
    end = System.nanoTime()
    if debug:
        BenchmarkResult.log_message(
            f"Benchmark total execution time: {(end - start) / 1_000_000_000:.2f} seconds"
        )
Ejemplo n.º 21
0
def execute_grcuda_benchmark(benchmark,
                             size,
                             block_sizes,
                             exec_policy,
                             new_stream_policy,
                             parent_stream_policy,
                             dependency_policy,
                             num_iter,
                             debug,
                             time_phases,
                             num_blocks=DEFAULT_NUM_BLOCKS,
                             prefetch=False,
                             output_date=None):
    if debug:
        BenchmarkResult.log_message("")
        BenchmarkResult.log_message("")
        BenchmarkResult.log_message("#" * 30)
        BenchmarkResult.log_message(f"Benchmark {i + 1}/{tot_benchmarks}")
        BenchmarkResult.log_message(
            f"benchmark={benchmark}, size={n},"
            f"block sizes={block_sizes}, "
            f"num blocks={num_blocks}, "
            f"exec policy={exec_policy}, "
            f"new stream policy={new_stream_policy}, "
            f"parent stream policy={parent_stream_policy}, "
            f"dependency policy={dependency_policy}, "
            f"prefetch={prefetch}, "
            f"time_phases={time_phases}")
        BenchmarkResult.log_message("#" * 30)
        BenchmarkResult.log_message("")
        BenchmarkResult.log_message("")

    if not output_date:
        output_date = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    file_name = f"{output_date}_{benchmark}_{exec_policy}_{new_stream_policy}_{parent_stream_policy}_" \
                f"{dependency_policy}_{prefetch}_{size}_{num_iter}_{num_blocks}.json"
    # Create a folder if it doesn't exist;
    output_folder_path = os.path.join(BenchmarkResult.DEFAULT_RES_FOLDER,
                                      output_date + "_grcuda")
    if not os.path.exists(output_folder_path):
        if debug:
            BenchmarkResult.log_message(
                f"creating result folder: {output_folder_path}")
        os.mkdir(output_folder_path)
    output_path = os.path.join(output_folder_path, file_name)
    b1d_size = " ".join([str(b['block_size_1d']) for b in block_sizes])
    b2d_size = " ".join([str(b['block_size_2d']) for b in block_sizes])

    benchmark_cmd = GRAALPYTHON_CMD.format(
        HEAP_SIZE, new_stream_policy,
        "--grcuda.InputPrefetch" if prefetch else "", exec_policy,
        dependency_policy, parent_stream_policy, num_iter, size, num_blocks,
        benchmark, b1d_size, b2d_size, "-d" if debug else "",
        "-p" if time_phases else "", output_path)
    start = System.nanoTime()
    result = subprocess.run(
        benchmark_cmd,
        shell=True,
        stdout=subprocess.STDOUT,
        cwd=f"{os.getenv('GRCUDA_HOME')}/projects/resources/python/benchmark")
    result.check_returncode()
    end = System.nanoTime()
    if debug:
        BenchmarkResult.log_message(
            f"Benchmark total execution time: {(end - start) / 1_000_000_000:.2f} seconds"
        )
Ejemplo n.º 22
0
    def execute(self) -> object:

        start_comp = System.nanoTime()
        start = 0

        for i in range(self.num_iterations):
            # Authorities;
            self.execute_phase(
                f"spmv_a_{i}",
                self.spmv_kernel(self.num_blocks_size,
                                 self.block_size), self.ptr2, self.idx2,
                self.val2, self.hub1, self.auth2, self.size, self.num_nnz)

            # Hubs;
            self.execute_phase(
                f"spmv_h_{i}",
                self.spmv_kernel(self.num_blocks_size,
                                 self.block_size), self.ptr, self.idx,
                self.val, self.auth1, self.hub2, self.size, self.num_nnz)

            # Normalize authorities;
            self.execute_phase(
                f"sum_a_{i}",
                self.sum_kernel(self.num_blocks_size, self.block_size),
                self.auth2, self.auth_norm, self.size)

            # Normalize hubs;
            self.execute_phase(
                f"sum_h_{i}",
                self.sum_kernel(self.num_blocks_size, self.block_size),
                self.hub2, self.hub_norm, self.size)

            self.execute_phase(
                f"divide_a_{i}",
                self.divide_kernel(self.num_blocks_size, self.block_size),
                self.auth2, self.auth1, self.auth_norm, self.size)

            self.execute_phase(
                f"divide_h_{i}",
                self.divide_kernel(self.num_blocks_size, self.block_size),
                self.hub2, self.hub1, self.hub_norm, self.size)

            if self.time_phases:
                start = System.nanoTime()
            self.auth_norm[0] = 0.0
            self.hub_norm[0] = 0.0
            if self.time_phases:
                end = System.nanoTime()
                self.benchmark.add_phase({
                    "name":
                    f"norm_reset_{i}",
                    "time_sec": (end - start) / 1_000_000_000
                })

        # Add a final sync step to measure the real computation time;
        if self.time_phases:
            start = System.nanoTime()
        tmp1 = self.auth1[0]
        tmp2 = self.hub1[0]
        end = System.nanoTime()
        if self.time_phases:
            self.benchmark.add_phase({
                "name": "sync",
                "time_sec": (end - start) / 1_000_000_000
            })
        self.benchmark.add_computation_time((end - start_comp) / 1_000_000_000)
        # Compute GPU result;
        for i in range(self.size):
            self.gpu_result[i] = self.auth1[i] + self.hub1[i]

        self.benchmark.add_to_benchmark("gpu_result", 0)
        if self.benchmark.debug:
            BenchmarkResult.log_message(
                f"\tgpu result: [" +
                ", ".join([f"{x:.4f}" for x in self.gpu_result[:10]]) + "...]")

        return self.gpu_result
Ejemplo n.º 23
0
    )

    # Parse the input arguments;
    args = parser.parse_args()

    debug = args.debug if args.debug else BenchmarkResult.DEFAULT_DEBUG
    num_iter = args.num_iter if args.num_iter else BenchmarkResult.DEFAULT_NUM_ITER
    use_cuda = args.cuda_test
    time_phases = args.time_phases
    num_blocks = args.num_blocks

    # Setup the block size for each benchmark;
    block_sizes = create_block_size_list(block_sizes_1d, block_sizes_2d)
    if debug:
        BenchmarkResult.log_message(
            f"using block sizes: {block_sizes}; using low-level CUDA benchmarks: {use_cuda}"
        )

    def tot_benchmark_count():
        tot = 0
        if use_cuda:
            for b in benchmarks:
                tot += len(num_elem[b]) * len(block_sizes) * len(
                    cuda_exec_policies) * len(new_stream_policies) * len(
                        parent_stream_policies) * len(
                            dependency_policies) * len(prefetch)
        else:
            for b in benchmarks:
                tot += len(num_elem[b]) * len(exec_policies) * len(prefetch)
        return tot