def test_occupancy_32_9_21_threads(self): mat_a = DenseMatrix(num_rows=32, num_cols=56, addressing="strided", bbox=[0, 0, 31, 20], transpose=False) mat_b = DenseMatrix(num_rows=32, num_cols=56, addressing="strided", bbox=[0, 0, 20, 8], transpose=False) mat_c = DenseMatrix(num_rows=32, num_cols=9, bbox=[0, 0, 31, 8], addressing="strided", transpose=False) self.gen.generate(mat_a, mat_b, mat_c, alpha=1.1, beta=1.1) # one multiplication per block because the kernel requires too much of # shared memory self.assertEqual(self.gen.num_active_threads, 32) self.assertEqual(self.gen.num_mult_per_block, 2)
def test_exact_transpose_loader(self): # load a column in one go loader = shm_mem_factory(vm=self._vm, matrix=DenseMatrix(num_rows=33, num_cols=56, addressing='none', bbox=[0, 0, 15, 20], transpose=True), num_active_threads=32, load_and_transpose=True) self.assertIsInstance(loader, ExactTransposePatchLoader) # multiple hops to load a column loader = shm_mem_factory(vm=self._vm, matrix=DenseMatrix(num_rows=65, num_cols=56, addressing='none', bbox=[0, 0, 34, 20], transpose=True), num_active_threads=32, load_and_transpose=True) self.assertIsInstance(loader, ExactTransposePatchLoader)
def test_occupancy_56_9_9_threads(self): mat_a = DenseMatrix(num_rows=56, num_cols=9, addressing="strided", bbox=[0, 0, 55, 8], transpose=False) mat_b = DenseMatrix(num_rows=9, num_cols=9, addressing="strided", bbox=[0, 0, 8, 8], transpose=False) mat_c = DenseMatrix(num_rows=56, num_cols=9, bbox=[0, 0, 55, 8], addressing="strided", transpose=False) self.gen.generate(mat_a, mat_b, mat_c, alpha=1.1, beta=1.1) self.assertEqual(self.gen.num_active_threads, 64) self.assertEqual(self.gen.num_mult_per_block, 1)
def produce_matrix(spec): return DenseMatrix(num_rows=spec["num_rows"], num_cols=spec["num_cols"], addressing=spec["addressing"], bbox=spec["bbox"], transpose=spec["trans"])
def _produce_matrix(self, matrix_spec): return DenseMatrix(num_rows=matrix_spec["rows"], num_cols=matrix_spec["cols"], addressing=matrix_spec["addressing"], bbox=matrix_spec["bbox"], transpose=matrix_spec["trans"])
"--manufacturer", action="store", help="Name of the Manufacturer, currently nvidia and amd are supported", default="nvidia") parser.add_argument( "-s", "--sub_arch", action="store", help="Sub_arch of the GPU, e.g sm_60 for Nvidia or gfx906 for AMD", default="sm_60") args = parser.parse_args() mat_a = DenseMatrix(num_rows=9, num_cols=9, addressing="strided", bbox=[0, 0, 8, 8], transpose=False) mat_b = DenseMatrix(num_rows=9, num_cols=9, addressing="strided", bbox=[0, 0, 8, 8], transpose=False) try: vm = vm_factory(name=args.manufacturer, sub_name=args.sub_arch, fp_type="float") gen = CsaGenerator(vm)