def __translate_define_expr(self, expr, end=True): ''' Get a single parsed DEFINE expression and return the equivalent C++ and CUDA code. If 'end' is false, then the final characters of the expression, like semi-colons and newlines, are not added. ''' cpp = '' cuda = '' return_type = Type.enum_to_c_type(expr.loc, expr.type) cpp = f'{return_type} {expr.name}(' # Add the parameters. for (arg_type, arg_name) in expr.args[:-1]: cpp += f'{Type.enum_to_c_type(expr.loc, arg_type)} {arg_name}, ' # The last parameter is a little different. if len(expr.args) > 0: (arg_type, arg_name) = expr.args[-1] cpp += f'{Type.enum_to_c_type(expr.loc, arg_type)} {arg_name}' cpp += ')' # Add this prototype for use in a header file. self.cpp_prototypes.append(cpp + ';\n') cpp += ' {\n' # Add the body of the function. body_cpp = '' for e in expr.body[:-1]: (c, cu) = self.__translate_expr(e, True) body_cpp += c cuda += cu # The last expression should be returned. if len(expr.body) == 0: error_str = 'expected one or more body expressions' raise error.Syntax(expr.loc, error_str) e = expr.body[-1] if e.exprClass != ExprEnum.LITERAL and \ e.exprClass != ExprEnum.GET_VAR and \ e.exprClass != ExprEnum.CALL: error_str = 'expected literal, get, or call as last body expression' raise error.Syntax(expr.loc, error_str) (c, cu) = self.__translate_expr(e, False) body_cpp += f'return {c};\n' cuda += cu self._increase_indent() body_cpp = self._make_indented(body_cpp) self._decrease_indent() cpp = self._make_indented(cpp) cpp = cpp + body_cpp + self._make_indented('}\n\n') return (cpp, cuda)
def __translate_parallel_loop_expr(self, expr, end=True): ''' Get a single parsed PARA_LOOP expression and return the equivalent C++ and CUDA code. If 'end' is false, then the final characters of the expression, like semi-colons and newlines, are not added. ''' def sub_expr_str(expr): return f'{self.__translate_expr(expr, end=False)[0]}' cpp = '' cuda = '' # Get a unique name for the cuda kernel. self._para_loop_ind += 1 cuda_kernel_name = f'cuda_loop{self._para_loop_ind}_kernel' # Determine the number of blocks and threads to use. iters_str = f'({sub_expr_str(expr.end_index)} - ' + \ f'{sub_expr_str(expr.start_index)})' threads_per_block = f'min(512, {iters_str})' blocks = f'min(32, 1 + {iters_str} / {threads_per_block})' # Setup the function to call the kernel. args = [] for var_name in expr.used_vars: var_type = expr.env.lookup_variable(expr.loc, var_name) c_type = Type.enum_to_c_type(expr.loc, var_type) args.append(f'{c_type} {var_name}') cuda += f'void call_{cuda_kernel_name}' cuda += f'({", ".join(args)})' self.cuda_prototypes.append(cuda + ';\n') # Call this kernel-calling fucntion in the cpp code. cpp += f'call_{cuda_kernel_name}' cpp += f'({", ".join(expr.used_vars)});\n' cuda += ' {\n' cuda_body = '' # Make device variables if necessary. dev_vars = [] for var_name in expr.used_vars: dev_name = f'dev_{var_name}' data_name = f'{dev_name}_data' # Only used for lists. var_type = expr.env.lookup_variable(expr.loc, var_name) if var_type == Type.INT or var_type == Type.FLOAT: dev_vars.append(var_name) elif var_type == Type.LIST_INT: dev_vars.append(dev_name) # Allocate memory. size = f'{var_name}.size * sizeof(int)' cuda_body += f'int *{data_name};\n' cuda_body += f'cudaMalloc((void **) &{data_name}, {size});\n' # Copy the data from host to device. cuda_body += f'cudaMemcpy({data_name}, {var_name}.data, ' + \ f'{size}, cudaMemcpyHostToDevice);\n' cuda_body += f'int_list {dev_name} = {"{"}{var_name}.size, ' + \ f'{data_name}{"}"};\n\n' elif var_type == Type.LIST_FLOAT: dev_vars.append(dev_name) # Allocate memory. size = f'{var_name}.size * sizeof(float)' cuda_body += f'float *{data_name};\n' cuda_body += f'cudaMalloc((void **) &{data_name}, {size});\n' # Copy the data from host to device. cuda_body += f'cudaMemcpy({data_name}, {var_name}.data, ' + \ f'{size}, cudaMemcpyHostToDevice);\n' cuda_body += f'float_list {dev_name} = {"{"}' + \ f'{var_name}.size, {data_name}{"}"};\n\n' elif var_type == Type.STRING or var_type == Type.LIST_STRING: error_str = 'strings not yet allowed in parallelization' raise error.InternalError(expr.loc, error_str) # Call the kernel. cuda_body += f'{cuda_kernel_name}<<<{blocks}, {threads_per_block}>>>' cuda_body += f'({", ".join(dev_vars)});\n\n' # Copy the data back from device to host. for var_name in expr.used_vars: dev_name = f'dev_{var_name}' data_name = f'{dev_name}_data' var_type = expr.env.lookup_variable(expr.loc, var_name) if var_type == Type.INT or var_type == Type.FLOAT: # There is no need to copy the variable back, because C++ # passes it by value and so the value did not change. pass elif var_type == Type.LIST_INT: size = f'{var_name}.size * sizeof(int)' # Copy the data from host to device. cuda_body += f'cudaMemcpy({var_name}.data, {data_name}, {size}, ' cuda_body += f'cudaMemcpyDeviceToHost);\n' elif var_type == Type.LIST_FLOAT: size = f'{var_name}.size * sizeof(float)' # Copy the data from host to device. cuda_body += f'cudaMemcpy({var_name}.data, {data_name}, {size}, ' cuda_body += f'cudaMemcpyDeviceToHost);\n' elif var_type == Type.STRING or var_type == Type.LIST_STRING: error_str = 'strings not yet allowed in parallelization' raise error.InternalError(expr.loc, error_str) self._increase_indent() cuda_body = self._make_indented(cuda_body) self._decrease_indent() cuda += cuda_body + '}\n\n' # Setup the cuda code. # No return value is needed because results are returned via the input # lists. cuda_kernel = f'__global__ void {cuda_kernel_name}' # Add the arguments. cuda_kernel += f'({", ".join(args)})' # Add this function prototype for use in a header file. self.cuda_prototypes.append(cuda_kernel + ';\n') cuda_kernel += ' {\n' # Determine the index in the loop. index = expr.index_name cuda_kernel += f' int {index} = blockIdx.x * blockDim.x + ' cuda_kernel += f'threadIdx.x + {sub_expr_str(expr.start_index)};\n\n' # Loop over all indices that this thread is responsible for. max_index = sub_expr_str(expr.end_index) cuda_kernel += f' while ({index} < {max_index}) {"{"}\n' for e in expr.body: (c, _) = self.__translate_expr(e); cuda_kernel += c cuda_kernel += f' {index} += gridDim.x * blockDim.x;\n' cuda_kernel += f' {"}"}\n' cuda_kernel += f'{"}"}\n\n' cuda += cuda_kernel return (cpp, cuda)