Ejemplo n.º 1
0
    def __translate_define_expr(self, expr, end=True):
        ''' Get a single parsed DEFINE expression and return the equivalent
            C++ and CUDA code.

            If 'end' is false, then the final characters of the expression,
            like semi-colons and newlines, are not added.
        '''

        cpp = ''
        cuda = ''

        return_type = Type.enum_to_c_type(expr.loc, expr.type)
        cpp = f'{return_type} {expr.name}('

        # Add the parameters.
        for (arg_type, arg_name) in expr.args[:-1]:
            cpp += f'{Type.enum_to_c_type(expr.loc, arg_type)} {arg_name}, '

        # The last parameter is a little different.
        if len(expr.args) > 0:
            (arg_type, arg_name) = expr.args[-1]
            cpp += f'{Type.enum_to_c_type(expr.loc, arg_type)} {arg_name}'
        cpp += ')'

        # Add this prototype for use in a header file.
        self.cpp_prototypes.append(cpp + ';\n')

        cpp += ' {\n'

        # Add the body of the function.
        body_cpp = ''
        for e in expr.body[:-1]:
            (c, cu) = self.__translate_expr(e, True)
            body_cpp += c
            cuda += cu

        # The last expression should be returned.
        if len(expr.body) == 0:
            error_str = 'expected one or more body expressions'
            raise error.Syntax(expr.loc, error_str)

        e = expr.body[-1]
        if e.exprClass != ExprEnum.LITERAL and \
           e.exprClass != ExprEnum.GET_VAR and \
           e.exprClass != ExprEnum.CALL:
            error_str = 'expected literal, get, or call as last body expression'
            raise error.Syntax(expr.loc, error_str)

        (c, cu) = self.__translate_expr(e, False)
        body_cpp += f'return {c};\n'
        cuda += cu

        self._increase_indent()
        body_cpp = self._make_indented(body_cpp)
        self._decrease_indent()

        cpp = self._make_indented(cpp)
        cpp = cpp + body_cpp + self._make_indented('}\n\n')

        return (cpp, cuda)
Ejemplo n.º 2
0
    def __translate_parallel_loop_expr(self, expr, end=True):
        ''' Get a single parsed PARA_LOOP expression and return the equivalent
            C++ and CUDA code.

            If 'end' is false, then the final characters of the expression,
            like semi-colons and newlines, are not added.
        '''

        def sub_expr_str(expr):
            return f'{self.__translate_expr(expr, end=False)[0]}'

        cpp = ''
        cuda = ''

        # Get a unique name for the cuda kernel.
        self._para_loop_ind += 1
        cuda_kernel_name = f'cuda_loop{self._para_loop_ind}_kernel'

        # Determine the number of blocks and threads to use.
        iters_str = f'({sub_expr_str(expr.end_index)} - ' + \
                    f'{sub_expr_str(expr.start_index)})'
        threads_per_block = f'min(512, {iters_str})'
        blocks = f'min(32, 1 + {iters_str} / {threads_per_block})'

        # Setup the function to call the kernel.
        args = []
        for var_name in expr.used_vars:
            var_type = expr.env.lookup_variable(expr.loc, var_name)
            c_type = Type.enum_to_c_type(expr.loc, var_type)

            args.append(f'{c_type} {var_name}')

        cuda += f'void call_{cuda_kernel_name}'
        cuda += f'({", ".join(args)})'
        self.cuda_prototypes.append(cuda + ';\n')

        # Call this kernel-calling fucntion in the cpp code.
        cpp += f'call_{cuda_kernel_name}'
        cpp += f'({", ".join(expr.used_vars)});\n'

        cuda += ' {\n'
        cuda_body = ''

        # Make device variables if necessary.
        dev_vars = []
        for var_name in expr.used_vars:
            dev_name = f'dev_{var_name}'
            data_name = f'{dev_name}_data'  # Only used for lists.
            var_type = expr.env.lookup_variable(expr.loc, var_name)

            if var_type == Type.INT or var_type == Type.FLOAT:
                dev_vars.append(var_name)
            elif var_type == Type.LIST_INT:
                dev_vars.append(dev_name)

                # Allocate memory.
                size = f'{var_name}.size * sizeof(int)'
                cuda_body += f'int *{data_name};\n'
                cuda_body += f'cudaMalloc((void **) &{data_name}, {size});\n'

                # Copy the data from host to device.
                cuda_body += f'cudaMemcpy({data_name}, {var_name}.data, ' + \
                             f'{size}, cudaMemcpyHostToDevice);\n'
                cuda_body += f'int_list {dev_name} = {"{"}{var_name}.size, ' + \
                             f'{data_name}{"}"};\n\n'
            elif var_type == Type.LIST_FLOAT:
                dev_vars.append(dev_name)

                # Allocate memory.
                size = f'{var_name}.size * sizeof(float)'
                cuda_body += f'float *{data_name};\n'
                cuda_body += f'cudaMalloc((void **) &{data_name}, {size});\n'

                # Copy the data from host to device.
                cuda_body += f'cudaMemcpy({data_name}, {var_name}.data, ' + \
                             f'{size}, cudaMemcpyHostToDevice);\n'
                cuda_body += f'float_list {dev_name} = {"{"}' + \
                             f'{var_name}.size, {data_name}{"}"};\n\n'
            elif var_type == Type.STRING or var_type == Type.LIST_STRING:
                error_str = 'strings not yet allowed in parallelization'
                raise error.InternalError(expr.loc, error_str)

        # Call the kernel.
        cuda_body += f'{cuda_kernel_name}<<<{blocks}, {threads_per_block}>>>'
        cuda_body += f'({", ".join(dev_vars)});\n\n'

        # Copy the data back from device to host.
        for var_name in expr.used_vars:
            dev_name = f'dev_{var_name}'
            data_name = f'{dev_name}_data'
            var_type = expr.env.lookup_variable(expr.loc, var_name)

            if var_type == Type.INT or var_type == Type.FLOAT:
                # There is no need to copy the variable back, because C++
                # passes it by value and so the value did not change.
                pass
            elif var_type == Type.LIST_INT:
                size = f'{var_name}.size * sizeof(int)'

                # Copy the data from host to device.
                cuda_body += f'cudaMemcpy({var_name}.data, {data_name}, {size}, '
                cuda_body += f'cudaMemcpyDeviceToHost);\n'
            elif var_type == Type.LIST_FLOAT:
                size = f'{var_name}.size * sizeof(float)'

                # Copy the data from host to device.
                cuda_body += f'cudaMemcpy({var_name}.data, {data_name}, {size}, '
                cuda_body += f'cudaMemcpyDeviceToHost);\n'
            elif var_type == Type.STRING or var_type == Type.LIST_STRING:
                error_str = 'strings not yet allowed in parallelization'
                raise error.InternalError(expr.loc, error_str)

        self._increase_indent()
        cuda_body = self._make_indented(cuda_body)
        self._decrease_indent()

        cuda += cuda_body + '}\n\n'

        # Setup the cuda code.
        # No return value is needed because results are returned via the input
        # lists.
        cuda_kernel = f'__global__ void {cuda_kernel_name}'

        # Add the arguments.
        cuda_kernel += f'({", ".join(args)})'

        # Add this function prototype for use in a header file.
        self.cuda_prototypes.append(cuda_kernel + ';\n')

        cuda_kernel += ' {\n'

        # Determine the index in the loop.
        index = expr.index_name
        cuda_kernel += f'    int {index} = blockIdx.x * blockDim.x + '
        cuda_kernel += f'threadIdx.x + {sub_expr_str(expr.start_index)};\n\n'

        # Loop over all indices that this thread is responsible for.
        max_index = sub_expr_str(expr.end_index)
        cuda_kernel += f'    while ({index} < {max_index}) {"{"}\n'

        for e in expr.body:
            (c, _) = self.__translate_expr(e);
            cuda_kernel += c

        cuda_kernel += f'        {index} += gridDim.x * blockDim.x;\n'
        cuda_kernel += f'    {"}"}\n'
        cuda_kernel += f'{"}"}\n\n'

        cuda += cuda_kernel

        return (cpp, cuda)