Exemple #1
0
    def transform(self, tree, program_cfg):
        arg_cfg, tune_cfg = program_cfg
        channels, height, width = arg_cfg[0]
        cfg = {
            'pad_h': C.Constant(self.pad_h),
            'pad_w': C.Constant(self.pad_w),
            'stride_h': C.Constant(self.stride_h),
            'stride_w': C.Constant(self.stride_w),
            'kernel_h': C.Constant(self.kernel_h),
            'kernel_w': C.Constant(self.kernel_w),
            'channels': C.Constant(channels),
            'height': C.Constant(height),
            'width': C.Constant(width),
        }
        im2col = C.FunctionDecl(
            None,
            C.SymbolRef("im2col"),
            [C.SymbolRef("data_im", arg_cfg[1]()),
             C.SymbolRef("data_col", arg_cfg[1]())],
            [StringTemplate("""
int stride_h = $stride_h;
int stride_w = $stride_w;
int pad_h = $pad_h;
int pad_w = $pad_w;
int kernel_h = $kernel_h;
int kernel_w = $kernel_w;
int channels = $channels;
int height = $height;
int width = $width;
int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
int channels_col = channels * kernel_h * kernel_w;
for (int c = 0; c < channels_col; ++c) {
    int w_offset = c % kernel_w;
    int h_offset = (c / kernel_w) % kernel_h;
    int c_im = c / kernel_h / kernel_w;
    for (int h = 0; h < height_col; ++h) {
        for (int w = 0; w < width_col; ++w) {
            int h_pad = h * stride_h - pad_h + h_offset;
            int w_pad = w * stride_w - pad_w + w_offset;
            if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
            data_col[(c * height_col + h) * width_col + w] =
                data_im[(c_im * height + h_pad) * width + w_pad];
            else
                data_col[(c * height_col + h) * width_col + w] = 0;
        }
    }
} """, cfg)])
        return [C.CFile('im2col', [im2col])]
Exemple #2
0
    def build_kernel(self, kernel_src, kernel_name, kernel_args):
        kernel_src = C.CFile('generated', [StringTemplate(
"""
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
"""
            ), kernel_src])
        try:
            program = cl.clCreateProgramWithSource(
                latte.config.cl_ctx, kernel_src.codegen()).build()
            kernel = program[kernel_name]
        except cl.BuildProgramFailureError as e:
            logger.error("Failed build program:\n %s", kernel_src.codegen())
            raise e
        self.kernels[kernel_name] = kernel
        for index, arg in enumerate(kernel_args):
            kernel.setarg(index, self.cl_buffers[arg], ctypes.sizeof(cl.cl_mem))
        logger.debug(kernel_src)
Exemple #3
0
def mpi_compile(project):
    if not MPI_ENABLED:
        module = project.codegen()
    elif MPI.COMM_WORLD.Get_rank() == 0:
        module = project.codegen()
        MPI.COMM_WORLD.bcast(module, root=0)
    else:
        module = MPI.COMM_WORLD.bcast(None, root=0)
    return module


_file = FileTemplate(
    os.path.dirname(os.path.abspath(__file__)) + "/templates/utils.c")

c_file = C.CFile("util", [_file])
module = ctree.nodes.Project([c_file]).codegen()
get_cpu_freq = module.get_callable("get_cpu_freq",
                                   ctypes.CFUNCTYPE(ctypes.c_double))

#
# def aligned(shape, dtype, alignment=64, init=np.empty):
#     if isinstance(shape, list):
#         shape = tuple(shape)
#     pointer = aligned_malloc(np.prod(shape) * np.dtype(dtype).itemsize)
#     typ = np.ctypeslib.ndpointer(dtype=dtype, ndim=len(shape), shape=shape)
#     arr = np.ctypeslib.as_array(typ(pointer), shape)
#     if init == np.empty:
#         return arr
#     elif init == np.zeros:
#         arr.fill(0.0)
Exemple #4
0
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''
from ctree.templates.nodes import FileTemplate
import os
import ctree
import ctree.c.nodes as C
import ctypes
import latte
import latte.util as util

_file = FileTemplate(
    os.path.dirname(os.path.abspath(__file__)) + "/templates/sgd.c")

c_file = C.CFile("sgd", [_file])
module = util.mpi_compile(ctree.nodes.Project([c_file]))
_sgd_update = module.get_callable(
    "sgd_update",
    ctypes.CFUNCTYPE(None, ctypes.POINTER(ctypes.c_float),
                     ctypes.POINTER(ctypes.c_float),
                     ctypes.POINTER(ctypes.c_float), ctypes.c_float,
                     ctypes.c_float, ctypes.c_int, ctypes.c_int))


def sgd_update(param, grad, hist, lr, mom, batch_size):
    _sgd_update(
        param.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        hist.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        ctypes.c_float(lr),
Exemple #5
0
                dest = dest.reshape(tiled_shape)
                dest[_slice] = tiled
            else:
                dest[_slice] = value
            if cl_buffer is not None:
                _, evt = cl.buffer_from_ndarray(latte.config.cl_queue, dest, buf=cl_buffer)
                evt.wait()
        setattr(self, "set_" + field, set)
        if self.parent_group is not None:
            setattr(self.parent_group, "get_" + field, get)
            setattr(self.parent_group, "get_" + field + "_view", get_view)
            setattr(self.parent_group, "set_" + field, set)

reorder_storage_file = FileTemplate(os.path.dirname(os.path.abspath(__file__)) + "/templates/reorder_storage.c")

c_file = C.CFile("reorder_storage", [reorder_storage_file])
module = util.mpi_compile(ctree.nodes.Project([c_file]))

class DataEnsemble(Ensemble):
    def __init__(self, batch_size, shape):
        self.value = np.zeros((batch_size, ) + shape, np.float32)
        neurons = np.empty(shape, dtype='object')
        for i, _ in np.ndenumerate(neurons):
            neurons[i] = DataNeuron()
        self.reorder_4d_5d = module.get_callable("reorder_4d_5d", 
            ctypes.CFUNCTYPE(None, np.ctypeslib.ndpointer(np.float32, self.value.ndim, self.value.shape), 
                np.ctypeslib.ndpointer(np.float32, self.value.ndim, self.value.shape),
                ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int))
        super().__init__(neurons)

    def forward(self, value):
Exemple #6
0
    def transform(self, tree, program_cfg):
        arg_cfg, tune_cfg = program_cfg
        # TODO: These should be tunables
        rx, ry = tune_cfg['rx'] * 4, tune_cfg['ry'] * 4
        cx, cy = tune_cfg['cx'] * 4, tune_cfg['cy'] * 4
        unroll = tune_cfg['ry'] * 4
        n, dtype = arg_cfg['n'], arg_cfg['dtype']
        array_type = np.ctypeslib.ndpointer(dtype, 2, (n, n))()

        A = C.SymbolRef("A", array_type)
        B = C.SymbolRef("B", array_type)
        _C = C.SymbolRef("C", array_type)

        N = C.Constant(n)
        RX, RY = C.Constant(rx), C.Constant(ry)
        CX, CY = C.Constant(cx), C.Constant(cy)
        UNROLL = C.Constant(unroll)

        template_args = {
            "A_decl": A.copy(declare=True),
            "B_decl": B.copy(declare=True),
            "C_decl": _C.copy(declare=True),
            "RX": RX,
            "RY": RY,
            "CX": CX,
            "CY": CY,
            "UNROLL": UNROLL,
            "lda": N,
        }

        preamble = StringTemplate(
            """
        #include <immintrin.h>
        #define min(x,y) (((x)<(y))?(x):(y))
        """, copy.deepcopy(template_args))

        reg_template_args = {
            'load_c_block':
            self._gen_load_c_block(rx, ry, n),
            'store_c_block':
            self._gen_store_c_block(rx, ry, n),
            'k_rank1_updates':
            self._gen_k_rank1_updates(rx, ry, cx, cy, unroll, n),
        }
        reg_template_args.update(copy.deepcopy(template_args))

        register_dgemm = StringTemplate(
            """
        void register_dgemm( $A_decl, $B_decl, $C_decl, int K )  {
            __m256d c[$RY/4][$RX];
            $load_c_block
            while ( K >= $UNROLL ) {
              $k_rank1_updates
              A += $UNROLL*$CY;
              B += $UNROLL;
              K -= $UNROLL;
            }
            $store_c_block
        }
        """, reg_template_args)

        fast_dgemm_args = {
            "LOAD_A_BLOCK": self.get_load_a_block(arg_cfg['transA'],
                                                  template_args)
        }
        fast_dgemm_args.update(copy.deepcopy(template_args))

        fast_dgemm = StringTemplate(
            """
        void fast_dgemm( int M, int N, int K, $A_decl, $B_decl, $C_decl ) {
            static double a[$CX*$CY] __attribute__ ((aligned (16)));
            $LOAD_A_BLOCK
            //  multiply using the copy
            for( int j = 0; j < N; j += $RX )
                for( int i = 0; i < M; i += $RY )
                    register_dgemm( a + i, B + j*$lda, C + i + j*$lda, K );
        }""", fast_dgemm_args)

        fringe_dgemm = StringTemplate(
            """
        void fringe_dgemm( int M, int N, int K, $A_decl, $B_decl, $C_decl )
        {
            for( int j = 0; j < N; j++ )
               for( int i = 0; i < M; i++ )
                    for( int k = 0; k < K; k++ )
                         C[i+j*$lda] += A[i+k*$lda] * B[k+j*$lda];
        }
        """, copy.deepcopy(template_args))

        wall_time = StringTemplate(
            """
        #include <sys/time.h>
        double wall_time () {
          struct timeval t;
          gettimeofday (&t, NULL);
          return 1.*t.tv_sec + 1.e-6*t.tv_usec;
        }
        """, {})

        dgemm = StringTemplate(
            """
        int align( int x, int y ) { return x <= y ? x : (x/y)*y; }
        void dgemm($C_decl, $A_decl, $B_decl, double *duration) {
            double start_time = wall_time();
            for( int i = 0; i < $lda; ) {
                int I = align( min( $lda-i, $CY ), $RY );
                for( int j = 0; j < $lda; ) {
                    int J = align( $lda-j, $RX );
                    for( int k = 0; k < $lda; ) {
                        int K = align( min( $lda-k, $CX ), $UNROLL );
                        if( (I%$RY) == 0 && (J%$RX) == 0 && (K%$UNROLL) == 0 )
                            fast_dgemm ( I, J, K, A + i + k*$lda, B + k +
                               j*$lda, C + i + j*$lda );
                        else
                            fringe_dgemm( I, J, K, A + i + k*$lda, B + k +
                               j*$lda, C + i + j*$lda );
                        k += K;
                    }
                    j += J;
                }
                i += I;
            }
            // report time back for tuner
            *duration = wall_time() - start_time;
        }
        """, copy.deepcopy(template_args))

        tree = C.CFile("generated", [
            preamble,
            wall_time,
            register_dgemm,
            fast_dgemm,
            fringe_dgemm,
            dgemm,
        ])
        return [tree]
Exemple #7
0
                parallel_strategy)
    parallel_strategy = "OPENMP"

nthreads = os.getenv("LATTE_NUM_THREADS", None)
if parallel_strategy == "OPENCL_SIMPLE_LOOP":
    import pycl as cl
    cl_ctx = cl.clCreateContext()
    cl_queue = cl.clCreateCommandQueue(cl_ctx)
elif parallel_strategy in ["SIMPLE_LOOP"
                           ] or parallel_strategy in ["FLOWGRAPH_LOOP"]:
    package_path = os.path.dirname(os.path.abspath(__file__))
    _file = FileTemplate(
        os.path.dirname(os.path.abspath(__file__)) + "/runtime/runtime.cpp",
        {"LATTE_PACKAGE_PATH": StringTemplate(package_path)})

    c_file = C.CFile("runtime", [_file])
    module = util.mpi_compile(ctree.nodes.Project([c_file]))
    init_nthreads = module.get_callable("init_nthreads",
                                        ctypes.CFUNCTYPE(None, ctypes.c_int))
    init_default = module.get_callable("init_default", ctypes.CFUNCTYPE(None))
    if nthreads is not None:
        init_nthreads(int(nthreads))
    else:
        init_default()
    if parallel_strategy in ["FLOWGRAPH_LOOP"]:
        img_block_size = os.getenv("LATTE_PIPELINE_BLOCK_SIZE", 16)
        img_block_size = int(img_block_size)
elif parallel_strategy == "OPENMP":
    if nthreads is not None:
        os.environ["OMP_NUM_THREADS"] = nthreads