Example #1
0
    def test_forward(self):
        @compose
        def fn(bottom, label, top):
            top = SoftmaxForward(bottom, label)
            return top

        bottom = hm.random((6, 13, 1), _range=(-5, 5))
        label = hmarray((6, ))
        for i in range(6):
            label = i % 6
        top = hmarray((6, 13, 1))
        fn(bottom, label, top)

        top.sync_host()
        for i in range(6):
            sum = 0
            for c in range(13):
                sum += top[i, c, 0]
            self.assertTrue(sum > .999)
            self.assertTrue(sum < 1.001)
            scale = 0
            for c in range(13):
                scale += exp(bottom[i, c, 0])
            for c in range(13):
                self.assertGreaterEqual(top[i, c, 0] + 1e-4,
                                        exp(bottom[i, c, 0]) / scale)
                self.assertLessEqual(top[i, c, 0] - 1e-4,
                                     exp(bottom[i, c, 0]) / scale)
 def set_up(self):
     datum = pb.Datum()
     datum.ParseFromString(next(self.cursor)[1])
     self.mean = self.mean.reshape(datum.channels, datum.height, datum.width)
     height, width = datum.height, datum.width
     if self.crop_size:
         height, width = self.crop_size, self.crop_size
     self.data = hmarray((self.batch_size, datum.channels, height, width))
     self.label = hmarray((self.batch_size, ))
     return [(self.data, None), (self.label, None)]
Example #3
0
 def set_up(self):
     datum = pb.Datum()
     datum.ParseFromString(next(self.cursor)[1])
     self.mean = self.mean.reshape(datum.channels, datum.height,
                                   datum.width)
     height, width = datum.height, datum.width
     if self.crop_size:
         height, width = self.crop_size, self.crop_size
     self.data = hmarray((self.batch_size, datum.channels, height, width))
     self.label = hmarray((self.batch_size, ))
     return [(self.data, None), (self.label, None)]
    def test_simple(self):
        a = hm.random((3, 16, 27, 27))
        scale = hmarray((3, 16, 27, 27))
        actual = hmarray((3, 16, 27, 27))

        @compose
        def fn(bottom, scale, top):
            top, scale = LrnForward(bottom, alpha=alpha, beta=beta,
                                    local_size=local_size, k=1)
            return top, scale

        fn(a, scale, actual)
        actual.sync_host()
        expected = reference_lrn(a)
        self._check(actual, expected)
    def test_relu(self):
        top = hm.zeros((4, 12, 15, 15))
        bottom = hm.random((4, 12, 15, 15), _range=(-1, 1))

        @compose
        def fn(bottom, top):
            top = ReluForward(bottom)
            return top

        fn(bottom, top)
        top.sync_host()

        expected = np.copy(bottom)
        expected[expected < 0] = 0
        self._check(top, expected)

        top_diff = hm.random(top.shape)
        bottom_diff = hmarray(top.shape)

        @compose
        def fn(top_diff, bottom, bottom_diff):
            bottom_diff = ReluBackward(bottom, top_diff)
            return bottom_diff

        fn(top_diff, bottom, bottom_diff)
        bottom_diff.sync_host()

        expected = np.copy(top_diff)
        expected[bottom < 0] = 0
        self._check(bottom_diff, expected)
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            bottom = symbol_table[sources[0].name]
            top = symbol_table[sinks[0].name]
            bias_multiplier = hmarray((1, bottom.shape[0]))
            bias_multiplier.fill(1)
            bias_multiplier.sync_ocl()
            N = top.shape[1]
            K = np.prod(bottom.shape[1:])
            M = bottom.shape[0]
            class InnerProductLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    weights = symbol_table[sources[1].name]
                    bias = symbol_table[sources[2].name]
                    top = symbol_table[sinks[0].name]
                    evt = sgemm(False, True, 1.0, bottom, 0, K, weights,
                                0, K, 0.0, top, 0, N, M, N, K, wait_for=wait_for)
                    evt = sgemm(False, False, 1.0, bias_multiplier, 0,
                                1, bias, 0, N, 1.0, top, 0, N, M, N,
                                1, wait_for=evt)
                    return [evt]
            return InnerProductLauncher(sources, sinks)
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            bottom = symbol_table[sources[0].name]
            top = symbol_table[sinks[0].name]
            bias_multiplier = hmarray((1, bottom.shape[0]))
            bias_multiplier.fill(1)
            bias_multiplier.sync_ocl()
            N = top.shape[1]
            K = np.prod(bottom.shape[1:])
            M = bottom.shape[0]

            class InnerProductLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    weights = symbol_table[sources[1].name]
                    bias = symbol_table[sources[2].name]
                    top = symbol_table[sinks[0].name]
                    evt = sgemm(False,
                                True,
                                1.0,
                                bottom,
                                0,
                                K,
                                weights,
                                0,
                                K,
                                0.0,
                                top,
                                0,
                                N,
                                M,
                                N,
                                K,
                                wait_for=wait_for)
                    evt = sgemm(False,
                                False,
                                1.0,
                                bias_multiplier,
                                0,
                                1,
                                bias,
                                0,
                                N,
                                1.0,
                                top,
                                0,
                                N,
                                M,
                                N,
                                1,
                                wait_for=evt)
                    return [evt]

            return InnerProductLauncher(sources, sinks)
Example #8
0
    def test_simple(self):
        a = hm.random((3, 16, 27, 27))
        scale = hmarray((3, 16, 27, 27))
        actual = hmarray((3, 16, 27, 27))

        @compose
        def fn(bottom, scale, top):
            top, scale = LrnForward(bottom,
                                    alpha=alpha,
                                    beta=beta,
                                    local_size=local_size,
                                    k=1)
            return top, scale

        fn(a, scale, actual)
        actual.sync_host()
        expected = reference_lrn(a)
        self._check(actual, expected)
    def test_avg(self):
        shape = (3, 16, 24, 24)
        a = hm.random(shape, _range=(0, 255))
        actual_mask = hmarray((3, 16, 12, 12))
        actual = hmarray((3, 16, 12, 12))
        expected = hmarray((3, 16, 12, 12))
        expected.fill(float('-inf'))

        @compose
        def fn(bottom, mask, top):
            top = AvePoolForward(bottom, kernel_size=(2, 2),
                                 padding=(0, 0), stride=(2, 2))
            return top

        fn(a, actual_mask, actual)
        actual.sync_host()
        reference_ave_pool(a, expected, (2, 2), (2, 2), (0, 0))
        self._check(actual, expected)
    def test_pool(self):
        shape = (3, 16, 24, 24)
        a = hm.random(shape, _range=(0, 255))
        actual_mask = hmarray((3, 16, 12, 12))
        actual = hmarray((3, 16, 12, 12))
        expected_mask = hmarray((3, 16, 12, 12))
        expected = hmarray((3, 16, 12, 12))
        expected.fill(float('-inf'))

        @compose
        def fn(bottom, mask, top):
            top, mask = PoolForward(bottom,
                                    kernel_size=(2, 2),
                                    padding=(0, 0),
                                    stride=(2, 2))
            return top, mask

        fn(a, actual_mask, actual)
        actual.sync_host()
        actual_mask.sync_host()
        reference_pool(a, expected, expected_mask, (2, 2), (2, 2), (0, 0))
        self._check(actual, expected)
        self._check(actual_mask, expected_mask)
        bottom_diff = hm.zeros(shape)
        expected_bottom_diff = hm.zeros(shape)
        mask = actual_mask
        top_diff = hm.random((3, 16, 12, 12))

        @compose
        def fn(top_diff, mask, bottom_diff):
            bottom_diff = PoolBackward(top_diff,
                                       mask,
                                       kernel_size=(2, 2),
                                       padding=(0, 0),
                                       stride=(2, 2))
            return bottom_diff

        fn(top_diff, mask, bottom_diff)
        bottom_diff.sync_host()
        reference_pool_backward(top_diff, mask, expected_bottom_diff, (2, 2),
                                (2, 2), (0, 0))
        self._check(bottom_diff, expected_bottom_diff)
    def test_avg(self):
        shape = (3, 16, 24, 24)
        a = hm.random(shape, _range=(0, 255))
        actual_mask = hmarray((3, 16, 12, 12))
        actual = hmarray((3, 16, 12, 12))
        expected = hmarray((3, 16, 12, 12))
        expected.fill(float('-inf'))

        @compose
        def fn(bottom, mask, top):
            top = AvePoolForward(bottom,
                                 kernel_size=(2, 2),
                                 padding=(0, 0),
                                 stride=(2, 2))
            return top

        fn(a, actual_mask, actual)
        actual.sync_host()
        reference_ave_pool(a, expected, (2, 2), (2, 2), (0, 0))
        self._check(actual, expected)
    def test_pool(self):
        shape = (3, 16, 24, 24)
        a = hm.random(shape, _range=(0, 255))
        actual_mask = hmarray((3, 16, 12, 12))
        actual = hmarray((3, 16, 12, 12))
        expected_mask = hmarray((3, 16, 12, 12))
        expected = hmarray((3, 16, 12, 12))
        expected.fill(float('-inf'))

        @compose
        def fn(bottom, mask, top):
            top, mask = PoolForward(bottom, kernel_size=(2, 2),
                                    padding=(0, 0), stride=(2, 2))
            return top, mask

        fn(a, actual_mask, actual)
        actual.sync_host()
        actual_mask.sync_host()
        reference_pool(a, expected, expected_mask, (2, 2), (2, 2), (0, 0))
        self._check(actual, expected)
        self._check(actual_mask, expected_mask)
        bottom_diff = hm.zeros(shape)
        expected_bottom_diff = hm.zeros(shape)
        mask = actual_mask
        top_diff = hm.random((3, 16, 12, 12))

        @compose
        def fn(top_diff, mask, bottom_diff):
            bottom_diff = PoolBackward(top_diff, mask,
                                       kernel_size=(2, 2),
                                       padding=(0, 0),
                                       stride=(2, 2))
            return bottom_diff

        fn(top_diff, mask, bottom_diff)
        bottom_diff.sync_host()
        reference_pool_backward(top_diff, mask, expected_bottom_diff,
                                (2, 2), (2, 2), (0, 0))
        self._check(bottom_diff, expected_bottom_diff)
Example #13
0
def reference_lrn(blob):
    output = hmarray(blob.shape)
    shape = blob.shape
    for n in range(shape[0]):
        for c in range(shape[1]):
            for h in range(shape[2]):
                for w in range(shape[3]):
                    c_start = c - (local_size - 1) // 2
                    c_end = min(c_start + local_size, blob.shape[1])
                    c_start = max(c_start, 0)
                    scale = 1.0
                    for i in range(c_start, c_end):
                        value = blob[n, i, h, w]
                        scale += value * value * alpha * local_size
                    output[n, c, h, w] = \
                        blob[n, c, h, w] / pow(scale, beta)
    return output
def reference_lrn(blob):
    output = hmarray(blob.shape)
    shape = blob.shape
    for n in range(shape[0]):
        for c in range(shape[1]):
            for h in range(shape[2]):
                for w in range(shape[3]):
                    c_start = c - (local_size - 1) // 2
                    c_end = min(c_start + local_size, blob.shape[1])
                    c_start = max(c_start, 0)
                    scale = 1.0
                    for i in range(c_start, c_end):
                        value = blob[n, i, h, w]
                        scale += value * value * alpha * local_size
                    output[n, c, h, w] = \
                        blob[n, c, h, w] / pow(scale, beta)
    return output
Example #15
0
 def set_up(self, bottom, bottom_diff):
     self.bottom, self.bottom_diff = bottom, bottom_diff
     N = self.num_output
     K = np.prod(bottom.shape[1:])
     scale = 1.0 / np.sqrt(self.num_output)
     if self.weights is None:
         self.weights = hmarray.random((N, K), _range=(-scale, scale))
     self.weights_diff = hmarray.zeros((N, K))
     self.weights_history = hmarray.zeros((N, K))
     self.bias_diff = hmarray.zeros((self.num_output, ))
     self.bias_history = hmarray.zeros((self.num_output, ))
     self.bias_multiplier = hmarray((1, self.bottom.shape[0]))
     self.bias_multiplier.fill(1)
     self.bias_multiplier.sync_ocl()
     top_shape = (bottom.shape[0], N)
     self.top = hmarray.zeros(top_shape)
     self.top_diff = hmarray.zeros(top_shape)
     return [(self.top, self.top_diff)]
Example #16
0
    def get_launcher(cls, sources, sinks, keywords, symbol_table):
        kernel_h, kernel_w = keywords['kernel_size']
        pad_h, pad_w = keywords['padding']
        stride_h, stride_w = keywords['stride']
        num, channels, height, width = symbol_table[sources[0]].shape
        channels_col = channels * kernel_h * kernel_w
        height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
        width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
        col_data = hmarray((channels_col, height_col * width_col))
        bias_multiplier = hmarray(
            (1, np.prod(symbol_table[sinks[0]].shape[2:])))
        bias_multiplier.fill(1.0)
        bias_multiplier.sync_ocl()

        im2col_global_size = channels * height_col * width_col
        col2im_global_size = channels * height * width

        kernels = Template("""
__kernel void col2im(global float* data_col, global float* data_im,
                     int im_offset) {
  if (get_global_id(0) < $col2im_global_size) {
    int index = get_global_id(0);
    float val = 0;
    int w = index % $width + $pad_w;
    int h = (index / $width) % $height + $pad_h;
    int c = index / ($width * $height);
    // compute the start and end of the output
    int w_col_start = (w < $kernel_w) ? 0 : (w - $kernel_w) / $stride_w + 1;
    int w_col_end = min(w / $stride_w + 1, $width_col);
    int h_col_start = (h < $kernel_h) ? 0 : (h - $kernel_h) / $stride_h + 1;
    int h_col_end = min(h / $stride_h + 1, $height_col);
    // equivalent implementation
    int offset = (c * $kernel_h * $kernel_w + h * $kernel_w + w) * \
          $height_col * $width_col;
    int coeff_h_col = (1 - $stride_h * $kernel_w * $height_col) * \
          $width_col;
    int coeff_w_col = (1 - $stride_w * $height_col * $width_col);
    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
      }
    }
    data_im[im_offset + index] = val;
  }
}
__kernel void im2col(global const float* data_im, global float* data_col,
                     int bot_offset) {
  if (get_global_id(0) < $global_size) {
    int index = get_global_id(0);
    int w_out = index % $width_col;
    int h_index = index / $width_col;
    int h_out = h_index % $height_col;
    int channel_in = h_index / $height_col;
    int channel_out = channel_in * $kernel_h * $kernel_w;
    int h_in = h_out * $stride_h - $pad_h;
    int w_in = w_out * $stride_w - $pad_w;
    global float* data_col_ptr = data_col;
    data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
    global const float* data_im_ptr = data_im + bot_offset;
    data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
    for (int i = 0; i < $kernel_h; ++i) {
      for (int j = 0; j < $kernel_w; ++j) {
        int h = h_in + i;
        int w = w_in + j;
        *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
            data_im_ptr[i * $width + j] : 0;
        data_col_ptr += $height_col * $width_col;
      }
    }
  }
}
""").substitute(global_size=im2col_global_size,
                stride_h=stride_h,
                stride_w=stride_w,
                pad_h=pad_h,
                pad_w=pad_w,
                kernel_h=kernel_h,
                kernel_w=kernel_w,
                width=width,
                height=height,
                height_col=height_col,
                width_col=width_col,
                col2im_global_size=col2im_global_size)

        program = cl.clCreateProgramWithSource(context, kernels).build()
        im2col = program['im2col']
        im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)
        col2im = program['col2im']
        col2im.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)

        class ConvLauncher(object):
            def __init__(self, sources, sinks):
                self.sources = [ast.Name(s, ast.Load()) for s in sources]
                self.sinks = [ast.Name(s, ast.Load()) for s in sinks]

            def compile(self):
                pass

            def launch(self, symbol_table, wait_for):
                queue = queues[0]
                bottom = symbol_table[sources[0]]
                bot_offset = np.prod(bottom.shape[1:])
                top_diff = symbol_table[sources[1]]
                top_offset = np.prod(top_diff.shape[1:])
                weights = symbol_table[sources[2]]
                bottom_diff = symbol_table[sinks[0]]
                bottom_diff.fill(0)
                bottom_diff.sync_ocl()
                weights_diff = symbol_table[sinks[1]]
                weights_diff.fill(0)
                weights_diff.sync_ocl()
                bias_diff = symbol_table[sinks[2]]
                bias_diff.fill(0)
                bias_diff.sync_ocl()
                for i in range(bottom.shape[0]):
                    n = np.prod(top_diff.shape[2:])
                    sgemv(False, top_diff.shape[1], n, 1.0, top_diff,
                          i * top_offset, n, bias_multiplier, 0, 1, 1.0,
                          bias_diff, 0, 1)
                    im2col(bottom.ocl_buf, col_data.ocl_buf,
                           i * bot_offset).on(queue, im2col_global_size)
                    m = top_diff.shape[1]
                    n = col_data.shape[0]
                    k = col_data.shape[1]

                    sgemm(False, True, 1.0, top_diff, i * top_offset, k,
                          col_data, 0, k, 1.0, weights_diff, 0, n, m, n, k)

                    m = weights.shape[1]
                    n = col_data.shape[1]
                    k = weights.shape[0]

                    sgemm(True, False, 1.0, weights, 0, m, top_diff,
                          i * top_offset, n, 0.0, col_data, 0, n, m, n, k)
                    col2im(col_data.ocl_buf, bottom_diff.ocl_buf,
                           i * bot_offset).on(queue, col2im_global_size)

        return ConvLauncher(sources, sinks)
Example #17
0
    def get_launcher(cls, sources, sinks, keywords, symbol_table):
        bottom = symbol_table[sources[0]]
        num = bottom.shape[0]
        channels = bottom.shape[1]
        scale_shape = list(bottom.shape)
        scale_shape[1] = 1
        scale = hmarray(tuple(scale_shape))
        loss = hmarray(scale_shape)
        spatial_dim = int(np.prod(bottom.shape[2:]))
        count = np.prod(bottom.shape)

        kernels = Template("""
// @begin=cl@
__kernel void kernel_copy(global const float* data,
                          global float* out) {
  if (get_global_id(0) < $count) {
    int index = get_global_id(0);
    out[index] = data[index];
  }
}
__kernel void kernel_channel_max(global const float* data,
                                 global float* out) {
  if (get_global_id(0) < $num_times_spatial) {
    int index = get_global_id(0);
    int n = index / $spatial_dim;
    int s = index % $spatial_dim;
    float maxval = -FLT_MAX;
    for (int c = 0; c < $channels; ++c) {
      maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval);
    }
    out[index] = maxval;
  }
}
__kernel void kernel_channel_subtract(global const float* channel_max,
                                      global float* data) {
  if (get_global_id(0) < $count) {
    int index = get_global_id(0);
    int n = index / $channels / $spatial_dim;
    int s = index % $spatial_dim;
    data[index] -= channel_max[n * $spatial_dim + s];
  }
}
__kernel void kernel_exp(global const float* data, global float* out) {
  if (get_global_id(0) < $count) {
    int index = get_global_id(0);
    out[index] = exp(data[index]);
  }
}
__kernel void kernel_channel_sum(global const float* data,
                                 global float* channel_sum) {
  if (get_global_id(0) < $num_times_spatial) {
    int index = get_global_id(0);
    int n = index / $spatial_dim;
    int s = index % $spatial_dim;
    float sum = 0;
    for (int c = 0; c < $channels; ++c) {
      sum += data[(n * $channels + c) * $spatial_dim + s];
    }
    channel_sum[index] = sum;
  }
}
__kernel void kernel_channel_div(global const float* channel_sum,
                                 global float* data) {
  if (get_global_id(0) < $count) {
    int index = get_global_id(0);
    int n = index / $channels / $spatial_dim;
    int s = index % $spatial_dim;
    data[index] /= channel_sum[n * $spatial_dim + s];
  }
}
__kernel void SoftmaxLossForward(global const float* prob_data,
    global const float* label, global float* loss) {
  if (get_global_id(0) < $num_times_spatial) {
    int index = get_global_id(0);
    const int n = index / $spatial_dim;
    const int s = index % $spatial_dim;
    const int label_value = (int) label[n * $spatial_dim + s];
    loss[index] = -log(
        max(prob_data[n * $dim + label_value * $spatial_dim + s],
            FLT_MIN));
  }
}
// @end=cl@
""").substitute(count=count, num_times_spatial=num * spatial_dim,
                channels=channels, spatial_dim=spatial_dim,
                dim=np.prod(bottom.shape[1:]))

        program = cl.clCreateProgramWithSource(context, kernels).build()
        copy_kern = program['kernel_copy']
        copy_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        max_kern = program['kernel_channel_max']
        max_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        sub_kern = program['kernel_channel_subtract']
        sub_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        exp_kern = program['kernel_exp']
        exp_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        sum_kern = program['kernel_channel_sum']
        sum_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        div_kern = program['kernel_channel_div']
        div_kern.argtypes = (cl.cl_mem, cl.cl_mem)
        loss_forward = program['SoftmaxLossForward']
        loss_forward.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_mem)

        class SoftmaxLauncher(object):
            def compile(self):
                pass

            def launch(self, symbol_table):
                bottom = symbol_table[sources[0]]
                label = symbol_table[sources[1]]
                prob = symbol_table[sources[2]]
                top = symbol_table[sinks[0]]
                copy_kern(bottom.ocl_buf, prob.ocl_buf).on(queue, (count, ))
                max_kern(prob.ocl_buf, scale.ocl_buf).on(queue,
                                                         (num * spatial_dim, ))
                sub_kern(scale.ocl_buf, prob.ocl_buf).on(queue, (count, ))
                exp_kern(prob.ocl_buf, prob.ocl_buf).on(queue, (count, ))
                sum_kern(prob.ocl_buf, scale.ocl_buf).on(queue,
                                                         (num * spatial_dim,))
                div_kern(scale.ocl_buf, prob.ocl_buf).on(queue, (count, ))
                loss_forward(prob.ocl_buf, label.ocl_buf,
                             loss.ocl_buf).on(queue, (num * spatial_dim, ))
                loss.sync_host()
                top[0] = np.sum(loss) / np.float32(num)
                top.sync_ocl()

        return SoftmaxLauncher()
Example #18
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            bottom = symbol_table[sources[0].name]
            num = bottom.shape[0]
            channels = bottom.shape[1]
            scale_shape = list(bottom.shape)
            scale_shape[1] = 1
            scale = hmarray(tuple(scale_shape))
            spatial_dim = int(np.prod(bottom.shape[2:]))
            count = np.prod(bottom.shape)

            kernels = Template("""
    __kernel void kernel_copy(global const float* data,
                              global float* out) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        out[index] = data[index];
      }
    }
    __kernel void kernel_channel_max(global const float* data,
                                     global float* out) {
      if (get_global_id(0) < $num_times_spatial) {
        int index = get_global_id(0);
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float maxval = -FLT_MAX;
        for (int c = 0; c < $channels; ++c) {
          maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval);
        }
        out[index] = maxval;
      }
    }
    __kernel void kernel_channel_subtract(global const float* channel_max,
                                          global float* data) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] -= channel_max[n * $spatial_dim + s];
      }
    }
    __kernel void kernel_exp(global const float* data, global float* out) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        out[index] = exp(data[index]);
      }
    }
    __kernel void kernel_channel_sum(global const float* data,
                                     global float* channel_sum) {
      if (get_global_id(0) < $num_times_spatial) {
        int index = get_global_id(0);
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float sum = 0;
        for (int c = 0; c < $channels; ++c) {
          sum += data[(n * $channels + c) * $spatial_dim + s];
        }
        channel_sum[index] = sum;
      }
    }
    __kernel void kernel_channel_div(global const float* channel_sum,
                                     global float* data) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] /= channel_sum[n * $spatial_dim + s];
      }
    }
    """).substitute(count=count,
                    num_times_spatial=num * spatial_dim,
                    channels=channels,
                    spatial_dim=spatial_dim,
                    dim=np.prod(bottom.shape[1:]))

            program = cl.clCreateProgramWithSource(context, kernels).build()
            copy_kern = program['kernel_copy']
            copy_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            max_kern = program['kernel_channel_max']
            max_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            sub_kern = program['kernel_channel_subtract']
            sub_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            exp_kern = program['kernel_exp']
            exp_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            sum_kern = program['kernel_channel_sum']
            sum_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            div_kern = program['kernel_channel_div']
            div_kern.argtypes = (cl.cl_mem, cl.cl_mem)

            class SoftmaxLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sources

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    top = symbol_table[sinks[0].name]
                    if count % 16:
                        padded_count = (count + 15) & (~15)
                    else:
                        padded_count = count
                    num_times_spatial = num * spatial_dim
                    if num_times_spatial % 16:
                        padded_num_times_spatial = (num_times_spatial +
                                                    15) & (~15)
                    else:
                        padded_num_times_spatial = num_times_spatial
                    evt = copy_kern(bottom.ocl_buf,
                                    top.ocl_buf).on(queue, (padded_count, ),
                                                    wait_for=wait_for)
                    evt = max_kern(top.ocl_buf, scale.ocl_buf).on(
                        queue, (padded_num_times_spatial, ), wait_for=evt)
                    evt = sub_kern(scale.ocl_buf,
                                   top.ocl_buf).on(queue, (padded_count, ),
                                                   wait_for=evt)
                    evt = exp_kern(top.ocl_buf,
                                   top.ocl_buf).on(queue, (padded_count, ),
                                                   wait_for=evt)
                    evt = sum_kern(top.ocl_buf, scale.ocl_buf).on(
                        queue, (padded_num_times_spatial, ), wait_for=evt)
                    evt = div_kern(scale.ocl_buf,
                                   top.ocl_buf).on(queue, (padded_count, ),
                                                   wait_for=evt)
                    return [evt]

            return SoftmaxLauncher(sources, sinks)
Example #19
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            kernel_h, kernel_w = keywords['kernel_size']
            pad_h, pad_w = keywords['padding']
            stride_h, stride_w = keywords['stride']
            num, channels, height, width = symbol_table[sources[0].name].shape
            channels_col = channels * kernel_h * kernel_w
            # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
            # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
            out_channels, height_col, width_col = symbol_table[
                sinks[0].name].shape[1:]
            is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \
                     stride_w == 1 and pad_h == 0 and pad_w == 0
            if not is_1x1:
                col_datas = [
                    hmarray((channels_col, height_col * width_col))
                    for _ in range(len(queues))
                ]
            bias_multiplier = hmarray(
                (1, np.prod(symbol_table[sinks[0].name].shape[2:])))
            bias_multiplier.fill(1.0)
            bias_multiplier.sync_ocl()

            im2col_global_size = channels * height_col * width_col

            im2col = Template("""
    __kernel void im2col(global const float* data_im, global float* data_col,
                         int bot_offset) {
      if (get_global_id(0) < $global_size) {
        int index = get_global_id(0);
        int h_index = index / $width_col;
        int w_out = index - h_index * $width_col;
        int channel_in = h_index / $height_col;
        int h_out = h_index - channel_in * $height_col;
        int channel_out = channel_in * $kernel_h * $kernel_w;
        int h_in = h_out * $stride_h - $pad_h;
        int w_in = w_out * $stride_w - $pad_w;
        global float* data_col_ptr = data_col;
        data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
        global const float* data_im_ptr = data_im + bot_offset;
        data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
        #pragma unroll
        for (int i = 0; i < $kernel_h; ++i) {
          #pragma unroll
          for (int j = 0; j < $kernel_w; ++j) {
            int h = h_in + i;
            int w = w_in + j;
            *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
                data_im_ptr[i * $width + j] : 0;
            data_col_ptr += $height_col * $width_col;
          }
        }
      }
    }
    """).substitute(global_size=im2col_global_size,
                    stride_h=stride_h,
                    stride_w=stride_w,
                    pad_h=pad_h,
                    pad_w=pad_w,
                    kernel_h=kernel_h,
                    kernel_w=kernel_w,
                    width=width,
                    height=height,
                    height_col=height_col,
                    width_col=width_col)

            im2col = cl.clCreateProgramWithSource(context,
                                                  im2col).build()['im2col']
            im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)

            if im2col_global_size % 64:
                padded = (im2col_global_size + 63) & (~63)
            else:
                padded = im2col_global_size

            class ConvLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    bot_offset = np.prod(bottom.shape[1:])
                    weights = symbol_table[sources[1].name]
                    bias = symbol_table[sources[2].name]
                    top = symbol_table[sinks[0].name]
                    top_offset = np.prod(top.shape[1:])
                    m = weights.shape[0]
                    n = np.prod(top.shape[2:])
                    k = np.prod(weights.shape[1:])
                    # cl.clFinish(queues[0])
                    evts = []
                    if is_1x1:
                        for i in range(bottom.shape[0]):
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        weights,
                                        0,
                                        k,
                                        bottom,
                                        i * bot_offset,
                                        n,
                                        0.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        k,
                                        queues[i % len(queues)],
                                        wait_for=wait_for)
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        bias,
                                        0,
                                        1,
                                        bias_multiplier,
                                        0,
                                        n,
                                        1.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        1,
                                        queues[i % len(queues)],
                                        wait_for=evt)
                            evts.append(evt)
                    else:
                        for i in range(bottom.shape[0]):
                            evt = im2col(bottom.ocl_buf,
                                         col_datas[i % len(queues)].ocl_buf,
                                         i * bot_offset).on(
                                             queues[i % len(queues)],
                                             (padded, ),
                                             wait_for=wait_for)
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        weights,
                                        0,
                                        k,
                                        col_datas[i % len(queues)],
                                        0,
                                        n,
                                        0.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        k,
                                        queues[i % len(queues)],
                                        wait_for=evt)
                            evt = sgemm(False,
                                        False,
                                        1.0,
                                        bias,
                                        0,
                                        1,
                                        bias_multiplier,
                                        0,
                                        n,
                                        1.0,
                                        top,
                                        i * top_offset,
                                        n,
                                        m,
                                        n,
                                        1,
                                        queues[i % len(queues)],
                                        wait_for=evt)
                            evts.append(evt)
                    return evts
                    # for q in queues:
                    #     cl.clFinish(q)

            return ConvLauncher(sources, sinks)
conv3 = hmarray.zeros(caffe_net.blobs["conv3"].data.shape)

conv4_filters = caffe_net.params["conv4"][0].data.view(hmarray)
conv4_bias = caffe_net.params["conv4"][1].data.view(hmarray)
conv4 = hmarray.zeros(caffe_net.blobs["conv4"].data.shape)

conv5_filters = caffe_net.params["conv5"][0].data.view(hmarray)
conv5_bias = caffe_net.params["conv5"][1].data.view(hmarray)
conv5 = hmarray.zeros(caffe_net.blobs["conv5"].data.shape)

pool5 = hmarray.zeros(caffe_net.blobs["pool5"].data.shape)
pool5_mask = hmarray.zeros(pool2.shape)

fc6_filters = caffe_net.params["fc6"][0].data.view(hmarray)
fc6_bias = caffe_net.params["fc6"][1].data.view(hmarray)
fc6_bias_multiplier = hmarray((1, pool5.shape[0]))
fc6_bias_multiplier.fill(1)
fc6_bias_multiplier.sync_ocl()
fc6 = hmarray.zeros(caffe_net.blobs["fc6"].data.shape)

fc7_filters = caffe_net.params["fc7"][0].data.view(hmarray)
fc7_bias = caffe_net.params["fc7"][1].data.view(hmarray)
fc7_bias_multiplier = hmarray((1, fc6.shape[0]))
fc7_bias_multiplier.fill(1)
fc7_bias_multiplier.sync_ocl()
fc7 = hmarray.zeros(caffe_net.blobs["fc7"].data.shape)

fc8_filters = caffe_net.params["fc8"][0].data.view(hmarray)
fc8_bias = caffe_net.params["fc8"][1].data.view(hmarray)
fc8_bias_multiplier = hmarray((1, fc7.shape[0]))
fc8_bias_multiplier.fill(1)
Example #21
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            bottom = symbol_table[sources[0].name]
            num = bottom.shape[0]
            channels = bottom.shape[1]
            scale_shape = list(bottom.shape)
            scale_shape[1] = 1
            scale = hmarray(tuple(scale_shape))
            spatial_dim = int(np.prod(bottom.shape[2:]))
            count = np.prod(bottom.shape)

            kernels = Template("""
    #include <float.h>
    #include <math.h>
    void kernel_copy(float* data, float* out) {
      #pragma omp parallel for
      for (int index = 0; index < $count; index++) {
        out[index] = data[index];
      }
    }
    void kernel_channel_max(float* data, float* out) {
      #pragma omp parallel for
      for (int index = 0; index < $num_times_spatial; index++) {
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float maxval = -FLT_MAX;
        for (int c = 0; c < $channels; ++c) {
          maxval = fmax(data[(n * $channels + c) * $spatial_dim + s], maxval);
        }
        out[index] = maxval;
      }
    }
    void kernel_channel_subtract(float* channel_max, float* data) {
      #pragma omp parallel for
      for (int index = 0; index < $count; index++) {
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] -= channel_max[n * $spatial_dim + s];
      }
    }
    void kernel_exp(float* data, float* out) {
      #pragma omp parallel for
      for (int index = 0; index < $count; index++) {
        out[index] = exp(data[index]);
      }
    }
    void kernel_channel_sum(float* data, float* channel_sum) {
      #pragma omp parallel for
      for (int index = 0; index < $num_times_spatial; index++) {
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float sum = 0;
        for (int c = 0; c < $channels; ++c) {
          sum += data[(n * $channels + c) * $spatial_dim + s];
        }
        channel_sum[index] = sum;
      }
    }
    void kernel_channel_div(float* channel_sum, float* data) {
      #pragma omp parallel for
      for (int index = 0; index < $count; index++) {
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] /= channel_sum[n * $spatial_dim + s];
      }
    }
    """).substitute(count=count,
                    num_times_spatial=num * spatial_dim,
                    channels=channels,
                    spatial_dim=spatial_dim,
                    dim=np.prod(bottom.shape[1:]))
            lib = hm_compile_and_load(kernels)

            copy_kern = lib.kernel_copy
            max_kern = lib.kernel_channel_max
            sub_kern = lib.kernel_channel_subtract
            exp_kern = lib.kernel_exp
            sum_kern = lib.kernel_channel_sum
            div_kern = lib.kernel_channel_div

            class SoftmaxLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    top = symbol_table[sinks[0].name]
                    copy_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [bottom, top])
                    copy_kern(bottom, top)
                    max_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [top, scale])
                    max_kern(top, scale)
                    sub_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [scale, top])
                    sub_kern(scale, top)
                    exp_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [top, top])
                    exp_kern(top, top)
                    sum_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [top, scale])
                    sum_kern(top, scale)
                    div_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [scale, top])
                    div_kern(scale, top)

            return SoftmaxLauncher(sources, sinks)
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            bottom = symbol_table[sources[0].name]
            num = bottom.shape[0]
            channels = bottom.shape[1]
            scale_shape = list(bottom.shape)
            scale_shape[1] = 1
            scale = hmarray(tuple(scale_shape))
            spatial_dim = int(np.prod(bottom.shape[2:]))
            count = np.prod(bottom.shape)

            kernels = Template("""
    __kernel void kernel_copy(global const float* data,
                              global float* out) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        out[index] = data[index];
      }
    }
    __kernel void kernel_channel_max(global const float* data,
                                     global float* out) {
      if (get_global_id(0) < $num_times_spatial) {
        int index = get_global_id(0);
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float maxval = -FLT_MAX;
        for (int c = 0; c < $channels; ++c) {
          maxval = max(data[(n * $channels + c) * $spatial_dim + s], maxval);
        }
        out[index] = maxval;
      }
    }
    __kernel void kernel_channel_subtract(global const float* channel_max,
                                          global float* data) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] -= channel_max[n * $spatial_dim + s];
      }
    }
    __kernel void kernel_exp(global const float* data, global float* out) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        out[index] = exp(data[index]);
      }
    }
    __kernel void kernel_channel_sum(global const float* data,
                                     global float* channel_sum) {
      if (get_global_id(0) < $num_times_spatial) {
        int index = get_global_id(0);
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float sum = 0;
        for (int c = 0; c < $channels; ++c) {
          sum += data[(n * $channels + c) * $spatial_dim + s];
        }
        channel_sum[index] = sum;
      }
    }
    __kernel void kernel_channel_div(global const float* channel_sum,
                                     global float* data) {
      if (get_global_id(0) < $count) {
        int index = get_global_id(0);
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] /= channel_sum[n * $spatial_dim + s];
      }
    }
    """).substitute(count=count, num_times_spatial=num * spatial_dim,
                    channels=channels, spatial_dim=spatial_dim,
                    dim=np.prod(bottom.shape[1:]))

            program = cl.clCreateProgramWithSource(context, kernels).build()
            copy_kern = program['kernel_copy']
            copy_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            max_kern = program['kernel_channel_max']
            max_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            sub_kern = program['kernel_channel_subtract']
            sub_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            exp_kern = program['kernel_exp']
            exp_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            sum_kern = program['kernel_channel_sum']
            sum_kern.argtypes = (cl.cl_mem, cl.cl_mem)
            div_kern = program['kernel_channel_div']
            div_kern.argtypes = (cl.cl_mem, cl.cl_mem)

            class SoftmaxLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sources

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    top = symbol_table[sinks[0].name]
                    if count % 16:
                        padded_count = (count + 15) & (~15)
                    else:
                        padded_count = count
                    num_times_spatial = num * spatial_dim
                    if num_times_spatial % 16:
                        padded_num_times_spatial = (num_times_spatial + 15) & (~15)
                    else:
                        padded_num_times_spatial = num_times_spatial
                    evt = copy_kern(bottom.ocl_buf, top.ocl_buf).on(
                        queue, (padded_count,), wait_for=wait_for)
                    evt = max_kern(top.ocl_buf, scale.ocl_buf).on(
                        queue, (padded_num_times_spatial, ), wait_for=evt)
                    evt = sub_kern(scale.ocl_buf, top.ocl_buf).on(
                        queue, (padded_count, ), wait_for=evt)
                    evt = exp_kern(top.ocl_buf, top.ocl_buf).on(
                        queue, (padded_count, ), wait_for=evt)
                    evt = sum_kern(top.ocl_buf, scale.ocl_buf).on(
                        queue, (padded_num_times_spatial, ), wait_for=evt)
                    evt = div_kern(scale.ocl_buf, top.ocl_buf).on(
                        queue, (padded_count, ), wait_for=evt)
                    return [evt]

            return SoftmaxLauncher(sources, sinks)
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            bottom = symbol_table[sources[0].name]
            num = bottom.shape[0]
            channels = bottom.shape[1]
            scale_shape = list(bottom.shape)
            scale_shape[1] = 1
            scale = hmarray(tuple(scale_shape))
            spatial_dim = int(np.prod(bottom.shape[2:]))
            count = np.prod(bottom.shape)

            kernels = Template("""
    #include <float.h>
    #include <math.h>
    void kernel_copy(float* data, float* out) {
      #pragma omp parallel for
      for (int index = 0; index < $count; index++) {
        out[index] = data[index];
      }
    }
    void kernel_channel_max(float* data, float* out) {
      #pragma omp parallel for
      for (int index = 0; index < $num_times_spatial; index++) {
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float maxval = -FLT_MAX;
        for (int c = 0; c < $channels; ++c) {
          maxval = fmax(data[(n * $channels + c) * $spatial_dim + s], maxval);
        }
        out[index] = maxval;
      }
    }
    void kernel_channel_subtract(float* channel_max, float* data) {
      #pragma omp parallel for
      for (int index = 0; index < $count; index++) {
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] -= channel_max[n * $spatial_dim + s];
      }
    }
    void kernel_exp(float* data, float* out) {
      #pragma omp parallel for
      for (int index = 0; index < $count; index++) {
        out[index] = exp(data[index]);
      }
    }
    void kernel_channel_sum(float* data, float* channel_sum) {
      #pragma omp parallel for
      for (int index = 0; index < $num_times_spatial; index++) {
        int n = index / $spatial_dim;
        int s = index % $spatial_dim;
        float sum = 0;
        for (int c = 0; c < $channels; ++c) {
          sum += data[(n * $channels + c) * $spatial_dim + s];
        }
        channel_sum[index] = sum;
      }
    }
    void kernel_channel_div(float* channel_sum, float* data) {
      #pragma omp parallel for
      for (int index = 0; index < $count; index++) {
        int n = index / $channels / $spatial_dim;
        int s = index % $spatial_dim;
        data[index] /= channel_sum[n * $spatial_dim + s];
      }
    }
    """).substitute(count=count, num_times_spatial=num * spatial_dim,
                    channels=channels, spatial_dim=spatial_dim,
                    dim=np.prod(bottom.shape[1:]))
            lib = hm_compile_and_load(kernels)

            copy_kern = lib.kernel_copy
            max_kern = lib.kernel_channel_max
            sub_kern = lib.kernel_channel_subtract
            exp_kern = lib.kernel_exp
            sum_kern = lib.kernel_channel_sum
            div_kern = lib.kernel_channel_div

            class SoftmaxLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    top = symbol_table[sinks[0].name]
                    copy_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [bottom, top])
                    copy_kern(bottom, top)
                    max_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [top, scale])
                    max_kern(top, scale)
                    sub_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [scale, top])
                    sub_kern(scale, top)
                    exp_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [top, top])
                    exp_kern(top, top)
                    sum_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [top, scale])
                    sum_kern(top, scale)
                    div_kern.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [scale, top])
                    div_kern(scale, top)

            return SoftmaxLauncher(sources, sinks)
Example #24
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            kernel_h, kernel_w = keywords['kernel_size']
            pad_h, pad_w = keywords['padding']
            stride_h, stride_w = keywords['stride']
            num, channels, height, width = symbol_table[sources[0].name].shape
            channels_col = channels * kernel_h * kernel_w
            # height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
            # width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
            out_channels, height_col, width_col = symbol_table[sinks[0].name].shape[1:]
            is_1x1 = kernel_w == 1 and kernel_h == 1 and stride_h == 1 and \
                     stride_w == 1 and pad_h == 0 and pad_w == 0
            if not is_1x1:
                col_datas = [hmarray((channels_col, height_col * width_col))
                            for _ in range(len(queues))]
            bias_multiplier = hmarray(
                (1, np.prod(symbol_table[sinks[0].name].shape[2:])))
            bias_multiplier.fill(1.0)
            bias_multiplier.sync_ocl()

            im2col_global_size = channels * height_col * width_col

            im2col = Template("""
    __kernel void im2col(global const float* data_im, global float* data_col,
                         int bot_offset) {
      if (get_global_id(0) < $global_size) {
        int index = get_global_id(0);
        int h_index = index / $width_col;
        int w_out = index - h_index * $width_col;
        int channel_in = h_index / $height_col;
        int h_out = h_index - channel_in * $height_col;
        int channel_out = channel_in * $kernel_h * $kernel_w;
        int h_in = h_out * $stride_h - $pad_h;
        int w_in = w_out * $stride_w - $pad_w;
        global float* data_col_ptr = data_col;
        data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
        global const float* data_im_ptr = data_im + bot_offset;
        data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
        #pragma unroll
        for (int i = 0; i < $kernel_h; ++i) {
          #pragma unroll
          for (int j = 0; j < $kernel_w; ++j) {
            int h = h_in + i;
            int w = w_in + j;
            *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
                data_im_ptr[i * $width + j] : 0;
            data_col_ptr += $height_col * $width_col;
          }
        }
      }
    }
    """).substitute(global_size=im2col_global_size, stride_h=stride_h,
                    stride_w=stride_w, pad_h=pad_h, pad_w=pad_w,
                    kernel_h=kernel_h, kernel_w=kernel_w, width=width,
                    height=height, height_col=height_col,
                    width_col=width_col)

            im2col = cl.clCreateProgramWithSource(
                context, im2col
            ).build()['im2col']
            im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)

            if im2col_global_size % 64:
                padded = (im2col_global_size + 63) & (~63)
            else:
                padded = im2col_global_size

            class ConvLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    bot_offset = np.prod(bottom.shape[1:])
                    weights = symbol_table[sources[1].name]
                    bias = symbol_table[sources[2].name]
                    top = symbol_table[sinks[0].name]
                    top_offset = np.prod(top.shape[1:])
                    m = weights.shape[0]
                    n = np.prod(top.shape[2:])
                    k = np.prod(weights.shape[1:])
                    # cl.clFinish(queues[0])
                    evts = []
                    if is_1x1:
                        for i in range(bottom.shape[0]):
                            evt = sgemm(False, False, 1.0, weights, 0, k,
                                        bottom, i * bot_offset, n, 0.0,
                                        top, i * top_offset, n, m, n,
                                        k, queues[i % len(queues)], wait_for=wait_for)
                            evt = sgemm(False, False, 1.0, bias, 0, 1,
                                        bias_multiplier, 0, n, 1.0, top, i *
                                        top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt)
                            evts.append(evt)
                    else:
                        for i in range(bottom.shape[0]):
                            evt = im2col(bottom.ocl_buf,
                                        col_datas[i % len(queues)].ocl_buf,
                                        i * bot_offset
                                        ).on(queues[i % len(queues)], (padded, ),
                                            wait_for=wait_for)
                            evt = sgemm(False, False, 1.0, weights, 0, k,
                                        col_datas[i % len(queues)],
                                        0, n, 0.0, top, i * top_offset, n, m, n,
                                        k, queues[i % len(queues)], wait_for=evt)
                            evt = sgemm(False, False, 1.0, bias, 0, 1,
                                        bias_multiplier, 0, n, 1.0, top, i *
                                        top_offset, n, m, n, 1, queues[i % len(queues)], wait_for=evt)
                            evts.append(evt)
                    return evts
                    # for q in queues:
                    #     cl.clFinish(q)
            return ConvLauncher(sources, sinks)
Example #25
0
    def get_launcher(cls, sources, sinks, keywords, symbol_table):
        kernel_h, kernel_w = keywords['kernel_size']
        pad_h, pad_w = keywords['padding']
        stride_h, stride_w = keywords['stride']
        num, channels, height, width = symbol_table[sources[0]].shape
        channels_col = channels * kernel_h * kernel_w
        height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
        width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
        col_data = hmarray((channels_col, height_col * width_col))
        bias_multiplier = hmarray(
            (1, np.prod(symbol_table[sinks[0]].shape[2:])))
        bias_multiplier.fill(1.0)
        bias_multiplier.sync_ocl()

        im2col_global_size = channels * height_col * width_col
        col2im_global_size = channels * height * width

        kernels = Template("""
__kernel void col2im(global float* data_col, global float* data_im,
                     int im_offset) {
  if (get_global_id(0) < $col2im_global_size) {
    int index = get_global_id(0);
    float val = 0;
    int w = index % $width + $pad_w;
    int h = (index / $width) % $height + $pad_h;
    int c = index / ($width * $height);
    // compute the start and end of the output
    int w_col_start = (w < $kernel_w) ? 0 : (w - $kernel_w) / $stride_w + 1;
    int w_col_end = min(w / $stride_w + 1, $width_col);
    int h_col_start = (h < $kernel_h) ? 0 : (h - $kernel_h) / $stride_h + 1;
    int h_col_end = min(h / $stride_h + 1, $height_col);
    // equivalent implementation
    int offset = (c * $kernel_h * $kernel_w + h * $kernel_w + w) * \
          $height_col * $width_col;
    int coeff_h_col = (1 - $stride_h * $kernel_w * $height_col) * \
          $width_col;
    int coeff_w_col = (1 - $stride_w * $height_col * $width_col);
    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
      }
    }
    data_im[im_offset + index] = val;
  }
}
__kernel void im2col(global const float* data_im, global float* data_col,
                     int bot_offset) {
  if (get_global_id(0) < $global_size) {
    int index = get_global_id(0);
    int w_out = index % $width_col;
    int h_index = index / $width_col;
    int h_out = h_index % $height_col;
    int channel_in = h_index / $height_col;
    int channel_out = channel_in * $kernel_h * $kernel_w;
    int h_in = h_out * $stride_h - $pad_h;
    int w_in = w_out * $stride_w - $pad_w;
    global float* data_col_ptr = data_col;
    data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
    global const float* data_im_ptr = data_im + bot_offset;
    data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
    for (int i = 0; i < $kernel_h; ++i) {
      for (int j = 0; j < $kernel_w; ++j) {
        int h = h_in + i;
        int w = w_in + j;
        *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
            data_im_ptr[i * $width + j] : 0;
        data_col_ptr += $height_col * $width_col;
      }
    }
  }
}
""").substitute(global_size=im2col_global_size, stride_h=stride_h,
                stride_w=stride_w, pad_h=pad_h, pad_w=pad_w,
                kernel_h=kernel_h, kernel_w=kernel_w, width=width,
                height=height, height_col=height_col,
                width_col=width_col, col2im_global_size=col2im_global_size)

        program = cl.clCreateProgramWithSource(context, kernels).build()
        im2col = program['im2col']
        im2col.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)
        col2im = program['col2im']
        col2im.argtypes = (cl.cl_mem, cl.cl_mem, cl.cl_int)

        class ConvLauncher(object):
            def __init__(self, sources, sinks):
                self.sources = [ast.Name(s, ast.Load()) for s in sources]
                self.sinks = [ast.Name(s, ast.Load()) for s in sinks]

            def compile(self):
                pass

            def launch(self, symbol_table, wait_for):
                queue = queues[0]
                bottom = symbol_table[sources[0]]
                bot_offset = np.prod(bottom.shape[1:])
                top_diff = symbol_table[sources[1]]
                top_offset = np.prod(top_diff.shape[1:])
                weights = symbol_table[sources[2]]
                bottom_diff = symbol_table[sinks[0]]
                bottom_diff.fill(0)
                bottom_diff.sync_ocl()
                weights_diff = symbol_table[sinks[1]]
                weights_diff.fill(0)
                weights_diff.sync_ocl()
                bias_diff = symbol_table[sinks[2]]
                bias_diff.fill(0)
                bias_diff.sync_ocl()
                for i in range(bottom.shape[0]):
                    n = np.prod(top_diff.shape[2:])
                    sgemv(False, top_diff.shape[1],
                          n, 1.0, top_diff, i *
                          top_offset, n, bias_multiplier, 0, 1, 1.0,
                          bias_diff, 0, 1)
                    im2col(bottom.ocl_buf, col_data.ocl_buf, i
                           * bot_offset).on(queue, im2col_global_size)
                    m = top_diff.shape[1]
                    n = col_data.shape[0]
                    k = col_data.shape[1]

                    sgemm(False, True, 1.0, top_diff, i *
                          top_offset, k, col_data, 0, k, 1.0,
                          weights_diff, 0, n, m, n, k)

                    m = weights.shape[1]
                    n = col_data.shape[1]
                    k = weights.shape[0]

                    sgemm(True, False, 1.0, weights, 0, m,
                          top_diff, i * top_offset, n, 0.0,
                          col_data, 0, n,
                          m, n, k)
                    col2im(col_data.ocl_buf,
                           bottom_diff.ocl_buf, i *
                           bot_offset).on(queue, col2im_global_size)

        return ConvLauncher(sources, sinks)
Example #26
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            kernel_h, kernel_w = keywords['kernel_size']
            pad_h, pad_w = keywords['padding']
            stride_h, stride_w = keywords['stride']
            num, channels, height, width = symbol_table[sources[0].name].shape
            channels_col = channels * kernel_h * kernel_w
            height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
            width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
            out_channels, height_col, width_col = symbol_table[sinks[0].name].shape[1:]
            col_data = hmarray((channels_col, height_col * width_col))
            bias_multiplier = hmarray(
                (1, np.prod(symbol_table[sinks[0].name].shape[2:])))
            bias_multiplier.fill(1.0)
            bias_multiplier.sync_ocl()

            im2col_global_size = channels * height_col * width_col

            im2col = Template("""
    void im2col(float* data_im, float* data_col, int bot_offset) {
      #pragma omp parallel for
      for (int index = 0; index < $global_size; index++) {
        int h_index = index / $width_col;
        int w_out = index - h_index * $width_col;
        int channel_in = h_index / $height_col;
        int h_out = h_index - channel_in * $height_col;
        int channel_out = channel_in * $kernel_h * $kernel_w;
        int h_in = h_out * $stride_h - $pad_h;
        int w_in = w_out * $stride_w - $pad_w;
        float* data_col_ptr = data_col;
        data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
        const float* data_im_ptr = data_im + bot_offset;
        data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
        for (int i = 0; i < $kernel_h; ++i) {
          for (int j = 0; j < $kernel_w; ++j) {
            int h = h_in + i;
            int w = w_in + j;
            *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
                data_im_ptr[i * $width + j] : 0;
            data_col_ptr += $height_col * $width_col;
          }
        }
      }
    }
    """).substitute(global_size=im2col_global_size, stride_h=stride_h,
                    stride_w=stride_w, pad_h=pad_h, pad_w=pad_w,
                    kernel_h=kernel_h, kernel_w=kernel_w, width=width,
                    height=height, height_col=height_col,
                    width_col=width_col)
            lib = hm_compile_and_load(im2col)
            im2col = lib.im2col


            class ConvLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    bot_offset = np.prod(bottom.shape[1:])
                    weights = symbol_table[sources[1].name]
                    bias = symbol_table[sources[2].name]
                    top = symbol_table[sinks[0].name]
                    im2col.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape) for p in
                        [bottom, col_data]) + (ct.c_int, )
                    if len(weights.shape) > 2:
                        weights = weights.reshape(weights.shape[0], np.prod(weights.shape[1:]))
                    for i in range(bottom.shape[0]):
                        im2col(bottom, col_data, i * bot_offset)
                        top[i] = weights.dot(col_data).reshape(weights.shape[0], height_col, width_col)
                        top[i] += bias[:, np.newaxis, np.newaxis]
            return ConvLauncher(sources, sinks)
conv3 = hmarray.zeros(caffe_net.blobs['conv3'].data.shape)

conv4_filters = caffe_net.params['conv4'][0].data.view(hmarray)
conv4_bias = caffe_net.params['conv4'][1].data.view(hmarray)
conv4 = hmarray.zeros(caffe_net.blobs['conv4'].data.shape)

conv5_filters = caffe_net.params['conv5'][0].data.view(hmarray)
conv5_bias = caffe_net.params['conv5'][1].data.view(hmarray)
conv5 = hmarray.zeros(caffe_net.blobs['conv5'].data.shape)

pool5 = hmarray.zeros(caffe_net.blobs['pool5'].data.shape)
pool5_mask = hmarray.zeros(pool5.shape)

fc6_filters = caffe_net.params['fc6'][0].data.view(hmarray)
fc6_bias = caffe_net.params['fc6'][1].data.view(hmarray)
fc6_bias_multiplier = hmarray((1, pool5.shape[0]))
fc6_bias_multiplier.fill(1)
fc6 = hmarray.zeros(caffe_net.blobs['fc6'].data.shape)

fc7_filters = caffe_net.params['fc7'][0].data.view(hmarray)
fc7_bias = caffe_net.params['fc7'][1].data.view(hmarray)
fc7_bias_multiplier = hmarray((1, fc6.shape[0]))
fc7_bias_multiplier.fill(1)
fc7 = hmarray.zeros(caffe_net.blobs['fc7'].data.shape)

fc8_filters = caffe_net.params['fc8'][0].data.view(hmarray)
fc8_bias = caffe_net.params['fc8'][1].data.view(hmarray)
fc8_bias_multiplier = hmarray((1, fc7.shape[0]))
fc8_bias_multiplier.fill(1)
fc8 = hmarray.zeros(caffe_net.blobs['fc8'].data.shape)
Example #28
0
        def get_launcher(cls, sources, sinks, keywords, symbol_table):
            kernel_h, kernel_w = keywords['kernel_size']
            pad_h, pad_w = keywords['padding']
            stride_h, stride_w = keywords['stride']
            num, channels, height, width = symbol_table[sources[0].name].shape
            channels_col = channels * kernel_h * kernel_w
            height_col = (height + 2 * pad_h - kernel_h) // stride_h + 1
            width_col = (width + 2 * pad_w - kernel_w) // stride_w + 1
            out_channels, height_col, width_col = symbol_table[
                sinks[0].name].shape[1:]
            col_data = hmarray((channels_col, height_col * width_col))
            bias_multiplier = hmarray(
                (1, np.prod(symbol_table[sinks[0].name].shape[2:])))
            bias_multiplier.fill(1.0)
            bias_multiplier.sync_ocl()

            im2col_global_size = channels * height_col * width_col

            im2col = Template("""
    void im2col(float* data_im, float* data_col, int bot_offset) {
      #pragma omp parallel for
      for (int index = 0; index < $global_size; index++) {
        int h_index = index / $width_col;
        int w_out = index - h_index * $width_col;
        int channel_in = h_index / $height_col;
        int h_out = h_index - channel_in * $height_col;
        int channel_out = channel_in * $kernel_h * $kernel_w;
        int h_in = h_out * $stride_h - $pad_h;
        int w_in = w_out * $stride_w - $pad_w;
        float* data_col_ptr = data_col;
        data_col_ptr += (channel_out * $height_col + h_out) * $width_col + w_out;
        const float* data_im_ptr = data_im + bot_offset;
        data_im_ptr += (channel_in * $height + h_in) * $width + w_in;
        for (int i = 0; i < $kernel_h; ++i) {
          for (int j = 0; j < $kernel_w; ++j) {
            int h = h_in + i;
            int w = w_in + j;
            *data_col_ptr = (h >= 0 && w >= 0 && h < $height && w < $width) ?
                data_im_ptr[i * $width + j] : 0;
            data_col_ptr += $height_col * $width_col;
          }
        }
      }
    }
    """).substitute(global_size=im2col_global_size,
                    stride_h=stride_h,
                    stride_w=stride_w,
                    pad_h=pad_h,
                    pad_w=pad_w,
                    kernel_h=kernel_h,
                    kernel_w=kernel_w,
                    width=width,
                    height=height,
                    height_col=height_col,
                    width_col=width_col)
            lib = hm_compile_and_load(im2col)
            im2col = lib.im2col

            class ConvLauncher(object):
                def __init__(self, sources, sinks):
                    self.sources = sources
                    self.sinks = sinks

                def compile(self):
                    pass

                def launch(self, symbol_table, wait_for):
                    bottom = symbol_table[sources[0].name]
                    bot_offset = np.prod(bottom.shape[1:])
                    weights = symbol_table[sources[1].name]
                    bias = symbol_table[sources[2].name]
                    top = symbol_table[sinks[0].name]
                    im2col.argtypes = tuple(
                        np.ctypeslib.ndpointer(p.dtype, p.ndim, p.shape)
                        for p in [bottom, col_data]) + (ct.c_int, )
                    if len(weights.shape) > 2:
                        weights = weights.reshape(weights.shape[0],
                                                  np.prod(weights.shape[1:]))
                    for i in range(bottom.shape[0]):
                        im2col(bottom, col_data, i * bot_offset)
                        top[i] = weights.dot(col_data).reshape(
                            weights.shape[0], height_col, width_col)
                        top[i] += bias[:, np.newaxis, np.newaxis]

            return ConvLauncher(sources, sinks)